diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,112149 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.930638359749891, + "eval_steps": 1000, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.3255813953488372e-09, + "logits/chosen": -2.471824884414673, + "logits/rejected": -2.079159736633301, + "logps/chosen": -174.749267578125, + "logps/rejected": -308.16778564453125, + "loss": 0.5964, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27328822016716003, + "rewards/margins": 0.4016978144645691, + "rewards/rejected": -0.6749860048294067, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.6511627906976744e-09, + "logits/chosen": -2.6063313484191895, + "logits/rejected": -2.2856028079986572, + "logps/chosen": -318.5167236328125, + "logps/rejected": -396.08905029296875, + "loss": 0.6508, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13937486708164215, + "rewards/margins": 0.16503377258777618, + "rewards/rejected": -0.30440863966941833, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 6.976744186046511e-09, + "logits/chosen": -2.4662411212921143, + "logits/rejected": -2.6295621395111084, + "logps/chosen": -146.83151245117188, + "logps/rejected": -161.9111328125, + "loss": 0.7889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.643913209438324, + "rewards/margins": -0.0388026162981987, + "rewards/rejected": -0.6051105856895447, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 9.302325581395349e-09, + "logits/chosen": -2.2128384113311768, + "logits/rejected": -2.203862428665161, + "logps/chosen": -225.96243286132812, + "logps/rejected": -244.6072540283203, + "loss": 0.9154, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.37408754229545593, + "rewards/margins": -0.31671643257141113, + "rewards/rejected": -0.057371072471141815, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 1.1627906976744186e-08, + "logits/chosen": -2.065877676010132, + "logits/rejected": -2.1803784370422363, + "logps/chosen": -213.409423828125, + "logps/rejected": -203.0699920654297, + "loss": 0.5983, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3307132124900818, + "rewards/margins": 0.4796845018863678, + "rewards/rejected": -0.810397744178772, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1.3953488372093022e-08, + "logits/chosen": -2.319749355316162, + "logits/rejected": -2.6371703147888184, + "logps/chosen": -439.01751708984375, + "logps/rejected": -241.67933654785156, + "loss": 1.5479, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5972545146942139, + "rewards/margins": -1.078753113746643, + "rewards/rejected": -0.5185015201568604, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.627906976744186e-08, + "logits/chosen": -2.9757614135742188, + "logits/rejected": -2.895737886428833, + "logps/chosen": -194.6126708984375, + "logps/rejected": -215.57421875, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13256311416625977, + "rewards/margins": 0.2143106460571289, + "rewards/rejected": -0.34687376022338867, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.8604651162790698e-08, + "logits/chosen": -2.0621490478515625, + "logits/rejected": -2.3770477771759033, + "logps/chosen": -661.87451171875, + "logps/rejected": -352.9231872558594, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18366073071956635, + "rewards/margins": 0.3257562518119812, + "rewards/rejected": -0.5094169974327087, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.0930232558139533e-08, + "logits/chosen": -2.7137913703918457, + "logits/rejected": -2.715798854827881, + "logps/chosen": -211.98388671875, + "logps/rejected": -210.24530029296875, + "loss": 0.6558, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8401208519935608, + "rewards/margins": 0.10100153833627701, + "rewards/rejected": -0.9411224722862244, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 2.3255813953488372e-08, + "logits/chosen": -2.268031120300293, + "logits/rejected": -2.333078384399414, + "logps/chosen": -304.6269226074219, + "logps/rejected": -218.21595764160156, + "loss": 1.0356, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9625828266143799, + "rewards/margins": -0.43421831727027893, + "rewards/rejected": -0.5283644795417786, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.5581395348837208e-08, + "logits/chosen": -2.46273136138916, + "logits/rejected": -2.5069494247436523, + "logps/chosen": -280.9668273925781, + "logps/rejected": -242.62289428710938, + "loss": 0.8667, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5893173813819885, + "rewards/margins": -0.16260433197021484, + "rewards/rejected": -0.42671307921409607, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 2.7906976744186043e-08, + "logits/chosen": -2.474379301071167, + "logits/rejected": -2.4130866527557373, + "logps/chosen": -340.4402160644531, + "logps/rejected": -490.8125, + "loss": 0.7761, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4141865670681, + "rewards/margins": -0.07961148768663406, + "rewards/rejected": -0.3345750868320465, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 3.023255813953488e-08, + "logits/chosen": -2.4873900413513184, + "logits/rejected": -2.613281726837158, + "logps/chosen": -284.20635986328125, + "logps/rejected": -145.10678100585938, + "loss": 0.9958, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3096553683280945, + "rewards/margins": -0.4070754647254944, + "rewards/rejected": 0.09742006659507751, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 3.255813953488372e-08, + "logits/chosen": -2.4154539108276367, + "logits/rejected": -2.6339876651763916, + "logps/chosen": -536.8396606445312, + "logps/rejected": -440.3067626953125, + "loss": 0.8262, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.42787837982177734, + "rewards/margins": -0.2260442078113556, + "rewards/rejected": -0.20183415710926056, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 3.4883720930232553e-08, + "logits/chosen": -2.0477490425109863, + "logits/rejected": -2.0686001777648926, + "logps/chosen": -247.60975646972656, + "logps/rejected": -182.30712890625, + "loss": 0.824, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4310076832771301, + "rewards/margins": -0.18956337869167328, + "rewards/rejected": -0.24144430458545685, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 3.7209302325581396e-08, + "logits/chosen": -2.0259194374084473, + "logits/rejected": -2.2158186435699463, + "logps/chosen": -146.38795471191406, + "logps/rejected": -211.36460876464844, + "loss": 1.0976, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0345203876495361, + "rewards/margins": 0.4986010193824768, + "rewards/rejected": -1.5331213474273682, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 3.953488372093023e-08, + "logits/chosen": -3.0699312686920166, + "logits/rejected": -2.992260456085205, + "logps/chosen": -230.7408447265625, + "logps/rejected": -158.0206756591797, + "loss": 0.8654, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41682323813438416, + "rewards/margins": -0.2596265971660614, + "rewards/rejected": -0.15719667077064514, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 4.1860465116279067e-08, + "logits/chosen": -2.431905508041382, + "logits/rejected": -2.119258403778076, + "logps/chosen": -179.03073120117188, + "logps/rejected": -186.11746215820312, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.624506950378418, + "rewards/margins": 0.5342190861701965, + "rewards/rejected": -1.1587260961532593, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 4.418604651162791e-08, + "logits/chosen": -2.3878049850463867, + "logits/rejected": -2.3388524055480957, + "logps/chosen": -184.7909698486328, + "logps/rejected": -249.22142028808594, + "loss": 0.7476, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.28819751739501953, + "rewards/margins": -0.08883972465991974, + "rewards/rejected": -0.1993577927350998, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 4.6511627906976744e-08, + "logits/chosen": -2.1187872886657715, + "logits/rejected": -2.462371349334717, + "logps/chosen": -407.9158935546875, + "logps/rejected": -218.08628845214844, + "loss": 1.049, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.8088630437850952, + "rewards/margins": -0.5424136519432068, + "rewards/rejected": -0.2664493918418884, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 4.883720930232558e-08, + "logits/chosen": -2.1358978748321533, + "logits/rejected": -1.8003429174423218, + "logps/chosen": -178.56448364257812, + "logps/rejected": -273.8068542480469, + "loss": 0.6224, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10442107915878296, + "rewards/margins": 0.20315207540988922, + "rewards/rejected": -0.307573139667511, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 5.1162790697674416e-08, + "logits/chosen": -2.0717709064483643, + "logits/rejected": -2.526026487350464, + "logps/chosen": -480.04022216796875, + "logps/rejected": -304.8790283203125, + "loss": 0.6126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.426477313041687, + "rewards/margins": 0.19031721353530884, + "rewards/rejected": -0.6167945265769958, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 5.348837209302326e-08, + "logits/chosen": -2.432738780975342, + "logits/rejected": -2.3010363578796387, + "logps/chosen": -228.48782348632812, + "logps/rejected": -256.86456298828125, + "loss": 0.5999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19084897637367249, + "rewards/margins": 0.27129706740379333, + "rewards/rejected": -0.4621460437774658, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 5.5813953488372087e-08, + "logits/chosen": -2.284633159637451, + "logits/rejected": -2.101175308227539, + "logps/chosen": -177.31121826171875, + "logps/rejected": -201.95387268066406, + "loss": 1.2117, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6518417596817017, + "rewards/margins": -0.3435242772102356, + "rewards/rejected": -1.3083174228668213, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 5.813953488372093e-08, + "logits/chosen": -2.59863543510437, + "logits/rejected": -2.7144598960876465, + "logps/chosen": -255.5670928955078, + "logps/rejected": -311.2051696777344, + "loss": 0.6775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33376044034957886, + "rewards/margins": 0.19081330299377441, + "rewards/rejected": -0.5245736837387085, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 6.046511627906976e-08, + "logits/chosen": -2.3561620712280273, + "logits/rejected": -2.2605764865875244, + "logps/chosen": -172.1944580078125, + "logps/rejected": -180.14633178710938, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.059106722474098206, + "rewards/margins": 0.6985692977905273, + "rewards/rejected": -0.7576760053634644, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 6.27906976744186e-08, + "logits/chosen": -1.923173189163208, + "logits/rejected": -1.8035862445831299, + "logps/chosen": -324.13397216796875, + "logps/rejected": -296.1949157714844, + "loss": 0.9933, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4739299714565277, + "rewards/margins": -0.30800122022628784, + "rewards/rejected": -0.16592879593372345, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 6.511627906976744e-08, + "logits/chosen": -1.7430894374847412, + "logits/rejected": -1.587222695350647, + "logps/chosen": -125.14323425292969, + "logps/rejected": -168.17587280273438, + "loss": 0.8748, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.505617618560791, + "rewards/margins": 0.04966023564338684, + "rewards/rejected": -0.5552778840065002, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 6.744186046511628e-08, + "logits/chosen": -2.3875133991241455, + "logits/rejected": -2.4412190914154053, + "logps/chosen": -180.6735076904297, + "logps/rejected": -149.4240264892578, + "loss": 0.6651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7512054443359375, + "rewards/margins": 0.7399396896362305, + "rewards/rejected": -1.4911452531814575, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 6.976744186046511e-08, + "logits/chosen": -2.1003315448760986, + "logits/rejected": -2.2294981479644775, + "logps/chosen": -319.9154968261719, + "logps/rejected": -232.8546600341797, + "loss": 0.753, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13115616142749786, + "rewards/margins": -0.06040288507938385, + "rewards/rejected": -0.07075329124927521, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 7.209302325581396e-08, + "logits/chosen": -2.8773646354675293, + "logits/rejected": -2.7694449424743652, + "logps/chosen": -186.46194458007812, + "logps/rejected": -255.46096801757812, + "loss": 0.8024, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3301490843296051, + "rewards/margins": -0.10115490108728409, + "rewards/rejected": -0.22899417579174042, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 7.441860465116279e-08, + "logits/chosen": -2.1780011653900146, + "logits/rejected": -2.2342939376831055, + "logps/chosen": -343.21746826171875, + "logps/rejected": -264.72637939453125, + "loss": 0.6627, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5595120191574097, + "rewards/margins": 0.10767830908298492, + "rewards/rejected": -0.6671902537345886, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 7.674418604651163e-08, + "logits/chosen": -2.9698286056518555, + "logits/rejected": -3.0241994857788086, + "logps/chosen": -183.80117797851562, + "logps/rejected": -177.83920288085938, + "loss": 0.7493, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14829738438129425, + "rewards/margins": 0.0063615962862968445, + "rewards/rejected": -0.1546589881181717, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 7.906976744186046e-08, + "logits/chosen": -2.6177778244018555, + "logits/rejected": -2.711055040359497, + "logps/chosen": -221.38626098632812, + "logps/rejected": -210.04791259765625, + "loss": 0.8767, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34128063917160034, + "rewards/margins": -0.2563408613204956, + "rewards/rejected": -0.08493972569704056, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 8.13953488372093e-08, + "logits/chosen": -2.139709234237671, + "logits/rejected": -2.1862668991088867, + "logps/chosen": -404.7296447753906, + "logps/rejected": -282.5640563964844, + "loss": 0.6201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18404540419578552, + "rewards/margins": 0.23818421363830566, + "rewards/rejected": -0.4222296476364136, + "step": 35 + }, + { + "epoch": 0.0, + "learning_rate": 8.372093023255813e-08, + "logits/chosen": -1.888787865638733, + "logits/rejected": -1.9491325616836548, + "logps/chosen": -590.8215942382812, + "logps/rejected": -364.3123779296875, + "loss": 0.9904, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7168409824371338, + "rewards/margins": -0.27501195669174194, + "rewards/rejected": -0.44182896614074707, + "step": 36 + }, + { + "epoch": 0.0, + "learning_rate": 8.604651162790698e-08, + "logits/chosen": -2.8992369174957275, + "logits/rejected": -2.9710233211517334, + "logps/chosen": -299.6357421875, + "logps/rejected": -316.66363525390625, + "loss": 0.774, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27237987518310547, + "rewards/margins": -0.09465236961841583, + "rewards/rejected": -0.17772750556468964, + "step": 37 + }, + { + "epoch": 0.0, + "learning_rate": 8.837209302325582e-08, + "logits/chosen": -2.4523823261260986, + "logits/rejected": -2.5172343254089355, + "logps/chosen": -289.5021057128906, + "logps/rejected": -251.57266235351562, + "loss": 0.6092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12071393430233002, + "rewards/margins": 0.19816556572914124, + "rewards/rejected": -0.31887951493263245, + "step": 38 + }, + { + "epoch": 0.0, + "learning_rate": 9.069767441860464e-08, + "logits/chosen": -2.383002758026123, + "logits/rejected": -2.4668784141540527, + "logps/chosen": -417.6801452636719, + "logps/rejected": -317.9768371582031, + "loss": 0.7849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5020383596420288, + "rewards/margins": -0.1064617931842804, + "rewards/rejected": -0.395576536655426, + "step": 39 + }, + { + "epoch": 0.0, + "learning_rate": 9.302325581395349e-08, + "logits/chosen": -2.4345924854278564, + "logits/rejected": -2.3385424613952637, + "logps/chosen": -310.6422424316406, + "logps/rejected": -241.9683837890625, + "loss": 0.8526, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8365793228149414, + "rewards/margins": 0.3219362497329712, + "rewards/rejected": -2.158515691757202, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 9.534883720930232e-08, + "logits/chosen": -2.4973034858703613, + "logits/rejected": -2.285468578338623, + "logps/chosen": -209.2900390625, + "logps/rejected": -206.86367797851562, + "loss": 0.8699, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4569118618965149, + "rewards/margins": -0.2597395181655884, + "rewards/rejected": -0.1971723437309265, + "step": 41 + }, + { + "epoch": 0.0, + "learning_rate": 9.767441860465116e-08, + "logits/chosen": -2.52140736579895, + "logits/rejected": -2.4069154262542725, + "logps/chosen": -255.2979736328125, + "logps/rejected": -202.064453125, + "loss": 0.5774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.062088657170534134, + "rewards/margins": 0.2910660207271576, + "rewards/rejected": -0.35315465927124023, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 1e-07, + "logits/chosen": -2.484564781188965, + "logits/rejected": -2.4409337043762207, + "logps/chosen": -179.82351684570312, + "logps/rejected": -179.67767333984375, + "loss": 1.1582, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5952649712562561, + "rewards/margins": -0.596484363079071, + "rewards/rejected": 0.0012193676084280014, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.0232558139534883e-07, + "logits/chosen": -2.62892484664917, + "logits/rejected": -2.2565321922302246, + "logps/chosen": -177.9101104736328, + "logps/rejected": -251.5445098876953, + "loss": 0.6459, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03073548898100853, + "rewards/margins": 0.1703680008649826, + "rewards/rejected": -0.20110349357128143, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 1.0465116279069767e-07, + "logits/chosen": -2.327939987182617, + "logits/rejected": -2.0135440826416016, + "logps/chosen": -246.9445343017578, + "logps/rejected": -416.3923645019531, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4190826117992401, + "rewards/margins": 0.06963533163070679, + "rewards/rejected": -0.4887179136276245, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.0697674418604652e-07, + "logits/chosen": -2.1443333625793457, + "logits/rejected": -2.0008716583251953, + "logps/chosen": -250.57481384277344, + "logps/rejected": -226.42410278320312, + "loss": 0.7956, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.34612005949020386, + "rewards/margins": -0.13149793446063995, + "rewards/rejected": -0.2146221101284027, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 1.0930232558139534e-07, + "logits/chosen": -2.745948076248169, + "logits/rejected": -2.4107697010040283, + "logps/chosen": -281.4881591796875, + "logps/rejected": -269.353515625, + "loss": 0.8439, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6514642238616943, + "rewards/margins": -0.22894853353500366, + "rewards/rejected": -0.4225156903266907, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.1162790697674417e-07, + "logits/chosen": -2.198420286178589, + "logits/rejected": -2.2828078269958496, + "logps/chosen": -167.5601806640625, + "logps/rejected": -156.71365356445312, + "loss": 0.8383, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.158750057220459, + "rewards/margins": 0.2781144976615906, + "rewards/rejected": -2.4368646144866943, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.1395348837209302e-07, + "logits/chosen": -2.0897417068481445, + "logits/rejected": -2.4239277839660645, + "logps/chosen": -379.0882873535156, + "logps/rejected": -334.62799072265625, + "loss": 0.9831, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6604304313659668, + "rewards/margins": -0.3971719443798065, + "rewards/rejected": -0.2632584869861603, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.1627906976744186e-07, + "logits/chosen": -2.442202091217041, + "logits/rejected": -2.3841967582702637, + "logps/chosen": -304.512451171875, + "logps/rejected": -279.9529724121094, + "loss": 0.7445, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5183042287826538, + "rewards/margins": -0.04576530307531357, + "rewards/rejected": -0.47253894805908203, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.1860465116279068e-07, + "logits/chosen": -2.252142906188965, + "logits/rejected": -1.9597101211547852, + "logps/chosen": -181.8081512451172, + "logps/rejected": -330.64056396484375, + "loss": 0.5954, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09730133414268494, + "rewards/margins": 0.514672577381134, + "rewards/rejected": -0.6119738817214966, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.2093023255813953e-07, + "logits/chosen": -2.501095771789551, + "logits/rejected": -2.6473937034606934, + "logps/chosen": -346.1275939941406, + "logps/rejected": -219.6533203125, + "loss": 0.9755, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5505790710449219, + "rewards/margins": -0.3939691185951233, + "rewards/rejected": -0.1566099226474762, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 1.2325581395348838e-07, + "logits/chosen": -2.6609301567077637, + "logits/rejected": -2.6081488132476807, + "logps/chosen": -306.408203125, + "logps/rejected": -260.86627197265625, + "loss": 0.6929, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18002676963806152, + "rewards/margins": 0.034969575703144073, + "rewards/rejected": -0.214996337890625, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 1.255813953488372e-07, + "logits/chosen": -2.687702178955078, + "logits/rejected": -2.5199923515319824, + "logps/chosen": -261.8502197265625, + "logps/rejected": -238.46640014648438, + "loss": 0.5445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5038968324661255, + "rewards/margins": 0.7838702201843262, + "rewards/rejected": -1.2877670526504517, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 1.2790697674418602e-07, + "logits/chosen": -2.3474626541137695, + "logits/rejected": -2.340203285217285, + "logps/chosen": -374.0192565917969, + "logps/rejected": -368.4148864746094, + "loss": 0.786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4484564960002899, + "rewards/margins": -0.07293173670768738, + "rewards/rejected": -0.37552469968795776, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 1.3023255813953487e-07, + "logits/chosen": -2.3067140579223633, + "logits/rejected": -2.245939016342163, + "logps/chosen": -278.44842529296875, + "logps/rejected": -341.2032470703125, + "loss": 0.7258, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.42136216163635254, + "rewards/margins": -0.04227380454540253, + "rewards/rejected": -0.3790883421897888, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 1.3255813953488372e-07, + "logits/chosen": -2.502161741256714, + "logits/rejected": -2.46122145652771, + "logps/chosen": -327.3040771484375, + "logps/rejected": -251.4073028564453, + "loss": 0.6466, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.00623655691742897, + "rewards/margins": 0.15304572880268097, + "rewards/rejected": -0.15928226709365845, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 1.3488372093023257e-07, + "logits/chosen": -1.8423786163330078, + "logits/rejected": -2.146819591522217, + "logps/chosen": -539.5610961914062, + "logps/rejected": -328.5838317871094, + "loss": 0.7059, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28162315487861633, + "rewards/margins": 0.10821359604597092, + "rewards/rejected": -0.38983675837516785, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 1.372093023255814e-07, + "logits/chosen": -2.3620684146881104, + "logits/rejected": -2.5322587490081787, + "logps/chosen": -280.43353271484375, + "logps/rejected": -188.90223693847656, + "loss": 0.6032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02366514876484871, + "rewards/margins": 0.23062409460544586, + "rewards/rejected": -0.2542892396450043, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 1.3953488372093021e-07, + "logits/chosen": -2.473050355911255, + "logits/rejected": -2.5788612365722656, + "logps/chosen": -179.7606658935547, + "logps/rejected": -175.0801239013672, + "loss": 0.6764, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15987436473369598, + "rewards/margins": 0.27256450057029724, + "rewards/rejected": -0.4324389100074768, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.4186046511627906e-07, + "logits/chosen": -2.0576164722442627, + "logits/rejected": -1.8822991847991943, + "logps/chosen": -332.81524658203125, + "logps/rejected": -298.131103515625, + "loss": 0.853, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9428043365478516, + "rewards/margins": -0.11977139860391617, + "rewards/rejected": -0.8230329155921936, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 1.441860465116279e-07, + "logits/chosen": -2.597367286682129, + "logits/rejected": -2.418121337890625, + "logps/chosen": -481.1905822753906, + "logps/rejected": -314.07086181640625, + "loss": 0.8734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5394330024719238, + "rewards/margins": -0.0797206461429596, + "rewards/rejected": -0.45971229672431946, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 1.4651162790697673e-07, + "logits/chosen": -2.741827964782715, + "logits/rejected": -2.795870542526245, + "logps/chosen": -317.8959045410156, + "logps/rejected": -182.69577026367188, + "loss": 0.7737, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18465913832187653, + "rewards/margins": -0.006775900721549988, + "rewards/rejected": -0.17788323760032654, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 1.4883720930232558e-07, + "logits/chosen": -1.9943172931671143, + "logits/rejected": -2.168597459793091, + "logps/chosen": -483.20599365234375, + "logps/rejected": -412.0091552734375, + "loss": 0.6568, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7220395803451538, + "rewards/margins": 0.2720763385295868, + "rewards/rejected": -0.9941158890724182, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 1.511627906976744e-07, + "logits/chosen": -2.6222081184387207, + "logits/rejected": -2.667799949645996, + "logps/chosen": -72.76377868652344, + "logps/rejected": -141.27818298339844, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15879850089550018, + "rewards/margins": 0.5578910708427429, + "rewards/rejected": -0.7166895866394043, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 1.5348837209302325e-07, + "logits/chosen": -2.5911662578582764, + "logits/rejected": -2.658141613006592, + "logps/chosen": -178.01080322265625, + "logps/rejected": -188.33279418945312, + "loss": 0.5075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16364695131778717, + "rewards/margins": 0.6743106842041016, + "rewards/rejected": -0.8379575610160828, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 1.558139534883721e-07, + "logits/chosen": -1.8621940612792969, + "logits/rejected": -2.5851268768310547, + "logps/chosen": -301.75958251953125, + "logps/rejected": -233.52359008789062, + "loss": 0.7691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8451168537139893, + "rewards/margins": 0.09759727120399475, + "rewards/rejected": -0.9427141547203064, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 1.5813953488372092e-07, + "logits/chosen": -2.4558424949645996, + "logits/rejected": -2.2277941703796387, + "logps/chosen": -216.7931365966797, + "logps/rejected": -234.21463012695312, + "loss": 0.8939, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8377339243888855, + "rewards/margins": -0.1661604642868042, + "rewards/rejected": -0.6715735197067261, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 1.6046511627906975e-07, + "logits/chosen": -2.363039255142212, + "logits/rejected": -2.160311460494995, + "logps/chosen": -356.49432373046875, + "logps/rejected": -399.37176513671875, + "loss": 0.6304, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2711089849472046, + "rewards/margins": 0.2946273684501648, + "rewards/rejected": -0.5657364130020142, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 1.627906976744186e-07, + "logits/chosen": -3.031538248062134, + "logits/rejected": -3.012721538543701, + "logps/chosen": -205.16964721679688, + "logps/rejected": -218.19842529296875, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09832841157913208, + "rewards/margins": 0.3259468674659729, + "rewards/rejected": -0.424275279045105, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.6511627906976742e-07, + "logits/chosen": -2.079557180404663, + "logits/rejected": -2.255338191986084, + "logps/chosen": -310.7487487792969, + "logps/rejected": -236.25978088378906, + "loss": 0.7614, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5114730596542358, + "rewards/margins": -0.07345346361398697, + "rewards/rejected": -0.43801963329315186, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 1.6744186046511627e-07, + "logits/chosen": -2.3511509895324707, + "logits/rejected": -2.0647401809692383, + "logps/chosen": -335.1462097167969, + "logps/rejected": -420.6827087402344, + "loss": 0.6492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3721458613872528, + "rewards/margins": 0.1769292652606964, + "rewards/rejected": -0.5490751266479492, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 1.6976744186046512e-07, + "logits/chosen": -2.3079817295074463, + "logits/rejected": -2.32616925239563, + "logps/chosen": -188.2433319091797, + "logps/rejected": -237.46751403808594, + "loss": 0.6356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11632364243268967, + "rewards/margins": 0.1776544749736786, + "rewards/rejected": -0.29397812485694885, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 1.7209302325581396e-07, + "logits/chosen": -2.3757071495056152, + "logits/rejected": -2.3956480026245117, + "logps/chosen": -221.7606658935547, + "logps/rejected": -181.85850524902344, + "loss": 0.776, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1650571972131729, + "rewards/margins": -0.011085748672485352, + "rewards/rejected": -0.15397146344184875, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 1.7441860465116279e-07, + "logits/chosen": -1.8253350257873535, + "logits/rejected": -1.9870967864990234, + "logps/chosen": -185.83123779296875, + "logps/rejected": -210.3099365234375, + "loss": 0.8035, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48037785291671753, + "rewards/margins": -0.07544104009866714, + "rewards/rejected": -0.404936820268631, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 1.7674418604651164e-07, + "logits/chosen": -2.8733153343200684, + "logits/rejected": -2.8293955326080322, + "logps/chosen": -224.70474243164062, + "logps/rejected": -145.5535430908203, + "loss": 0.9047, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5753843784332275, + "rewards/margins": -0.1298551708459854, + "rewards/rejected": -0.44552919268608093, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 1.7906976744186043e-07, + "logits/chosen": -2.5145156383514404, + "logits/rejected": -2.584648609161377, + "logps/chosen": -214.8340301513672, + "logps/rejected": -285.63616943359375, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1631683111190796, + "rewards/margins": 0.5445789098739624, + "rewards/rejected": -0.707747220993042, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 1.8139534883720928e-07, + "logits/chosen": -2.116382598876953, + "logits/rejected": -2.2611207962036133, + "logps/chosen": -139.19638061523438, + "logps/rejected": -164.49725341796875, + "loss": 2.3158, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4383955001831055, + "rewards/margins": -1.4515013694763184, + "rewards/rejected": -0.986893892288208, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 1.8372093023255813e-07, + "logits/chosen": -2.3509247303009033, + "logits/rejected": -2.111485242843628, + "logps/chosen": -170.42820739746094, + "logps/rejected": -262.1083679199219, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7016013264656067, + "rewards/margins": 0.05304943770170212, + "rewards/rejected": -0.7546507716178894, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 1.8604651162790698e-07, + "logits/chosen": -2.5512146949768066, + "logits/rejected": -2.405355930328369, + "logps/chosen": -302.0247802734375, + "logps/rejected": -332.14385986328125, + "loss": 0.6452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21803753077983856, + "rewards/margins": 0.2606792151927948, + "rewards/rejected": -0.47871676087379456, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 1.883720930232558e-07, + "logits/chosen": -2.649334192276001, + "logits/rejected": -2.5358147621154785, + "logps/chosen": -201.7420196533203, + "logps/rejected": -186.45254516601562, + "loss": 0.6067, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23455172777175903, + "rewards/margins": 0.5783436298370361, + "rewards/rejected": -0.8128952383995056, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 1.9069767441860465e-07, + "logits/chosen": -2.448350429534912, + "logits/rejected": -2.317558765411377, + "logps/chosen": -256.4176025390625, + "logps/rejected": -312.09637451171875, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2117278128862381, + "rewards/margins": 0.3453845977783203, + "rewards/rejected": -0.5571123957633972, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 1.930232558139535e-07, + "logits/chosen": -2.4099225997924805, + "logits/rejected": -2.5384576320648193, + "logps/chosen": -395.0341796875, + "logps/rejected": -222.17771911621094, + "loss": 0.9035, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3871869444847107, + "rewards/margins": -0.2399856150150299, + "rewards/rejected": -0.14720135927200317, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 1.9534883720930232e-07, + "logits/chosen": -2.3440794944763184, + "logits/rejected": -2.2873172760009766, + "logps/chosen": -220.02389526367188, + "logps/rejected": -284.811767578125, + "loss": 0.8451, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28907763957977295, + "rewards/margins": -0.22321955859661102, + "rewards/rejected": -0.0658581331372261, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 1.9767441860465114e-07, + "logits/chosen": -2.413980007171631, + "logits/rejected": -2.5064969062805176, + "logps/chosen": -337.5306091308594, + "logps/rejected": -304.260498046875, + "loss": 0.9087, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.807115912437439, + "rewards/margins": -0.2305152714252472, + "rewards/rejected": -0.5766006112098694, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 2e-07, + "logits/chosen": -2.4838058948516846, + "logits/rejected": -2.0719006061553955, + "logps/chosen": -131.3878173828125, + "logps/rejected": -252.70587158203125, + "loss": 0.623, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02195039391517639, + "rewards/margins": 0.24605891108512878, + "rewards/rejected": -0.22410848736763, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 2.0232558139534881e-07, + "logits/chosen": -2.2932047843933105, + "logits/rejected": -2.3045012950897217, + "logps/chosen": -145.06350708007812, + "logps/rejected": -107.056884765625, + "loss": 0.8407, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2594671845436096, + "rewards/margins": -0.22573591768741608, + "rewards/rejected": -0.03373127430677414, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 2.0465116279069766e-07, + "logits/chosen": -2.6768062114715576, + "logits/rejected": -2.4649171829223633, + "logps/chosen": -354.6230773925781, + "logps/rejected": -442.5867004394531, + "loss": 0.596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3420978784561157, + "rewards/margins": 0.3304292857646942, + "rewards/rejected": -0.6725271344184875, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 2.069767441860465e-07, + "logits/chosen": -2.3527350425720215, + "logits/rejected": -2.3163013458251953, + "logps/chosen": -232.95733642578125, + "logps/rejected": -156.99827575683594, + "loss": 0.8092, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36036771535873413, + "rewards/margins": 0.1983512043952942, + "rewards/rejected": -0.5587188601493835, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 2.0930232558139533e-07, + "logits/chosen": -2.6749043464660645, + "logits/rejected": -2.6445324420928955, + "logps/chosen": -335.8284606933594, + "logps/rejected": -277.89697265625, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3249202370643616, + "rewards/margins": 0.4027801752090454, + "rewards/rejected": -0.7277003526687622, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 2.1162790697674418e-07, + "logits/chosen": -2.066859245300293, + "logits/rejected": -2.3091135025024414, + "logps/chosen": -287.7630615234375, + "logps/rejected": -233.48422241210938, + "loss": 0.5942, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33292651176452637, + "rewards/margins": 0.33223992586135864, + "rewards/rejected": -0.665166437625885, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 2.1395348837209303e-07, + "logits/chosen": -2.086124897003174, + "logits/rejected": -1.982297420501709, + "logps/chosen": -209.50343322753906, + "logps/rejected": -193.66485595703125, + "loss": 1.1486, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.9113097190856934, + "rewards/margins": -0.490678608417511, + "rewards/rejected": -0.42063108086586, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 2.1627906976744183e-07, + "logits/chosen": -2.570356845855713, + "logits/rejected": -2.636735677719116, + "logps/chosen": -183.26925659179688, + "logps/rejected": -170.32650756835938, + "loss": 0.8528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8393935561180115, + "rewards/margins": -0.02400238811969757, + "rewards/rejected": -0.8153911828994751, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 2.1860465116279068e-07, + "logits/chosen": -2.406005620956421, + "logits/rejected": -2.591987133026123, + "logps/chosen": -218.12379455566406, + "logps/rejected": -201.21875, + "loss": 0.5105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.163734570145607, + "rewards/margins": 0.5999506711959839, + "rewards/rejected": -0.7636852860450745, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 2.2093023255813952e-07, + "logits/chosen": -2.2681570053100586, + "logits/rejected": -2.3083791732788086, + "logps/chosen": -242.9326629638672, + "logps/rejected": -310.1129455566406, + "loss": 0.5452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14550848305225372, + "rewards/margins": 0.3707062602043152, + "rewards/rejected": -0.5162147283554077, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 2.2325581395348835e-07, + "logits/chosen": -2.6006369590759277, + "logits/rejected": -2.2266733646392822, + "logps/chosen": -117.85919189453125, + "logps/rejected": -216.32598876953125, + "loss": 0.6745, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5499044060707092, + "rewards/margins": 0.27583783864974976, + "rewards/rejected": -0.825742244720459, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 2.255813953488372e-07, + "logits/chosen": -1.6210027933120728, + "logits/rejected": -2.2731971740722656, + "logps/chosen": -555.565673828125, + "logps/rejected": -290.25115966796875, + "loss": 1.204, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9664278030395508, + "rewards/margins": -0.6703994274139404, + "rewards/rejected": -0.29602834582328796, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 2.2790697674418604e-07, + "logits/chosen": -2.392503499984741, + "logits/rejected": -2.2693846225738525, + "logps/chosen": -141.28466796875, + "logps/rejected": -231.76806640625, + "loss": 0.5255, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0303176641464233, + "rewards/margins": 0.6791374087333679, + "rewards/rejected": -1.709455132484436, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 2.302325581395349e-07, + "logits/chosen": -2.2897956371307373, + "logits/rejected": -2.125539779663086, + "logps/chosen": -371.5478515625, + "logps/rejected": -412.0826416015625, + "loss": 0.7491, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6182222962379456, + "rewards/margins": -0.024349048733711243, + "rewards/rejected": -0.5938732028007507, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 2.3255813953488372e-07, + "logits/chosen": -1.8030691146850586, + "logits/rejected": -2.1702818870544434, + "logps/chosen": -447.7699890136719, + "logps/rejected": -252.9459686279297, + "loss": 0.886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49596014618873596, + "rewards/margins": -0.2117862105369568, + "rewards/rejected": -0.2841739356517792, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 2.3488372093023254e-07, + "logits/chosen": -2.4537315368652344, + "logits/rejected": -2.274648666381836, + "logps/chosen": -348.35028076171875, + "logps/rejected": -277.3187255859375, + "loss": 0.6709, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22102771699428558, + "rewards/margins": 0.10838757455348969, + "rewards/rejected": -0.3294152617454529, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 2.3720930232558136e-07, + "logits/chosen": -2.1595985889434814, + "logits/rejected": -2.1457972526550293, + "logps/chosen": -143.21351623535156, + "logps/rejected": -197.78945922851562, + "loss": 0.8263, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42699629068374634, + "rewards/margins": -0.18382105231285095, + "rewards/rejected": -0.243175208568573, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 2.3953488372093024e-07, + "logits/chosen": -2.7874536514282227, + "logits/rejected": -2.850274085998535, + "logps/chosen": -410.5032958984375, + "logps/rejected": -342.883544921875, + "loss": 0.5105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22132186591625214, + "rewards/margins": 0.49288198351860046, + "rewards/rejected": -0.7142038345336914, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 2.4186046511627906e-07, + "logits/chosen": -1.8586337566375732, + "logits/rejected": -1.871573805809021, + "logps/chosen": -201.969482421875, + "logps/rejected": -206.4261474609375, + "loss": 0.7765, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36011138558387756, + "rewards/margins": -0.09883344173431396, + "rewards/rejected": -0.261277973651886, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 2.441860465116279e-07, + "logits/chosen": -1.9981329441070557, + "logits/rejected": -2.097548007965088, + "logps/chosen": -170.4879608154297, + "logps/rejected": -178.36611938476562, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0482807159423828, + "rewards/margins": 0.1192755326628685, + "rewards/rejected": -1.1675562858581543, + "step": 105 + }, + { + "epoch": 0.01, + "learning_rate": 2.4651162790697676e-07, + "logits/chosen": -2.8918509483337402, + "logits/rejected": -2.8618173599243164, + "logps/chosen": -163.16221618652344, + "logps/rejected": -220.1534423828125, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49197301268577576, + "rewards/margins": 0.0901225209236145, + "rewards/rejected": -0.5820955038070679, + "step": 106 + }, + { + "epoch": 0.01, + "learning_rate": 2.488372093023256e-07, + "logits/chosen": -2.243861198425293, + "logits/rejected": -2.241596221923828, + "logps/chosen": -272.9605712890625, + "logps/rejected": -212.6696319580078, + "loss": 0.8962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8061225414276123, + "rewards/margins": -0.11616721749305725, + "rewards/rejected": -0.6899554133415222, + "step": 107 + }, + { + "epoch": 0.01, + "learning_rate": 2.511627906976744e-07, + "logits/chosen": -2.4468531608581543, + "logits/rejected": -2.6342649459838867, + "logps/chosen": -302.49273681640625, + "logps/rejected": -250.5012969970703, + "loss": 0.9525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6266262531280518, + "rewards/margins": -0.2894216477870941, + "rewards/rejected": -0.33720460534095764, + "step": 108 + }, + { + "epoch": 0.01, + "learning_rate": 2.534883720930232e-07, + "logits/chosen": -1.8889508247375488, + "logits/rejected": -1.847444772720337, + "logps/chosen": -347.6550598144531, + "logps/rejected": -330.3041076660156, + "loss": 1.243, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2972767353057861, + "rewards/margins": -0.44702115654945374, + "rewards/rejected": -0.8502554893493652, + "step": 109 + }, + { + "epoch": 0.01, + "learning_rate": 2.5581395348837204e-07, + "logits/chosen": -2.876573324203491, + "logits/rejected": -2.8644440174102783, + "logps/chosen": -210.26419067382812, + "logps/rejected": -189.56478881835938, + "loss": 1.0044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49664145708084106, + "rewards/margins": -0.3557063937187195, + "rewards/rejected": -0.1409350335597992, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 2.581395348837209e-07, + "logits/chosen": -2.6873810291290283, + "logits/rejected": -2.6178128719329834, + "logps/chosen": -156.33543395996094, + "logps/rejected": -181.47691345214844, + "loss": 0.6194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.253256231546402, + "rewards/margins": 0.21962662041187286, + "rewards/rejected": -0.47288286685943604, + "step": 111 + }, + { + "epoch": 0.01, + "learning_rate": 2.6046511627906974e-07, + "logits/chosen": -2.4136996269226074, + "logits/rejected": -2.627744436264038, + "logps/chosen": -77.70036315917969, + "logps/rejected": -160.221923828125, + "loss": 0.5063, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4397839605808258, + "rewards/margins": 0.7447307109832764, + "rewards/rejected": -1.1845147609710693, + "step": 112 + }, + { + "epoch": 0.01, + "learning_rate": 2.627906976744186e-07, + "logits/chosen": -2.1982645988464355, + "logits/rejected": -2.1680996417999268, + "logps/chosen": -284.7992858886719, + "logps/rejected": -381.1324157714844, + "loss": 0.5805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15814796090126038, + "rewards/margins": 0.31709104776382446, + "rewards/rejected": -0.4752390384674072, + "step": 113 + }, + { + "epoch": 0.01, + "learning_rate": 2.6511627906976744e-07, + "logits/chosen": -1.969825029373169, + "logits/rejected": -1.9821308851242065, + "logps/chosen": -254.40682983398438, + "logps/rejected": -240.384033203125, + "loss": 0.8084, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2654780447483063, + "rewards/margins": -0.10995368659496307, + "rewards/rejected": -0.155524343252182, + "step": 114 + }, + { + "epoch": 0.01, + "learning_rate": 2.6744186046511626e-07, + "logits/chosen": -1.8670458793640137, + "logits/rejected": -1.8168087005615234, + "logps/chosen": -424.1560363769531, + "logps/rejected": -379.2054138183594, + "loss": 0.5846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.963326632976532, + "rewards/margins": 0.3254411518573761, + "rewards/rejected": -1.2887678146362305, + "step": 115 + }, + { + "epoch": 0.01, + "learning_rate": 2.6976744186046514e-07, + "logits/chosen": -2.0922317504882812, + "logits/rejected": -1.9747600555419922, + "logps/chosen": -196.18214416503906, + "logps/rejected": -254.14593505859375, + "loss": 0.7107, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38838067650794983, + "rewards/margins": 0.14774498343467712, + "rewards/rejected": -0.536125659942627, + "step": 116 + }, + { + "epoch": 0.01, + "learning_rate": 2.720930232558139e-07, + "logits/chosen": -2.570683002471924, + "logits/rejected": -2.6108639240264893, + "logps/chosen": -197.09706115722656, + "logps/rejected": -157.64486694335938, + "loss": 1.0065, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7618712186813354, + "rewards/margins": -0.4027426540851593, + "rewards/rejected": -0.3591286242008209, + "step": 117 + }, + { + "epoch": 0.01, + "learning_rate": 2.744186046511628e-07, + "logits/chosen": -2.60833477973938, + "logits/rejected": -2.522894859313965, + "logps/chosen": -243.58566284179688, + "logps/rejected": -214.76171875, + "loss": 0.4848, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10729708522558212, + "rewards/margins": 0.991612434387207, + "rewards/rejected": -0.8843153119087219, + "step": 118 + }, + { + "epoch": 0.01, + "learning_rate": 2.767441860465116e-07, + "logits/chosen": -2.2670044898986816, + "logits/rejected": -2.2630767822265625, + "logps/chosen": -168.00486755371094, + "logps/rejected": -187.48045349121094, + "loss": 0.7306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1719190776348114, + "rewards/margins": 0.003288939595222473, + "rewards/rejected": -0.17520800232887268, + "step": 119 + }, + { + "epoch": 0.01, + "learning_rate": 2.7906976744186043e-07, + "logits/chosen": -2.029635190963745, + "logits/rejected": -2.5271973609924316, + "logps/chosen": -410.35430908203125, + "logps/rejected": -224.42367553710938, + "loss": 0.7204, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.474770724773407, + "rewards/margins": 0.07804751396179199, + "rewards/rejected": -0.5528181791305542, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 2.813953488372093e-07, + "logits/chosen": -2.1471428871154785, + "logits/rejected": -2.099607467651367, + "logps/chosen": -376.0931396484375, + "logps/rejected": -256.83087158203125, + "loss": 0.4298, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.009108111262321472, + "rewards/margins": 0.683029055595398, + "rewards/rejected": -0.6739209294319153, + "step": 121 + }, + { + "epoch": 0.01, + "learning_rate": 2.837209302325581e-07, + "logits/chosen": -1.8627537488937378, + "logits/rejected": -2.174872636795044, + "logps/chosen": -577.5985107421875, + "logps/rejected": -456.60791015625, + "loss": 0.4403, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.033387936651706696, + "rewards/margins": 0.6115955114364624, + "rewards/rejected": -0.5782076120376587, + "step": 122 + }, + { + "epoch": 0.01, + "learning_rate": 2.8604651162790695e-07, + "logits/chosen": -2.4507200717926025, + "logits/rejected": -2.474109411239624, + "logps/chosen": -164.4906005859375, + "logps/rejected": -224.8709716796875, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.604463517665863, + "rewards/margins": 1.556099772453308, + "rewards/rejected": -2.1605632305145264, + "step": 123 + }, + { + "epoch": 0.01, + "learning_rate": 2.883720930232558e-07, + "logits/chosen": -2.047478675842285, + "logits/rejected": -2.321160078048706, + "logps/chosen": -287.2242431640625, + "logps/rejected": -174.9025115966797, + "loss": 0.8151, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6188247799873352, + "rewards/margins": -0.038447946310043335, + "rewards/rejected": -0.5803768634796143, + "step": 124 + }, + { + "epoch": 0.01, + "learning_rate": 2.9069767441860464e-07, + "logits/chosen": -3.026174545288086, + "logits/rejected": -3.0613205432891846, + "logps/chosen": -56.339683532714844, + "logps/rejected": -141.6298065185547, + "loss": 0.6197, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13143277168273926, + "rewards/margins": 0.16544213891029358, + "rewards/rejected": -0.29687491059303284, + "step": 125 + }, + { + "epoch": 0.01, + "learning_rate": 2.9302325581395347e-07, + "logits/chosen": -2.4416348934173584, + "logits/rejected": -2.265141725540161, + "logps/chosen": -243.73532104492188, + "logps/rejected": -367.46075439453125, + "loss": 0.6641, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.312120646238327, + "rewards/margins": 0.17439718544483185, + "rewards/rejected": -0.4865178167819977, + "step": 126 + }, + { + "epoch": 0.01, + "learning_rate": 2.953488372093023e-07, + "logits/chosen": -2.1385931968688965, + "logits/rejected": -2.1570651531219482, + "logps/chosen": -376.7693176269531, + "logps/rejected": -324.1436767578125, + "loss": 0.6087, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21629011631011963, + "rewards/margins": 0.3393087387084961, + "rewards/rejected": -0.5555988550186157, + "step": 127 + }, + { + "epoch": 0.01, + "learning_rate": 2.9767441860465116e-07, + "logits/chosen": -2.594001054763794, + "logits/rejected": -2.5332846641540527, + "logps/chosen": -265.2559814453125, + "logps/rejected": -225.3300323486328, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11694058775901794, + "rewards/margins": 0.7060776948928833, + "rewards/rejected": -0.8230182528495789, + "step": 128 + }, + { + "epoch": 0.02, + "learning_rate": 3e-07, + "logits/chosen": -2.0931873321533203, + "logits/rejected": -2.004478693008423, + "logps/chosen": -155.2783966064453, + "logps/rejected": -240.91732788085938, + "loss": 0.7649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4195050895214081, + "rewards/margins": 0.28783467411994934, + "rewards/rejected": -0.7073397636413574, + "step": 129 + }, + { + "epoch": 0.02, + "learning_rate": 2.9996456832408174e-07, + "logits/chosen": -2.7366340160369873, + "logits/rejected": -2.691190242767334, + "logps/chosen": -295.0311279296875, + "logps/rejected": -242.97836303710938, + "loss": 0.4413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11736232787370682, + "rewards/margins": 0.6299964785575867, + "rewards/rejected": -0.5126341581344604, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 2.9992913664816343e-07, + "logits/chosen": -2.417391061782837, + "logits/rejected": -2.5532987117767334, + "logps/chosen": -463.8819885253906, + "logps/rejected": -318.5346374511719, + "loss": 0.7805, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17544816434383392, + "rewards/margins": -0.02362711727619171, + "rewards/rejected": -0.1518210470676422, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 2.998937049722452e-07, + "logits/chosen": -2.3182525634765625, + "logits/rejected": -2.6131339073181152, + "logps/chosen": -341.30474853515625, + "logps/rejected": -267.3067626953125, + "loss": 0.8798, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9353861212730408, + "rewards/margins": -0.11911562830209732, + "rewards/rejected": -0.8162704706192017, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 2.998582732963269e-07, + "logits/chosen": -2.659172296524048, + "logits/rejected": -2.9109010696411133, + "logps/chosen": -389.9925231933594, + "logps/rejected": -177.78512573242188, + "loss": 0.979, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6689620018005371, + "rewards/margins": -0.3065325915813446, + "rewards/rejected": -0.3624293804168701, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 2.998228416204086e-07, + "logits/chosen": -2.85929799079895, + "logits/rejected": -2.7307815551757812, + "logps/chosen": -206.71617126464844, + "logps/rejected": -264.13751220703125, + "loss": 0.8968, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0276267528533936, + "rewards/margins": -0.05892267823219299, + "rewards/rejected": -0.968704104423523, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 2.9978740994449037e-07, + "logits/chosen": -1.618591070175171, + "logits/rejected": -1.894781470298767, + "logps/chosen": -425.32305908203125, + "logps/rejected": -257.943603515625, + "loss": 0.5959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24507570266723633, + "rewards/margins": 0.26544684171676636, + "rewards/rejected": -0.5105225443840027, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 2.997519782685721e-07, + "logits/chosen": -2.729038953781128, + "logits/rejected": -2.7132813930511475, + "logps/chosen": -161.44418334960938, + "logps/rejected": -258.60394287109375, + "loss": 0.521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044339098036289215, + "rewards/margins": 0.5262002348899841, + "rewards/rejected": -0.5705393552780151, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 2.997165465926538e-07, + "logits/chosen": -2.5893824100494385, + "logits/rejected": -2.2048580646514893, + "logps/chosen": -209.68711853027344, + "logps/rejected": -288.78070068359375, + "loss": 0.6152, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3048330545425415, + "rewards/margins": 0.313556432723999, + "rewards/rejected": -0.6183894872665405, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 2.9968111491673556e-07, + "logits/chosen": -2.4292397499084473, + "logits/rejected": -2.253242254257202, + "logps/chosen": -294.5284729003906, + "logps/rejected": -344.6085510253906, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5531035661697388, + "rewards/margins": 1.3905061483383179, + "rewards/rejected": -1.943609595298767, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 2.9964568324081726e-07, + "logits/chosen": -2.594527006149292, + "logits/rejected": -2.4379348754882812, + "logps/chosen": -142.90951538085938, + "logps/rejected": -235.33216857910156, + "loss": 0.6526, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42056623101234436, + "rewards/margins": 0.17449112236499786, + "rewards/rejected": -0.5950573682785034, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 2.99610251564899e-07, + "logits/chosen": -2.115158796310425, + "logits/rejected": -2.400864601135254, + "logps/chosen": -349.9581298828125, + "logps/rejected": -284.10321044921875, + "loss": 0.6305, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2188100814819336, + "rewards/margins": 0.8487216234207153, + "rewards/rejected": -2.0675318241119385, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 2.9957481988898076e-07, + "logits/chosen": -2.687493324279785, + "logits/rejected": -2.610898971557617, + "logps/chosen": -169.18585205078125, + "logps/rejected": -354.4629211425781, + "loss": 0.5257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3375472128391266, + "rewards/margins": 0.40620124340057373, + "rewards/rejected": -0.7437484860420227, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 2.9953938821306245e-07, + "logits/chosen": -2.8204967975616455, + "logits/rejected": -2.859799861907959, + "logps/chosen": -91.15990447998047, + "logps/rejected": -102.08074188232422, + "loss": 0.7275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34524649381637573, + "rewards/margins": 0.06920135021209717, + "rewards/rejected": -0.4144478440284729, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 2.995039565371442e-07, + "logits/chosen": -2.2518529891967773, + "logits/rejected": -2.300924301147461, + "logps/chosen": -256.9783020019531, + "logps/rejected": -238.1842041015625, + "loss": 0.589, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3805345296859741, + "rewards/margins": 0.47803041338920593, + "rewards/rejected": -0.8585649728775024, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 2.994685248612259e-07, + "logits/chosen": -2.2423295974731445, + "logits/rejected": -2.2950899600982666, + "logps/chosen": -212.7620849609375, + "logps/rejected": -216.1529083251953, + "loss": 0.5701, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23712702095508575, + "rewards/margins": 0.41263240575790405, + "rewards/rejected": -0.6497594714164734, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 2.9943309318530765e-07, + "logits/chosen": -2.735442638397217, + "logits/rejected": -2.798872709274292, + "logps/chosen": -167.79116821289062, + "logps/rejected": -233.87918090820312, + "loss": 0.5797, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15597400069236755, + "rewards/margins": 0.42198696732521057, + "rewards/rejected": -0.5779609680175781, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 2.9939766150938934e-07, + "logits/chosen": -1.4767216444015503, + "logits/rejected": -2.1149492263793945, + "logps/chosen": -628.093994140625, + "logps/rejected": -442.81524658203125, + "loss": 0.7641, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8022739887237549, + "rewards/margins": 0.36251893639564514, + "rewards/rejected": -1.1647928953170776, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 2.9936222983347114e-07, + "logits/chosen": -2.5397250652313232, + "logits/rejected": -2.699514389038086, + "logps/chosen": -329.48687744140625, + "logps/rejected": -358.07769775390625, + "loss": 0.6317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4497101902961731, + "rewards/margins": 0.17475473880767822, + "rewards/rejected": -0.6244649291038513, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 2.9932679815755284e-07, + "logits/chosen": -2.2238693237304688, + "logits/rejected": -2.359199285507202, + "logps/chosen": -375.20623779296875, + "logps/rejected": -263.2727966308594, + "loss": 0.7545, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4355597496032715, + "rewards/margins": -0.070066437125206, + "rewards/rejected": -0.3654933571815491, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 2.992913664816346e-07, + "logits/chosen": -2.275559425354004, + "logits/rejected": -2.1012988090515137, + "logps/chosen": -267.72418212890625, + "logps/rejected": -294.7813720703125, + "loss": 0.7864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7298777103424072, + "rewards/margins": 0.41024768352508545, + "rewards/rejected": -1.1401255130767822, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 2.992559348057163e-07, + "logits/chosen": -2.740659475326538, + "logits/rejected": -2.6224112510681152, + "logps/chosen": -269.9718017578125, + "logps/rejected": -216.44573974609375, + "loss": 0.6544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6055588126182556, + "rewards/margins": 0.40005630254745483, + "rewards/rejected": -1.0056151151657104, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 2.9922050312979803e-07, + "logits/chosen": -1.780271053314209, + "logits/rejected": -2.3586924076080322, + "logps/chosen": -353.4786682128906, + "logps/rejected": -280.44384765625, + "loss": 0.6061, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6104757785797119, + "rewards/margins": 0.7683894634246826, + "rewards/rejected": -1.378865361213684, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 2.991850714538798e-07, + "logits/chosen": -1.7893050909042358, + "logits/rejected": -1.875062346458435, + "logps/chosen": -321.23736572265625, + "logps/rejected": -259.16925048828125, + "loss": 1.0307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.624213457107544, + "rewards/margins": -0.25228065252304077, + "rewards/rejected": -0.37193283438682556, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 2.991496397779615e-07, + "logits/chosen": -2.173344612121582, + "logits/rejected": -2.4468865394592285, + "logps/chosen": -339.6275634765625, + "logps/rejected": -247.14126586914062, + "loss": 0.6515, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.259911060333252, + "rewards/margins": 0.30777859687805176, + "rewards/rejected": -1.5676896572113037, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 2.991142081020432e-07, + "logits/chosen": -2.2529501914978027, + "logits/rejected": -2.5658624172210693, + "logps/chosen": -340.80841064453125, + "logps/rejected": -239.71432495117188, + "loss": 0.8936, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8029078841209412, + "rewards/margins": -0.23260381817817688, + "rewards/rejected": -0.5703040361404419, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 2.990787764261249e-07, + "logits/chosen": -2.510453701019287, + "logits/rejected": -2.5344133377075195, + "logps/chosen": -109.82048034667969, + "logps/rejected": -171.2945556640625, + "loss": 0.514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.055533893406391144, + "rewards/margins": 0.4897593855857849, + "rewards/rejected": -0.5452932715415955, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 2.9904334475020667e-07, + "logits/chosen": -1.9162174463272095, + "logits/rejected": -1.6850186586380005, + "logps/chosen": -185.43527221679688, + "logps/rejected": -444.98468017578125, + "loss": 0.4895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33354297280311584, + "rewards/margins": 0.9943645000457764, + "rewards/rejected": -1.3279074430465698, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 2.9900791307428836e-07, + "logits/chosen": -2.2484042644500732, + "logits/rejected": -2.4392080307006836, + "logps/chosen": -261.85552978515625, + "logps/rejected": -204.5252685546875, + "loss": 0.9553, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1330440044403076, + "rewards/margins": -0.12059669196605682, + "rewards/rejected": -1.0124472379684448, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 2.989724813983701e-07, + "logits/chosen": -2.699685573577881, + "logits/rejected": -2.6827781200408936, + "logps/chosen": -248.12286376953125, + "logps/rejected": -275.5225830078125, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20926009118556976, + "rewards/margins": 0.05238410830497742, + "rewards/rejected": -0.261644184589386, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 2.9893704972245186e-07, + "logits/chosen": -2.1270711421966553, + "logits/rejected": -2.05656099319458, + "logps/chosen": -279.4642639160156, + "logps/rejected": -299.3373107910156, + "loss": 0.6945, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3710419833660126, + "rewards/margins": 0.33852648735046387, + "rewards/rejected": -0.7095685005187988, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 2.989016180465336e-07, + "logits/chosen": -2.311832904815674, + "logits/rejected": -2.2269809246063232, + "logps/chosen": -258.2981872558594, + "logps/rejected": -222.5327606201172, + "loss": 0.7792, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6360909938812256, + "rewards/margins": 0.20646511018276215, + "rewards/rejected": -1.8425562381744385, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 2.988661863706153e-07, + "logits/chosen": -2.6292130947113037, + "logits/rejected": -2.7311182022094727, + "logps/chosen": -186.0697021484375, + "logps/rejected": -234.8448486328125, + "loss": 0.5622, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36750853061676025, + "rewards/margins": 0.7177603244781494, + "rewards/rejected": -1.0852688550949097, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 2.9883075469469705e-07, + "logits/chosen": -2.1932260990142822, + "logits/rejected": -1.905879020690918, + "logps/chosen": -141.46380615234375, + "logps/rejected": -347.05877685546875, + "loss": 0.6139, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9611469507217407, + "rewards/margins": 0.2585405111312866, + "rewards/rejected": -1.2196874618530273, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 2.987953230187788e-07, + "logits/chosen": -2.353823184967041, + "logits/rejected": -2.472700595855713, + "logps/chosen": -454.1716613769531, + "logps/rejected": -289.56573486328125, + "loss": 0.7534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7410773634910583, + "rewards/margins": 0.4614759683609009, + "rewards/rejected": -1.2025532722473145, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 2.987598913428605e-07, + "logits/chosen": -2.3809735774993896, + "logits/rejected": -2.438262462615967, + "logps/chosen": -361.6285400390625, + "logps/rejected": -271.9600830078125, + "loss": 0.5782, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32255005836486816, + "rewards/margins": 0.5883536338806152, + "rewards/rejected": -0.9109036922454834, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 2.9872445966694225e-07, + "logits/chosen": -2.086742877960205, + "logits/rejected": -2.1662802696228027, + "logps/chosen": -135.5213165283203, + "logps/rejected": -184.32989501953125, + "loss": 0.5989, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44525524973869324, + "rewards/margins": 0.3135383725166321, + "rewards/rejected": -0.7587935924530029, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 2.9868902799102394e-07, + "logits/chosen": -2.264157295227051, + "logits/rejected": -2.49031138420105, + "logps/chosen": -341.9414978027344, + "logps/rejected": -229.1075897216797, + "loss": 0.5416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1409548819065094, + "rewards/margins": 0.591231644153595, + "rewards/rejected": -0.732186496257782, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 2.986535963151057e-07, + "logits/chosen": -2.183047294616699, + "logits/rejected": -2.2306580543518066, + "logps/chosen": -447.38775634765625, + "logps/rejected": -341.69586181640625, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4464993476867676, + "rewards/margins": 0.678467869758606, + "rewards/rejected": -1.1249672174453735, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 2.986181646391874e-07, + "logits/chosen": -2.5195505619049072, + "logits/rejected": -2.6643764972686768, + "logps/chosen": -185.33193969726562, + "logps/rejected": -169.29483032226562, + "loss": 0.7391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3364061415195465, + "rewards/margins": 0.1545141041278839, + "rewards/rejected": -0.4909202456474304, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 2.9858273296326914e-07, + "logits/chosen": -2.520084857940674, + "logits/rejected": -2.400113821029663, + "logps/chosen": -158.7333526611328, + "logps/rejected": -169.21876525878906, + "loss": 0.7242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21627792716026306, + "rewards/margins": 0.10012604296207428, + "rewards/rejected": -0.31640395522117615, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 2.985473012873509e-07, + "logits/chosen": -2.579526424407959, + "logits/rejected": -2.632833480834961, + "logps/chosen": -139.39089965820312, + "logps/rejected": -160.66226196289062, + "loss": 0.5402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26232483983039856, + "rewards/margins": 0.41159719228744507, + "rewards/rejected": -0.673922061920166, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 2.9851186961143263e-07, + "logits/chosen": -2.3546195030212402, + "logits/rejected": -2.4710028171539307, + "logps/chosen": -250.13548278808594, + "logps/rejected": -204.47930908203125, + "loss": 0.4305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0816798061132431, + "rewards/margins": 0.7516033053398132, + "rewards/rejected": -0.8332831263542175, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 2.9847643793551433e-07, + "logits/chosen": -2.461793899536133, + "logits/rejected": -2.8258161544799805, + "logps/chosen": -273.1517639160156, + "logps/rejected": -222.17884826660156, + "loss": 1.4372, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.4574060440063477, + "rewards/margins": -0.4638090133666992, + "rewards/rejected": -2.9935970306396484, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 2.984410062595961e-07, + "logits/chosen": -1.6198781728744507, + "logits/rejected": -1.8017055988311768, + "logps/chosen": -447.2364196777344, + "logps/rejected": -390.44378662109375, + "loss": 0.8512, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6869930028915405, + "rewards/margins": -0.1895975023508072, + "rewards/rejected": -0.49739551544189453, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 2.9840557458367777e-07, + "logits/chosen": -2.711627960205078, + "logits/rejected": -2.640700340270996, + "logps/chosen": -295.3407287597656, + "logps/rejected": -238.1574249267578, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4178104102611542, + "rewards/margins": 0.8496869206428528, + "rewards/rejected": -1.2674973011016846, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 2.983701429077595e-07, + "logits/chosen": -1.7103465795516968, + "logits/rejected": -2.231901168823242, + "logps/chosen": -369.5630187988281, + "logps/rejected": -316.6242370605469, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21179567277431488, + "rewards/margins": 0.6494349837303162, + "rewards/rejected": -0.8612306118011475, + "step": 175 + }, + { + "epoch": 0.02, + "learning_rate": 2.9833471123184127e-07, + "logits/chosen": -2.610466957092285, + "logits/rejected": -2.489729166030884, + "logps/chosen": -225.26031494140625, + "logps/rejected": -223.93878173828125, + "loss": 0.3781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1574171930551529, + "rewards/margins": 1.0541033744812012, + "rewards/rejected": -1.2115206718444824, + "step": 176 + }, + { + "epoch": 0.02, + "learning_rate": 2.9829927955592297e-07, + "logits/chosen": -2.463379383087158, + "logits/rejected": -2.1589033603668213, + "logps/chosen": -132.13174438476562, + "logps/rejected": -240.66943359375, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4057082533836365, + "rewards/margins": 0.5709302425384521, + "rewards/rejected": -0.9766385555267334, + "step": 177 + }, + { + "epoch": 0.02, + "learning_rate": 2.982638478800047e-07, + "logits/chosen": -1.5857264995574951, + "logits/rejected": -1.8790276050567627, + "logps/chosen": -399.6611022949219, + "logps/rejected": -325.9180908203125, + "loss": 0.5951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30008211731910706, + "rewards/margins": 0.2790365219116211, + "rewards/rejected": -0.5791186690330505, + "step": 178 + }, + { + "epoch": 0.02, + "learning_rate": 2.982284162040864e-07, + "logits/chosen": -2.4249980449676514, + "logits/rejected": -2.5328400135040283, + "logps/chosen": -145.7904052734375, + "logps/rejected": -178.11697387695312, + "loss": 0.543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09167562425136566, + "rewards/margins": 0.4494943618774414, + "rewards/rejected": -0.5411700010299683, + "step": 179 + }, + { + "epoch": 0.02, + "learning_rate": 2.9819298452816816e-07, + "logits/chosen": -2.637824773788452, + "logits/rejected": -2.14784574508667, + "logps/chosen": -278.68115234375, + "logps/rejected": -223.5745849609375, + "loss": 0.8634, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.012578010559082, + "rewards/margins": 0.07178112119436264, + "rewards/rejected": -1.0843591690063477, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 2.981575528522499e-07, + "logits/chosen": -1.8628668785095215, + "logits/rejected": -2.1004085540771484, + "logps/chosen": -375.42156982421875, + "logps/rejected": -239.94210815429688, + "loss": 0.6203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1471804976463318, + "rewards/margins": 0.21720591187477112, + "rewards/rejected": -0.3643864095211029, + "step": 181 + }, + { + "epoch": 0.02, + "learning_rate": 2.9812212117633166e-07, + "logits/chosen": -1.7659504413604736, + "logits/rejected": -2.2073898315429688, + "logps/chosen": -487.9418640136719, + "logps/rejected": -343.2994384765625, + "loss": 1.3051, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8398267030715942, + "rewards/margins": -0.7485359311103821, + "rewards/rejected": -1.0912909507751465, + "step": 182 + }, + { + "epoch": 0.02, + "learning_rate": 2.9808668950041335e-07, + "logits/chosen": -2.2780654430389404, + "logits/rejected": -2.2091846466064453, + "logps/chosen": -311.5879821777344, + "logps/rejected": -317.8841857910156, + "loss": 0.6217, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4547070264816284, + "rewards/margins": 0.3624263107776642, + "rewards/rejected": -0.8171333074569702, + "step": 183 + }, + { + "epoch": 0.02, + "learning_rate": 2.980512578244951e-07, + "logits/chosen": -2.9426255226135254, + "logits/rejected": -2.876356363296509, + "logps/chosen": -295.82733154296875, + "logps/rejected": -232.63882446289062, + "loss": 0.6686, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41066229343414307, + "rewards/margins": 0.3706735074520111, + "rewards/rejected": -0.7813358306884766, + "step": 184 + }, + { + "epoch": 0.02, + "learning_rate": 2.980158261485768e-07, + "logits/chosen": -2.661207675933838, + "logits/rejected": -2.7515451908111572, + "logps/chosen": -266.5347900390625, + "logps/rejected": -266.1720886230469, + "loss": 0.4387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5421686172485352, + "rewards/margins": 0.7271385788917542, + "rewards/rejected": -1.269307255744934, + "step": 185 + }, + { + "epoch": 0.02, + "learning_rate": 2.9798039447265854e-07, + "logits/chosen": -2.2362399101257324, + "logits/rejected": -2.4736719131469727, + "logps/chosen": -438.289794921875, + "logps/rejected": -400.8497314453125, + "loss": 0.7248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14261962473392487, + "rewards/margins": 0.5831397175788879, + "rewards/rejected": -0.7257592678070068, + "step": 186 + }, + { + "epoch": 0.02, + "learning_rate": 2.979449627967403e-07, + "logits/chosen": -2.177757740020752, + "logits/rejected": -2.4214768409729004, + "logps/chosen": -424.8014831542969, + "logps/rejected": -294.1171875, + "loss": 0.7395, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48851269483566284, + "rewards/margins": 0.37288495898246765, + "rewards/rejected": -0.8613976240158081, + "step": 187 + }, + { + "epoch": 0.02, + "learning_rate": 2.97909531120822e-07, + "logits/chosen": -2.5479719638824463, + "logits/rejected": -2.540255546569824, + "logps/chosen": -173.61489868164062, + "logps/rejected": -151.67105102539062, + "loss": 0.481, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06813786923885345, + "rewards/margins": 0.605137825012207, + "rewards/rejected": -0.5369999408721924, + "step": 188 + }, + { + "epoch": 0.02, + "learning_rate": 2.9787409944490374e-07, + "logits/chosen": -1.9244165420532227, + "logits/rejected": -2.060289144515991, + "logps/chosen": -545.0999755859375, + "logps/rejected": -347.0440673828125, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0551561117172241, + "rewards/margins": 0.14402733743190765, + "rewards/rejected": -1.199183464050293, + "step": 189 + }, + { + "epoch": 0.02, + "learning_rate": 2.9783866776898543e-07, + "logits/chosen": -2.305917501449585, + "logits/rejected": -2.548753023147583, + "logps/chosen": -480.833251953125, + "logps/rejected": -307.805908203125, + "loss": 0.5484, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11998571455478668, + "rewards/margins": 0.49165478348731995, + "rewards/rejected": -0.3716690242290497, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 2.978032360930672e-07, + "logits/chosen": -2.8891310691833496, + "logits/rejected": -2.576831102371216, + "logps/chosen": -170.96961975097656, + "logps/rejected": -227.4477996826172, + "loss": 0.5154, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4644376337528229, + "rewards/margins": 1.6487505435943604, + "rewards/rejected": -2.1131880283355713, + "step": 191 + }, + { + "epoch": 0.02, + "learning_rate": 2.9776780441714893e-07, + "logits/chosen": -2.3669207096099854, + "logits/rejected": -2.4071383476257324, + "logps/chosen": -235.2662811279297, + "logps/rejected": -279.0393981933594, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8461290001869202, + "rewards/margins": 0.7511619925498962, + "rewards/rejected": -1.5972909927368164, + "step": 192 + }, + { + "epoch": 0.02, + "learning_rate": 2.977323727412306e-07, + "logits/chosen": -2.2885642051696777, + "logits/rejected": -2.519991874694824, + "logps/chosen": -253.64480590820312, + "logps/rejected": -203.57786560058594, + "loss": 0.5247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42813435196876526, + "rewards/margins": 0.6662355661392212, + "rewards/rejected": -1.094369888305664, + "step": 193 + }, + { + "epoch": 0.02, + "learning_rate": 2.976969410653124e-07, + "logits/chosen": -2.3995888233184814, + "logits/rejected": -2.4556751251220703, + "logps/chosen": -171.52984619140625, + "logps/rejected": -210.47445678710938, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.046367041766643524, + "rewards/margins": 0.7165279984474182, + "rewards/rejected": -0.7628950476646423, + "step": 194 + }, + { + "epoch": 0.02, + "learning_rate": 2.976615093893941e-07, + "logits/chosen": -2.5814974308013916, + "logits/rejected": -2.38312029838562, + "logps/chosen": -272.8090515136719, + "logps/rejected": -285.1854553222656, + "loss": 0.4581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1665532886981964, + "rewards/margins": 0.7537395358085632, + "rewards/rejected": -0.920292854309082, + "step": 195 + }, + { + "epoch": 0.02, + "learning_rate": 2.976260777134758e-07, + "logits/chosen": -2.504533529281616, + "logits/rejected": -2.6208178997039795, + "logps/chosen": -204.13047790527344, + "logps/rejected": -169.91793823242188, + "loss": 0.8929, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2568453550338745, + "rewards/margins": 0.419555127620697, + "rewards/rejected": -1.6764004230499268, + "step": 196 + }, + { + "epoch": 0.02, + "learning_rate": 2.9759064603755757e-07, + "logits/chosen": -3.0464584827423096, + "logits/rejected": -3.003575325012207, + "logps/chosen": -249.3004150390625, + "logps/rejected": -148.258544921875, + "loss": 0.4221, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18436755239963531, + "rewards/margins": 0.8750616312026978, + "rewards/rejected": -0.6906940937042236, + "step": 197 + }, + { + "epoch": 0.02, + "learning_rate": 2.975552143616393e-07, + "logits/chosen": -2.0811831951141357, + "logits/rejected": -2.285975933074951, + "logps/chosen": -316.1667785644531, + "logps/rejected": -294.22918701171875, + "loss": 1.1003, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7553806304931641, + "rewards/margins": -0.23474159836769104, + "rewards/rejected": -0.5206390023231506, + "step": 198 + }, + { + "epoch": 0.02, + "learning_rate": 2.97519782685721e-07, + "logits/chosen": -2.5461838245391846, + "logits/rejected": -2.3391404151916504, + "logps/chosen": -337.11712646484375, + "logps/rejected": -305.74029541015625, + "loss": 0.3651, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1557615101337433, + "rewards/margins": 1.2772061824798584, + "rewards/rejected": -1.4329677820205688, + "step": 199 + }, + { + "epoch": 0.02, + "learning_rate": 2.9748435100980276e-07, + "logits/chosen": -2.509634017944336, + "logits/rejected": -2.122727155685425, + "logps/chosen": -234.50094604492188, + "logps/rejected": -224.40541076660156, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17068585753440857, + "rewards/margins": 1.0835914611816406, + "rewards/rejected": -1.2542774677276611, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 2.9744891933388446e-07, + "logits/chosen": -2.180366039276123, + "logits/rejected": -2.0672030448913574, + "logps/chosen": -341.3550720214844, + "logps/rejected": -324.4943542480469, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29692578315734863, + "rewards/margins": 0.7773337960243225, + "rewards/rejected": -1.0742595195770264, + "step": 201 + }, + { + "epoch": 0.02, + "learning_rate": 2.974134876579662e-07, + "logits/chosen": -2.6591265201568604, + "logits/rejected": -2.7623841762542725, + "logps/chosen": -339.0309753417969, + "logps/rejected": -250.5465087890625, + "loss": 0.6343, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2517094910144806, + "rewards/margins": 0.2982446253299713, + "rewards/rejected": -0.5499541759490967, + "step": 202 + }, + { + "epoch": 0.02, + "learning_rate": 2.973780559820479e-07, + "logits/chosen": -2.6656172275543213, + "logits/rejected": -2.3511343002319336, + "logps/chosen": -208.1717529296875, + "logps/rejected": -239.59095764160156, + "loss": 0.575, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13784259557724, + "rewards/margins": 0.5672997236251831, + "rewards/rejected": -0.7051423192024231, + "step": 203 + }, + { + "epoch": 0.02, + "learning_rate": 2.9734262430612965e-07, + "logits/chosen": -1.9007630348205566, + "logits/rejected": -2.0440890789031982, + "logps/chosen": -247.72889709472656, + "logps/rejected": -279.59552001953125, + "loss": 0.8708, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0063238143920898, + "rewards/margins": 1.0005090236663818, + "rewards/rejected": -2.0068325996398926, + "step": 204 + }, + { + "epoch": 0.02, + "learning_rate": 2.973071926302114e-07, + "logits/chosen": -2.014699697494507, + "logits/rejected": -2.4477901458740234, + "logps/chosen": -469.59979248046875, + "logps/rejected": -278.1065979003906, + "loss": 0.777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7977591753005981, + "rewards/margins": 0.17854562401771545, + "rewards/rejected": -0.976304829120636, + "step": 205 + }, + { + "epoch": 0.02, + "learning_rate": 2.9727176095429315e-07, + "logits/chosen": -2.726283073425293, + "logits/rejected": -2.7086198329925537, + "logps/chosen": -72.46257019042969, + "logps/rejected": -136.21966552734375, + "loss": 0.3958, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06324831396341324, + "rewards/margins": 1.182030439376831, + "rewards/rejected": -1.2452788352966309, + "step": 206 + }, + { + "epoch": 0.02, + "learning_rate": 2.9723632927837484e-07, + "logits/chosen": -2.4139962196350098, + "logits/rejected": -2.327277898788452, + "logps/chosen": -389.550048828125, + "logps/rejected": -327.2835388183594, + "loss": 0.6662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48070067167282104, + "rewards/margins": 0.6324909925460815, + "rewards/rejected": -1.1131917238235474, + "step": 207 + }, + { + "epoch": 0.02, + "learning_rate": 2.972008976024566e-07, + "logits/chosen": -1.8973721265792847, + "logits/rejected": -1.7923822402954102, + "logps/chosen": -169.95416259765625, + "logps/rejected": -277.5341796875, + "loss": 0.8652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7989225387573242, + "rewards/margins": 0.6200987696647644, + "rewards/rejected": -1.4190213680267334, + "step": 208 + }, + { + "epoch": 0.02, + "learning_rate": 2.9716546592653834e-07, + "logits/chosen": -2.751174211502075, + "logits/rejected": -2.6011664867401123, + "logps/chosen": -87.83241271972656, + "logps/rejected": -170.61825561523438, + "loss": 0.5183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43172308802604675, + "rewards/margins": 0.555282711982727, + "rewards/rejected": -0.9870057702064514, + "step": 209 + }, + { + "epoch": 0.02, + "learning_rate": 2.9713003425062003e-07, + "logits/chosen": -2.216834545135498, + "logits/rejected": -2.2389798164367676, + "logps/chosen": -191.7452850341797, + "logps/rejected": -334.9488525390625, + "loss": 0.664, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29541483521461487, + "rewards/margins": 0.3470202684402466, + "rewards/rejected": -0.6424350738525391, + "step": 210 + }, + { + "epoch": 0.02, + "learning_rate": 2.970946025747018e-07, + "logits/chosen": -1.7326728105545044, + "logits/rejected": -1.884152889251709, + "logps/chosen": -258.3929443359375, + "logps/rejected": -161.04075622558594, + "loss": 0.6175, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3610254526138306, + "rewards/margins": 0.2630250155925751, + "rewards/rejected": -1.6240503787994385, + "step": 211 + }, + { + "epoch": 0.02, + "learning_rate": 2.970591708987835e-07, + "logits/chosen": -2.704021692276001, + "logits/rejected": -2.5570011138916016, + "logps/chosen": -315.704833984375, + "logps/rejected": -191.2744140625, + "loss": 2.3518, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.783074140548706, + "rewards/margins": -1.667793869972229, + "rewards/rejected": -2.1152801513671875, + "step": 212 + }, + { + "epoch": 0.02, + "learning_rate": 2.9702373922286523e-07, + "logits/chosen": -2.2044677734375, + "logits/rejected": -2.1459813117980957, + "logps/chosen": -297.95281982421875, + "logps/rejected": -326.42425537109375, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3468668460845947, + "rewards/margins": 0.9670214653015137, + "rewards/rejected": -2.3138883113861084, + "step": 213 + }, + { + "epoch": 0.02, + "learning_rate": 2.969883075469469e-07, + "logits/chosen": -2.5468626022338867, + "logits/rejected": -2.7631421089172363, + "logps/chosen": -161.05430603027344, + "logps/rejected": -238.472900390625, + "loss": 0.7499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4447501599788666, + "rewards/margins": 0.4884619414806366, + "rewards/rejected": -0.933212161064148, + "step": 214 + }, + { + "epoch": 0.03, + "learning_rate": 2.9695287587102867e-07, + "logits/chosen": -2.435810089111328, + "logits/rejected": -2.504624366760254, + "logps/chosen": -186.88778686523438, + "logps/rejected": -237.7312774658203, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5053725838661194, + "rewards/margins": 0.5947993993759155, + "rewards/rejected": -1.1001720428466797, + "step": 215 + }, + { + "epoch": 0.03, + "learning_rate": 2.969174441951104e-07, + "logits/chosen": -2.254075050354004, + "logits/rejected": -2.1868703365325928, + "logps/chosen": -122.3339614868164, + "logps/rejected": -216.272705078125, + "loss": 0.7093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47668716311454773, + "rewards/margins": 0.585472822189331, + "rewards/rejected": -1.0621600151062012, + "step": 216 + }, + { + "epoch": 0.03, + "learning_rate": 2.9688201251919217e-07, + "logits/chosen": -2.545182466506958, + "logits/rejected": -2.6172900199890137, + "logps/chosen": -330.7689208984375, + "logps/rejected": -261.0626220703125, + "loss": 0.6623, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9052447080612183, + "rewards/margins": 0.5616000294685364, + "rewards/rejected": -1.4668447971343994, + "step": 217 + }, + { + "epoch": 0.03, + "learning_rate": 2.9684658084327386e-07, + "logits/chosen": -2.6438913345336914, + "logits/rejected": -2.718538522720337, + "logps/chosen": -475.1298828125, + "logps/rejected": -213.1118927001953, + "loss": 0.9303, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9925950765609741, + "rewards/margins": -0.061597973108291626, + "rewards/rejected": -0.9309971928596497, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 2.968111491673556e-07, + "logits/chosen": -2.286801338195801, + "logits/rejected": -2.335422992706299, + "logps/chosen": -299.2216796875, + "logps/rejected": -310.5240478515625, + "loss": 0.5823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4594840705394745, + "rewards/margins": 0.631011962890625, + "rewards/rejected": -1.0904960632324219, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 2.9677571749143736e-07, + "logits/chosen": -2.2917537689208984, + "logits/rejected": -2.4121413230895996, + "logps/chosen": -353.86700439453125, + "logps/rejected": -210.02362060546875, + "loss": 0.4395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.004545241594314575, + "rewards/margins": 0.6690179705619812, + "rewards/rejected": -0.6735632419586182, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 2.9674028581551906e-07, + "logits/chosen": -2.103459119796753, + "logits/rejected": -2.2720165252685547, + "logps/chosen": -176.75814819335938, + "logps/rejected": -186.2899169921875, + "loss": 0.4432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5019952058792114, + "rewards/margins": 0.8146330118179321, + "rewards/rejected": -2.3166284561157227, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 2.967048541396008e-07, + "logits/chosen": -1.855933666229248, + "logits/rejected": -2.3924572467803955, + "logps/chosen": -390.41326904296875, + "logps/rejected": -258.12005615234375, + "loss": 0.649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6132771968841553, + "rewards/margins": 0.4132848381996155, + "rewards/rejected": -1.026561975479126, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 2.966694224636825e-07, + "logits/chosen": -2.9040279388427734, + "logits/rejected": -2.758218765258789, + "logps/chosen": -136.47802734375, + "logps/rejected": -199.02577209472656, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23595374822616577, + "rewards/margins": 1.14297354221344, + "rewards/rejected": -1.3789273500442505, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 2.9663399078776425e-07, + "logits/chosen": -1.877577304840088, + "logits/rejected": -2.1572866439819336, + "logps/chosen": -380.12945556640625, + "logps/rejected": -156.77786254882812, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.041225314140319824, + "rewards/margins": 0.10930295288562775, + "rewards/rejected": -0.15052828192710876, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 2.9659855911184595e-07, + "logits/chosen": -3.0273501873016357, + "logits/rejected": -2.907764196395874, + "logps/chosen": -311.0216064453125, + "logps/rejected": -261.26324462890625, + "loss": 0.5198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3632568120956421, + "rewards/margins": 0.4935813546180725, + "rewards/rejected": -0.8568381667137146, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 2.965631274359277e-07, + "logits/chosen": -2.519087314605713, + "logits/rejected": -2.3993542194366455, + "logps/chosen": -224.118408203125, + "logps/rejected": -263.0647888183594, + "loss": 0.5003, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13065998256206512, + "rewards/margins": 1.0385065078735352, + "rewards/rejected": -1.1691664457321167, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 2.9652769576000944e-07, + "logits/chosen": -2.1350302696228027, + "logits/rejected": -2.2670650482177734, + "logps/chosen": -371.3938293457031, + "logps/rejected": -282.5650939941406, + "loss": 0.6179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49811485409736633, + "rewards/margins": 0.4753153920173645, + "rewards/rejected": -0.9734302759170532, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 2.9649226408409114e-07, + "logits/chosen": -1.6433414220809937, + "logits/rejected": -2.169814109802246, + "logps/chosen": -391.1260986328125, + "logps/rejected": -236.60311889648438, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6052765846252441, + "rewards/margins": 0.4140346050262451, + "rewards/rejected": -1.0193111896514893, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 2.964568324081729e-07, + "logits/chosen": -2.126410484313965, + "logits/rejected": -2.1705334186553955, + "logps/chosen": -243.95066833496094, + "logps/rejected": -327.18896484375, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21082711219787598, + "rewards/margins": 1.200587272644043, + "rewards/rejected": -1.411414384841919, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 2.9642140073225464e-07, + "logits/chosen": -2.5843443870544434, + "logits/rejected": -2.34238600730896, + "logps/chosen": -118.04267883300781, + "logps/rejected": -295.7855224609375, + "loss": 0.6246, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5008370876312256, + "rewards/margins": 0.7272105813026428, + "rewards/rejected": -1.2280477285385132, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 2.963859690563364e-07, + "logits/chosen": -2.4147214889526367, + "logits/rejected": -2.3730053901672363, + "logps/chosen": -246.2408447265625, + "logps/rejected": -265.24371337890625, + "loss": 0.4393, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3537565767765045, + "rewards/margins": 0.9284340739250183, + "rewards/rejected": -1.2821907997131348, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 2.963505373804181e-07, + "logits/chosen": -2.780045509338379, + "logits/rejected": -2.573618173599243, + "logps/chosen": -228.7745819091797, + "logps/rejected": -192.944580078125, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0125674307346344, + "rewards/margins": 1.2672791481018066, + "rewards/rejected": -1.2547117471694946, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 2.9631510570449983e-07, + "logits/chosen": -2.135087251663208, + "logits/rejected": -2.096663236618042, + "logps/chosen": -174.82577514648438, + "logps/rejected": -260.0534362792969, + "loss": 0.5357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6059907674789429, + "rewards/margins": 0.9217367768287659, + "rewards/rejected": -1.527727723121643, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 2.962796740285815e-07, + "logits/chosen": -2.2490909099578857, + "logits/rejected": -2.2157199382781982, + "logps/chosen": -359.9885559082031, + "logps/rejected": -285.5429382324219, + "loss": 0.7898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3191032409667969, + "rewards/margins": 0.15777510404586792, + "rewards/rejected": -0.4768783450126648, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 2.9624424235266327e-07, + "logits/chosen": -2.2581114768981934, + "logits/rejected": -1.9997048377990723, + "logps/chosen": -228.3654327392578, + "logps/rejected": -271.7457275390625, + "loss": 0.8028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9475138187408447, + "rewards/margins": 0.20894011855125427, + "rewards/rejected": -1.1564539670944214, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 2.9620881067674497e-07, + "logits/chosen": -2.1485652923583984, + "logits/rejected": -2.2349109649658203, + "logps/chosen": -326.4866638183594, + "logps/rejected": -243.7642364501953, + "loss": 0.7143, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4513411223888397, + "rewards/margins": 0.2783699631690979, + "rewards/rejected": -0.7297110557556152, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 2.961733790008267e-07, + "logits/chosen": -2.2010984420776367, + "logits/rejected": -2.410281181335449, + "logps/chosen": -322.23992919921875, + "logps/rejected": -294.8111572265625, + "loss": 0.4889, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1166505217552185, + "rewards/margins": 0.6585292816162109, + "rewards/rejected": -0.7751798033714294, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 2.9613794732490847e-07, + "logits/chosen": -2.2597479820251465, + "logits/rejected": -2.2367334365844727, + "logps/chosen": -167.72113037109375, + "logps/rejected": -244.42709350585938, + "loss": 0.965, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9110950231552124, + "rewards/margins": 0.900370717048645, + "rewards/rejected": -2.8114657402038574, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 2.9610251564899016e-07, + "logits/chosen": -2.5217127799987793, + "logits/rejected": -2.4344356060028076, + "logps/chosen": -260.0079040527344, + "logps/rejected": -227.93646240234375, + "loss": 0.5447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36287394165992737, + "rewards/margins": 0.5700594186782837, + "rewards/rejected": -0.9329333901405334, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 2.960670839730719e-07, + "logits/chosen": -2.1234779357910156, + "logits/rejected": -2.1665754318237305, + "logps/chosen": -323.8233337402344, + "logps/rejected": -286.0159606933594, + "loss": 0.3277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07391555607318878, + "rewards/margins": 1.038224458694458, + "rewards/rejected": -1.1121399402618408, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 2.9603165229715366e-07, + "logits/chosen": -2.753492832183838, + "logits/rejected": -2.721874237060547, + "logps/chosen": -168.72793579101562, + "logps/rejected": -189.45932006835938, + "loss": 0.3772, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28331437706947327, + "rewards/margins": 1.6643120050430298, + "rewards/rejected": -1.380997657775879, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 2.959962206212354e-07, + "logits/chosen": -2.226271152496338, + "logits/rejected": -2.082524061203003, + "logps/chosen": -403.5612487792969, + "logps/rejected": -410.4591369628906, + "loss": 0.7343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5986248850822449, + "rewards/margins": 0.41287389397621155, + "rewards/rejected": -1.0114986896514893, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 2.959607889453171e-07, + "logits/chosen": -2.630030632019043, + "logits/rejected": -2.8333232402801514, + "logps/chosen": -456.001220703125, + "logps/rejected": -368.57684326171875, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10505972057580948, + "rewards/margins": 1.203420639038086, + "rewards/rejected": -1.3084805011749268, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 2.9592535726939885e-07, + "logits/chosen": -2.175078868865967, + "logits/rejected": -2.2508466243743896, + "logps/chosen": -154.53163146972656, + "logps/rejected": -215.64236450195312, + "loss": 0.9409, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2359592914581299, + "rewards/margins": -0.060165584087371826, + "rewards/rejected": -1.1757936477661133, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 2.9588992559348055e-07, + "logits/chosen": -2.532890558242798, + "logits/rejected": -2.612595319747925, + "logps/chosen": -222.73016357421875, + "logps/rejected": -146.5400390625, + "loss": 0.7311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3925657868385315, + "rewards/margins": 0.47019314765930176, + "rewards/rejected": -0.8627589344978333, + "step": 245 + }, + { + "epoch": 0.03, + "learning_rate": 2.958544939175623e-07, + "logits/chosen": -2.3777523040771484, + "logits/rejected": -2.2148380279541016, + "logps/chosen": -138.219970703125, + "logps/rejected": -248.41384887695312, + "loss": 0.3746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20181617140769958, + "rewards/margins": 0.9688894748687744, + "rewards/rejected": -1.1707056760787964, + "step": 246 + }, + { + "epoch": 0.03, + "learning_rate": 2.95819062241644e-07, + "logits/chosen": -2.3724496364593506, + "logits/rejected": -2.488852024078369, + "logps/chosen": -199.93492126464844, + "logps/rejected": -174.43544006347656, + "loss": 1.6197, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0855000019073486, + "rewards/margins": -0.8561151623725891, + "rewards/rejected": -1.2293848991394043, + "step": 247 + }, + { + "epoch": 0.03, + "learning_rate": 2.9578363056572574e-07, + "logits/chosen": -2.487362861633301, + "logits/rejected": -2.739895820617676, + "logps/chosen": -252.55865478515625, + "logps/rejected": -235.70318603515625, + "loss": 0.7096, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5399394035339355, + "rewards/margins": 0.2241605669260025, + "rewards/rejected": -0.7640999555587769, + "step": 248 + }, + { + "epoch": 0.03, + "learning_rate": 2.957481988898075e-07, + "logits/chosen": -2.588501214981079, + "logits/rejected": -2.593179225921631, + "logps/chosen": -266.2436828613281, + "logps/rejected": -168.1346893310547, + "loss": 0.6591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4024375081062317, + "rewards/margins": 0.3704236149787903, + "rewards/rejected": -0.772861123085022, + "step": 249 + }, + { + "epoch": 0.03, + "learning_rate": 2.957127672138892e-07, + "logits/chosen": -2.942012310028076, + "logits/rejected": -2.997328758239746, + "logps/chosen": -126.1902084350586, + "logps/rejected": -158.05067443847656, + "loss": 0.5559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13358812034130096, + "rewards/margins": 0.6038130521774292, + "rewards/rejected": -0.7374010682106018, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 2.9567733553797093e-07, + "logits/chosen": -1.9721193313598633, + "logits/rejected": -1.7903449535369873, + "logps/chosen": -154.9866943359375, + "logps/rejected": -223.43446350097656, + "loss": 0.4142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8144962787628174, + "rewards/margins": 1.9981340169906616, + "rewards/rejected": -2.8126301765441895, + "step": 251 + }, + { + "epoch": 0.03, + "learning_rate": 2.956419038620527e-07, + "logits/chosen": -2.7903285026550293, + "logits/rejected": -2.695744276046753, + "logps/chosen": -319.083740234375, + "logps/rejected": -239.67547607421875, + "loss": 0.6354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34986791014671326, + "rewards/margins": 0.4811970293521881, + "rewards/rejected": -0.8310648798942566, + "step": 252 + }, + { + "epoch": 0.03, + "learning_rate": 2.9560647218613443e-07, + "logits/chosen": -2.5284905433654785, + "logits/rejected": -2.3725132942199707, + "logps/chosen": -280.74383544921875, + "logps/rejected": -321.76336669921875, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4790899157524109, + "rewards/margins": 1.1579196453094482, + "rewards/rejected": -1.6370095014572144, + "step": 253 + }, + { + "epoch": 0.03, + "learning_rate": 2.955710405102161e-07, + "logits/chosen": -2.1102724075317383, + "logits/rejected": -2.428840398788452, + "logps/chosen": -144.34535217285156, + "logps/rejected": -170.6175537109375, + "loss": 0.6721, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8028558492660522, + "rewards/margins": 0.5086990594863892, + "rewards/rejected": -1.311555027961731, + "step": 254 + }, + { + "epoch": 0.03, + "learning_rate": 2.955356088342979e-07, + "logits/chosen": -1.822654128074646, + "logits/rejected": -2.174063205718994, + "logps/chosen": -578.197998046875, + "logps/rejected": -385.5157775878906, + "loss": 1.1791, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.130832314491272, + "rewards/margins": -0.6686003804206848, + "rewards/rejected": -0.46223190426826477, + "step": 255 + }, + { + "epoch": 0.03, + "learning_rate": 2.9550017715837957e-07, + "logits/chosen": -2.641218900680542, + "logits/rejected": -2.665098190307617, + "logps/chosen": -189.53826904296875, + "logps/rejected": -170.3012237548828, + "loss": 1.1807, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2722926139831543, + "rewards/margins": 0.05193984508514404, + "rewards/rejected": -1.3242323398590088, + "step": 256 + }, + { + "epoch": 0.03, + "learning_rate": 2.954647454824613e-07, + "logits/chosen": -2.4489333629608154, + "logits/rejected": -2.611192464828491, + "logps/chosen": -313.12982177734375, + "logps/rejected": -188.38650512695312, + "loss": 0.793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25091975927352905, + "rewards/margins": 0.15536347031593323, + "rewards/rejected": -0.4062832295894623, + "step": 257 + }, + { + "epoch": 0.03, + "learning_rate": 2.95429313806543e-07, + "logits/chosen": -1.9388154745101929, + "logits/rejected": -1.8804645538330078, + "logps/chosen": -510.9178161621094, + "logps/rejected": -424.8813171386719, + "loss": 0.3365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30925655364990234, + "rewards/margins": 1.298856496810913, + "rewards/rejected": -1.608113169670105, + "step": 258 + }, + { + "epoch": 0.03, + "learning_rate": 2.9539388213062476e-07, + "logits/chosen": -2.0558621883392334, + "logits/rejected": -2.029038906097412, + "logps/chosen": -407.1056213378906, + "logps/rejected": -294.9714660644531, + "loss": 0.5928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4749473035335541, + "rewards/margins": 0.32635682821273804, + "rewards/rejected": -0.8013041019439697, + "step": 259 + }, + { + "epoch": 0.03, + "learning_rate": 2.953584504547065e-07, + "logits/chosen": -2.4706008434295654, + "logits/rejected": -2.4558961391448975, + "logps/chosen": -288.6748962402344, + "logps/rejected": -236.25311279296875, + "loss": 0.62, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7156071066856384, + "rewards/margins": 0.5134493112564087, + "rewards/rejected": -1.229056477546692, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 2.953230187787882e-07, + "logits/chosen": -1.9513782262802124, + "logits/rejected": -2.3186841011047363, + "logps/chosen": -297.04290771484375, + "logps/rejected": -207.31565856933594, + "loss": 0.7052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7733050584793091, + "rewards/margins": 0.2747534215450287, + "rewards/rejected": -1.0480585098266602, + "step": 261 + }, + { + "epoch": 0.03, + "learning_rate": 2.9528758710286996e-07, + "logits/chosen": -2.430399179458618, + "logits/rejected": -2.4979355335235596, + "logps/chosen": -461.70343017578125, + "logps/rejected": -287.472412109375, + "loss": 0.4636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4616781175136566, + "rewards/margins": 1.174939513206482, + "rewards/rejected": -1.636617660522461, + "step": 262 + }, + { + "epoch": 0.03, + "learning_rate": 2.9525215542695165e-07, + "logits/chosen": -2.741328001022339, + "logits/rejected": -2.6582798957824707, + "logps/chosen": -276.3038330078125, + "logps/rejected": -283.74920654296875, + "loss": 0.6814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4555474519729614, + "rewards/margins": 0.4569088816642761, + "rewards/rejected": -0.9124563932418823, + "step": 263 + }, + { + "epoch": 0.03, + "learning_rate": 2.952167237510334e-07, + "logits/chosen": -1.8767638206481934, + "logits/rejected": -2.134045362472534, + "logps/chosen": -344.1347961425781, + "logps/rejected": -342.5531921386719, + "loss": 0.4246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3596734404563904, + "rewards/margins": 0.7209112048149109, + "rewards/rejected": -1.0805846452713013, + "step": 264 + }, + { + "epoch": 0.03, + "learning_rate": 2.9518129207511515e-07, + "logits/chosen": -2.8755650520324707, + "logits/rejected": -2.8715429306030273, + "logps/chosen": -392.4359436035156, + "logps/rejected": -252.48297119140625, + "loss": 0.3009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25258395075798035, + "rewards/margins": 1.3310014009475708, + "rewards/rejected": -1.0784173011779785, + "step": 265 + }, + { + "epoch": 0.03, + "learning_rate": 2.951458603991969e-07, + "logits/chosen": -2.176382303237915, + "logits/rejected": -2.4258620738983154, + "logps/chosen": -661.7243041992188, + "logps/rejected": -342.0917053222656, + "loss": 0.5881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14278724789619446, + "rewards/margins": 0.39230793714523315, + "rewards/rejected": -0.5350951552391052, + "step": 266 + }, + { + "epoch": 0.03, + "learning_rate": 2.951104287232786e-07, + "logits/chosen": -1.8555489778518677, + "logits/rejected": -2.122600793838501, + "logps/chosen": -377.9168395996094, + "logps/rejected": -287.09393310546875, + "loss": 0.9388, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8848345279693604, + "rewards/margins": -0.004211366176605225, + "rewards/rejected": -0.8806231021881104, + "step": 267 + }, + { + "epoch": 0.03, + "learning_rate": 2.9507499704736034e-07, + "logits/chosen": -2.4599874019622803, + "logits/rejected": -2.4040067195892334, + "logps/chosen": -301.8019714355469, + "logps/rejected": -254.99032592773438, + "loss": 0.5875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33585959672927856, + "rewards/margins": 0.23818278312683105, + "rewards/rejected": -0.5740423798561096, + "step": 268 + }, + { + "epoch": 0.03, + "learning_rate": 2.9503956537144204e-07, + "logits/chosen": -2.2341794967651367, + "logits/rejected": -2.505638599395752, + "logps/chosen": -501.1637878417969, + "logps/rejected": -296.4586181640625, + "loss": 0.8335, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2650803327560425, + "rewards/margins": 0.013272076845169067, + "rewards/rejected": -0.27835240960121155, + "step": 269 + }, + { + "epoch": 0.03, + "learning_rate": 2.950041336955238e-07, + "logits/chosen": -1.9049019813537598, + "logits/rejected": -2.0151309967041016, + "logps/chosen": -316.24652099609375, + "logps/rejected": -277.3941955566406, + "loss": 0.5259, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3008253276348114, + "rewards/margins": 0.4698154926300049, + "rewards/rejected": -0.7706408500671387, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 2.9496870201960553e-07, + "logits/chosen": -2.5724985599517822, + "logits/rejected": -2.657470703125, + "logps/chosen": -311.4435119628906, + "logps/rejected": -196.11766052246094, + "loss": 0.2728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06979581713676453, + "rewards/margins": 1.4767982959747314, + "rewards/rejected": -1.5465940237045288, + "step": 271 + }, + { + "epoch": 0.03, + "learning_rate": 2.9493327034368723e-07, + "logits/chosen": -1.5899715423583984, + "logits/rejected": -2.04009747505188, + "logps/chosen": -332.3902282714844, + "logps/rejected": -223.86692810058594, + "loss": 0.951, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1163685321807861, + "rewards/margins": 0.1418951153755188, + "rewards/rejected": -1.2582635879516602, + "step": 272 + }, + { + "epoch": 0.03, + "learning_rate": 2.94897838667769e-07, + "logits/chosen": -2.602829933166504, + "logits/rejected": -2.8829126358032227, + "logps/chosen": -618.8819580078125, + "logps/rejected": -253.289306640625, + "loss": 0.7115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20049412548542023, + "rewards/margins": 0.8102298378944397, + "rewards/rejected": -1.0107239484786987, + "step": 273 + }, + { + "epoch": 0.03, + "learning_rate": 2.948624069918507e-07, + "logits/chosen": -1.9868472814559937, + "logits/rejected": -2.449455499649048, + "logps/chosen": -310.1795654296875, + "logps/rejected": -195.93051147460938, + "loss": 0.5633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23465867340564728, + "rewards/margins": 0.4427909851074219, + "rewards/rejected": -0.677449643611908, + "step": 274 + }, + { + "epoch": 0.03, + "learning_rate": 2.948269753159324e-07, + "logits/chosen": -2.969364881515503, + "logits/rejected": -2.962390184402466, + "logps/chosen": -157.34304809570312, + "logps/rejected": -130.302490234375, + "loss": 0.3617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08375561237335205, + "rewards/margins": 1.063370704650879, + "rewards/rejected": -1.1471264362335205, + "step": 275 + }, + { + "epoch": 0.03, + "learning_rate": 2.9479154364001417e-07, + "logits/chosen": -2.027743339538574, + "logits/rejected": -1.8373136520385742, + "logps/chosen": -256.11737060546875, + "logps/rejected": -318.2117004394531, + "loss": 0.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2073525190353394, + "rewards/margins": 0.5226768255233765, + "rewards/rejected": -1.7300293445587158, + "step": 276 + }, + { + "epoch": 0.03, + "learning_rate": 2.947561119640959e-07, + "logits/chosen": -2.2515320777893066, + "logits/rejected": -2.0383710861206055, + "logps/chosen": -416.75750732421875, + "logps/rejected": -364.9051818847656, + "loss": 0.3936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5839005708694458, + "rewards/margins": 0.9212774038314819, + "rewards/rejected": -1.5051779747009277, + "step": 277 + }, + { + "epoch": 0.03, + "learning_rate": 2.947206802881776e-07, + "logits/chosen": -1.8994593620300293, + "logits/rejected": -2.0609066486358643, + "logps/chosen": -338.57421875, + "logps/rejected": -273.29254150390625, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029149174690246582, + "rewards/margins": 0.2943916320800781, + "rewards/rejected": -0.3235408067703247, + "step": 278 + }, + { + "epoch": 0.03, + "learning_rate": 2.9468524861225936e-07, + "logits/chosen": -2.3778624534606934, + "logits/rejected": -2.2536587715148926, + "logps/chosen": -270.11712646484375, + "logps/rejected": -240.03390502929688, + "loss": 0.4873, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.286642849445343, + "rewards/margins": 0.7670831680297852, + "rewards/rejected": -1.0537259578704834, + "step": 279 + }, + { + "epoch": 0.03, + "learning_rate": 2.9464981693634106e-07, + "logits/chosen": -1.9662208557128906, + "logits/rejected": -2.114424705505371, + "logps/chosen": -399.0927734375, + "logps/rejected": -238.5034637451172, + "loss": 0.4433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36703047156333923, + "rewards/margins": 1.092298984527588, + "rewards/rejected": -1.4593294858932495, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 2.946143852604228e-07, + "logits/chosen": -2.291977882385254, + "logits/rejected": -1.7917745113372803, + "logps/chosen": -269.79132080078125, + "logps/rejected": -449.75653076171875, + "loss": 0.318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37038087844848633, + "rewards/margins": 1.6547183990478516, + "rewards/rejected": -2.025099277496338, + "step": 281 + }, + { + "epoch": 0.03, + "learning_rate": 2.9457895358450456e-07, + "logits/chosen": -2.1801748275756836, + "logits/rejected": -2.332432270050049, + "logps/chosen": -325.0802917480469, + "logps/rejected": -388.4512634277344, + "loss": 0.5333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.367206871509552, + "rewards/margins": 0.6799616813659668, + "rewards/rejected": -1.047168493270874, + "step": 282 + }, + { + "epoch": 0.03, + "learning_rate": 2.9454352190858625e-07, + "logits/chosen": -1.3814514875411987, + "logits/rejected": -1.726662039756775, + "logps/chosen": -396.02337646484375, + "logps/rejected": -281.3721923828125, + "loss": 0.6014, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4060601592063904, + "rewards/margins": 0.6215767860412598, + "rewards/rejected": -1.027637004852295, + "step": 283 + }, + { + "epoch": 0.03, + "learning_rate": 2.94508090232668e-07, + "logits/chosen": -2.6151933670043945, + "logits/rejected": -2.6169497966766357, + "logps/chosen": -198.55926513671875, + "logps/rejected": -137.8518829345703, + "loss": 0.4985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1619531661272049, + "rewards/margins": 0.9281085133552551, + "rewards/rejected": -1.0900616645812988, + "step": 284 + }, + { + "epoch": 0.03, + "learning_rate": 2.944726585567497e-07, + "logits/chosen": -2.9354021549224854, + "logits/rejected": -2.7473397254943848, + "logps/chosen": -205.51333618164062, + "logps/rejected": -268.3748779296875, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3936096429824829, + "rewards/margins": 0.9245909452438354, + "rewards/rejected": -1.3182005882263184, + "step": 285 + }, + { + "epoch": 0.03, + "learning_rate": 2.9443722688083145e-07, + "logits/chosen": -1.4875712394714355, + "logits/rejected": -2.0510120391845703, + "logps/chosen": -409.445556640625, + "logps/rejected": -196.56214904785156, + "loss": 1.0093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6522005796432495, + "rewards/margins": -0.13873091340065002, + "rewards/rejected": -0.5134696364402771, + "step": 286 + }, + { + "epoch": 0.03, + "learning_rate": 2.944017952049132e-07, + "logits/chosen": -2.3332650661468506, + "logits/rejected": -2.0238451957702637, + "logps/chosen": -238.11366271972656, + "logps/rejected": -328.9278259277344, + "loss": 0.5911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3827953338623047, + "rewards/margins": 1.046369194984436, + "rewards/rejected": -1.4291645288467407, + "step": 287 + }, + { + "epoch": 0.03, + "learning_rate": 2.9436636352899494e-07, + "logits/chosen": -2.5052852630615234, + "logits/rejected": -2.4758758544921875, + "logps/chosen": -281.4767761230469, + "logps/rejected": -350.5293884277344, + "loss": 0.3689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2601891756057739, + "rewards/margins": 1.2887394428253174, + "rewards/rejected": -1.5489286184310913, + "step": 288 + }, + { + "epoch": 0.03, + "learning_rate": 2.9433093185307664e-07, + "logits/chosen": -2.366420030593872, + "logits/rejected": -2.3785367012023926, + "logps/chosen": -301.2152404785156, + "logps/rejected": -276.4963684082031, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02587474137544632, + "rewards/margins": 1.1035606861114502, + "rewards/rejected": -1.0776861906051636, + "step": 289 + }, + { + "epoch": 0.03, + "learning_rate": 2.942955001771584e-07, + "logits/chosen": -2.093283176422119, + "logits/rejected": -2.4747838973999023, + "logps/chosen": -420.95843505859375, + "logps/rejected": -170.1190185546875, + "loss": 0.7256, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5149084329605103, + "rewards/margins": 0.5440642237663269, + "rewards/rejected": -1.0589725971221924, + "step": 290 + }, + { + "epoch": 0.03, + "learning_rate": 2.942600685012401e-07, + "logits/chosen": -2.427334785461426, + "logits/rejected": -2.646097183227539, + "logps/chosen": -376.02117919921875, + "logps/rejected": -304.498779296875, + "loss": 0.4787, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0406392477452755, + "rewards/margins": 1.0841079950332642, + "rewards/rejected": -1.1247472763061523, + "step": 291 + }, + { + "epoch": 0.03, + "learning_rate": 2.9422463682532183e-07, + "logits/chosen": -2.721784830093384, + "logits/rejected": -2.788503408432007, + "logps/chosen": -330.68353271484375, + "logps/rejected": -319.1875, + "loss": 0.3812, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06904447078704834, + "rewards/margins": 1.3375811576843262, + "rewards/rejected": -1.406625747680664, + "step": 292 + }, + { + "epoch": 0.03, + "learning_rate": 2.9418920514940353e-07, + "logits/chosen": -2.5546584129333496, + "logits/rejected": -2.2899272441864014, + "logps/chosen": -247.6539306640625, + "logps/rejected": -292.12799072265625, + "loss": 0.8762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.759085476398468, + "rewards/margins": 0.21519267559051514, + "rewards/rejected": -0.9742782115936279, + "step": 293 + }, + { + "epoch": 0.03, + "learning_rate": 2.941537734734853e-07, + "logits/chosen": -2.8096203804016113, + "logits/rejected": -2.6154065132141113, + "logps/chosen": -178.92384338378906, + "logps/rejected": -332.939453125, + "loss": 0.2987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16457784175872803, + "rewards/margins": 1.7711654901504517, + "rewards/rejected": -1.6065876483917236, + "step": 294 + }, + { + "epoch": 0.03, + "learning_rate": 2.94118341797567e-07, + "logits/chosen": -2.797778844833374, + "logits/rejected": -2.829437494277954, + "logps/chosen": -345.2289123535156, + "logps/rejected": -283.1036376953125, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3279910087585449, + "rewards/margins": 1.4313820600509644, + "rewards/rejected": -1.7593731880187988, + "step": 295 + }, + { + "epoch": 0.03, + "learning_rate": 2.940829101216487e-07, + "logits/chosen": -1.6894688606262207, + "logits/rejected": -1.6803333759307861, + "logps/chosen": -598.8785400390625, + "logps/rejected": -595.1441650390625, + "loss": 0.4342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010276809334754944, + "rewards/margins": 1.0291900634765625, + "rewards/rejected": -1.0189132690429688, + "step": 296 + }, + { + "epoch": 0.03, + "learning_rate": 2.9404747844573047e-07, + "logits/chosen": -2.36710786819458, + "logits/rejected": -2.6229710578918457, + "logps/chosen": -326.7007141113281, + "logps/rejected": -257.9637451171875, + "loss": 0.5844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14547762274742126, + "rewards/margins": 0.46267378330230713, + "rewards/rejected": -0.608151376247406, + "step": 297 + }, + { + "epoch": 0.03, + "learning_rate": 2.9401204676981216e-07, + "logits/chosen": -2.285874366760254, + "logits/rejected": -2.0000181198120117, + "logps/chosen": -259.4213562011719, + "logps/rejected": -284.89862060546875, + "loss": 0.7303, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8663985729217529, + "rewards/margins": 0.20266467332839966, + "rewards/rejected": -1.0690631866455078, + "step": 298 + }, + { + "epoch": 0.03, + "learning_rate": 2.9397661509389397e-07, + "logits/chosen": -2.6445860862731934, + "logits/rejected": -2.664821147918701, + "logps/chosen": -245.68978881835938, + "logps/rejected": -252.12106323242188, + "loss": 0.9665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0938446521759033, + "rewards/margins": 0.18389992415905, + "rewards/rejected": -1.2777445316314697, + "step": 299 + }, + { + "epoch": 0.03, + "learning_rate": 2.9394118341797566e-07, + "logits/chosen": -2.6599860191345215, + "logits/rejected": -2.7291247844696045, + "logps/chosen": -170.41127014160156, + "logps/rejected": -158.5400390625, + "loss": 0.5093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9475737810134888, + "rewards/margins": 0.47724682092666626, + "rewards/rejected": -1.4248205423355103, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 2.939057517420574e-07, + "logits/chosen": -2.29803204536438, + "logits/rejected": -2.607515811920166, + "logps/chosen": -364.19329833984375, + "logps/rejected": -204.98898315429688, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4560944139957428, + "rewards/margins": 0.6856675148010254, + "rewards/rejected": -1.1417618989944458, + "step": 301 + }, + { + "epoch": 0.04, + "learning_rate": 2.938703200661391e-07, + "logits/chosen": -1.950028896331787, + "logits/rejected": -2.022510290145874, + "logps/chosen": -394.421875, + "logps/rejected": -287.7738952636719, + "loss": 0.4218, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.257552832365036, + "rewards/margins": 1.3202406167984009, + "rewards/rejected": -1.062687873840332, + "step": 302 + }, + { + "epoch": 0.04, + "learning_rate": 2.9383488839022085e-07, + "logits/chosen": -2.1507041454315186, + "logits/rejected": -2.356308698654175, + "logps/chosen": -305.9482116699219, + "logps/rejected": -186.61788940429688, + "loss": 0.4265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15852631628513336, + "rewards/margins": 0.9647853374481201, + "rewards/rejected": -1.1233115196228027, + "step": 303 + }, + { + "epoch": 0.04, + "learning_rate": 2.9379945671430255e-07, + "logits/chosen": -2.5175838470458984, + "logits/rejected": -2.486604690551758, + "logps/chosen": -262.03350830078125, + "logps/rejected": -255.69540405273438, + "loss": 0.7578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4626442790031433, + "rewards/margins": -0.029388144612312317, + "rewards/rejected": -0.4332561492919922, + "step": 304 + }, + { + "epoch": 0.04, + "learning_rate": 2.937640250383843e-07, + "logits/chosen": -2.5114057064056396, + "logits/rejected": -2.107201337814331, + "logps/chosen": -241.24156188964844, + "logps/rejected": -297.77947998046875, + "loss": 0.3567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25777173042297363, + "rewards/margins": 1.7837862968444824, + "rewards/rejected": -2.041558027267456, + "step": 305 + }, + { + "epoch": 0.04, + "learning_rate": 2.9372859336246605e-07, + "logits/chosen": -2.3229546546936035, + "logits/rejected": -2.4203202724456787, + "logps/chosen": -307.3225402832031, + "logps/rejected": -375.33990478515625, + "loss": 0.2627, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3219256103038788, + "rewards/margins": 1.551868200302124, + "rewards/rejected": -1.2299425601959229, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 2.9369316168654774e-07, + "logits/chosen": -2.8530185222625732, + "logits/rejected": -2.6325011253356934, + "logps/chosen": -290.1589660644531, + "logps/rejected": -219.94435119628906, + "loss": 0.3821, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04211246967315674, + "rewards/margins": 1.2057253122329712, + "rewards/rejected": -1.1636128425598145, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 2.936577300106295e-07, + "logits/chosen": -2.1724071502685547, + "logits/rejected": -1.9063711166381836, + "logps/chosen": -239.16073608398438, + "logps/rejected": -228.0086212158203, + "loss": 0.5574, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40734410285949707, + "rewards/margins": 0.5282961130142212, + "rewards/rejected": -0.9356402158737183, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 2.936222983347112e-07, + "logits/chosen": -2.1313507556915283, + "logits/rejected": -2.1726279258728027, + "logps/chosen": -350.1370849609375, + "logps/rejected": -376.45904541015625, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08623509109020233, + "rewards/margins": 1.5222742557525635, + "rewards/rejected": -1.6085093021392822, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 2.9358686665879293e-07, + "logits/chosen": -2.493105173110962, + "logits/rejected": -2.646644115447998, + "logps/chosen": -185.68478393554688, + "logps/rejected": -159.75222778320312, + "loss": 0.781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7015661597251892, + "rewards/margins": 0.41815176606178284, + "rewards/rejected": -1.1197179555892944, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 2.935514349828747e-07, + "logits/chosen": -1.7797365188598633, + "logits/rejected": -1.9488158226013184, + "logps/chosen": -298.86151123046875, + "logps/rejected": -264.2230224609375, + "loss": 0.5771, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1043767929077148, + "rewards/margins": 0.5647428631782532, + "rewards/rejected": -1.6691197156906128, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 2.9351600330695643e-07, + "logits/chosen": -2.6082606315612793, + "logits/rejected": -2.602665901184082, + "logps/chosen": -174.48670959472656, + "logps/rejected": -133.47076416015625, + "loss": 1.0146, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.512934684753418, + "rewards/margins": -0.0038271695375442505, + "rewards/rejected": -0.5091075301170349, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 2.9348057163103813e-07, + "logits/chosen": -2.2726187705993652, + "logits/rejected": -2.2766458988189697, + "logps/chosen": -151.23597717285156, + "logps/rejected": -218.1590576171875, + "loss": 0.5699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.930323600769043, + "rewards/margins": 0.7915714979171753, + "rewards/rejected": -1.7218952178955078, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 2.934451399551199e-07, + "logits/chosen": -2.8945980072021484, + "logits/rejected": -2.8225159645080566, + "logps/chosen": -230.0916290283203, + "logps/rejected": -222.00045776367188, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5174702405929565, + "rewards/margins": 1.3400602340698242, + "rewards/rejected": -1.8575303554534912, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 2.9340970827920157e-07, + "logits/chosen": -2.62554931640625, + "logits/rejected": -2.6453537940979004, + "logps/chosen": -308.14825439453125, + "logps/rejected": -176.90383911132812, + "loss": 0.8522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7629151344299316, + "rewards/margins": 0.31568899750709534, + "rewards/rejected": -1.0786041021347046, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 2.933742766032833e-07, + "logits/chosen": -2.029466152191162, + "logits/rejected": -2.171477794647217, + "logps/chosen": -314.6988525390625, + "logps/rejected": -375.4151611328125, + "loss": 0.685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5350890755653381, + "rewards/margins": 0.5873953104019165, + "rewards/rejected": -1.1224844455718994, + "step": 316 + }, + { + "epoch": 0.04, + "learning_rate": 2.9333884492736507e-07, + "logits/chosen": -2.9043831825256348, + "logits/rejected": -2.9594264030456543, + "logps/chosen": -206.5111541748047, + "logps/rejected": -225.08241271972656, + "loss": 0.5474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4688420295715332, + "rewards/margins": 0.8206722736358643, + "rewards/rejected": -1.2895143032073975, + "step": 317 + }, + { + "epoch": 0.04, + "learning_rate": 2.9330341325144676e-07, + "logits/chosen": -2.895951271057129, + "logits/rejected": -2.8387908935546875, + "logps/chosen": -337.5228271484375, + "logps/rejected": -334.8077697753906, + "loss": 0.7819, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.659294843673706, + "rewards/margins": 0.35979580879211426, + "rewards/rejected": -1.0190907716751099, + "step": 318 + }, + { + "epoch": 0.04, + "learning_rate": 2.932679815755285e-07, + "logits/chosen": -2.223888874053955, + "logits/rejected": -2.391298532485962, + "logps/chosen": -213.8368682861328, + "logps/rejected": -175.1593017578125, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17392238974571228, + "rewards/margins": 0.6756976842880249, + "rewards/rejected": -0.8496200442314148, + "step": 319 + }, + { + "epoch": 0.04, + "learning_rate": 2.932325498996102e-07, + "logits/chosen": -2.329521656036377, + "logits/rejected": -2.2428598403930664, + "logps/chosen": -331.2684631347656, + "logps/rejected": -293.89569091796875, + "loss": 0.853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8404381275177002, + "rewards/margins": 0.40912631154060364, + "rewards/rejected": -1.2495644092559814, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 2.9319711822369196e-07, + "logits/chosen": -2.395176887512207, + "logits/rejected": -2.6548402309417725, + "logps/chosen": -200.82545471191406, + "logps/rejected": -100.20393371582031, + "loss": 1.996, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5338711738586426, + "rewards/margins": -1.283815622329712, + "rewards/rejected": -0.2500556707382202, + "step": 321 + }, + { + "epoch": 0.04, + "learning_rate": 2.931616865477737e-07, + "logits/chosen": -2.295654296875, + "logits/rejected": -2.3269121646881104, + "logps/chosen": -168.94126892089844, + "logps/rejected": -181.1977996826172, + "loss": 0.6319, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38651272654533386, + "rewards/margins": 0.4317438006401062, + "rewards/rejected": -0.8182565569877625, + "step": 322 + }, + { + "epoch": 0.04, + "learning_rate": 2.9312625487185545e-07, + "logits/chosen": -2.679442882537842, + "logits/rejected": -2.5386698246002197, + "logps/chosen": -357.37164306640625, + "logps/rejected": -204.69769287109375, + "loss": 0.4761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49394285678863525, + "rewards/margins": 0.948100209236145, + "rewards/rejected": -1.4420431852340698, + "step": 323 + }, + { + "epoch": 0.04, + "learning_rate": 2.9309082319593715e-07, + "logits/chosen": -2.3916513919830322, + "logits/rejected": -2.5804829597473145, + "logps/chosen": -392.87078857421875, + "logps/rejected": -171.0058135986328, + "loss": 0.6124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09215235710144043, + "rewards/margins": 0.6518601179122925, + "rewards/rejected": -0.7440124750137329, + "step": 324 + }, + { + "epoch": 0.04, + "learning_rate": 2.930553915200189e-07, + "logits/chosen": -2.369706153869629, + "logits/rejected": -2.4630026817321777, + "logps/chosen": -222.11709594726562, + "logps/rejected": -211.93954467773438, + "loss": 0.8231, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40757668018341064, + "rewards/margins": 0.23469087481498718, + "rewards/rejected": -0.6422675251960754, + "step": 325 + }, + { + "epoch": 0.04, + "learning_rate": 2.930199598441006e-07, + "logits/chosen": -2.2539374828338623, + "logits/rejected": -2.287092447280884, + "logps/chosen": -121.72743225097656, + "logps/rejected": -111.5833740234375, + "loss": 0.5449, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4091636538505554, + "rewards/margins": 0.46893033385276794, + "rewards/rejected": -0.878093957901001, + "step": 326 + }, + { + "epoch": 0.04, + "learning_rate": 2.9298452816818234e-07, + "logits/chosen": -2.0524537563323975, + "logits/rejected": -2.1779425144195557, + "logps/chosen": -275.94903564453125, + "logps/rejected": -273.35821533203125, + "loss": 0.4695, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.055811215192079544, + "rewards/margins": 0.7680109143257141, + "rewards/rejected": -0.7121996879577637, + "step": 327 + }, + { + "epoch": 0.04, + "learning_rate": 2.929490964922641e-07, + "logits/chosen": -2.068142890930176, + "logits/rejected": -2.379666805267334, + "logps/chosen": -265.1993103027344, + "logps/rejected": -225.3121337890625, + "loss": 0.5916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8519630432128906, + "rewards/margins": 0.6603106260299683, + "rewards/rejected": -1.5122735500335693, + "step": 328 + }, + { + "epoch": 0.04, + "learning_rate": 2.929136648163458e-07, + "logits/chosen": -2.3993899822235107, + "logits/rejected": -2.6701223850250244, + "logps/chosen": -240.1083221435547, + "logps/rejected": -323.28564453125, + "loss": 0.5382, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31654006242752075, + "rewards/margins": 0.6563017964363098, + "rewards/rejected": -0.9728418588638306, + "step": 329 + }, + { + "epoch": 0.04, + "learning_rate": 2.9287823314042754e-07, + "logits/chosen": -2.6669204235076904, + "logits/rejected": -2.564671277999878, + "logps/chosen": -123.85690307617188, + "logps/rejected": -205.9306640625, + "loss": 1.1335, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9808606505393982, + "rewards/margins": 0.11081727594137192, + "rewards/rejected": -1.0916779041290283, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.9284280146450923e-07, + "logits/chosen": -2.3550915718078613, + "logits/rejected": -2.516937732696533, + "logps/chosen": -391.8165283203125, + "logps/rejected": -235.18252563476562, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.623511016368866, + "rewards/margins": 0.6541772484779358, + "rewards/rejected": -1.2776882648468018, + "step": 331 + }, + { + "epoch": 0.04, + "learning_rate": 2.92807369788591e-07, + "logits/chosen": -2.9920287132263184, + "logits/rejected": -2.9969563484191895, + "logps/chosen": -180.14744567871094, + "logps/rejected": -219.72019958496094, + "loss": 0.4066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24392685294151306, + "rewards/margins": 0.9460574984550476, + "rewards/rejected": -1.1899843215942383, + "step": 332 + }, + { + "epoch": 0.04, + "learning_rate": 2.927719381126727e-07, + "logits/chosen": -2.2411112785339355, + "logits/rejected": -2.3199000358581543, + "logps/chosen": -296.79571533203125, + "logps/rejected": -371.7720947265625, + "loss": 0.8679, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7076969742774963, + "rewards/margins": 0.24727892875671387, + "rewards/rejected": -0.9549759030342102, + "step": 333 + }, + { + "epoch": 0.04, + "learning_rate": 2.927365064367545e-07, + "logits/chosen": -2.8356363773345947, + "logits/rejected": -2.749779224395752, + "logps/chosen": -297.3299865722656, + "logps/rejected": -207.79348754882812, + "loss": 0.4644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15958870947360992, + "rewards/margins": 0.8317511081695557, + "rewards/rejected": -0.9913398027420044, + "step": 334 + }, + { + "epoch": 0.04, + "learning_rate": 2.9270107476083617e-07, + "logits/chosen": -2.4010345935821533, + "logits/rejected": -2.444368600845337, + "logps/chosen": -196.5550079345703, + "logps/rejected": -183.0149383544922, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1564604490995407, + "rewards/margins": 1.3196423053741455, + "rewards/rejected": -1.4761028289794922, + "step": 335 + }, + { + "epoch": 0.04, + "learning_rate": 2.926656430849179e-07, + "logits/chosen": -2.9160611629486084, + "logits/rejected": -2.7852678298950195, + "logps/chosen": -314.6018371582031, + "logps/rejected": -218.02850341796875, + "loss": 0.5763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45732107758522034, + "rewards/margins": 1.3264633417129517, + "rewards/rejected": -1.7837843894958496, + "step": 336 + }, + { + "epoch": 0.04, + "learning_rate": 2.926302114089996e-07, + "logits/chosen": -1.8458666801452637, + "logits/rejected": -1.565752387046814, + "logps/chosen": -164.42196655273438, + "logps/rejected": -238.62448120117188, + "loss": 0.5317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.610163688659668, + "rewards/margins": 0.7577385306358337, + "rewards/rejected": -1.367902159690857, + "step": 337 + }, + { + "epoch": 0.04, + "learning_rate": 2.9259477973308137e-07, + "logits/chosen": -2.0817527770996094, + "logits/rejected": -2.321756362915039, + "logps/chosen": -344.75726318359375, + "logps/rejected": -276.75738525390625, + "loss": 0.696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3976415693759918, + "rewards/margins": 0.6477232575416565, + "rewards/rejected": -1.0453648567199707, + "step": 338 + }, + { + "epoch": 0.04, + "learning_rate": 2.925593480571631e-07, + "logits/chosen": -2.6471240520477295, + "logits/rejected": -2.897420883178711, + "logps/chosen": -193.80697631835938, + "logps/rejected": -297.48284912109375, + "loss": 0.4418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14745938777923584, + "rewards/margins": 0.9114383459091187, + "rewards/rejected": -1.0588977336883545, + "step": 339 + }, + { + "epoch": 0.04, + "learning_rate": 2.925239163812448e-07, + "logits/chosen": -2.2821202278137207, + "logits/rejected": -2.2430543899536133, + "logps/chosen": -230.7744140625, + "logps/rejected": -260.9974365234375, + "loss": 0.3477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3567458987236023, + "rewards/margins": 1.6605525016784668, + "rewards/rejected": -2.017298460006714, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 2.9248848470532656e-07, + "logits/chosen": -2.4030134677886963, + "logits/rejected": -2.3298275470733643, + "logps/chosen": -262.053466796875, + "logps/rejected": -201.80091857910156, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10247425734996796, + "rewards/margins": 0.30203771591186523, + "rewards/rejected": -0.4045120179653168, + "step": 341 + }, + { + "epoch": 0.04, + "learning_rate": 2.9245305302940825e-07, + "logits/chosen": -2.625683069229126, + "logits/rejected": -2.8678855895996094, + "logps/chosen": -406.8061828613281, + "logps/rejected": -269.43145751953125, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1474510133266449, + "rewards/margins": 1.4309965372085571, + "rewards/rejected": -1.5784475803375244, + "step": 342 + }, + { + "epoch": 0.04, + "learning_rate": 2.9241762135349e-07, + "logits/chosen": -2.5967495441436768, + "logits/rejected": -2.370731830596924, + "logps/chosen": -167.21290588378906, + "logps/rejected": -199.54653930664062, + "loss": 0.4807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5110343098640442, + "rewards/margins": 0.6498540639877319, + "rewards/rejected": -1.160888433456421, + "step": 343 + }, + { + "epoch": 0.04, + "learning_rate": 2.923821896775717e-07, + "logits/chosen": -1.9601327180862427, + "logits/rejected": -2.131619930267334, + "logps/chosen": -370.6251220703125, + "logps/rejected": -403.8722839355469, + "loss": 0.3625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.89009690284729, + "rewards/margins": 1.1248087882995605, + "rewards/rejected": -2.0149056911468506, + "step": 344 + }, + { + "epoch": 0.04, + "learning_rate": 2.9234675800165345e-07, + "logits/chosen": -2.1298558712005615, + "logits/rejected": -2.444132089614868, + "logps/chosen": -231.8939666748047, + "logps/rejected": -254.7119903564453, + "loss": 0.4597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04477114602923393, + "rewards/margins": 0.8427332639694214, + "rewards/rejected": -0.8875043988227844, + "step": 345 + }, + { + "epoch": 0.04, + "learning_rate": 2.923113263257352e-07, + "logits/chosen": -2.1115477085113525, + "logits/rejected": -2.266761064529419, + "logps/chosen": -402.6540832519531, + "logps/rejected": -273.7836608886719, + "loss": 0.5504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09515545517206192, + "rewards/margins": 0.42880088090896606, + "rewards/rejected": -0.523956298828125, + "step": 346 + }, + { + "epoch": 0.04, + "learning_rate": 2.9227589464981694e-07, + "logits/chosen": -2.0364744663238525, + "logits/rejected": -2.3418893814086914, + "logps/chosen": -266.392822265625, + "logps/rejected": -260.46429443359375, + "loss": 1.3493, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9322304725646973, + "rewards/margins": -0.14251938462257385, + "rewards/rejected": -0.7897111773490906, + "step": 347 + }, + { + "epoch": 0.04, + "learning_rate": 2.9224046297389864e-07, + "logits/chosen": -2.1186115741729736, + "logits/rejected": -2.2281064987182617, + "logps/chosen": -566.9127197265625, + "logps/rejected": -355.6307373046875, + "loss": 0.6121, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.518950343132019, + "rewards/margins": 0.7422138452529907, + "rewards/rejected": -1.2611641883850098, + "step": 348 + }, + { + "epoch": 0.04, + "learning_rate": 2.922050312979804e-07, + "logits/chosen": -1.6921344995498657, + "logits/rejected": -2.184948444366455, + "logps/chosen": -526.6041870117188, + "logps/rejected": -314.04559326171875, + "loss": 0.7727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4664980173110962, + "rewards/margins": 0.33103129267692566, + "rewards/rejected": -0.7975293397903442, + "step": 349 + }, + { + "epoch": 0.04, + "learning_rate": 2.9216959962206214e-07, + "logits/chosen": -2.6512913703918457, + "logits/rejected": -2.5971741676330566, + "logps/chosen": -103.80355834960938, + "logps/rejected": -171.83544921875, + "loss": 0.5209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15030202269554138, + "rewards/margins": 0.7159022092819214, + "rewards/rejected": -0.8662042617797852, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 2.9213416794614383e-07, + "logits/chosen": -2.267549753189087, + "logits/rejected": -2.382564067840576, + "logps/chosen": -337.7655029296875, + "logps/rejected": -219.90203857421875, + "loss": 0.3232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18075509369373322, + "rewards/margins": 1.231830358505249, + "rewards/rejected": -1.0510752201080322, + "step": 351 + }, + { + "epoch": 0.04, + "learning_rate": 2.920987362702256e-07, + "logits/chosen": -2.5430941581726074, + "logits/rejected": -2.6897616386413574, + "logps/chosen": -284.12066650390625, + "logps/rejected": -167.8983612060547, + "loss": 0.5895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5054863691329956, + "rewards/margins": 0.7512806057929993, + "rewards/rejected": -1.25676691532135, + "step": 352 + }, + { + "epoch": 0.04, + "learning_rate": 2.920633045943073e-07, + "logits/chosen": -1.7325901985168457, + "logits/rejected": -1.751321792602539, + "logps/chosen": -339.93890380859375, + "logps/rejected": -286.529052734375, + "loss": 0.7085, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.009088158607483, + "rewards/margins": 0.9606056809425354, + "rewards/rejected": -1.969693899154663, + "step": 353 + }, + { + "epoch": 0.04, + "learning_rate": 2.92027872918389e-07, + "logits/chosen": -1.8583483695983887, + "logits/rejected": -2.534839630126953, + "logps/chosen": -377.93359375, + "logps/rejected": -160.0740966796875, + "loss": 0.9, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.100935459136963, + "rewards/margins": 0.3936083912849426, + "rewards/rejected": -1.4945437908172607, + "step": 354 + }, + { + "epoch": 0.04, + "learning_rate": 2.919924412424707e-07, + "logits/chosen": -2.6600818634033203, + "logits/rejected": -2.6654810905456543, + "logps/chosen": -333.54119873046875, + "logps/rejected": -219.50796508789062, + "loss": 1.0827, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6857550740242004, + "rewards/margins": -0.1820569932460785, + "rewards/rejected": -0.5036981105804443, + "step": 355 + }, + { + "epoch": 0.04, + "learning_rate": 2.9195700956655247e-07, + "logits/chosen": -2.3361635208129883, + "logits/rejected": -2.1461362838745117, + "logps/chosen": -220.15480041503906, + "logps/rejected": -352.7494812011719, + "loss": 0.379, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01200837828218937, + "rewards/margins": 1.2757107019424438, + "rewards/rejected": -1.2877191305160522, + "step": 356 + }, + { + "epoch": 0.04, + "learning_rate": 2.919215778906342e-07, + "logits/chosen": -2.3405277729034424, + "logits/rejected": -2.2837538719177246, + "logps/chosen": -310.7549743652344, + "logps/rejected": -337.0315856933594, + "loss": 1.6176, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.3899741172790527, + "rewards/margins": -1.1155414581298828, + "rewards/rejected": -1.2744327783584595, + "step": 357 + }, + { + "epoch": 0.04, + "learning_rate": 2.9188614621471597e-07, + "logits/chosen": -2.6386992931365967, + "logits/rejected": -2.5254485607147217, + "logps/chosen": -245.9833526611328, + "logps/rejected": -160.0703125, + "loss": 0.4477, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10537806153297424, + "rewards/margins": 0.9918594360351562, + "rewards/rejected": -0.8864814043045044, + "step": 358 + }, + { + "epoch": 0.04, + "learning_rate": 2.9185071453879766e-07, + "logits/chosen": -2.2128055095672607, + "logits/rejected": -2.035968542098999, + "logps/chosen": -339.15765380859375, + "logps/rejected": -299.94012451171875, + "loss": 1.2344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0853787660598755, + "rewards/margins": 0.7561144828796387, + "rewards/rejected": -1.8414932489395142, + "step": 359 + }, + { + "epoch": 0.04, + "learning_rate": 2.918152828628794e-07, + "logits/chosen": -2.756640672683716, + "logits/rejected": -2.8360657691955566, + "logps/chosen": -189.69451904296875, + "logps/rejected": -297.2071533203125, + "loss": 0.3574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16082653403282166, + "rewards/margins": 0.9469164609909058, + "rewards/rejected": -1.1077430248260498, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 2.9177985118696116e-07, + "logits/chosen": -2.657911777496338, + "logits/rejected": -2.5685245990753174, + "logps/chosen": -187.61907958984375, + "logps/rejected": -157.60479736328125, + "loss": 1.09, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0232057571411133, + "rewards/margins": -0.1979352831840515, + "rewards/rejected": -0.8252705335617065, + "step": 361 + }, + { + "epoch": 0.04, + "learning_rate": 2.9174441951104286e-07, + "logits/chosen": -2.6753158569335938, + "logits/rejected": -2.5553839206695557, + "logps/chosen": -204.78146362304688, + "logps/rejected": -216.9765167236328, + "loss": 0.7002, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6889148950576782, + "rewards/margins": 0.1984691619873047, + "rewards/rejected": -0.8873841166496277, + "step": 362 + }, + { + "epoch": 0.04, + "learning_rate": 2.917089878351246e-07, + "logits/chosen": -2.1885862350463867, + "logits/rejected": -2.39033842086792, + "logps/chosen": -235.45162963867188, + "logps/rejected": -144.41941833496094, + "loss": 0.5936, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24951782822608948, + "rewards/margins": 0.4134438633918762, + "rewards/rejected": -0.6629617214202881, + "step": 363 + }, + { + "epoch": 0.04, + "learning_rate": 2.916735561592063e-07, + "logits/chosen": -2.741023063659668, + "logits/rejected": -2.564554452896118, + "logps/chosen": -321.51788330078125, + "logps/rejected": -343.75238037109375, + "loss": 0.2675, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3317694365978241, + "rewards/margins": 1.8310751914978027, + "rewards/rejected": -1.4993058443069458, + "step": 364 + }, + { + "epoch": 0.04, + "learning_rate": 2.9163812448328805e-07, + "logits/chosen": -2.019880771636963, + "logits/rejected": -1.74005925655365, + "logps/chosen": -342.08795166015625, + "logps/rejected": -345.8409423828125, + "loss": 0.6034, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6675082445144653, + "rewards/margins": 0.8716038465499878, + "rewards/rejected": -1.5391119718551636, + "step": 365 + }, + { + "epoch": 0.04, + "learning_rate": 2.9160269280736974e-07, + "logits/chosen": -2.4452133178710938, + "logits/rejected": -2.361947536468506, + "logps/chosen": -182.25216674804688, + "logps/rejected": -186.62860107421875, + "loss": 0.4143, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9851146936416626, + "rewards/margins": 0.953251302242279, + "rewards/rejected": -1.9383659362792969, + "step": 366 + }, + { + "epoch": 0.04, + "learning_rate": 2.915672611314515e-07, + "logits/chosen": -2.1833503246307373, + "logits/rejected": -2.6416120529174805, + "logps/chosen": -524.2366943359375, + "logps/rejected": -317.4039306640625, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2800936996936798, + "rewards/margins": 0.6419193744659424, + "rewards/rejected": -0.9220131039619446, + "step": 367 + }, + { + "epoch": 0.04, + "learning_rate": 2.9153182945553324e-07, + "logits/chosen": -1.9374914169311523, + "logits/rejected": -1.8206708431243896, + "logps/chosen": -351.09735107421875, + "logps/rejected": -376.822265625, + "loss": 0.4398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03537649288773537, + "rewards/margins": 0.6947737336158752, + "rewards/rejected": -0.7301502227783203, + "step": 368 + }, + { + "epoch": 0.04, + "learning_rate": 2.91496397779615e-07, + "logits/chosen": -1.9805552959442139, + "logits/rejected": -2.0341053009033203, + "logps/chosen": -237.97662353515625, + "logps/rejected": -323.3060302734375, + "loss": 0.6642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.055992819368839264, + "rewards/margins": 0.17544439435005188, + "rewards/rejected": -0.23143723607063293, + "step": 369 + }, + { + "epoch": 0.04, + "learning_rate": 2.914609661036967e-07, + "logits/chosen": -2.4620485305786133, + "logits/rejected": -2.445059299468994, + "logps/chosen": -381.3786315917969, + "logps/rejected": -217.01284790039062, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027956679463386536, + "rewards/margins": 1.0060575008392334, + "rewards/rejected": -1.034014105796814, + "step": 370 + }, + { + "epoch": 0.04, + "learning_rate": 2.9142553442777843e-07, + "logits/chosen": -2.438389301300049, + "logits/rejected": -2.386345863342285, + "logps/chosen": -209.4347381591797, + "logps/rejected": -285.638671875, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2624393403530121, + "rewards/margins": 0.7292307615280151, + "rewards/rejected": -0.9916701316833496, + "step": 371 + }, + { + "epoch": 0.04, + "learning_rate": 2.9139010275186013e-07, + "logits/chosen": -2.462907075881958, + "logits/rejected": -2.776033401489258, + "logps/chosen": -241.20034790039062, + "logps/rejected": -192.75881958007812, + "loss": 0.4781, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03623209521174431, + "rewards/margins": 1.3184354305267334, + "rewards/rejected": -1.3546675443649292, + "step": 372 + }, + { + "epoch": 0.04, + "learning_rate": 2.913546710759419e-07, + "logits/chosen": -2.61348557472229, + "logits/rejected": -2.6519858837127686, + "logps/chosen": -483.35443115234375, + "logps/rejected": -399.0893249511719, + "loss": 0.3253, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6226974725723267, + "rewards/margins": 1.7413036823272705, + "rewards/rejected": -2.3640010356903076, + "step": 373 + }, + { + "epoch": 0.04, + "learning_rate": 2.9131923940002363e-07, + "logits/chosen": -2.3559398651123047, + "logits/rejected": -2.443772315979004, + "logps/chosen": -276.01092529296875, + "logps/rejected": -324.1201477050781, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19327013194561005, + "rewards/margins": 1.7155585289001465, + "rewards/rejected": -1.908828616142273, + "step": 374 + }, + { + "epoch": 0.04, + "learning_rate": 2.912838077241053e-07, + "logits/chosen": -2.5892324447631836, + "logits/rejected": -2.6127986907958984, + "logps/chosen": -288.15679931640625, + "logps/rejected": -218.14593505859375, + "loss": 0.505, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6188991069793701, + "rewards/margins": 0.879996657371521, + "rewards/rejected": -1.4988958835601807, + "step": 375 + }, + { + "epoch": 0.04, + "learning_rate": 2.9124837604818707e-07, + "logits/chosen": -2.741547107696533, + "logits/rejected": -2.7076892852783203, + "logps/chosen": -208.8468017578125, + "logps/rejected": -168.34310913085938, + "loss": 0.8012, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7391061782836914, + "rewards/margins": 0.2517876625061035, + "rewards/rejected": -0.9908938407897949, + "step": 376 + }, + { + "epoch": 0.04, + "learning_rate": 2.9121294437226877e-07, + "logits/chosen": -2.7398834228515625, + "logits/rejected": -2.765960454940796, + "logps/chosen": -210.3505859375, + "logps/rejected": -157.2550048828125, + "loss": 0.6086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4951014518737793, + "rewards/margins": 0.42920827865600586, + "rewards/rejected": -0.9243097305297852, + "step": 377 + }, + { + "epoch": 0.04, + "learning_rate": 2.911775126963505e-07, + "logits/chosen": -2.597426176071167, + "logits/rejected": -2.450941562652588, + "logps/chosen": -452.9858703613281, + "logps/rejected": -402.2458190917969, + "loss": 0.5617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4296402335166931, + "rewards/margins": 0.5917314887046814, + "rewards/rejected": -1.021371603012085, + "step": 378 + }, + { + "epoch": 0.04, + "learning_rate": 2.9114208102043226e-07, + "logits/chosen": -3.081923484802246, + "logits/rejected": -2.9856417179107666, + "logps/chosen": -210.20872497558594, + "logps/rejected": -217.0289764404297, + "loss": 0.6458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4244641065597534, + "rewards/margins": 0.5986112356185913, + "rewards/rejected": -1.0230753421783447, + "step": 379 + }, + { + "epoch": 0.04, + "learning_rate": 2.9110664934451396e-07, + "logits/chosen": -2.674346923828125, + "logits/rejected": -2.4168198108673096, + "logps/chosen": -230.85279846191406, + "logps/rejected": -154.787841796875, + "loss": 0.6082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4232533574104309, + "rewards/margins": 0.39322400093078613, + "rewards/rejected": -0.8164772987365723, + "step": 380 + }, + { + "epoch": 0.04, + "learning_rate": 2.910712176685957e-07, + "logits/chosen": -2.6380908489227295, + "logits/rejected": -2.4891340732574463, + "logps/chosen": -178.94752502441406, + "logps/rejected": -223.0135498046875, + "loss": 0.7279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38809722661972046, + "rewards/margins": 0.2853945791721344, + "rewards/rejected": -0.673491895198822, + "step": 381 + }, + { + "epoch": 0.04, + "learning_rate": 2.9103578599267746e-07, + "logits/chosen": -1.5114073753356934, + "logits/rejected": -2.4028563499450684, + "logps/chosen": -353.2110595703125, + "logps/rejected": -205.11846923828125, + "loss": 1.1754, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1019339561462402, + "rewards/margins": 0.12049147486686707, + "rewards/rejected": -1.2224254608154297, + "step": 382 + }, + { + "epoch": 0.04, + "learning_rate": 2.9100035431675915e-07, + "logits/chosen": -2.36358642578125, + "logits/rejected": -2.28446626663208, + "logps/chosen": -219.1780242919922, + "logps/rejected": -249.15045166015625, + "loss": 0.6282, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03214915841817856, + "rewards/margins": 0.36723947525024414, + "rewards/rejected": -0.335090309381485, + "step": 383 + }, + { + "epoch": 0.04, + "learning_rate": 2.909649226408409e-07, + "logits/chosen": -2.4393086433410645, + "logits/rejected": -2.344698667526245, + "logps/chosen": -231.1734161376953, + "logps/rejected": -276.0218505859375, + "loss": 0.4909, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3179285526275635, + "rewards/margins": 0.7397074103355408, + "rewards/rejected": -2.057636022567749, + "step": 384 + }, + { + "epoch": 0.04, + "learning_rate": 2.9092949096492265e-07, + "logits/chosen": -1.9227142333984375, + "logits/rejected": -2.378279209136963, + "logps/chosen": -460.6540832519531, + "logps/rejected": -297.25567626953125, + "loss": 0.7016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6133627891540527, + "rewards/margins": 0.5649220943450928, + "rewards/rejected": -1.1782848834991455, + "step": 385 + }, + { + "epoch": 0.04, + "learning_rate": 2.9089405928900435e-07, + "logits/chosen": -2.261834144592285, + "logits/rejected": -2.0776326656341553, + "logps/chosen": -210.3868408203125, + "logps/rejected": -243.05325317382812, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21825449168682098, + "rewards/margins": 0.7398832440376282, + "rewards/rejected": -0.9581376314163208, + "step": 386 + }, + { + "epoch": 0.05, + "learning_rate": 2.908586276130861e-07, + "logits/chosen": -2.5592167377471924, + "logits/rejected": -2.2397923469543457, + "logps/chosen": -270.41009521484375, + "logps/rejected": -252.57057189941406, + "loss": 1.0451, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.095860481262207, + "rewards/margins": 0.40147170424461365, + "rewards/rejected": -2.4973320960998535, + "step": 387 + }, + { + "epoch": 0.05, + "learning_rate": 2.908231959371678e-07, + "logits/chosen": -2.467172384262085, + "logits/rejected": -2.4671850204467773, + "logps/chosen": -176.31703186035156, + "logps/rejected": -131.861083984375, + "loss": 0.6416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47119781374931335, + "rewards/margins": 0.4313661456108093, + "rewards/rejected": -0.9025639295578003, + "step": 388 + }, + { + "epoch": 0.05, + "learning_rate": 2.9078776426124954e-07, + "logits/chosen": -2.3285129070281982, + "logits/rejected": -2.333667516708374, + "logps/chosen": -286.033935546875, + "logps/rejected": -182.059326171875, + "loss": 0.8016, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.700645387172699, + "rewards/margins": 0.21044054627418518, + "rewards/rejected": -0.9110859632492065, + "step": 389 + }, + { + "epoch": 0.05, + "learning_rate": 2.907523325853313e-07, + "logits/chosen": -2.387022018432617, + "logits/rejected": -2.7735767364501953, + "logps/chosen": -310.4424133300781, + "logps/rejected": -248.2230987548828, + "loss": 0.2051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0037674754858016968, + "rewards/margins": 1.8231241703033447, + "rewards/rejected": -1.8268916606903076, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.90716900909413e-07, + "logits/chosen": -2.4947261810302734, + "logits/rejected": -2.6018552780151367, + "logps/chosen": -409.39495849609375, + "logps/rejected": -274.84393310546875, + "loss": 0.9161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.607440710067749, + "rewards/margins": 0.10968615114688873, + "rewards/rejected": -0.7171268463134766, + "step": 391 + }, + { + "epoch": 0.05, + "learning_rate": 2.9068146923349473e-07, + "logits/chosen": -2.6144323348999023, + "logits/rejected": -2.44895076751709, + "logps/chosen": -284.3625183105469, + "logps/rejected": -261.03082275390625, + "loss": 0.678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6003652811050415, + "rewards/margins": 0.3987134099006653, + "rewards/rejected": -0.999078631401062, + "step": 392 + }, + { + "epoch": 0.05, + "learning_rate": 2.906460375575765e-07, + "logits/chosen": -2.2292399406433105, + "logits/rejected": -2.152902364730835, + "logps/chosen": -283.7800598144531, + "logps/rejected": -256.03045654296875, + "loss": 0.3496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15193067491054535, + "rewards/margins": 1.3511581420898438, + "rewards/rejected": -1.5030887126922607, + "step": 393 + }, + { + "epoch": 0.05, + "learning_rate": 2.906106058816582e-07, + "logits/chosen": -2.0130889415740967, + "logits/rejected": -2.03128719329834, + "logps/chosen": -312.2867126464844, + "logps/rejected": -338.41522216796875, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47012022137641907, + "rewards/margins": 0.5652213096618652, + "rewards/rejected": -1.0353416204452515, + "step": 394 + }, + { + "epoch": 0.05, + "learning_rate": 2.905751742057399e-07, + "logits/chosen": -2.3529956340789795, + "logits/rejected": -2.301142692565918, + "logps/chosen": -195.08575439453125, + "logps/rejected": -193.3866729736328, + "loss": 0.4212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6629807949066162, + "rewards/margins": 1.451525092124939, + "rewards/rejected": -2.1145060062408447, + "step": 395 + }, + { + "epoch": 0.05, + "learning_rate": 2.9053974252982167e-07, + "logits/chosen": -2.63325834274292, + "logits/rejected": -2.498455286026001, + "logps/chosen": -216.3518524169922, + "logps/rejected": -324.20367431640625, + "loss": 0.5293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2945455312728882, + "rewards/margins": 0.92228102684021, + "rewards/rejected": -1.2168265581130981, + "step": 396 + }, + { + "epoch": 0.05, + "learning_rate": 2.9050431085390337e-07, + "logits/chosen": -2.695741891860962, + "logits/rejected": -2.668283700942993, + "logps/chosen": -191.29249572753906, + "logps/rejected": -209.6196746826172, + "loss": 0.4286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29533764719963074, + "rewards/margins": 1.2358360290527344, + "rewards/rejected": -1.5311737060546875, + "step": 397 + }, + { + "epoch": 0.05, + "learning_rate": 2.904688791779851e-07, + "logits/chosen": -2.4054059982299805, + "logits/rejected": -2.5963492393493652, + "logps/chosen": -307.03265380859375, + "logps/rejected": -362.26361083984375, + "loss": 1.063, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.862945556640625, + "rewards/margins": -0.06770412623882294, + "rewards/rejected": -0.7952414155006409, + "step": 398 + }, + { + "epoch": 0.05, + "learning_rate": 2.904334475020668e-07, + "logits/chosen": -2.5434956550598145, + "logits/rejected": -2.273796796798706, + "logps/chosen": -252.62017822265625, + "logps/rejected": -389.3582458496094, + "loss": 0.6428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.814016580581665, + "rewards/margins": 0.6457063555717468, + "rewards/rejected": -1.4597229957580566, + "step": 399 + }, + { + "epoch": 0.05, + "learning_rate": 2.9039801582614856e-07, + "logits/chosen": -2.5275187492370605, + "logits/rejected": -2.4476003646850586, + "logps/chosen": -125.37290954589844, + "logps/rejected": -200.5078125, + "loss": 0.3068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16572076082229614, + "rewards/margins": 1.1224074363708496, + "rewards/rejected": -1.288128137588501, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.9036258415023026e-07, + "logits/chosen": -2.55018949508667, + "logits/rejected": -2.530451774597168, + "logps/chosen": -215.76553344726562, + "logps/rejected": -275.7241516113281, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3347064256668091, + "rewards/margins": 1.3861677646636963, + "rewards/rejected": -1.720874309539795, + "step": 401 + }, + { + "epoch": 0.05, + "learning_rate": 2.90327152474312e-07, + "logits/chosen": -1.6912660598754883, + "logits/rejected": -1.7484867572784424, + "logps/chosen": -438.5322265625, + "logps/rejected": -355.2369689941406, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5170990228652954, + "rewards/margins": 0.8915930390357971, + "rewards/rejected": -1.4086921215057373, + "step": 402 + }, + { + "epoch": 0.05, + "learning_rate": 2.9029172079839375e-07, + "logits/chosen": -2.574814796447754, + "logits/rejected": -2.5845723152160645, + "logps/chosen": -202.02037048339844, + "logps/rejected": -150.90652465820312, + "loss": 0.4926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12014584988355637, + "rewards/margins": 0.662086009979248, + "rewards/rejected": -0.7822319269180298, + "step": 403 + }, + { + "epoch": 0.05, + "learning_rate": 2.902562891224755e-07, + "logits/chosen": -2.0928592681884766, + "logits/rejected": -2.258073091506958, + "logps/chosen": -338.05963134765625, + "logps/rejected": -288.1326904296875, + "loss": 0.4268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44505953788757324, + "rewards/margins": 1.2010225057601929, + "rewards/rejected": -1.6460820436477661, + "step": 404 + }, + { + "epoch": 0.05, + "learning_rate": 2.902208574465572e-07, + "logits/chosen": -2.6006851196289062, + "logits/rejected": -2.6106038093566895, + "logps/chosen": -217.82427978515625, + "logps/rejected": -194.0026397705078, + "loss": 0.6276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6466972827911377, + "rewards/margins": 0.9627809524536133, + "rewards/rejected": -1.6094781160354614, + "step": 405 + }, + { + "epoch": 0.05, + "learning_rate": 2.9018542577063895e-07, + "logits/chosen": -1.9177751541137695, + "logits/rejected": -1.4863187074661255, + "logps/chosen": -353.9942626953125, + "logps/rejected": -466.2381286621094, + "loss": 0.6619, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6182900071144104, + "rewards/margins": 0.5348527431488037, + "rewards/rejected": -1.1531426906585693, + "step": 406 + }, + { + "epoch": 0.05, + "learning_rate": 2.901499940947207e-07, + "logits/chosen": -2.347299575805664, + "logits/rejected": -2.581388235092163, + "logps/chosen": -322.8605651855469, + "logps/rejected": -222.009765625, + "loss": 0.5515, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1542651355266571, + "rewards/margins": 0.9100422263145447, + "rewards/rejected": -1.064307451248169, + "step": 407 + }, + { + "epoch": 0.05, + "learning_rate": 2.901145624188024e-07, + "logits/chosen": -2.617509365081787, + "logits/rejected": -2.600517749786377, + "logps/chosen": -110.65100860595703, + "logps/rejected": -189.55299377441406, + "loss": 0.3341, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09722583740949631, + "rewards/margins": 1.648722529411316, + "rewards/rejected": -1.745948314666748, + "step": 408 + }, + { + "epoch": 0.05, + "learning_rate": 2.9007913074288414e-07, + "logits/chosen": -2.777291774749756, + "logits/rejected": -2.7684519290924072, + "logps/chosen": -280.8226318359375, + "logps/rejected": -223.43063354492188, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18180479109287262, + "rewards/margins": 0.9385455846786499, + "rewards/rejected": -1.1203503608703613, + "step": 409 + }, + { + "epoch": 0.05, + "learning_rate": 2.9004369906696584e-07, + "logits/chosen": -2.1051454544067383, + "logits/rejected": -2.399475574493408, + "logps/chosen": -383.5094299316406, + "logps/rejected": -267.96728515625, + "loss": 1.1977, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4341328144073486, + "rewards/margins": -0.2899511754512787, + "rewards/rejected": -1.144181728363037, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.900082673910476e-07, + "logits/chosen": -2.1536006927490234, + "logits/rejected": -1.9441380500793457, + "logps/chosen": -97.18720245361328, + "logps/rejected": -231.72508239746094, + "loss": 0.3853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29566866159439087, + "rewards/margins": 1.2901939153671265, + "rewards/rejected": -1.5858625173568726, + "step": 411 + }, + { + "epoch": 0.05, + "learning_rate": 2.899728357151293e-07, + "logits/chosen": -1.9044407606124878, + "logits/rejected": -1.8881824016571045, + "logps/chosen": -352.42572021484375, + "logps/rejected": -344.89385986328125, + "loss": 0.224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49962738156318665, + "rewards/margins": 2.233905553817749, + "rewards/rejected": -2.7335331439971924, + "step": 412 + }, + { + "epoch": 0.05, + "learning_rate": 2.8993740403921103e-07, + "logits/chosen": -2.88010835647583, + "logits/rejected": -2.8571999073028564, + "logps/chosen": -182.77574157714844, + "logps/rejected": -255.7477264404297, + "loss": 0.7982, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.76109379529953, + "rewards/margins": 0.878085732460022, + "rewards/rejected": -1.6391795873641968, + "step": 413 + }, + { + "epoch": 0.05, + "learning_rate": 2.899019723632928e-07, + "logits/chosen": -2.0980958938598633, + "logits/rejected": -2.152815580368042, + "logps/chosen": -298.0321960449219, + "logps/rejected": -252.02593994140625, + "loss": 0.5475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10067229717969894, + "rewards/margins": 0.668516993522644, + "rewards/rejected": -0.5678446888923645, + "step": 414 + }, + { + "epoch": 0.05, + "learning_rate": 2.8986654068737447e-07, + "logits/chosen": -2.628714084625244, + "logits/rejected": -2.790391445159912, + "logps/chosen": -195.35165405273438, + "logps/rejected": -230.36630249023438, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.652872622013092, + "rewards/margins": 1.6709685325622559, + "rewards/rejected": -2.323841094970703, + "step": 415 + }, + { + "epoch": 0.05, + "learning_rate": 2.898311090114562e-07, + "logits/chosen": -2.105151653289795, + "logits/rejected": -2.255107879638672, + "logps/chosen": -397.61407470703125, + "logps/rejected": -282.79150390625, + "loss": 0.5097, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45666179060935974, + "rewards/margins": 1.1008095741271973, + "rewards/rejected": -1.5574712753295898, + "step": 416 + }, + { + "epoch": 0.05, + "learning_rate": 2.8979567733553797e-07, + "logits/chosen": -2.489713668823242, + "logits/rejected": -2.4211790561676025, + "logps/chosen": -157.96438598632812, + "logps/rejected": -286.14056396484375, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15878267586231232, + "rewards/margins": 1.4543695449829102, + "rewards/rejected": -1.613152265548706, + "step": 417 + }, + { + "epoch": 0.05, + "learning_rate": 2.897602456596197e-07, + "logits/chosen": -2.194603204727173, + "logits/rejected": -2.3707656860351562, + "logps/chosen": -442.0442810058594, + "logps/rejected": -257.8227233886719, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2044883817434311, + "rewards/margins": 1.0412293672561646, + "rewards/rejected": -1.2457177639007568, + "step": 418 + }, + { + "epoch": 0.05, + "learning_rate": 2.897248139837014e-07, + "logits/chosen": -2.3681437969207764, + "logits/rejected": -2.239917278289795, + "logps/chosen": -259.718505859375, + "logps/rejected": -330.5833435058594, + "loss": 0.8768, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7860607504844666, + "rewards/margins": 0.4908476769924164, + "rewards/rejected": -1.27690851688385, + "step": 419 + }, + { + "epoch": 0.05, + "learning_rate": 2.8968938230778316e-07, + "logits/chosen": -1.9029079675674438, + "logits/rejected": -1.8731968402862549, + "logps/chosen": -268.7479248046875, + "logps/rejected": -279.08074951171875, + "loss": 0.6306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5833770036697388, + "rewards/margins": 0.4556977152824402, + "rewards/rejected": -1.0390746593475342, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 2.8965395063186486e-07, + "logits/chosen": -2.1960530281066895, + "logits/rejected": -2.480104684829712, + "logps/chosen": -349.0220031738281, + "logps/rejected": -296.8118591308594, + "loss": 0.7114, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47710901498794556, + "rewards/margins": 0.4133308529853821, + "rewards/rejected": -0.8904398679733276, + "step": 421 + }, + { + "epoch": 0.05, + "learning_rate": 2.896185189559466e-07, + "logits/chosen": -2.4694385528564453, + "logits/rejected": -2.651801109313965, + "logps/chosen": -270.26385498046875, + "logps/rejected": -175.33889770507812, + "loss": 0.6255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.773228645324707, + "rewards/margins": 0.27232787013053894, + "rewards/rejected": -1.0455564260482788, + "step": 422 + }, + { + "epoch": 0.05, + "learning_rate": 2.895830872800283e-07, + "logits/chosen": -1.3705748319625854, + "logits/rejected": -1.8169828653335571, + "logps/chosen": -330.394287109375, + "logps/rejected": -286.46734619140625, + "loss": 0.4229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49803948402404785, + "rewards/margins": 1.1607952117919922, + "rewards/rejected": -1.658834457397461, + "step": 423 + }, + { + "epoch": 0.05, + "learning_rate": 2.8954765560411005e-07, + "logits/chosen": -2.41660475730896, + "logits/rejected": -2.460914134979248, + "logps/chosen": -442.93524169921875, + "logps/rejected": -310.57049560546875, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24222737550735474, + "rewards/margins": 1.2475676536560059, + "rewards/rejected": -1.4897949695587158, + "step": 424 + }, + { + "epoch": 0.05, + "learning_rate": 2.895122239281918e-07, + "logits/chosen": -2.571394443511963, + "logits/rejected": -2.3807296752929688, + "logps/chosen": -160.80511474609375, + "logps/rejected": -152.62330627441406, + "loss": 0.3994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11728984117507935, + "rewards/margins": 1.0486934185028076, + "rewards/rejected": -1.1659833192825317, + "step": 425 + }, + { + "epoch": 0.05, + "learning_rate": 2.894767922522735e-07, + "logits/chosen": -2.335704803466797, + "logits/rejected": -2.5373404026031494, + "logps/chosen": -400.5793762207031, + "logps/rejected": -374.7125244140625, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15045621991157532, + "rewards/margins": 1.0319464206695557, + "rewards/rejected": -1.1824026107788086, + "step": 426 + }, + { + "epoch": 0.05, + "learning_rate": 2.8944136057635524e-07, + "logits/chosen": -1.9246761798858643, + "logits/rejected": -2.2647767066955566, + "logps/chosen": -484.20635986328125, + "logps/rejected": -314.3674011230469, + "loss": 0.5874, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38436758518218994, + "rewards/margins": 0.6285203099250793, + "rewards/rejected": -1.012887954711914, + "step": 427 + }, + { + "epoch": 0.05, + "learning_rate": 2.89405928900437e-07, + "logits/chosen": -2.5076851844787598, + "logits/rejected": -2.515650987625122, + "logps/chosen": -221.2677459716797, + "logps/rejected": -333.34918212890625, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.026145324110984802, + "rewards/margins": 2.101147413253784, + "rewards/rejected": -2.1272926330566406, + "step": 428 + }, + { + "epoch": 0.05, + "learning_rate": 2.8937049722451874e-07, + "logits/chosen": -2.255150318145752, + "logits/rejected": -2.3165829181671143, + "logps/chosen": -231.63540649414062, + "logps/rejected": -191.27479553222656, + "loss": 1.1615, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.797600507736206, + "rewards/margins": -0.5019054412841797, + "rewards/rejected": -0.29569515585899353, + "step": 429 + }, + { + "epoch": 0.05, + "learning_rate": 2.8933506554860044e-07, + "logits/chosen": -1.9827651977539062, + "logits/rejected": -2.0335562229156494, + "logps/chosen": -233.53016662597656, + "logps/rejected": -123.15323638916016, + "loss": 0.6817, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5800879001617432, + "rewards/margins": 0.15834546089172363, + "rewards/rejected": -0.7384333610534668, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 2.892996338726822e-07, + "logits/chosen": -2.0162081718444824, + "logits/rejected": -2.465643882751465, + "logps/chosen": -289.4651794433594, + "logps/rejected": -166.14990234375, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12164521217346191, + "rewards/margins": 0.6192151308059692, + "rewards/rejected": -0.7408603429794312, + "step": 431 + }, + { + "epoch": 0.05, + "learning_rate": 2.892642021967639e-07, + "logits/chosen": -1.8190035820007324, + "logits/rejected": -2.064676523208618, + "logps/chosen": -370.3983154296875, + "logps/rejected": -320.833251953125, + "loss": 0.6127, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5361925959587097, + "rewards/margins": 0.8419095873832703, + "rewards/rejected": -1.37810218334198, + "step": 432 + }, + { + "epoch": 0.05, + "learning_rate": 2.8922877052084563e-07, + "logits/chosen": -2.0875978469848633, + "logits/rejected": -1.979960560798645, + "logps/chosen": -186.75677490234375, + "logps/rejected": -194.5227508544922, + "loss": 0.3268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03865785151720047, + "rewards/margins": 1.0173653364181519, + "rewards/rejected": -1.0560232400894165, + "step": 433 + }, + { + "epoch": 0.05, + "learning_rate": 2.891933388449273e-07, + "logits/chosen": -1.6656266450881958, + "logits/rejected": -1.692918300628662, + "logps/chosen": -581.7532348632812, + "logps/rejected": -417.2818298339844, + "loss": 0.2337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2516899108886719, + "rewards/margins": 1.714011788368225, + "rewards/rejected": -1.9657018184661865, + "step": 434 + }, + { + "epoch": 0.05, + "learning_rate": 2.891579071690091e-07, + "logits/chosen": -2.3993992805480957, + "logits/rejected": -2.3897898197174072, + "logps/chosen": -219.9913787841797, + "logps/rejected": -168.07618713378906, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4981522262096405, + "rewards/margins": 0.35705751180648804, + "rewards/rejected": -0.8552097082138062, + "step": 435 + }, + { + "epoch": 0.05, + "learning_rate": 2.891224754930908e-07, + "logits/chosen": -2.3494534492492676, + "logits/rejected": -2.560002326965332, + "logps/chosen": -247.70327758789062, + "logps/rejected": -217.59912109375, + "loss": 1.0221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0105006694793701, + "rewards/margins": -0.08980664610862732, + "rewards/rejected": -0.9206939935684204, + "step": 436 + }, + { + "epoch": 0.05, + "learning_rate": 2.890870438171725e-07, + "logits/chosen": -2.293614387512207, + "logits/rejected": -2.306755542755127, + "logps/chosen": -284.87396240234375, + "logps/rejected": -281.9623107910156, + "loss": 0.6157, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.229252815246582, + "rewards/margins": 0.6979652643203735, + "rewards/rejected": -1.9272180795669556, + "step": 437 + }, + { + "epoch": 0.05, + "learning_rate": 2.8905161214125427e-07, + "logits/chosen": -2.129169464111328, + "logits/rejected": -2.250277042388916, + "logps/chosen": -368.252197265625, + "logps/rejected": -260.0812683105469, + "loss": 0.4146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47230294346809387, + "rewards/margins": 1.3293743133544922, + "rewards/rejected": -1.8016772270202637, + "step": 438 + }, + { + "epoch": 0.05, + "learning_rate": 2.89016180465336e-07, + "logits/chosen": -1.8125989437103271, + "logits/rejected": -1.962503433227539, + "logps/chosen": -225.67848205566406, + "logps/rejected": -259.9812316894531, + "loss": 0.7039, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47003689408302307, + "rewards/margins": 0.5679247379302979, + "rewards/rejected": -1.0379616022109985, + "step": 439 + }, + { + "epoch": 0.05, + "learning_rate": 2.8898074878941776e-07, + "logits/chosen": -2.305500030517578, + "logits/rejected": -2.199249029159546, + "logps/chosen": -214.42691040039062, + "logps/rejected": -455.64117431640625, + "loss": 0.4123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24727585911750793, + "rewards/margins": 1.1467941999435425, + "rewards/rejected": -1.394070029258728, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 2.8894531711349946e-07, + "logits/chosen": -2.5705955028533936, + "logits/rejected": -2.6412603855133057, + "logps/chosen": -284.3816223144531, + "logps/rejected": -274.05169677734375, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48974108695983887, + "rewards/margins": 0.5115760564804077, + "rewards/rejected": -1.0013171434402466, + "step": 441 + }, + { + "epoch": 0.05, + "learning_rate": 2.889098854375812e-07, + "logits/chosen": -2.197633981704712, + "logits/rejected": -2.0193705558776855, + "logps/chosen": -467.29144287109375, + "logps/rejected": -411.1417236328125, + "loss": 0.8187, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3258193731307983, + "rewards/margins": 0.15741285681724548, + "rewards/rejected": -1.4832321405410767, + "step": 442 + }, + { + "epoch": 0.05, + "learning_rate": 2.888744537616629e-07, + "logits/chosen": -1.7728595733642578, + "logits/rejected": -2.0740301609039307, + "logps/chosen": -283.5924987792969, + "logps/rejected": -196.9116973876953, + "loss": 0.6075, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.674859881401062, + "rewards/margins": 0.539892315864563, + "rewards/rejected": -1.214752197265625, + "step": 443 + }, + { + "epoch": 0.05, + "learning_rate": 2.8883902208574465e-07, + "logits/chosen": -2.081737756729126, + "logits/rejected": -2.0653939247131348, + "logps/chosen": -289.5146179199219, + "logps/rejected": -281.1646728515625, + "loss": 0.501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28984910249710083, + "rewards/margins": 0.9263857007026672, + "rewards/rejected": -1.2162346839904785, + "step": 444 + }, + { + "epoch": 0.05, + "learning_rate": 2.8880359040982635e-07, + "logits/chosen": -2.829387664794922, + "logits/rejected": -2.8123908042907715, + "logps/chosen": -186.57882690429688, + "logps/rejected": -155.37716674804688, + "loss": 0.5977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12644381821155548, + "rewards/margins": 0.2871760129928589, + "rewards/rejected": -0.41361984610557556, + "step": 445 + }, + { + "epoch": 0.05, + "learning_rate": 2.887681587339081e-07, + "logits/chosen": -2.623011827468872, + "logits/rejected": -2.6550562381744385, + "logps/chosen": -212.9499053955078, + "logps/rejected": -233.2044677734375, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5647568702697754, + "rewards/margins": 0.45782092213630676, + "rewards/rejected": -1.0225777626037598, + "step": 446 + }, + { + "epoch": 0.05, + "learning_rate": 2.8873272705798985e-07, + "logits/chosen": -2.1888375282287598, + "logits/rejected": -2.374483585357666, + "logps/chosen": -454.57757568359375, + "logps/rejected": -346.9500732421875, + "loss": 0.6295, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27782851457595825, + "rewards/margins": 0.49870729446411133, + "rewards/rejected": -0.7765358090400696, + "step": 447 + }, + { + "epoch": 0.05, + "learning_rate": 2.8869729538207154e-07, + "logits/chosen": -2.517354965209961, + "logits/rejected": -2.5084331035614014, + "logps/chosen": -110.6280517578125, + "logps/rejected": -195.51849365234375, + "loss": 0.4698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8173120021820068, + "rewards/margins": 0.8536496162414551, + "rewards/rejected": -1.670961618423462, + "step": 448 + }, + { + "epoch": 0.05, + "learning_rate": 2.886618637061533e-07, + "logits/chosen": -2.7251298427581787, + "logits/rejected": -2.80497145652771, + "logps/chosen": -294.46685791015625, + "logps/rejected": -295.30413818359375, + "loss": 0.5665, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5064153671264648, + "rewards/margins": 0.7449488043785095, + "rewards/rejected": -2.25136399269104, + "step": 449 + }, + { + "epoch": 0.05, + "learning_rate": 2.88626432030235e-07, + "logits/chosen": -2.8603343963623047, + "logits/rejected": -2.612391948699951, + "logps/chosen": -278.3944091796875, + "logps/rejected": -298.2635498046875, + "loss": 0.4652, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4374709129333496, + "rewards/margins": 0.7024620771408081, + "rewards/rejected": -1.1399329900741577, + "step": 450 + }, + { + "epoch": 0.05, + "learning_rate": 2.885910003543168e-07, + "logits/chosen": -2.2044122219085693, + "logits/rejected": -2.2808475494384766, + "logps/chosen": -263.2228698730469, + "logps/rejected": -171.86575317382812, + "loss": 0.8299, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.520301342010498, + "rewards/margins": 0.10889454185962677, + "rewards/rejected": -0.6291958689689636, + "step": 451 + }, + { + "epoch": 0.05, + "learning_rate": 2.885555686783985e-07, + "logits/chosen": -1.9963300228118896, + "logits/rejected": -2.2340283393859863, + "logps/chosen": -339.03076171875, + "logps/rejected": -223.689453125, + "loss": 0.5081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27676182985305786, + "rewards/margins": 0.4591420590877533, + "rewards/rejected": -0.7359039187431335, + "step": 452 + }, + { + "epoch": 0.05, + "learning_rate": 2.8852013700248023e-07, + "logits/chosen": -2.3173396587371826, + "logits/rejected": -2.1254189014434814, + "logps/chosen": -291.9952697753906, + "logps/rejected": -434.4285583496094, + "loss": 0.2433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5326418280601501, + "rewards/margins": 1.6209514141082764, + "rewards/rejected": -2.1535933017730713, + "step": 453 + }, + { + "epoch": 0.05, + "learning_rate": 2.8848470532656193e-07, + "logits/chosen": -2.454075336456299, + "logits/rejected": -2.410428047180176, + "logps/chosen": -484.0467224121094, + "logps/rejected": -336.8230895996094, + "loss": 0.517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7301141619682312, + "rewards/margins": 0.5541477203369141, + "rewards/rejected": -1.2842618227005005, + "step": 454 + }, + { + "epoch": 0.05, + "learning_rate": 2.884492736506437e-07, + "logits/chosen": -2.7751338481903076, + "logits/rejected": -2.804736614227295, + "logps/chosen": -359.765625, + "logps/rejected": -262.50189208984375, + "loss": 0.4686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24237971007823944, + "rewards/margins": 0.769894003868103, + "rewards/rejected": -1.0122736692428589, + "step": 455 + }, + { + "epoch": 0.05, + "learning_rate": 2.8841384197472537e-07, + "logits/chosen": -2.1996359825134277, + "logits/rejected": -2.0379090309143066, + "logps/chosen": -225.12881469726562, + "logps/rejected": -243.345947265625, + "loss": 0.647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7290774583816528, + "rewards/margins": 0.5943965911865234, + "rewards/rejected": -1.3234740495681763, + "step": 456 + }, + { + "epoch": 0.05, + "learning_rate": 2.883784102988071e-07, + "logits/chosen": -2.2726848125457764, + "logits/rejected": -2.2914962768554688, + "logps/chosen": -409.12310791015625, + "logps/rejected": -337.31634521484375, + "loss": 0.5884, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3329915404319763, + "rewards/margins": 0.4369245767593384, + "rewards/rejected": -0.7699161767959595, + "step": 457 + }, + { + "epoch": 0.05, + "learning_rate": 2.8834297862288887e-07, + "logits/chosen": -2.439037799835205, + "logits/rejected": -2.3562841415405273, + "logps/chosen": -217.17013549804688, + "logps/rejected": -320.61383056640625, + "loss": 0.772, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.78801029920578, + "rewards/margins": 0.6818526983261108, + "rewards/rejected": -1.469862937927246, + "step": 458 + }, + { + "epoch": 0.05, + "learning_rate": 2.8830754694697056e-07, + "logits/chosen": -2.3345448970794678, + "logits/rejected": -1.9554619789123535, + "logps/chosen": -276.74346923828125, + "logps/rejected": -349.22515869140625, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37507063150405884, + "rewards/margins": 0.9351624846458435, + "rewards/rejected": -1.310233235359192, + "step": 459 + }, + { + "epoch": 0.05, + "learning_rate": 2.882721152710523e-07, + "logits/chosen": -2.6931076049804688, + "logits/rejected": -2.6111483573913574, + "logps/chosen": -229.34307861328125, + "logps/rejected": -195.31698608398438, + "loss": 0.5077, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.039030686020851135, + "rewards/margins": 0.6117849349975586, + "rewards/rejected": -0.6508156657218933, + "step": 460 + }, + { + "epoch": 0.05, + "learning_rate": 2.88236683595134e-07, + "logits/chosen": -2.171952247619629, + "logits/rejected": -2.2652018070220947, + "logps/chosen": -410.07574462890625, + "logps/rejected": -344.35809326171875, + "loss": 0.5448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7113549709320068, + "rewards/margins": 0.6973752975463867, + "rewards/rejected": -1.4087302684783936, + "step": 461 + }, + { + "epoch": 0.05, + "learning_rate": 2.8820125191921576e-07, + "logits/chosen": -2.3885738849639893, + "logits/rejected": -2.5526578426361084, + "logps/chosen": -252.19741821289062, + "logps/rejected": -276.8016357421875, + "loss": 0.5275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0005277693271636963, + "rewards/margins": 0.7073348760604858, + "rewards/rejected": -0.7078626751899719, + "step": 462 + }, + { + "epoch": 0.05, + "learning_rate": 2.881658202432975e-07, + "logits/chosen": -2.6351418495178223, + "logits/rejected": -2.8905675411224365, + "logps/chosen": -473.2274169921875, + "logps/rejected": -194.9286651611328, + "loss": 0.6359, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7356879711151123, + "rewards/margins": 0.7049455642700195, + "rewards/rejected": -1.4406334161758423, + "step": 463 + }, + { + "epoch": 0.05, + "learning_rate": 2.8813038856737925e-07, + "logits/chosen": -2.4014363288879395, + "logits/rejected": -2.565269947052002, + "logps/chosen": -203.2305908203125, + "logps/rejected": -263.9621887207031, + "loss": 0.4605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9398926496505737, + "rewards/margins": 1.4968774318695068, + "rewards/rejected": -2.436769962310791, + "step": 464 + }, + { + "epoch": 0.05, + "learning_rate": 2.8809495689146095e-07, + "logits/chosen": -1.9531371593475342, + "logits/rejected": -1.6714953184127808, + "logps/chosen": -419.4506530761719, + "logps/rejected": -492.268310546875, + "loss": 0.7231, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4054261445999146, + "rewards/margins": 0.42764317989349365, + "rewards/rejected": -1.8330694437026978, + "step": 465 + }, + { + "epoch": 0.05, + "learning_rate": 2.880595252155427e-07, + "logits/chosen": -2.2883946895599365, + "logits/rejected": -2.557877779006958, + "logps/chosen": -129.26202392578125, + "logps/rejected": -135.85995483398438, + "loss": 0.849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.506321907043457, + "rewards/margins": 0.26170840859413147, + "rewards/rejected": -0.7680302858352661, + "step": 466 + }, + { + "epoch": 0.05, + "learning_rate": 2.880240935396244e-07, + "logits/chosen": -2.6069416999816895, + "logits/rejected": -2.7098441123962402, + "logps/chosen": -252.12705993652344, + "logps/rejected": -165.3023223876953, + "loss": 0.4092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.248795747756958, + "rewards/margins": 0.8864930868148804, + "rewards/rejected": -1.1352887153625488, + "step": 467 + }, + { + "epoch": 0.05, + "learning_rate": 2.8798866186370614e-07, + "logits/chosen": -2.7018556594848633, + "logits/rejected": -2.7645883560180664, + "logps/chosen": -166.42974853515625, + "logps/rejected": -220.08763122558594, + "loss": 1.4401, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2216691970825195, + "rewards/margins": -0.0849132239818573, + "rewards/rejected": -1.1367559432983398, + "step": 468 + }, + { + "epoch": 0.05, + "learning_rate": 2.879532301877879e-07, + "logits/chosen": -2.6121859550476074, + "logits/rejected": -2.5769598484039307, + "logps/chosen": -269.4376220703125, + "logps/rejected": -251.34103393554688, + "loss": 0.6438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15875861048698425, + "rewards/margins": 0.8456829786300659, + "rewards/rejected": -1.004441499710083, + "step": 469 + }, + { + "epoch": 0.05, + "learning_rate": 2.879177985118696e-07, + "logits/chosen": -2.2257862091064453, + "logits/rejected": -2.1988463401794434, + "logps/chosen": -183.557373046875, + "logps/rejected": -219.3253631591797, + "loss": 0.4975, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4910590648651123, + "rewards/margins": 0.707735002040863, + "rewards/rejected": -1.1987941265106201, + "step": 470 + }, + { + "epoch": 0.05, + "learning_rate": 2.8788236683595134e-07, + "logits/chosen": -2.3143200874328613, + "logits/rejected": -2.397932291030884, + "logps/chosen": -454.4237060546875, + "logps/rejected": -352.91497802734375, + "loss": 0.4827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27401429414749146, + "rewards/margins": 0.8136337995529175, + "rewards/rejected": -1.0876481533050537, + "step": 471 + }, + { + "epoch": 0.05, + "learning_rate": 2.8784693516003303e-07, + "logits/chosen": -2.225881576538086, + "logits/rejected": -2.4619812965393066, + "logps/chosen": -236.94288635253906, + "logps/rejected": -186.68948364257812, + "loss": 0.6113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.721285343170166, + "rewards/margins": 0.8076116442680359, + "rewards/rejected": -1.5288970470428467, + "step": 472 + }, + { + "epoch": 0.06, + "learning_rate": 2.878115034841148e-07, + "logits/chosen": -2.2152652740478516, + "logits/rejected": -2.28939151763916, + "logps/chosen": -184.28762817382812, + "logps/rejected": -242.72091674804688, + "loss": 0.2799, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1465131938457489, + "rewards/margins": 1.4585176706314087, + "rewards/rejected": -1.3120044469833374, + "step": 473 + }, + { + "epoch": 0.06, + "learning_rate": 2.8777607180819653e-07, + "logits/chosen": -2.443805694580078, + "logits/rejected": -2.402092456817627, + "logps/chosen": -370.5953369140625, + "logps/rejected": -383.960205078125, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48763585090637207, + "rewards/margins": 1.3761087656021118, + "rewards/rejected": -1.8637446165084839, + "step": 474 + }, + { + "epoch": 0.06, + "learning_rate": 2.877406401322783e-07, + "logits/chosen": -1.9888908863067627, + "logits/rejected": -2.5703654289245605, + "logps/chosen": -298.15582275390625, + "logps/rejected": -149.9966583251953, + "loss": 0.741, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1638567447662354, + "rewards/margins": 0.12602999806404114, + "rewards/rejected": -1.289886713027954, + "step": 475 + }, + { + "epoch": 0.06, + "learning_rate": 2.8770520845635997e-07, + "logits/chosen": -2.442338705062866, + "logits/rejected": -2.4861013889312744, + "logps/chosen": -293.5130920410156, + "logps/rejected": -335.413818359375, + "loss": 0.3171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38817447423934937, + "rewards/margins": 1.1757229566574097, + "rewards/rejected": -1.5638973712921143, + "step": 476 + }, + { + "epoch": 0.06, + "learning_rate": 2.876697767804417e-07, + "logits/chosen": -2.6148037910461426, + "logits/rejected": -2.7018911838531494, + "logps/chosen": -435.9836120605469, + "logps/rejected": -243.5842742919922, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2591713070869446, + "rewards/margins": 0.6935673952102661, + "rewards/rejected": -0.9527387022972107, + "step": 477 + }, + { + "epoch": 0.06, + "learning_rate": 2.876343451045234e-07, + "logits/chosen": -2.1867642402648926, + "logits/rejected": -2.363856554031372, + "logps/chosen": -595.6495361328125, + "logps/rejected": -229.33285522460938, + "loss": 0.7856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6898307800292969, + "rewards/margins": 0.469849556684494, + "rewards/rejected": -1.1596803665161133, + "step": 478 + }, + { + "epoch": 0.06, + "learning_rate": 2.8759891342860517e-07, + "logits/chosen": -2.5236077308654785, + "logits/rejected": -2.6806118488311768, + "logps/chosen": -228.54183959960938, + "logps/rejected": -136.3505859375, + "loss": 0.8091, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5312069654464722, + "rewards/margins": -0.10284203290939331, + "rewards/rejected": -0.42836493253707886, + "step": 479 + }, + { + "epoch": 0.06, + "learning_rate": 2.875634817526869e-07, + "logits/chosen": -1.848218560218811, + "logits/rejected": -2.02508544921875, + "logps/chosen": -385.38470458984375, + "logps/rejected": -336.7276306152344, + "loss": 0.5346, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8404441475868225, + "rewards/margins": 0.7621071338653564, + "rewards/rejected": -1.6025512218475342, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 2.875280500767686e-07, + "logits/chosen": -2.7000069618225098, + "logits/rejected": -2.7673792839050293, + "logps/chosen": -301.5589599609375, + "logps/rejected": -436.10357666015625, + "loss": 0.3349, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03319627791643143, + "rewards/margins": 1.7067854404449463, + "rewards/rejected": -1.6735892295837402, + "step": 481 + }, + { + "epoch": 0.06, + "learning_rate": 2.8749261840085036e-07, + "logits/chosen": -2.736711025238037, + "logits/rejected": -2.7535383701324463, + "logps/chosen": -114.55261993408203, + "logps/rejected": -236.68429565429688, + "loss": 0.5251, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3576797842979431, + "rewards/margins": 1.492825984954834, + "rewards/rejected": -1.8505055904388428, + "step": 482 + }, + { + "epoch": 0.06, + "learning_rate": 2.8745718672493205e-07, + "logits/chosen": -2.719964027404785, + "logits/rejected": -2.647434949874878, + "logps/chosen": -300.51800537109375, + "logps/rejected": -205.2421875, + "loss": 0.5212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7528806924819946, + "rewards/margins": 0.9341813921928406, + "rewards/rejected": -1.68706214427948, + "step": 483 + }, + { + "epoch": 0.06, + "learning_rate": 2.874217550490138e-07, + "logits/chosen": -2.7279610633850098, + "logits/rejected": -2.6858766078948975, + "logps/chosen": -277.12933349609375, + "logps/rejected": -255.76255798339844, + "loss": 0.4553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7410680055618286, + "rewards/margins": 1.2947826385498047, + "rewards/rejected": -2.0358505249023438, + "step": 484 + }, + { + "epoch": 0.06, + "learning_rate": 2.873863233730955e-07, + "logits/chosen": -2.8642711639404297, + "logits/rejected": -2.873987913131714, + "logps/chosen": -251.55746459960938, + "logps/rejected": -184.16329956054688, + "loss": 0.4018, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32012537121772766, + "rewards/margins": 1.0811015367507935, + "rewards/rejected": -1.4012269973754883, + "step": 485 + }, + { + "epoch": 0.06, + "learning_rate": 2.873508916971773e-07, + "logits/chosen": -2.201634407043457, + "logits/rejected": -2.2737879753112793, + "logps/chosen": -188.0582733154297, + "logps/rejected": -170.73974609375, + "loss": 0.4998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3140193223953247, + "rewards/margins": 0.8652557134628296, + "rewards/rejected": -1.1792750358581543, + "step": 486 + }, + { + "epoch": 0.06, + "learning_rate": 2.87315460021259e-07, + "logits/chosen": -1.9628938436508179, + "logits/rejected": -2.2958121299743652, + "logps/chosen": -390.8328552246094, + "logps/rejected": -257.3310546875, + "loss": 0.622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.239447683095932, + "rewards/margins": 0.3572545051574707, + "rewards/rejected": -0.5967022180557251, + "step": 487 + }, + { + "epoch": 0.06, + "learning_rate": 2.8728002834534074e-07, + "logits/chosen": -2.032559633255005, + "logits/rejected": -2.0186400413513184, + "logps/chosen": -205.0035400390625, + "logps/rejected": -285.2829284667969, + "loss": 0.2318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48124760389328003, + "rewards/margins": 2.1955108642578125, + "rewards/rejected": -2.6767587661743164, + "step": 488 + }, + { + "epoch": 0.06, + "learning_rate": 2.8724459666942244e-07, + "logits/chosen": -2.1903562545776367, + "logits/rejected": -2.337620258331299, + "logps/chosen": -309.85595703125, + "logps/rejected": -178.13546752929688, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8676897287368774, + "rewards/margins": 0.29718202352523804, + "rewards/rejected": -1.1648716926574707, + "step": 489 + }, + { + "epoch": 0.06, + "learning_rate": 2.872091649935042e-07, + "logits/chosen": -2.307166337966919, + "logits/rejected": -2.303006649017334, + "logps/chosen": -172.33628845214844, + "logps/rejected": -411.9163513183594, + "loss": 0.3838, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46171411871910095, + "rewards/margins": 1.38311767578125, + "rewards/rejected": -1.844831943511963, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 2.871737333175859e-07, + "logits/chosen": -2.1132290363311768, + "logits/rejected": -1.9482606649398804, + "logps/chosen": -172.27197265625, + "logps/rejected": -285.3385314941406, + "loss": 0.9715, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9262038469314575, + "rewards/margins": 0.0935896784067154, + "rewards/rejected": -1.0197935104370117, + "step": 491 + }, + { + "epoch": 0.06, + "learning_rate": 2.8713830164166763e-07, + "logits/chosen": -1.9774844646453857, + "logits/rejected": -1.946367859840393, + "logps/chosen": -325.24462890625, + "logps/rejected": -341.6451721191406, + "loss": 0.2237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.043151985853910446, + "rewards/margins": 1.921749472618103, + "rewards/rejected": -1.9649015665054321, + "step": 492 + }, + { + "epoch": 0.06, + "learning_rate": 2.871028699657494e-07, + "logits/chosen": -2.476759195327759, + "logits/rejected": -2.372145414352417, + "logps/chosen": -226.0302276611328, + "logps/rejected": -206.26907348632812, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4979541003704071, + "rewards/margins": 1.2120133638381958, + "rewards/rejected": -1.7099673748016357, + "step": 493 + }, + { + "epoch": 0.06, + "learning_rate": 2.870674382898311e-07, + "logits/chosen": -1.9751077890396118, + "logits/rejected": -1.6521974802017212, + "logps/chosen": -302.59716796875, + "logps/rejected": -288.4311218261719, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2044219970703125, + "rewards/margins": 1.7481882572174072, + "rewards/rejected": -1.9526102542877197, + "step": 494 + }, + { + "epoch": 0.06, + "learning_rate": 2.870320066139128e-07, + "logits/chosen": -2.711160659790039, + "logits/rejected": -2.455995559692383, + "logps/chosen": -104.32241821289062, + "logps/rejected": -253.74916076660156, + "loss": 0.4919, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34323960542678833, + "rewards/margins": 1.5447239875793457, + "rewards/rejected": -1.8879635334014893, + "step": 495 + }, + { + "epoch": 0.06, + "learning_rate": 2.869965749379945e-07, + "logits/chosen": -2.9557652473449707, + "logits/rejected": -2.897108554840088, + "logps/chosen": -173.22103881835938, + "logps/rejected": -147.4190216064453, + "loss": 0.4603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7069957852363586, + "rewards/margins": 0.7420517206192017, + "rewards/rejected": -1.449047565460205, + "step": 496 + }, + { + "epoch": 0.06, + "learning_rate": 2.869611432620763e-07, + "logits/chosen": -2.351804256439209, + "logits/rejected": -2.493929386138916, + "logps/chosen": -226.66146850585938, + "logps/rejected": -198.23118591308594, + "loss": 0.6335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6662598252296448, + "rewards/margins": 0.6029986143112183, + "rewards/rejected": -1.2692583799362183, + "step": 497 + }, + { + "epoch": 0.06, + "learning_rate": 2.86925711586158e-07, + "logits/chosen": -2.3872475624084473, + "logits/rejected": -2.4446985721588135, + "logps/chosen": -212.6195068359375, + "logps/rejected": -321.24969482421875, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14084210991859436, + "rewards/margins": 2.517458200454712, + "rewards/rejected": -2.6583003997802734, + "step": 498 + }, + { + "epoch": 0.06, + "learning_rate": 2.8689027991023977e-07, + "logits/chosen": -2.4775023460388184, + "logits/rejected": -2.4002418518066406, + "logps/chosen": -370.8385925292969, + "logps/rejected": -404.053466796875, + "loss": 1.3206, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1094766855239868, + "rewards/margins": -0.2916056513786316, + "rewards/rejected": -0.81787109375, + "step": 499 + }, + { + "epoch": 0.06, + "learning_rate": 2.8685484823432146e-07, + "logits/chosen": -2.2234346866607666, + "logits/rejected": -2.5067219734191895, + "logps/chosen": -338.90447998046875, + "logps/rejected": -209.46636962890625, + "loss": 0.7608, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6381328701972961, + "rewards/margins": 0.14408577978610992, + "rewards/rejected": -0.7822186946868896, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 2.868194165584032e-07, + "logits/chosen": -2.7447075843811035, + "logits/rejected": -2.7495927810668945, + "logps/chosen": -204.04478454589844, + "logps/rejected": -212.54946899414062, + "loss": 0.5815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3632766604423523, + "rewards/margins": 0.6391451358795166, + "rewards/rejected": -1.0024218559265137, + "step": 501 + }, + { + "epoch": 0.06, + "learning_rate": 2.867839848824849e-07, + "logits/chosen": -2.311384677886963, + "logits/rejected": -2.520874500274658, + "logps/chosen": -294.0456237792969, + "logps/rejected": -198.28070068359375, + "loss": 0.4262, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3848944902420044, + "rewards/margins": 0.9312523603439331, + "rewards/rejected": -1.3161468505859375, + "step": 502 + }, + { + "epoch": 0.06, + "learning_rate": 2.8674855320656665e-07, + "logits/chosen": -2.4274258613586426, + "logits/rejected": -2.3611042499542236, + "logps/chosen": -190.7697296142578, + "logps/rejected": -181.71424865722656, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.024285294115543365, + "rewards/margins": 1.4035873413085938, + "rewards/rejected": -1.427872657775879, + "step": 503 + }, + { + "epoch": 0.06, + "learning_rate": 2.867131215306484e-07, + "logits/chosen": -1.810881495475769, + "logits/rejected": -1.9104323387145996, + "logps/chosen": -225.77105712890625, + "logps/rejected": -205.95071411132812, + "loss": 0.775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5099223852157593, + "rewards/margins": 0.5111232399940491, + "rewards/rejected": -1.0210456848144531, + "step": 504 + }, + { + "epoch": 0.06, + "learning_rate": 2.866776898547301e-07, + "logits/chosen": -2.82720685005188, + "logits/rejected": -2.7215538024902344, + "logps/chosen": -206.60256958007812, + "logps/rejected": -277.6413879394531, + "loss": 0.3142, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3899177610874176, + "rewards/margins": 1.6895167827606201, + "rewards/rejected": -2.079434633255005, + "step": 505 + }, + { + "epoch": 0.06, + "learning_rate": 2.8664225817881185e-07, + "logits/chosen": -2.1150169372558594, + "logits/rejected": -1.6521074771881104, + "logps/chosen": -115.73867797851562, + "logps/rejected": -199.58462524414062, + "loss": 0.3515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6508910655975342, + "rewards/margins": 1.1830346584320068, + "rewards/rejected": -1.833925724029541, + "step": 506 + }, + { + "epoch": 0.06, + "learning_rate": 2.8660682650289354e-07, + "logits/chosen": -2.2337965965270996, + "logits/rejected": -2.0642075538635254, + "logps/chosen": -217.8906707763672, + "logps/rejected": -297.22027587890625, + "loss": 0.4655, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7175643444061279, + "rewards/margins": 1.0016785860061646, + "rewards/rejected": -1.7192429304122925, + "step": 507 + }, + { + "epoch": 0.06, + "learning_rate": 2.865713948269753e-07, + "logits/chosen": -2.6315131187438965, + "logits/rejected": -2.808044910430908, + "logps/chosen": -212.70486450195312, + "logps/rejected": -201.50839233398438, + "loss": 0.5888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8553215861320496, + "rewards/margins": 0.5989441275596619, + "rewards/rejected": -1.4542657136917114, + "step": 508 + }, + { + "epoch": 0.06, + "learning_rate": 2.8653596315105704e-07, + "logits/chosen": -1.8543720245361328, + "logits/rejected": -2.0296168327331543, + "logps/chosen": -459.28076171875, + "logps/rejected": -386.000244140625, + "loss": 0.3854, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41743218898773193, + "rewards/margins": 1.1974530220031738, + "rewards/rejected": -1.6148850917816162, + "step": 509 + }, + { + "epoch": 0.06, + "learning_rate": 2.865005314751388e-07, + "logits/chosen": -2.2033531665802, + "logits/rejected": -2.0924296379089355, + "logps/chosen": -297.481689453125, + "logps/rejected": -288.87139892578125, + "loss": 0.3865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3853161334991455, + "rewards/margins": 1.689305305480957, + "rewards/rejected": -2.0746214389801025, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 2.864650997992205e-07, + "logits/chosen": -2.591458320617676, + "logits/rejected": -2.3992488384246826, + "logps/chosen": -301.3564147949219, + "logps/rejected": -333.9700927734375, + "loss": 0.3947, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04263585805892944, + "rewards/margins": 1.2566362619400024, + "rewards/rejected": -1.299272060394287, + "step": 511 + }, + { + "epoch": 0.06, + "learning_rate": 2.8642966812330223e-07, + "logits/chosen": -2.07089900970459, + "logits/rejected": -2.598860740661621, + "logps/chosen": -510.76593017578125, + "logps/rejected": -330.44775390625, + "loss": 1.0799, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.311828374862671, + "rewards/margins": 0.15822213888168335, + "rewards/rejected": -1.4700504541397095, + "step": 512 + }, + { + "epoch": 0.06, + "learning_rate": 2.8639423644738393e-07, + "logits/chosen": -2.004502773284912, + "logits/rejected": -2.0472323894500732, + "logps/chosen": -379.2911071777344, + "logps/rejected": -414.2760009765625, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.336702823638916, + "rewards/margins": 2.075023651123047, + "rewards/rejected": -2.411726474761963, + "step": 513 + }, + { + "epoch": 0.06, + "learning_rate": 2.863588047714657e-07, + "logits/chosen": -2.4964797496795654, + "logits/rejected": -2.0879085063934326, + "logps/chosen": -195.63156127929688, + "logps/rejected": -373.75909423828125, + "loss": 0.3865, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3250020742416382, + "rewards/margins": 1.4956847429275513, + "rewards/rejected": -1.8206868171691895, + "step": 514 + }, + { + "epoch": 0.06, + "learning_rate": 2.863233730955474e-07, + "logits/chosen": -1.67506742477417, + "logits/rejected": -1.9044263362884521, + "logps/chosen": -221.3343505859375, + "logps/rejected": -159.6397247314453, + "loss": 0.717, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5118857026100159, + "rewards/margins": 0.7656558752059937, + "rewards/rejected": -1.2775416374206543, + "step": 515 + }, + { + "epoch": 0.06, + "learning_rate": 2.862879414196291e-07, + "logits/chosen": -2.238819122314453, + "logits/rejected": -2.3344180583953857, + "logps/chosen": -292.8455810546875, + "logps/rejected": -201.6183319091797, + "loss": 1.1519, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1026118993759155, + "rewards/margins": -0.1640138477087021, + "rewards/rejected": -0.938598096370697, + "step": 516 + }, + { + "epoch": 0.06, + "learning_rate": 2.8625250974371087e-07, + "logits/chosen": -2.199331760406494, + "logits/rejected": -2.4519035816192627, + "logps/chosen": -499.5540771484375, + "logps/rejected": -354.43280029296875, + "loss": 0.5898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6915086507797241, + "rewards/margins": 0.5066684484481812, + "rewards/rejected": -1.1981772184371948, + "step": 517 + }, + { + "epoch": 0.06, + "learning_rate": 2.8621707806779257e-07, + "logits/chosen": -1.9242891073226929, + "logits/rejected": -1.9771735668182373, + "logps/chosen": -182.25662231445312, + "logps/rejected": -198.211181640625, + "loss": 0.5615, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4929031431674957, + "rewards/margins": 0.47970664501190186, + "rewards/rejected": -0.97260981798172, + "step": 518 + }, + { + "epoch": 0.06, + "learning_rate": 2.861816463918743e-07, + "logits/chosen": -1.5745623111724854, + "logits/rejected": -1.6949636936187744, + "logps/chosen": -452.0148620605469, + "logps/rejected": -415.39544677734375, + "loss": 0.528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8255192637443542, + "rewards/margins": 0.45034247636795044, + "rewards/rejected": -1.2758617401123047, + "step": 519 + }, + { + "epoch": 0.06, + "learning_rate": 2.86146214715956e-07, + "logits/chosen": -2.8264522552490234, + "logits/rejected": -2.7948410511016846, + "logps/chosen": -190.25497436523438, + "logps/rejected": -235.7244873046875, + "loss": 0.4328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2126975059509277, + "rewards/margins": 1.3871009349822998, + "rewards/rejected": -2.5997984409332275, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 2.861107830400378e-07, + "logits/chosen": -2.314972162246704, + "logits/rejected": -2.3311662673950195, + "logps/chosen": -323.1270751953125, + "logps/rejected": -323.8503723144531, + "loss": 1.5594, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.346125602722168, + "rewards/margins": 0.4829988479614258, + "rewards/rejected": -2.8291244506835938, + "step": 521 + }, + { + "epoch": 0.06, + "learning_rate": 2.860753513641195e-07, + "logits/chosen": -2.328765392303467, + "logits/rejected": -2.3360581398010254, + "logps/chosen": -109.8043212890625, + "logps/rejected": -138.8583526611328, + "loss": 0.4922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33297431468963623, + "rewards/margins": 0.5839194655418396, + "rewards/rejected": -0.25094518065452576, + "step": 522 + }, + { + "epoch": 0.06, + "learning_rate": 2.8603991968820126e-07, + "logits/chosen": -2.251887083053589, + "logits/rejected": -2.291194200515747, + "logps/chosen": -241.99241638183594, + "logps/rejected": -271.60943603515625, + "loss": 0.2842, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16754008829593658, + "rewards/margins": 1.7293349504470825, + "rewards/rejected": -1.8968749046325684, + "step": 523 + }, + { + "epoch": 0.06, + "learning_rate": 2.8600448801228295e-07, + "logits/chosen": -2.694969654083252, + "logits/rejected": -2.5892202854156494, + "logps/chosen": -335.2853698730469, + "logps/rejected": -323.2527770996094, + "loss": 0.5201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4566298723220825, + "rewards/margins": 0.8106477856636047, + "rewards/rejected": -1.2672775983810425, + "step": 524 + }, + { + "epoch": 0.06, + "learning_rate": 2.859690563363647e-07, + "logits/chosen": -2.410719394683838, + "logits/rejected": -2.3729844093322754, + "logps/chosen": -213.3101043701172, + "logps/rejected": -202.72064208984375, + "loss": 0.7818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5485799312591553, + "rewards/margins": 0.45479100942611694, + "rewards/rejected": -1.003371000289917, + "step": 525 + }, + { + "epoch": 0.06, + "learning_rate": 2.8593362466044645e-07, + "logits/chosen": -2.5247349739074707, + "logits/rejected": -2.3739495277404785, + "logps/chosen": -98.12782287597656, + "logps/rejected": -162.59182739257812, + "loss": 0.5076, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5434640645980835, + "rewards/margins": 0.8047605156898499, + "rewards/rejected": -1.3482245206832886, + "step": 526 + }, + { + "epoch": 0.06, + "learning_rate": 2.8589819298452814e-07, + "logits/chosen": -2.1265206336975098, + "logits/rejected": -2.174743175506592, + "logps/chosen": -372.384521484375, + "logps/rejected": -355.48297119140625, + "loss": 0.4598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9017437100410461, + "rewards/margins": 1.0171101093292236, + "rewards/rejected": -1.9188538789749146, + "step": 527 + }, + { + "epoch": 0.06, + "learning_rate": 2.858627613086099e-07, + "logits/chosen": -2.403273105621338, + "logits/rejected": -2.4131946563720703, + "logps/chosen": -313.4476318359375, + "logps/rejected": -283.8671875, + "loss": 0.4587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7013782858848572, + "rewards/margins": 0.9358751177787781, + "rewards/rejected": -1.6372534036636353, + "step": 528 + }, + { + "epoch": 0.06, + "learning_rate": 2.858273296326916e-07, + "logits/chosen": -2.7461700439453125, + "logits/rejected": -2.4763565063476562, + "logps/chosen": -153.01715087890625, + "logps/rejected": -210.9293212890625, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8665658235549927, + "rewards/margins": 1.2266720533370972, + "rewards/rejected": -2.09323787689209, + "step": 529 + }, + { + "epoch": 0.06, + "learning_rate": 2.8579189795677334e-07, + "logits/chosen": -1.754564642906189, + "logits/rejected": -2.068523406982422, + "logps/chosen": -357.93121337890625, + "logps/rejected": -292.3747253417969, + "loss": 0.5568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08435069769620895, + "rewards/margins": 0.612540602684021, + "rewards/rejected": -0.6968913674354553, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 2.8575646628085503e-07, + "logits/chosen": -1.759860634803772, + "logits/rejected": -2.098111152648926, + "logps/chosen": -276.138916015625, + "logps/rejected": -220.85955810546875, + "loss": 0.6222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5003688335418701, + "rewards/margins": 0.6859586238861084, + "rewards/rejected": -1.186327576637268, + "step": 531 + }, + { + "epoch": 0.06, + "learning_rate": 2.8572103460493683e-07, + "logits/chosen": -2.1432905197143555, + "logits/rejected": -2.074483871459961, + "logps/chosen": -351.5444030761719, + "logps/rejected": -338.73565673828125, + "loss": 0.4228, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31365320086479187, + "rewards/margins": 1.5232373476028442, + "rewards/rejected": -1.836890459060669, + "step": 532 + }, + { + "epoch": 0.06, + "learning_rate": 2.8568560292901853e-07, + "logits/chosen": -2.273639678955078, + "logits/rejected": -1.9727565050125122, + "logps/chosen": -250.47303771972656, + "logps/rejected": -284.10040283203125, + "loss": 0.6594, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0097332000732422, + "rewards/margins": 0.3556445837020874, + "rewards/rejected": -1.3653777837753296, + "step": 533 + }, + { + "epoch": 0.06, + "learning_rate": 2.856501712531003e-07, + "logits/chosen": -2.804464340209961, + "logits/rejected": -2.7689015865325928, + "logps/chosen": -161.2711944580078, + "logps/rejected": -200.8800048828125, + "loss": 2.0158, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.093459367752075, + "rewards/margins": -0.6570478081703186, + "rewards/rejected": -1.4364116191864014, + "step": 534 + }, + { + "epoch": 0.06, + "learning_rate": 2.85614739577182e-07, + "logits/chosen": -3.0661938190460205, + "logits/rejected": -3.0833563804626465, + "logps/chosen": -303.5691833496094, + "logps/rejected": -255.90713500976562, + "loss": 0.6971, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3442786633968353, + "rewards/margins": 0.4057345986366272, + "rewards/rejected": -0.7500132322311401, + "step": 535 + }, + { + "epoch": 0.06, + "learning_rate": 2.855793079012637e-07, + "logits/chosen": -2.6102354526519775, + "logits/rejected": -2.5817737579345703, + "logps/chosen": -173.6082000732422, + "logps/rejected": -171.6916046142578, + "loss": 1.0745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.174984097480774, + "rewards/margins": 0.05065372586250305, + "rewards/rejected": -1.2256379127502441, + "step": 536 + }, + { + "epoch": 0.06, + "learning_rate": 2.8554387622534547e-07, + "logits/chosen": -2.1097970008850098, + "logits/rejected": -2.2835311889648438, + "logps/chosen": -323.673828125, + "logps/rejected": -237.627685546875, + "loss": 0.6171, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7752617001533508, + "rewards/margins": 0.4622430205345154, + "rewards/rejected": -1.2375047206878662, + "step": 537 + }, + { + "epoch": 0.06, + "learning_rate": 2.8550844454942717e-07, + "logits/chosen": -2.563974142074585, + "logits/rejected": -2.4530539512634277, + "logps/chosen": -265.3961486816406, + "logps/rejected": -285.02984619140625, + "loss": 0.9371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8578805923461914, + "rewards/margins": 0.1478942632675171, + "rewards/rejected": -1.005774974822998, + "step": 538 + }, + { + "epoch": 0.06, + "learning_rate": 2.854730128735089e-07, + "logits/chosen": -2.417131185531616, + "logits/rejected": -2.495516061782837, + "logps/chosen": -420.91461181640625, + "logps/rejected": -291.0054931640625, + "loss": 0.8533, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0879417657852173, + "rewards/margins": 0.6263333559036255, + "rewards/rejected": -1.7142751216888428, + "step": 539 + }, + { + "epoch": 0.06, + "learning_rate": 2.854375811975906e-07, + "logits/chosen": -2.0525481700897217, + "logits/rejected": -2.0311365127563477, + "logps/chosen": -344.7208251953125, + "logps/rejected": -397.4723815917969, + "loss": 0.5427, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5082827806472778, + "rewards/margins": 0.8180028796195984, + "rewards/rejected": -1.3262856006622314, + "step": 540 + }, + { + "epoch": 0.06, + "learning_rate": 2.8540214952167236e-07, + "logits/chosen": -2.5007472038269043, + "logits/rejected": -2.6125829219818115, + "logps/chosen": -217.9143524169922, + "logps/rejected": -293.10125732421875, + "loss": 0.377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5075303316116333, + "rewards/margins": 1.2771257162094116, + "rewards/rejected": -1.784656047821045, + "step": 541 + }, + { + "epoch": 0.06, + "learning_rate": 2.8536671784575406e-07, + "logits/chosen": -1.8281805515289307, + "logits/rejected": -2.1173787117004395, + "logps/chosen": -285.6739501953125, + "logps/rejected": -302.1130065917969, + "loss": 0.6101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4645143747329712, + "rewards/margins": 0.7863868474960327, + "rewards/rejected": -1.2509013414382935, + "step": 542 + }, + { + "epoch": 0.06, + "learning_rate": 2.853312861698358e-07, + "logits/chosen": -2.258251190185547, + "logits/rejected": -2.320873975753784, + "logps/chosen": -105.01898956298828, + "logps/rejected": -177.27391052246094, + "loss": 0.258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2593585252761841, + "rewards/margins": 1.6169419288635254, + "rewards/rejected": -1.87630033493042, + "step": 543 + }, + { + "epoch": 0.06, + "learning_rate": 2.8529585449391755e-07, + "logits/chosen": -2.678955078125, + "logits/rejected": -2.6826601028442383, + "logps/chosen": -291.085693359375, + "logps/rejected": -344.6938781738281, + "loss": 0.5035, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6986862421035767, + "rewards/margins": 1.35354483127594, + "rewards/rejected": -2.0522310733795166, + "step": 544 + }, + { + "epoch": 0.06, + "learning_rate": 2.852604228179993e-07, + "logits/chosen": -2.3905558586120605, + "logits/rejected": -2.27713942527771, + "logps/chosen": -109.41691589355469, + "logps/rejected": -161.7470703125, + "loss": 0.4125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6202859878540039, + "rewards/margins": 1.4117522239685059, + "rewards/rejected": -2.0320382118225098, + "step": 545 + }, + { + "epoch": 0.06, + "learning_rate": 2.85224991142081e-07, + "logits/chosen": -2.698762893676758, + "logits/rejected": -2.413696527481079, + "logps/chosen": -219.04464721679688, + "logps/rejected": -335.5111999511719, + "loss": 0.4113, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6372820734977722, + "rewards/margins": 1.124608039855957, + "rewards/rejected": -1.761889934539795, + "step": 546 + }, + { + "epoch": 0.06, + "learning_rate": 2.8518955946616275e-07, + "logits/chosen": -2.8585376739501953, + "logits/rejected": -2.8951199054718018, + "logps/chosen": -112.4608154296875, + "logps/rejected": -181.8551025390625, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07646259665489197, + "rewards/margins": 1.5269970893859863, + "rewards/rejected": -1.6034598350524902, + "step": 547 + }, + { + "epoch": 0.06, + "learning_rate": 2.851541277902445e-07, + "logits/chosen": -2.631103038787842, + "logits/rejected": -2.609153985977173, + "logps/chosen": -201.09202575683594, + "logps/rejected": -181.19757080078125, + "loss": 0.4601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23847419023513794, + "rewards/margins": 1.3434467315673828, + "rewards/rejected": -1.5819209814071655, + "step": 548 + }, + { + "epoch": 0.06, + "learning_rate": 2.851186961143262e-07, + "logits/chosen": -2.270334482192993, + "logits/rejected": -2.2233192920684814, + "logps/chosen": -158.02249145507812, + "logps/rejected": -148.6134796142578, + "loss": 0.7252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4448128938674927, + "rewards/margins": 0.5420027375221252, + "rewards/rejected": -0.9868156313896179, + "step": 549 + }, + { + "epoch": 0.06, + "learning_rate": 2.8508326443840794e-07, + "logits/chosen": -2.369446039199829, + "logits/rejected": -2.135037660598755, + "logps/chosen": -254.35166931152344, + "logps/rejected": -361.304931640625, + "loss": 0.3178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1920468509197235, + "rewards/margins": 1.2199519872665405, + "rewards/rejected": -1.411998987197876, + "step": 550 + }, + { + "epoch": 0.06, + "learning_rate": 2.8504783276248963e-07, + "logits/chosen": -2.638059616088867, + "logits/rejected": -2.576690196990967, + "logps/chosen": -214.31524658203125, + "logps/rejected": -192.39529418945312, + "loss": 0.624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44820234179496765, + "rewards/margins": 0.769474983215332, + "rewards/rejected": -1.2176772356033325, + "step": 551 + }, + { + "epoch": 0.06, + "learning_rate": 2.850124010865714e-07, + "logits/chosen": -2.2462854385375977, + "logits/rejected": -2.163957118988037, + "logps/chosen": -360.5517272949219, + "logps/rejected": -251.49844360351562, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4803833961486816, + "rewards/margins": 0.9514158964157104, + "rewards/rejected": -2.4317994117736816, + "step": 552 + }, + { + "epoch": 0.06, + "learning_rate": 2.849769694106531e-07, + "logits/chosen": -2.6802244186401367, + "logits/rejected": -2.910111904144287, + "logps/chosen": -377.9355773925781, + "logps/rejected": -275.34112548828125, + "loss": 0.3389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41612130403518677, + "rewards/margins": 1.3597418069839478, + "rewards/rejected": -1.7758629322052002, + "step": 553 + }, + { + "epoch": 0.06, + "learning_rate": 2.8494153773473483e-07, + "logits/chosen": -2.739457130432129, + "logits/rejected": -2.568617820739746, + "logps/chosen": -133.44430541992188, + "logps/rejected": -148.55262756347656, + "loss": 1.2516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3007218837738037, + "rewards/margins": -0.14218169450759888, + "rewards/rejected": -1.1585402488708496, + "step": 554 + }, + { + "epoch": 0.06, + "learning_rate": 2.849061060588166e-07, + "logits/chosen": -1.8668720722198486, + "logits/rejected": -2.1898341178894043, + "logps/chosen": -535.6143188476562, + "logps/rejected": -281.7651672363281, + "loss": 0.4174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2744632661342621, + "rewards/margins": 1.174797534942627, + "rewards/rejected": -1.4492608308792114, + "step": 555 + }, + { + "epoch": 0.06, + "learning_rate": 2.848706743828983e-07, + "logits/chosen": -1.7967478036880493, + "logits/rejected": -2.06424880027771, + "logps/chosen": -442.9310302734375, + "logps/rejected": -445.7369689941406, + "loss": 0.2728, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5566263198852539, + "rewards/margins": 1.9546340703964233, + "rewards/rejected": -2.511260509490967, + "step": 556 + }, + { + "epoch": 0.06, + "learning_rate": 2.8483524270698e-07, + "logits/chosen": -2.343796968460083, + "logits/rejected": -2.4122018814086914, + "logps/chosen": -262.1907958984375, + "logps/rejected": -186.04566955566406, + "loss": 0.4684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7042860984802246, + "rewards/margins": 0.9044474363327026, + "rewards/rejected": -1.6087336540222168, + "step": 557 + }, + { + "epoch": 0.06, + "learning_rate": 2.8479981103106177e-07, + "logits/chosen": -2.3609390258789062, + "logits/rejected": -2.401257038116455, + "logps/chosen": -276.2867126464844, + "logps/rejected": -203.52835083007812, + "loss": 0.6571, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5534374713897705, + "rewards/margins": 0.649053156375885, + "rewards/rejected": -1.2024905681610107, + "step": 558 + }, + { + "epoch": 0.07, + "learning_rate": 2.847643793551435e-07, + "logits/chosen": -2.0271008014678955, + "logits/rejected": -1.9784281253814697, + "logps/chosen": -385.4505615234375, + "logps/rejected": -353.7092590332031, + "loss": 0.2353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21390002965927124, + "rewards/margins": 1.7905845642089844, + "rewards/rejected": -2.0044846534729004, + "step": 559 + }, + { + "epoch": 0.07, + "learning_rate": 2.847289476792252e-07, + "logits/chosen": -1.9069173336029053, + "logits/rejected": -2.188859224319458, + "logps/chosen": -185.70729064941406, + "logps/rejected": -245.00003051757812, + "loss": 1.0886, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3435674905776978, + "rewards/margins": 0.256305456161499, + "rewards/rejected": -1.5998730659484863, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 2.8469351600330696e-07, + "logits/chosen": -2.100656032562256, + "logits/rejected": -2.2159180641174316, + "logps/chosen": -341.1153259277344, + "logps/rejected": -372.8584899902344, + "loss": 0.3967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7885330319404602, + "rewards/margins": 0.9494342803955078, + "rewards/rejected": -1.7379672527313232, + "step": 561 + }, + { + "epoch": 0.07, + "learning_rate": 2.8465808432738866e-07, + "logits/chosen": -2.6383142471313477, + "logits/rejected": -2.6143205165863037, + "logps/chosen": -180.5325927734375, + "logps/rejected": -190.25733947753906, + "loss": 0.3534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03114059567451477, + "rewards/margins": 1.09554123878479, + "rewards/rejected": -1.1266816854476929, + "step": 562 + }, + { + "epoch": 0.07, + "learning_rate": 2.846226526514704e-07, + "logits/chosen": -2.4956278800964355, + "logits/rejected": -2.396529197692871, + "logps/chosen": -408.535400390625, + "logps/rejected": -384.07464599609375, + "loss": 0.4308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7730699181556702, + "rewards/margins": 0.9954893589019775, + "rewards/rejected": -1.768559217453003, + "step": 563 + }, + { + "epoch": 0.07, + "learning_rate": 2.845872209755521e-07, + "logits/chosen": -2.032639503479004, + "logits/rejected": -1.9179755449295044, + "logps/chosen": -430.50579833984375, + "logps/rejected": -490.73333740234375, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25208768248558044, + "rewards/margins": 1.3802275657653809, + "rewards/rejected": -1.6323151588439941, + "step": 564 + }, + { + "epoch": 0.07, + "learning_rate": 2.8455178929963385e-07, + "logits/chosen": -2.101374626159668, + "logits/rejected": -2.172497034072876, + "logps/chosen": -249.16525268554688, + "logps/rejected": -223.83316040039062, + "loss": 0.6541, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6702892780303955, + "rewards/margins": 1.3814959526062012, + "rewards/rejected": -2.051785469055176, + "step": 565 + }, + { + "epoch": 0.07, + "learning_rate": 2.845163576237156e-07, + "logits/chosen": -2.8870742321014404, + "logits/rejected": -2.889741897583008, + "logps/chosen": -93.1162109375, + "logps/rejected": -180.5037384033203, + "loss": 0.438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13500943779945374, + "rewards/margins": 0.8672112226486206, + "rewards/rejected": -1.002220630645752, + "step": 566 + }, + { + "epoch": 0.07, + "learning_rate": 2.8448092594779735e-07, + "logits/chosen": -1.924746036529541, + "logits/rejected": -2.1833443641662598, + "logps/chosen": -393.8853454589844, + "logps/rejected": -214.32781982421875, + "loss": 0.7415, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1905595064163208, + "rewards/margins": 1.1827194690704346, + "rewards/rejected": -2.373279094696045, + "step": 567 + }, + { + "epoch": 0.07, + "learning_rate": 2.8444549427187904e-07, + "logits/chosen": -2.338766098022461, + "logits/rejected": -2.310725212097168, + "logps/chosen": -190.17820739746094, + "logps/rejected": -192.0225067138672, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45062631368637085, + "rewards/margins": 0.6148356199264526, + "rewards/rejected": -1.0654618740081787, + "step": 568 + }, + { + "epoch": 0.07, + "learning_rate": 2.844100625959608e-07, + "logits/chosen": -2.637502670288086, + "logits/rejected": -2.4023377895355225, + "logps/chosen": -160.08731079101562, + "logps/rejected": -260.2137451171875, + "loss": 0.6336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5600972175598145, + "rewards/margins": 0.5039157271385193, + "rewards/rejected": -1.0640130043029785, + "step": 569 + }, + { + "epoch": 0.07, + "learning_rate": 2.843746309200425e-07, + "logits/chosen": -2.236431121826172, + "logits/rejected": -1.5225576162338257, + "logps/chosen": -166.45741271972656, + "logps/rejected": -439.48797607421875, + "loss": 0.3235, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8114561438560486, + "rewards/margins": 1.4516148567199707, + "rewards/rejected": -2.263071060180664, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 2.8433919924412424e-07, + "logits/chosen": -2.011160373687744, + "logits/rejected": -2.2315304279327393, + "logps/chosen": -241.73687744140625, + "logps/rejected": -184.26658630371094, + "loss": 0.6331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36120763421058655, + "rewards/margins": 0.5312861800193787, + "rewards/rejected": -0.8924937844276428, + "step": 571 + }, + { + "epoch": 0.07, + "learning_rate": 2.84303767568206e-07, + "logits/chosen": -1.904073715209961, + "logits/rejected": -1.9552388191223145, + "logps/chosen": -591.3861694335938, + "logps/rejected": -640.4342041015625, + "loss": 0.2905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15657061338424683, + "rewards/margins": 2.418710231781006, + "rewards/rejected": -2.5752811431884766, + "step": 572 + }, + { + "epoch": 0.07, + "learning_rate": 2.842683358922877e-07, + "logits/chosen": -2.004777669906616, + "logits/rejected": -1.9348944425582886, + "logps/chosen": -485.394287109375, + "logps/rejected": -456.34613037109375, + "loss": 0.3506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3032083809375763, + "rewards/margins": 1.9522475004196167, + "rewards/rejected": -2.255455732345581, + "step": 573 + }, + { + "epoch": 0.07, + "learning_rate": 2.8423290421636943e-07, + "logits/chosen": -2.0323987007141113, + "logits/rejected": -2.2565672397613525, + "logps/chosen": -376.1615295410156, + "logps/rejected": -293.8371887207031, + "loss": 0.3753, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6021791696548462, + "rewards/margins": 1.4226195812225342, + "rewards/rejected": -2.02479887008667, + "step": 574 + }, + { + "epoch": 0.07, + "learning_rate": 2.841974725404511e-07, + "logits/chosen": -2.7000463008880615, + "logits/rejected": -2.7524003982543945, + "logps/chosen": -200.11676025390625, + "logps/rejected": -181.8428497314453, + "loss": 0.6657, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6449150443077087, + "rewards/margins": 1.3228996992111206, + "rewards/rejected": -1.9678149223327637, + "step": 575 + }, + { + "epoch": 0.07, + "learning_rate": 2.8416204086453287e-07, + "logits/chosen": -2.293550968170166, + "logits/rejected": -2.221982955932617, + "logps/chosen": -284.7962341308594, + "logps/rejected": -240.72853088378906, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6617366671562195, + "rewards/margins": 0.8491596579551697, + "rewards/rejected": -1.5108962059020996, + "step": 576 + }, + { + "epoch": 0.07, + "learning_rate": 2.841266091886146e-07, + "logits/chosen": -2.594348430633545, + "logits/rejected": -2.5795907974243164, + "logps/chosen": -403.5820617675781, + "logps/rejected": -441.0587463378906, + "loss": 0.678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1816064566373825, + "rewards/margins": 1.0647549629211426, + "rewards/rejected": -1.2463613748550415, + "step": 577 + }, + { + "epoch": 0.07, + "learning_rate": 2.840911775126963e-07, + "logits/chosen": -1.8871886730194092, + "logits/rejected": -2.048887252807617, + "logps/chosen": -441.9034423828125, + "logps/rejected": -282.894287109375, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0569076538085938, + "rewards/margins": 0.5572657585144043, + "rewards/rejected": -1.614173412322998, + "step": 578 + }, + { + "epoch": 0.07, + "learning_rate": 2.8405574583677807e-07, + "logits/chosen": -2.4486382007598877, + "logits/rejected": -2.6536693572998047, + "logps/chosen": -307.143798828125, + "logps/rejected": -252.06643676757812, + "loss": 0.585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7573120594024658, + "rewards/margins": 1.1727367639541626, + "rewards/rejected": -1.9300488233566284, + "step": 579 + }, + { + "epoch": 0.07, + "learning_rate": 2.840203141608598e-07, + "logits/chosen": -2.450269937515259, + "logits/rejected": -2.4061622619628906, + "logps/chosen": -399.0839538574219, + "logps/rejected": -429.9809875488281, + "loss": 0.3457, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.044712163507938385, + "rewards/margins": 1.7830820083618164, + "rewards/rejected": -1.7383698225021362, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 2.839848824849415e-07, + "logits/chosen": -2.6853950023651123, + "logits/rejected": -2.7101523876190186, + "logps/chosen": -225.93035888671875, + "logps/rejected": -291.9564208984375, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48067495226860046, + "rewards/margins": 2.566830635070801, + "rewards/rejected": -3.0475056171417236, + "step": 581 + }, + { + "epoch": 0.07, + "learning_rate": 2.8394945080902326e-07, + "logits/chosen": -2.6570026874542236, + "logits/rejected": -2.828355073928833, + "logps/chosen": -472.3564758300781, + "logps/rejected": -299.128173828125, + "loss": 0.3127, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.663401186466217, + "rewards/margins": 1.259992241859436, + "rewards/rejected": -1.9233934879302979, + "step": 582 + }, + { + "epoch": 0.07, + "learning_rate": 2.83914019133105e-07, + "logits/chosen": -2.654460906982422, + "logits/rejected": -2.8254292011260986, + "logps/chosen": -183.21360778808594, + "logps/rejected": -211.34127807617188, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3141680061817169, + "rewards/margins": 1.4108171463012695, + "rewards/rejected": -1.724985122680664, + "step": 583 + }, + { + "epoch": 0.07, + "learning_rate": 2.838785874571867e-07, + "logits/chosen": -2.8397536277770996, + "logits/rejected": -2.9731905460357666, + "logps/chosen": -286.7995910644531, + "logps/rejected": -322.51629638671875, + "loss": 0.245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5188305974006653, + "rewards/margins": 1.7199426889419556, + "rewards/rejected": -2.2387733459472656, + "step": 584 + }, + { + "epoch": 0.07, + "learning_rate": 2.8384315578126845e-07, + "logits/chosen": -2.499354124069214, + "logits/rejected": -2.6108410358428955, + "logps/chosen": -253.4418182373047, + "logps/rejected": -343.5899658203125, + "loss": 1.0835, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0491673946380615, + "rewards/margins": -0.12267941236495972, + "rewards/rejected": -0.9264879822731018, + "step": 585 + }, + { + "epoch": 0.07, + "learning_rate": 2.8380772410535015e-07, + "logits/chosen": -2.4853053092956543, + "logits/rejected": -2.6456799507141113, + "logps/chosen": -313.1239013671875, + "logps/rejected": -213.18861389160156, + "loss": 0.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7546948790550232, + "rewards/margins": 0.8017174005508423, + "rewards/rejected": -1.5564121007919312, + "step": 586 + }, + { + "epoch": 0.07, + "learning_rate": 2.837722924294319e-07, + "logits/chosen": -2.3221936225891113, + "logits/rejected": -2.5736799240112305, + "logps/chosen": -431.1414794921875, + "logps/rejected": -371.3270263671875, + "loss": 0.3454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6498576402664185, + "rewards/margins": 1.3634686470031738, + "rewards/rejected": -2.0133261680603027, + "step": 587 + }, + { + "epoch": 0.07, + "learning_rate": 2.8373686075351364e-07, + "logits/chosen": -2.2068943977355957, + "logits/rejected": -2.1449995040893555, + "logps/chosen": -245.41595458984375, + "logps/rejected": -201.13003540039062, + "loss": 0.4847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7180060148239136, + "rewards/margins": 1.0727077722549438, + "rewards/rejected": -2.7907137870788574, + "step": 588 + }, + { + "epoch": 0.07, + "learning_rate": 2.8370142907759534e-07, + "logits/chosen": -2.7657008171081543, + "logits/rejected": -2.870767116546631, + "logps/chosen": -221.01698303222656, + "logps/rejected": -213.3057861328125, + "loss": 0.3241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.534647524356842, + "rewards/margins": 1.4820157289505005, + "rewards/rejected": -2.016663074493408, + "step": 589 + }, + { + "epoch": 0.07, + "learning_rate": 2.836659974016771e-07, + "logits/chosen": -2.561528444290161, + "logits/rejected": -2.377598762512207, + "logps/chosen": -158.01663208007812, + "logps/rejected": -260.22589111328125, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4788733720779419, + "rewards/margins": 1.0858718156814575, + "rewards/rejected": -1.5647451877593994, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 2.8363056572575884e-07, + "logits/chosen": -2.373501777648926, + "logits/rejected": -2.4982798099517822, + "logps/chosen": -215.85755920410156, + "logps/rejected": -145.58670043945312, + "loss": 0.5412, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6830252408981323, + "rewards/margins": 0.4874327480792999, + "rewards/rejected": -1.1704579591751099, + "step": 591 + }, + { + "epoch": 0.07, + "learning_rate": 2.8359513404984053e-07, + "logits/chosen": -2.644239664077759, + "logits/rejected": -2.832881450653076, + "logps/chosen": -174.84352111816406, + "logps/rejected": -259.2848205566406, + "loss": 0.238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6102717518806458, + "rewards/margins": 2.061450481414795, + "rewards/rejected": -2.671722412109375, + "step": 592 + }, + { + "epoch": 0.07, + "learning_rate": 2.835597023739223e-07, + "logits/chosen": -2.2125868797302246, + "logits/rejected": -2.1189050674438477, + "logps/chosen": -118.67438507080078, + "logps/rejected": -160.91433715820312, + "loss": 0.4665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5902976989746094, + "rewards/margins": 1.2112171649932861, + "rewards/rejected": -1.8015148639678955, + "step": 593 + }, + { + "epoch": 0.07, + "learning_rate": 2.8352427069800403e-07, + "logits/chosen": -2.72749924659729, + "logits/rejected": -2.665203332901001, + "logps/chosen": -105.28466796875, + "logps/rejected": -106.30815124511719, + "loss": 0.6002, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7433668375015259, + "rewards/margins": 0.4511909782886505, + "rewards/rejected": -1.1945579051971436, + "step": 594 + }, + { + "epoch": 0.07, + "learning_rate": 2.834888390220857e-07, + "logits/chosen": -2.207937002182007, + "logits/rejected": -2.4642491340637207, + "logps/chosen": -415.00994873046875, + "logps/rejected": -220.6689453125, + "loss": 0.7089, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9031976461410522, + "rewards/margins": 0.46310779452323914, + "rewards/rejected": -1.3663054704666138, + "step": 595 + }, + { + "epoch": 0.07, + "learning_rate": 2.834534073461675e-07, + "logits/chosen": -2.3175909519195557, + "logits/rejected": -2.2985143661499023, + "logps/chosen": -232.93284606933594, + "logps/rejected": -250.12237548828125, + "loss": 0.7043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7460999488830566, + "rewards/margins": 0.812207818031311, + "rewards/rejected": -1.5583077669143677, + "step": 596 + }, + { + "epoch": 0.07, + "learning_rate": 2.8341797567024917e-07, + "logits/chosen": -2.2121782302856445, + "logits/rejected": -1.9086616039276123, + "logps/chosen": -297.9239501953125, + "logps/rejected": -399.9457092285156, + "loss": 0.5678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20001479983329773, + "rewards/margins": 0.5181649923324585, + "rewards/rejected": -0.7181798219680786, + "step": 597 + }, + { + "epoch": 0.07, + "learning_rate": 2.833825439943309e-07, + "logits/chosen": -2.7228288650512695, + "logits/rejected": -2.6960911750793457, + "logps/chosen": -252.58743286132812, + "logps/rejected": -181.63479614257812, + "loss": 0.8179, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.059566855430603, + "rewards/margins": 0.8404138684272766, + "rewards/rejected": -1.8999807834625244, + "step": 598 + }, + { + "epoch": 0.07, + "learning_rate": 2.833471123184126e-07, + "logits/chosen": -2.2037315368652344, + "logits/rejected": -2.185729503631592, + "logps/chosen": -310.79974365234375, + "logps/rejected": -283.07550048828125, + "loss": 0.4263, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36597710847854614, + "rewards/margins": 0.9915189743041992, + "rewards/rejected": -1.3574961423873901, + "step": 599 + }, + { + "epoch": 0.07, + "learning_rate": 2.8331168064249436e-07, + "logits/chosen": -2.351884365081787, + "logits/rejected": -2.5851497650146484, + "logps/chosen": -272.38763427734375, + "logps/rejected": -228.53138732910156, + "loss": 0.431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6801834106445312, + "rewards/margins": 1.0062822103500366, + "rewards/rejected": -1.6864656209945679, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 2.832762489665761e-07, + "logits/chosen": -2.4204070568084717, + "logits/rejected": -2.7393598556518555, + "logps/chosen": -299.3605651855469, + "logps/rejected": -145.13677978515625, + "loss": 0.7684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5935815572738647, + "rewards/margins": 0.34504058957099915, + "rewards/rejected": -0.938622236251831, + "step": 601 + }, + { + "epoch": 0.07, + "learning_rate": 2.8324081729065786e-07, + "logits/chosen": -2.4079909324645996, + "logits/rejected": -2.4792494773864746, + "logps/chosen": -161.25982666015625, + "logps/rejected": -155.2931671142578, + "loss": 0.5439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6917847394943237, + "rewards/margins": 0.9195724725723267, + "rewards/rejected": -1.6113572120666504, + "step": 602 + }, + { + "epoch": 0.07, + "learning_rate": 2.8320538561473956e-07, + "logits/chosen": -2.05000376701355, + "logits/rejected": -2.067012071609497, + "logps/chosen": -241.41250610351562, + "logps/rejected": -244.8616180419922, + "loss": 0.9123, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3803644180297852, + "rewards/margins": 1.09917414188385, + "rewards/rejected": -2.4795384407043457, + "step": 603 + }, + { + "epoch": 0.07, + "learning_rate": 2.831699539388213e-07, + "logits/chosen": -2.2408676147460938, + "logits/rejected": -2.6611380577087402, + "logps/chosen": -573.1158447265625, + "logps/rejected": -283.30657958984375, + "loss": 0.5248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2845990061759949, + "rewards/margins": 0.7190200090408325, + "rewards/rejected": -1.0036190748214722, + "step": 604 + }, + { + "epoch": 0.07, + "learning_rate": 2.8313452226290305e-07, + "logits/chosen": -2.2413711547851562, + "logits/rejected": -1.9701826572418213, + "logps/chosen": -384.5602111816406, + "logps/rejected": -411.017333984375, + "loss": 0.5941, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3150925636291504, + "rewards/margins": 1.1022653579711914, + "rewards/rejected": -2.417357921600342, + "step": 605 + }, + { + "epoch": 0.07, + "learning_rate": 2.8309909058698475e-07, + "logits/chosen": -1.7727774381637573, + "logits/rejected": -1.9457204341888428, + "logps/chosen": -336.6259460449219, + "logps/rejected": -318.899658203125, + "loss": 0.8065, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6014400124549866, + "rewards/margins": 0.32296398282051086, + "rewards/rejected": -0.924403965473175, + "step": 606 + }, + { + "epoch": 0.07, + "learning_rate": 2.830636589110665e-07, + "logits/chosen": -1.8818624019622803, + "logits/rejected": -2.0092155933380127, + "logps/chosen": -362.704345703125, + "logps/rejected": -311.9741516113281, + "loss": 0.4337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2584696412086487, + "rewards/margins": 1.507845401763916, + "rewards/rejected": -1.7663151025772095, + "step": 607 + }, + { + "epoch": 0.07, + "learning_rate": 2.830282272351482e-07, + "logits/chosen": -2.657205104827881, + "logits/rejected": -2.791828155517578, + "logps/chosen": -237.78347778320312, + "logps/rejected": -172.70120239257812, + "loss": 0.729, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1202346086502075, + "rewards/margins": 0.17119766771793365, + "rewards/rejected": -1.29143226146698, + "step": 608 + }, + { + "epoch": 0.07, + "learning_rate": 2.8299279555922994e-07, + "logits/chosen": -2.8653438091278076, + "logits/rejected": -2.6305055618286133, + "logps/chosen": -181.93601989746094, + "logps/rejected": -266.245849609375, + "loss": 0.2102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4023686945438385, + "rewards/margins": 2.941915988922119, + "rewards/rejected": -3.3442845344543457, + "step": 609 + }, + { + "epoch": 0.07, + "learning_rate": 2.8295736388331164e-07, + "logits/chosen": -2.596820592880249, + "logits/rejected": -2.533358335494995, + "logps/chosen": -261.4521789550781, + "logps/rejected": -145.0147247314453, + "loss": 0.8639, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4233194589614868, + "rewards/margins": 0.36191022396087646, + "rewards/rejected": -1.7852296829223633, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 2.829219322073934e-07, + "logits/chosen": -1.9775631427764893, + "logits/rejected": -2.1341824531555176, + "logps/chosen": -576.28466796875, + "logps/rejected": -334.25592041015625, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0930986404418945, + "rewards/margins": 0.3086680769920349, + "rewards/rejected": -1.4017667770385742, + "step": 611 + }, + { + "epoch": 0.07, + "learning_rate": 2.8288650053147513e-07, + "logits/chosen": -2.603475332260132, + "logits/rejected": -2.3198750019073486, + "logps/chosen": -132.3052978515625, + "logps/rejected": -290.5291442871094, + "loss": 0.3141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5465155243873596, + "rewards/margins": 1.6672029495239258, + "rewards/rejected": -2.2137184143066406, + "step": 612 + }, + { + "epoch": 0.07, + "learning_rate": 2.8285106885555683e-07, + "logits/chosen": -2.4631457328796387, + "logits/rejected": -2.1344504356384277, + "logps/chosen": -259.9687194824219, + "logps/rejected": -278.46881103515625, + "loss": 0.3318, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4875625967979431, + "rewards/margins": 1.749682068824768, + "rewards/rejected": -2.2372446060180664, + "step": 613 + }, + { + "epoch": 0.07, + "learning_rate": 2.828156371796386e-07, + "logits/chosen": -2.502852439880371, + "logits/rejected": -2.4360191822052, + "logps/chosen": -219.77206420898438, + "logps/rejected": -236.79733276367188, + "loss": 0.5117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7535436153411865, + "rewards/margins": 0.6197067499160767, + "rewards/rejected": -1.3732503652572632, + "step": 614 + }, + { + "epoch": 0.07, + "learning_rate": 2.8278020550372033e-07, + "logits/chosen": -2.331800937652588, + "logits/rejected": -2.140773296356201, + "logps/chosen": -189.0993194580078, + "logps/rejected": -201.6796875, + "loss": 0.4374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3698059916496277, + "rewards/margins": 0.84446120262146, + "rewards/rejected": -1.2142672538757324, + "step": 615 + }, + { + "epoch": 0.07, + "learning_rate": 2.827447738278021e-07, + "logits/chosen": -2.684727191925049, + "logits/rejected": -2.515939950942993, + "logps/chosen": -165.89263916015625, + "logps/rejected": -188.135986328125, + "loss": 0.5219, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8118862509727478, + "rewards/margins": 1.0984495878219604, + "rewards/rejected": -1.9103360176086426, + "step": 616 + }, + { + "epoch": 0.07, + "learning_rate": 2.8270934215188377e-07, + "logits/chosen": -2.184600830078125, + "logits/rejected": -2.1141839027404785, + "logps/chosen": -237.5020751953125, + "logps/rejected": -301.9725036621094, + "loss": 0.4071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4573739767074585, + "rewards/margins": 1.5413289070129395, + "rewards/rejected": -1.9987026453018188, + "step": 617 + }, + { + "epoch": 0.07, + "learning_rate": 2.826739104759655e-07, + "logits/chosen": -2.179051160812378, + "logits/rejected": -2.357790231704712, + "logps/chosen": -267.117431640625, + "logps/rejected": -273.4722900390625, + "loss": 0.3951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5316153168678284, + "rewards/margins": 1.1263729333877563, + "rewards/rejected": -1.65798819065094, + "step": 618 + }, + { + "epoch": 0.07, + "learning_rate": 2.826384788000472e-07, + "logits/chosen": -2.4466304779052734, + "logits/rejected": -2.367253065109253, + "logps/chosen": -180.52737426757812, + "logps/rejected": -219.88311767578125, + "loss": 0.2799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05940570682287216, + "rewards/margins": 1.480074405670166, + "rewards/rejected": -1.4206687211990356, + "step": 619 + }, + { + "epoch": 0.07, + "learning_rate": 2.8260304712412896e-07, + "logits/chosen": -2.663456916809082, + "logits/rejected": -2.586817502975464, + "logps/chosen": -188.86251831054688, + "logps/rejected": -197.51052856445312, + "loss": 0.4668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21705666184425354, + "rewards/margins": 1.0164827108383179, + "rewards/rejected": -1.233539342880249, + "step": 620 + }, + { + "epoch": 0.07, + "learning_rate": 2.8256761544821066e-07, + "logits/chosen": -2.2668964862823486, + "logits/rejected": -2.3446195125579834, + "logps/chosen": -243.80783081054688, + "logps/rejected": -212.63470458984375, + "loss": 0.4425, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31664007902145386, + "rewards/margins": 1.5920805931091309, + "rewards/rejected": -1.9087207317352295, + "step": 621 + }, + { + "epoch": 0.07, + "learning_rate": 2.825321837722924e-07, + "logits/chosen": -2.6614184379577637, + "logits/rejected": -2.8160336017608643, + "logps/chosen": -541.4915771484375, + "logps/rejected": -400.97100830078125, + "loss": 0.1325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06345769762992859, + "rewards/margins": 2.4909658432006836, + "rewards/rejected": -2.5544238090515137, + "step": 622 + }, + { + "epoch": 0.07, + "learning_rate": 2.8249675209637416e-07, + "logits/chosen": -2.5059781074523926, + "logits/rejected": -2.671403408050537, + "logps/chosen": -303.0697326660156, + "logps/rejected": -257.12982177734375, + "loss": 0.4902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38753604888916016, + "rewards/margins": 1.1425929069519043, + "rewards/rejected": -1.530129075050354, + "step": 623 + }, + { + "epoch": 0.07, + "learning_rate": 2.8246132042045585e-07, + "logits/chosen": -2.230849266052246, + "logits/rejected": -2.5255587100982666, + "logps/chosen": -402.5303955078125, + "logps/rejected": -307.230712890625, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2931148409843445, + "rewards/margins": 1.1207062005996704, + "rewards/rejected": -1.4138211011886597, + "step": 624 + }, + { + "epoch": 0.07, + "learning_rate": 2.824258887445376e-07, + "logits/chosen": -2.224437713623047, + "logits/rejected": -2.2340521812438965, + "logps/chosen": -176.03976440429688, + "logps/rejected": -234.7279052734375, + "loss": 0.6201, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7022520303726196, + "rewards/margins": 1.3898818492889404, + "rewards/rejected": -2.0921339988708496, + "step": 625 + }, + { + "epoch": 0.07, + "learning_rate": 2.8239045706861935e-07, + "logits/chosen": -2.258146047592163, + "logits/rejected": -2.388251304626465, + "logps/chosen": -379.135009765625, + "logps/rejected": -307.49359130859375, + "loss": 0.7614, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1313681602478027, + "rewards/margins": 0.4820912182331085, + "rewards/rejected": -1.6134594678878784, + "step": 626 + }, + { + "epoch": 0.07, + "learning_rate": 2.823550253927011e-07, + "logits/chosen": -2.3732810020446777, + "logits/rejected": -2.358731508255005, + "logps/chosen": -190.1050262451172, + "logps/rejected": -221.21060180664062, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8787813186645508, + "rewards/margins": 1.8298004865646362, + "rewards/rejected": -3.7085819244384766, + "step": 627 + }, + { + "epoch": 0.07, + "learning_rate": 2.823195937167828e-07, + "logits/chosen": -2.172062397003174, + "logits/rejected": -2.0315685272216797, + "logps/chosen": -286.6888427734375, + "logps/rejected": -233.4279327392578, + "loss": 0.6726, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8270248174667358, + "rewards/margins": 0.9048182368278503, + "rewards/rejected": -1.7318429946899414, + "step": 628 + }, + { + "epoch": 0.07, + "learning_rate": 2.8228416204086454e-07, + "logits/chosen": -2.1710007190704346, + "logits/rejected": -2.284658432006836, + "logps/chosen": -223.65042114257812, + "logps/rejected": -269.5903625488281, + "loss": 0.3905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11111211776733398, + "rewards/margins": 1.2375226020812988, + "rewards/rejected": -1.3486347198486328, + "step": 629 + }, + { + "epoch": 0.07, + "learning_rate": 2.8224873036494624e-07, + "logits/chosen": -2.1916775703430176, + "logits/rejected": -1.9913333654403687, + "logps/chosen": -284.353759765625, + "logps/rejected": -263.2596740722656, + "loss": 0.2975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23000317811965942, + "rewards/margins": 1.3193955421447754, + "rewards/rejected": -1.54939866065979, + "step": 630 + }, + { + "epoch": 0.07, + "learning_rate": 2.82213298689028e-07, + "logits/chosen": -1.9303104877471924, + "logits/rejected": -2.1487889289855957, + "logps/chosen": -332.79400634765625, + "logps/rejected": -226.94430541992188, + "loss": 0.5281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41981393098831177, + "rewards/margins": 0.5807924866676331, + "rewards/rejected": -1.0006064176559448, + "step": 631 + }, + { + "epoch": 0.07, + "learning_rate": 2.821778670131097e-07, + "logits/chosen": -2.355780839920044, + "logits/rejected": -2.298170566558838, + "logps/chosen": -153.72251892089844, + "logps/rejected": -267.8560791015625, + "loss": 0.3884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3935573101043701, + "rewards/margins": 0.978131890296936, + "rewards/rejected": -1.3716892004013062, + "step": 632 + }, + { + "epoch": 0.07, + "learning_rate": 2.8214243533719143e-07, + "logits/chosen": -2.59535813331604, + "logits/rejected": -2.3859457969665527, + "logps/chosen": -188.00755310058594, + "logps/rejected": -224.98385620117188, + "loss": 0.5924, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3312590420246124, + "rewards/margins": 0.8680870532989502, + "rewards/rejected": -1.1993461847305298, + "step": 633 + }, + { + "epoch": 0.07, + "learning_rate": 2.821070036612732e-07, + "logits/chosen": -2.8450257778167725, + "logits/rejected": -2.8692235946655273, + "logps/chosen": -237.82232666015625, + "logps/rejected": -215.40042114257812, + "loss": 0.391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3587251305580139, + "rewards/margins": 1.4409229755401611, + "rewards/rejected": -1.7996481657028198, + "step": 634 + }, + { + "epoch": 0.07, + "learning_rate": 2.820715719853549e-07, + "logits/chosen": -2.593151330947876, + "logits/rejected": -2.5801234245300293, + "logps/chosen": -93.4423828125, + "logps/rejected": -153.20626831054688, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2512911856174469, + "rewards/margins": 0.5896918773651123, + "rewards/rejected": -0.8409830927848816, + "step": 635 + }, + { + "epoch": 0.07, + "learning_rate": 2.820361403094366e-07, + "logits/chosen": -1.9407367706298828, + "logits/rejected": -2.032761573791504, + "logps/chosen": -309.00830078125, + "logps/rejected": -270.1577453613281, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2378094494342804, + "rewards/margins": 1.2084672451019287, + "rewards/rejected": -0.9706577658653259, + "step": 636 + }, + { + "epoch": 0.07, + "learning_rate": 2.820007086335183e-07, + "logits/chosen": -1.994848608970642, + "logits/rejected": -1.6797795295715332, + "logps/chosen": -309.9917297363281, + "logps/rejected": -294.864013671875, + "loss": 0.501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5989134907722473, + "rewards/margins": 0.9195924997329712, + "rewards/rejected": -1.5185060501098633, + "step": 637 + }, + { + "epoch": 0.07, + "learning_rate": 2.819652769576001e-07, + "logits/chosen": -2.3502488136291504, + "logits/rejected": -2.3329994678497314, + "logps/chosen": -277.3572692871094, + "logps/rejected": -219.76028442382812, + "loss": 0.5654, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1290271282196045, + "rewards/margins": 0.581454873085022, + "rewards/rejected": -1.7104820013046265, + "step": 638 + }, + { + "epoch": 0.07, + "learning_rate": 2.819298452816818e-07, + "logits/chosen": -2.8418495655059814, + "logits/rejected": -2.8609778881073, + "logps/chosen": -221.52622985839844, + "logps/rejected": -243.91497802734375, + "loss": 0.3318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9092757701873779, + "rewards/margins": 2.5488829612731934, + "rewards/rejected": -3.4581589698791504, + "step": 639 + }, + { + "epoch": 0.07, + "learning_rate": 2.8189441360576357e-07, + "logits/chosen": -2.5195651054382324, + "logits/rejected": -2.632418155670166, + "logps/chosen": -273.74127197265625, + "logps/rejected": -258.5153503417969, + "loss": 0.4445, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0813536643981934, + "rewards/margins": 1.3315284252166748, + "rewards/rejected": -2.412882089614868, + "step": 640 + }, + { + "epoch": 0.07, + "learning_rate": 2.8185898192984526e-07, + "logits/chosen": -2.381563186645508, + "logits/rejected": -2.6133275032043457, + "logps/chosen": -375.8746643066406, + "logps/rejected": -323.3339538574219, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7131337523460388, + "rewards/margins": 1.5325254201889038, + "rewards/rejected": -2.245659351348877, + "step": 641 + }, + { + "epoch": 0.07, + "learning_rate": 2.81823550253927e-07, + "logits/chosen": -2.434840202331543, + "logits/rejected": -2.401885509490967, + "logps/chosen": -414.1163024902344, + "logps/rejected": -309.9866943359375, + "loss": 0.2143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.158794105052948, + "rewards/margins": 1.675749659538269, + "rewards/rejected": -1.8345437049865723, + "step": 642 + }, + { + "epoch": 0.07, + "learning_rate": 2.817881185780087e-07, + "logits/chosen": -3.029679775238037, + "logits/rejected": -2.9532246589660645, + "logps/chosen": -182.62249755859375, + "logps/rejected": -177.52967834472656, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1524386405944824, + "rewards/margins": 0.4387982189655304, + "rewards/rejected": -1.5912368297576904, + "step": 643 + }, + { + "epoch": 0.07, + "learning_rate": 2.8175268690209045e-07, + "logits/chosen": -2.324056386947632, + "logits/rejected": -2.1222987174987793, + "logps/chosen": -181.572021484375, + "logps/rejected": -289.6360168457031, + "loss": 0.2322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28997987508773804, + "rewards/margins": 2.0215511322021484, + "rewards/rejected": -2.3115310668945312, + "step": 644 + }, + { + "epoch": 0.08, + "learning_rate": 2.817172552261722e-07, + "logits/chosen": -2.059887170791626, + "logits/rejected": -2.1407175064086914, + "logps/chosen": -290.55230712890625, + "logps/rejected": -280.3440856933594, + "loss": 0.4756, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19593696296215057, + "rewards/margins": 1.9595015048980713, + "rewards/rejected": -2.1554384231567383, + "step": 645 + }, + { + "epoch": 0.08, + "learning_rate": 2.816818235502539e-07, + "logits/chosen": -2.6764943599700928, + "logits/rejected": -2.7170908451080322, + "logps/chosen": -152.4773712158203, + "logps/rejected": -200.98910522460938, + "loss": 0.7438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6851381063461304, + "rewards/margins": 0.8004188537597656, + "rewards/rejected": -1.4855568408966064, + "step": 646 + }, + { + "epoch": 0.08, + "learning_rate": 2.8164639187433565e-07, + "logits/chosen": -2.1113569736480713, + "logits/rejected": -2.3172569274902344, + "logps/chosen": -191.6326904296875, + "logps/rejected": -130.34942626953125, + "loss": 0.8519, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6206393241882324, + "rewards/margins": 0.1768942028284073, + "rewards/rejected": -0.7975335717201233, + "step": 647 + }, + { + "epoch": 0.08, + "learning_rate": 2.8161096019841734e-07, + "logits/chosen": -2.4157989025115967, + "logits/rejected": -2.4171743392944336, + "logps/chosen": -139.25582885742188, + "logps/rejected": -156.53382873535156, + "loss": 0.7906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7041699290275574, + "rewards/margins": 0.8959304094314575, + "rewards/rejected": -1.6001003980636597, + "step": 648 + }, + { + "epoch": 0.08, + "learning_rate": 2.815755285224991e-07, + "logits/chosen": -2.6967780590057373, + "logits/rejected": -2.621246814727783, + "logps/chosen": -178.10220336914062, + "logps/rejected": -317.8310241699219, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0364319309592247, + "rewards/margins": 1.4366344213485718, + "rewards/rejected": -1.4002025127410889, + "step": 649 + }, + { + "epoch": 0.08, + "learning_rate": 2.8154009684658084e-07, + "logits/chosen": -2.7764599323272705, + "logits/rejected": -2.8255114555358887, + "logps/chosen": -614.005126953125, + "logps/rejected": -288.136962890625, + "loss": 0.4876, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1919710636138916, + "rewards/margins": 0.9879306554794312, + "rewards/rejected": -2.179901599884033, + "step": 650 + }, + { + "epoch": 0.08, + "learning_rate": 2.815046651706626e-07, + "logits/chosen": -2.3394343852996826, + "logits/rejected": -2.317230463027954, + "logps/chosen": -408.80438232421875, + "logps/rejected": -493.7617492675781, + "loss": 0.7363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7283014059066772, + "rewards/margins": 1.6206783056259155, + "rewards/rejected": -2.3489794731140137, + "step": 651 + }, + { + "epoch": 0.08, + "learning_rate": 2.814692334947443e-07, + "logits/chosen": -1.9347187280654907, + "logits/rejected": -2.369868516921997, + "logps/chosen": -707.7894287109375, + "logps/rejected": -472.04498291015625, + "loss": 0.5317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4498329758644104, + "rewards/margins": 1.9948474168777466, + "rewards/rejected": -2.4446804523468018, + "step": 652 + }, + { + "epoch": 0.08, + "learning_rate": 2.8143380181882603e-07, + "logits/chosen": -2.857086420059204, + "logits/rejected": -2.9411962032318115, + "logps/chosen": -227.7477264404297, + "logps/rejected": -208.58953857421875, + "loss": 0.2632, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10916734486818314, + "rewards/margins": 2.8450326919555664, + "rewards/rejected": -2.735865354537964, + "step": 653 + }, + { + "epoch": 0.08, + "learning_rate": 2.8139837014290773e-07, + "logits/chosen": -2.3495724201202393, + "logits/rejected": -2.4254541397094727, + "logps/chosen": -238.48065185546875, + "logps/rejected": -158.19439697265625, + "loss": 0.4422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19937562942504883, + "rewards/margins": 1.0859156847000122, + "rewards/rejected": -1.2852911949157715, + "step": 654 + }, + { + "epoch": 0.08, + "learning_rate": 2.813629384669895e-07, + "logits/chosen": -2.220245599746704, + "logits/rejected": -2.145049810409546, + "logps/chosen": -287.5046691894531, + "logps/rejected": -268.61468505859375, + "loss": 0.5303, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.036236047744751, + "rewards/margins": 0.9143364429473877, + "rewards/rejected": -1.9505724906921387, + "step": 655 + }, + { + "epoch": 0.08, + "learning_rate": 2.813275067910712e-07, + "logits/chosen": -2.379141330718994, + "logits/rejected": -2.4098448753356934, + "logps/chosen": -159.46533203125, + "logps/rejected": -106.53160858154297, + "loss": 0.751, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5797417163848877, + "rewards/margins": -0.02314772456884384, + "rewards/rejected": -0.5565939545631409, + "step": 656 + }, + { + "epoch": 0.08, + "learning_rate": 2.812920751151529e-07, + "logits/chosen": -2.646141529083252, + "logits/rejected": -2.3955111503601074, + "logps/chosen": -134.55548095703125, + "logps/rejected": -196.1585235595703, + "loss": 0.5017, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8190967440605164, + "rewards/margins": 0.8932404518127441, + "rewards/rejected": -1.7123371362686157, + "step": 657 + }, + { + "epoch": 0.08, + "learning_rate": 2.8125664343923467e-07, + "logits/chosen": -2.254502773284912, + "logits/rejected": -2.3867976665496826, + "logps/chosen": -225.52923583984375, + "logps/rejected": -225.027587890625, + "loss": 0.3895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2669307589530945, + "rewards/margins": 2.5856473445892334, + "rewards/rejected": -2.8525781631469727, + "step": 658 + }, + { + "epoch": 0.08, + "learning_rate": 2.8122121176331636e-07, + "logits/chosen": -2.238013505935669, + "logits/rejected": -2.355064630508423, + "logps/chosen": -184.70205688476562, + "logps/rejected": -166.54415893554688, + "loss": 0.9496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7971018552780151, + "rewards/margins": -0.13034646213054657, + "rewards/rejected": -0.6667553782463074, + "step": 659 + }, + { + "epoch": 0.08, + "learning_rate": 2.811857800873981e-07, + "logits/chosen": -2.711026906967163, + "logits/rejected": -2.477163553237915, + "logps/chosen": -322.17315673828125, + "logps/rejected": -200.6646270751953, + "loss": 0.3882, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3079068660736084, + "rewards/margins": 1.2629361152648926, + "rewards/rejected": -1.570842981338501, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 2.8115034841147986e-07, + "logits/chosen": -2.244220733642578, + "logits/rejected": -2.0889992713928223, + "logps/chosen": -345.2870178222656, + "logps/rejected": -403.8587646484375, + "loss": 0.9451, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9911388754844666, + "rewards/margins": 0.5271368026733398, + "rewards/rejected": -1.518275499343872, + "step": 661 + }, + { + "epoch": 0.08, + "learning_rate": 2.811149167355616e-07, + "logits/chosen": -2.150815010070801, + "logits/rejected": -2.3247618675231934, + "logps/chosen": -227.82418823242188, + "logps/rejected": -253.78684997558594, + "loss": 0.3304, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09931284189224243, + "rewards/margins": 1.9400739669799805, + "rewards/rejected": -2.039386749267578, + "step": 662 + }, + { + "epoch": 0.08, + "learning_rate": 2.810794850596433e-07, + "logits/chosen": -2.4930598735809326, + "logits/rejected": -2.3492417335510254, + "logps/chosen": -297.0482177734375, + "logps/rejected": -339.7794189453125, + "loss": 0.7551, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7139439582824707, + "rewards/margins": 0.9855427742004395, + "rewards/rejected": -1.6994867324829102, + "step": 663 + }, + { + "epoch": 0.08, + "learning_rate": 2.8104405338372506e-07, + "logits/chosen": -2.600454807281494, + "logits/rejected": -2.3083202838897705, + "logps/chosen": -161.7836456298828, + "logps/rejected": -250.01954650878906, + "loss": 0.9111, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.9141299724578857, + "rewards/margins": 0.05952081084251404, + "rewards/rejected": -0.9736508131027222, + "step": 664 + }, + { + "epoch": 0.08, + "learning_rate": 2.8100862170780675e-07, + "logits/chosen": -2.201402425765991, + "logits/rejected": -2.5956668853759766, + "logps/chosen": -447.8415832519531, + "logps/rejected": -284.311767578125, + "loss": 0.3513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7406899333000183, + "rewards/margins": 1.4940874576568604, + "rewards/rejected": -2.2347772121429443, + "step": 665 + }, + { + "epoch": 0.08, + "learning_rate": 2.809731900318885e-07, + "logits/chosen": -1.9570379257202148, + "logits/rejected": -2.1581883430480957, + "logps/chosen": -203.88983154296875, + "logps/rejected": -204.23867797851562, + "loss": 0.8223, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0374674797058105, + "rewards/margins": 1.0673367977142334, + "rewards/rejected": -2.104804277420044, + "step": 666 + }, + { + "epoch": 0.08, + "learning_rate": 2.8093775835597025e-07, + "logits/chosen": -2.271881103515625, + "logits/rejected": -2.7551355361938477, + "logps/chosen": -425.5056457519531, + "logps/rejected": -253.63589477539062, + "loss": 0.4484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18098919093608856, + "rewards/margins": 1.0286552906036377, + "rewards/rejected": -1.2096445560455322, + "step": 667 + }, + { + "epoch": 0.08, + "learning_rate": 2.8090232668005194e-07, + "logits/chosen": -1.9986205101013184, + "logits/rejected": -1.900841236114502, + "logps/chosen": -413.8116760253906, + "logps/rejected": -403.51251220703125, + "loss": 0.537, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2627365589141846, + "rewards/margins": 1.8818767070770264, + "rewards/rejected": -4.144613265991211, + "step": 668 + }, + { + "epoch": 0.08, + "learning_rate": 2.808668950041337e-07, + "logits/chosen": -2.1321682929992676, + "logits/rejected": -2.290189266204834, + "logps/chosen": -221.2316131591797, + "logps/rejected": -209.66131591796875, + "loss": 0.3719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26084786653518677, + "rewards/margins": 1.5150394439697266, + "rewards/rejected": -1.7758872509002686, + "step": 669 + }, + { + "epoch": 0.08, + "learning_rate": 2.808314633282154e-07, + "logits/chosen": -2.051877975463867, + "logits/rejected": -2.444598913192749, + "logps/chosen": -339.264404296875, + "logps/rejected": -192.53775024414062, + "loss": 0.2962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011827416718006134, + "rewards/margins": 1.426331639289856, + "rewards/rejected": -1.4381592273712158, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 2.8079603165229714e-07, + "logits/chosen": -2.2946462631225586, + "logits/rejected": -2.5074288845062256, + "logps/chosen": -170.9038543701172, + "logps/rejected": -247.75242614746094, + "loss": 0.5485, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5765238404273987, + "rewards/margins": 1.430662989616394, + "rewards/rejected": -2.0071868896484375, + "step": 671 + }, + { + "epoch": 0.08, + "learning_rate": 2.8076059997637883e-07, + "logits/chosen": -1.8959805965423584, + "logits/rejected": -2.4402034282684326, + "logps/chosen": -300.51104736328125, + "logps/rejected": -140.6285400390625, + "loss": 0.5086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3240899443626404, + "rewards/margins": 0.9766297340393066, + "rewards/rejected": -1.3007197380065918, + "step": 672 + }, + { + "epoch": 0.08, + "learning_rate": 2.8072516830046063e-07, + "logits/chosen": -2.3013556003570557, + "logits/rejected": -2.4747202396392822, + "logps/chosen": -241.64178466796875, + "logps/rejected": -203.61398315429688, + "loss": 0.7173, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6803563237190247, + "rewards/margins": 0.10033737123012543, + "rewards/rejected": -0.7806937098503113, + "step": 673 + }, + { + "epoch": 0.08, + "learning_rate": 2.8068973662454233e-07, + "logits/chosen": -2.445082187652588, + "logits/rejected": -1.9891459941864014, + "logps/chosen": -246.13426208496094, + "logps/rejected": -246.265625, + "loss": 0.4529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3844401240348816, + "rewards/margins": 1.3719998598098755, + "rewards/rejected": -1.7564398050308228, + "step": 674 + }, + { + "epoch": 0.08, + "learning_rate": 2.806543049486241e-07, + "logits/chosen": -2.2285172939300537, + "logits/rejected": -2.497166633605957, + "logps/chosen": -260.9329528808594, + "logps/rejected": -170.54322814941406, + "loss": 1.1059, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7965224981307983, + "rewards/margins": 0.37358638644218445, + "rewards/rejected": -2.1701087951660156, + "step": 675 + }, + { + "epoch": 0.08, + "learning_rate": 2.8061887327270577e-07, + "logits/chosen": -2.3908731937408447, + "logits/rejected": -2.4542956352233887, + "logps/chosen": -181.05657958984375, + "logps/rejected": -160.290771484375, + "loss": 0.6856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9020822048187256, + "rewards/margins": 0.33048588037490845, + "rewards/rejected": -1.2325680255889893, + "step": 676 + }, + { + "epoch": 0.08, + "learning_rate": 2.805834415967875e-07, + "logits/chosen": -1.972471833229065, + "logits/rejected": -2.1061344146728516, + "logps/chosen": -469.3251953125, + "logps/rejected": -386.42938232421875, + "loss": 0.3062, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9037914872169495, + "rewards/margins": 1.7410629987716675, + "rewards/rejected": -2.6448545455932617, + "step": 677 + }, + { + "epoch": 0.08, + "learning_rate": 2.805480099208692e-07, + "logits/chosen": -2.8037707805633545, + "logits/rejected": -2.8312735557556152, + "logps/chosen": -224.88235473632812, + "logps/rejected": -197.22244262695312, + "loss": 0.511, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5849311947822571, + "rewards/margins": 2.288635492324829, + "rewards/rejected": -2.8735663890838623, + "step": 678 + }, + { + "epoch": 0.08, + "learning_rate": 2.8051257824495097e-07, + "logits/chosen": -2.513211965560913, + "logits/rejected": -2.4965219497680664, + "logps/chosen": -215.69085693359375, + "logps/rejected": -226.50442504882812, + "loss": 0.4229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7692616581916809, + "rewards/margins": 1.492958426475525, + "rewards/rejected": -2.2622201442718506, + "step": 679 + }, + { + "epoch": 0.08, + "learning_rate": 2.804771465690327e-07, + "logits/chosen": -1.805816650390625, + "logits/rejected": -1.7954068183898926, + "logps/chosen": -294.740966796875, + "logps/rejected": -285.6097412109375, + "loss": 0.603, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.715692400932312, + "rewards/margins": 0.38370200991630554, + "rewards/rejected": -1.09939444065094, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 2.804417148931144e-07, + "logits/chosen": -2.2543814182281494, + "logits/rejected": -2.4916348457336426, + "logps/chosen": -188.1788330078125, + "logps/rejected": -194.21267700195312, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4180101156234741, + "rewards/margins": 1.1600381135940552, + "rewards/rejected": -1.5780482292175293, + "step": 681 + }, + { + "epoch": 0.08, + "learning_rate": 2.8040628321719616e-07, + "logits/chosen": -2.9856488704681396, + "logits/rejected": -2.9655675888061523, + "logps/chosen": -347.2264099121094, + "logps/rejected": -204.5126953125, + "loss": 0.4166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28545743227005005, + "rewards/margins": 1.1703705787658691, + "rewards/rejected": -1.455828070640564, + "step": 682 + }, + { + "epoch": 0.08, + "learning_rate": 2.8037085154127785e-07, + "logits/chosen": -2.879774332046509, + "logits/rejected": -2.765890121459961, + "logps/chosen": -273.8333740234375, + "logps/rejected": -270.3662109375, + "loss": 0.5096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5706639885902405, + "rewards/margins": 1.928100824356079, + "rewards/rejected": -2.498764753341675, + "step": 683 + }, + { + "epoch": 0.08, + "learning_rate": 2.8033541986535966e-07, + "logits/chosen": -2.3434393405914307, + "logits/rejected": -2.2945632934570312, + "logps/chosen": -323.439453125, + "logps/rejected": -308.67352294921875, + "loss": 0.4146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8997571468353271, + "rewards/margins": 1.7598652839660645, + "rewards/rejected": -2.6596224308013916, + "step": 684 + }, + { + "epoch": 0.08, + "learning_rate": 2.8029998818944135e-07, + "logits/chosen": -2.49277925491333, + "logits/rejected": -2.3509836196899414, + "logps/chosen": -311.6294860839844, + "logps/rejected": -301.6763000488281, + "loss": 0.6562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5908518433570862, + "rewards/margins": 0.5877425074577332, + "rewards/rejected": -1.1785943508148193, + "step": 685 + }, + { + "epoch": 0.08, + "learning_rate": 2.802645565135231e-07, + "logits/chosen": -3.058283567428589, + "logits/rejected": -2.9940643310546875, + "logps/chosen": -185.73513793945312, + "logps/rejected": -170.11874389648438, + "loss": 0.2695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06979950517416, + "rewards/margins": 1.678873062133789, + "rewards/rejected": -1.748672604560852, + "step": 686 + }, + { + "epoch": 0.08, + "learning_rate": 2.802291248376048e-07, + "logits/chosen": -1.7685227394104004, + "logits/rejected": -2.010348320007324, + "logps/chosen": -365.30889892578125, + "logps/rejected": -236.9215850830078, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6521315574645996, + "rewards/margins": 0.23895293474197388, + "rewards/rejected": -0.8910845518112183, + "step": 687 + }, + { + "epoch": 0.08, + "learning_rate": 2.8019369316168654e-07, + "logits/chosen": -2.598428249359131, + "logits/rejected": -2.434915065765381, + "logps/chosen": -132.9219970703125, + "logps/rejected": -233.4220733642578, + "loss": 0.6621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7431386113166809, + "rewards/margins": 0.6722696423530579, + "rewards/rejected": -1.4154083728790283, + "step": 688 + }, + { + "epoch": 0.08, + "learning_rate": 2.8015826148576824e-07, + "logits/chosen": -2.456496238708496, + "logits/rejected": -2.3771629333496094, + "logps/chosen": -262.38531494140625, + "logps/rejected": -288.2380065917969, + "loss": 0.3292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42245495319366455, + "rewards/margins": 1.4673452377319336, + "rewards/rejected": -1.8898003101348877, + "step": 689 + }, + { + "epoch": 0.08, + "learning_rate": 2.8012282980985e-07, + "logits/chosen": -1.4079651832580566, + "logits/rejected": -2.055075168609619, + "logps/chosen": -450.5494079589844, + "logps/rejected": -198.27972412109375, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47688814997673035, + "rewards/margins": 0.5374201536178589, + "rewards/rejected": -1.014308214187622, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 2.8008739813393174e-07, + "logits/chosen": -2.591388702392578, + "logits/rejected": -2.5524582862854004, + "logps/chosen": -133.5125732421875, + "logps/rejected": -156.23046875, + "loss": 0.6672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6534445285797119, + "rewards/margins": 0.6154847741127014, + "rewards/rejected": -1.268929362297058, + "step": 691 + }, + { + "epoch": 0.08, + "learning_rate": 2.8005196645801343e-07, + "logits/chosen": -2.6435165405273438, + "logits/rejected": -2.7400946617126465, + "logps/chosen": -207.95704650878906, + "logps/rejected": -237.688720703125, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6209278702735901, + "rewards/margins": 1.43307363986969, + "rewards/rejected": -2.054001569747925, + "step": 692 + }, + { + "epoch": 0.08, + "learning_rate": 2.800165347820952e-07, + "logits/chosen": -2.5939781665802, + "logits/rejected": -2.648477077484131, + "logps/chosen": -134.758056640625, + "logps/rejected": -232.98541259765625, + "loss": 0.5362, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2669229507446289, + "rewards/margins": 1.2116219997406006, + "rewards/rejected": -1.4785449504852295, + "step": 693 + }, + { + "epoch": 0.08, + "learning_rate": 2.799811031061769e-07, + "logits/chosen": -2.6734209060668945, + "logits/rejected": -2.3314242362976074, + "logps/chosen": -145.61383056640625, + "logps/rejected": -229.9839324951172, + "loss": 0.4777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5519739985466003, + "rewards/margins": 0.7958526611328125, + "rewards/rejected": -1.347826600074768, + "step": 694 + }, + { + "epoch": 0.08, + "learning_rate": 2.799456714302586e-07, + "logits/chosen": -2.6144814491271973, + "logits/rejected": -2.888803005218506, + "logps/chosen": -354.63031005859375, + "logps/rejected": -235.78070068359375, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06772447377443314, + "rewards/margins": 1.6161918640136719, + "rewards/rejected": -1.6839163303375244, + "step": 695 + }, + { + "epoch": 0.08, + "learning_rate": 2.799102397543404e-07, + "logits/chosen": -2.860764741897583, + "logits/rejected": -2.8972458839416504, + "logps/chosen": -231.79254150390625, + "logps/rejected": -261.70855712890625, + "loss": 0.306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5158629417419434, + "rewards/margins": 1.3141611814498901, + "rewards/rejected": -1.8300241231918335, + "step": 696 + }, + { + "epoch": 0.08, + "learning_rate": 2.798748080784221e-07, + "logits/chosen": -2.0907487869262695, + "logits/rejected": -1.709822416305542, + "logps/chosen": -275.63763427734375, + "logps/rejected": -333.2041320800781, + "loss": 0.4658, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18836429715156555, + "rewards/margins": 1.2075763940811157, + "rewards/rejected": -1.3959407806396484, + "step": 697 + }, + { + "epoch": 0.08, + "learning_rate": 2.798393764025038e-07, + "logits/chosen": -1.857750415802002, + "logits/rejected": -1.8417351245880127, + "logps/chosen": -418.12078857421875, + "logps/rejected": -387.6116943359375, + "loss": 0.1157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04824839159846306, + "rewards/margins": 2.574727773666382, + "rewards/rejected": -2.5264792442321777, + "step": 698 + }, + { + "epoch": 0.08, + "learning_rate": 2.7980394472658557e-07, + "logits/chosen": -2.4613399505615234, + "logits/rejected": -2.3161675930023193, + "logps/chosen": -174.2075653076172, + "logps/rejected": -184.5574951171875, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7121073603630066, + "rewards/margins": 2.3927316665649414, + "rewards/rejected": -3.1048388481140137, + "step": 699 + }, + { + "epoch": 0.08, + "learning_rate": 2.7976851305066726e-07, + "logits/chosen": -2.0053398609161377, + "logits/rejected": -2.2668185234069824, + "logps/chosen": -323.29925537109375, + "logps/rejected": -193.83499145507812, + "loss": 0.7332, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6456852555274963, + "rewards/margins": 0.4359254837036133, + "rewards/rejected": -1.0816106796264648, + "step": 700 + }, + { + "epoch": 0.08, + "learning_rate": 2.79733081374749e-07, + "logits/chosen": -2.719074010848999, + "logits/rejected": -2.7703237533569336, + "logps/chosen": -283.40362548828125, + "logps/rejected": -333.7422790527344, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8003613352775574, + "rewards/margins": 1.5607712268829346, + "rewards/rejected": -2.361132860183716, + "step": 701 + }, + { + "epoch": 0.08, + "learning_rate": 2.7969764969883076e-07, + "logits/chosen": -2.345130681991577, + "logits/rejected": -2.1284444332122803, + "logps/chosen": -412.54864501953125, + "logps/rejected": -377.3162841796875, + "loss": 0.3025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6909152269363403, + "rewards/margins": 1.6313015222549438, + "rewards/rejected": -2.322216749191284, + "step": 702 + }, + { + "epoch": 0.08, + "learning_rate": 2.7966221802291246e-07, + "logits/chosen": -2.021716594696045, + "logits/rejected": -2.0690855979919434, + "logps/chosen": -338.63262939453125, + "logps/rejected": -316.3762512207031, + "loss": 0.7908, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7158768773078918, + "rewards/margins": 0.5332122445106506, + "rewards/rejected": -1.2490891218185425, + "step": 703 + }, + { + "epoch": 0.08, + "learning_rate": 2.796267863469942e-07, + "logits/chosen": -1.840503454208374, + "logits/rejected": -2.06848406791687, + "logps/chosen": -565.7543334960938, + "logps/rejected": -361.3687744140625, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1513749063014984, + "rewards/margins": 1.6841871738433838, + "rewards/rejected": -1.835561990737915, + "step": 704 + }, + { + "epoch": 0.08, + "learning_rate": 2.795913546710759e-07, + "logits/chosen": -2.1994693279266357, + "logits/rejected": -2.34114933013916, + "logps/chosen": -464.0210876464844, + "logps/rejected": -300.4583740234375, + "loss": 1.1177, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5121523141860962, + "rewards/margins": 0.05686815083026886, + "rewards/rejected": -1.5690205097198486, + "step": 705 + }, + { + "epoch": 0.08, + "learning_rate": 2.7955592299515765e-07, + "logits/chosen": -2.586184024810791, + "logits/rejected": -2.7434518337249756, + "logps/chosen": -593.52783203125, + "logps/rejected": -334.9260559082031, + "loss": 0.4639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7823278903961182, + "rewards/margins": 1.1696197986602783, + "rewards/rejected": -1.9519476890563965, + "step": 706 + }, + { + "epoch": 0.08, + "learning_rate": 2.7952049131923934e-07, + "logits/chosen": -2.434286594390869, + "logits/rejected": -2.3348546028137207, + "logps/chosen": -190.62185668945312, + "logps/rejected": -220.13125610351562, + "loss": 0.4043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5339385271072388, + "rewards/margins": 1.9141103029251099, + "rewards/rejected": -2.4480488300323486, + "step": 707 + }, + { + "epoch": 0.08, + "learning_rate": 2.7948505964332115e-07, + "logits/chosen": -2.7167341709136963, + "logits/rejected": -2.6435136795043945, + "logps/chosen": -376.6390075683594, + "logps/rejected": -335.6934814453125, + "loss": 0.3032, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47025322914123535, + "rewards/margins": 1.9096953868865967, + "rewards/rejected": -2.379948854446411, + "step": 708 + }, + { + "epoch": 0.08, + "learning_rate": 2.7944962796740284e-07, + "logits/chosen": -2.669339656829834, + "logits/rejected": -2.7214407920837402, + "logps/chosen": -287.1677551269531, + "logps/rejected": -232.77674865722656, + "loss": 0.4921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4828144907951355, + "rewards/margins": 1.136594533920288, + "rewards/rejected": -1.6194088459014893, + "step": 709 + }, + { + "epoch": 0.08, + "learning_rate": 2.794141962914846e-07, + "logits/chosen": -2.4019737243652344, + "logits/rejected": -2.5598459243774414, + "logps/chosen": -267.4624328613281, + "logps/rejected": -260.18896484375, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46108418703079224, + "rewards/margins": 1.21381413936615, + "rewards/rejected": -1.6748985052108765, + "step": 710 + }, + { + "epoch": 0.08, + "learning_rate": 2.793787646155663e-07, + "logits/chosen": -2.3857693672180176, + "logits/rejected": -2.0118579864501953, + "logps/chosen": -236.5018310546875, + "logps/rejected": -277.8370056152344, + "loss": 0.4318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21453361213207245, + "rewards/margins": 1.2105472087860107, + "rewards/rejected": -1.4250807762145996, + "step": 711 + }, + { + "epoch": 0.08, + "learning_rate": 2.7934333293964803e-07, + "logits/chosen": -2.282078742980957, + "logits/rejected": -2.257857322692871, + "logps/chosen": -327.7319641113281, + "logps/rejected": -289.92364501953125, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.485090970993042, + "rewards/margins": 1.2860441207885742, + "rewards/rejected": -1.7711350917816162, + "step": 712 + }, + { + "epoch": 0.08, + "learning_rate": 2.793079012637298e-07, + "logits/chosen": -1.8446171283721924, + "logits/rejected": -1.83958101272583, + "logps/chosen": -301.27081298828125, + "logps/rejected": -311.137451171875, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5433977842330933, + "rewards/margins": 1.9595355987548828, + "rewards/rejected": -2.5029335021972656, + "step": 713 + }, + { + "epoch": 0.08, + "learning_rate": 2.792724695878115e-07, + "logits/chosen": -2.216991424560547, + "logits/rejected": -2.2228689193725586, + "logps/chosen": -183.7284393310547, + "logps/rejected": -220.22628784179688, + "loss": 1.035, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8529211282730103, + "rewards/margins": 0.05384710431098938, + "rewards/rejected": -1.9067683219909668, + "step": 714 + }, + { + "epoch": 0.08, + "learning_rate": 2.7923703791189323e-07, + "logits/chosen": -2.1335768699645996, + "logits/rejected": -2.0502963066101074, + "logps/chosen": -429.21673583984375, + "logps/rejected": -377.0115966796875, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3235827684402466, + "rewards/margins": 1.298314094543457, + "rewards/rejected": -1.621896743774414, + "step": 715 + }, + { + "epoch": 0.08, + "learning_rate": 2.792016062359749e-07, + "logits/chosen": -2.2320785522460938, + "logits/rejected": -2.269399642944336, + "logps/chosen": -235.19699096679688, + "logps/rejected": -287.6104736328125, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8427947759628296, + "rewards/margins": 1.0084350109100342, + "rewards/rejected": -1.8512297868728638, + "step": 716 + }, + { + "epoch": 0.08, + "learning_rate": 2.7916617456005667e-07, + "logits/chosen": -2.188983917236328, + "logits/rejected": -2.080789089202881, + "logps/chosen": -269.00555419921875, + "logps/rejected": -257.24346923828125, + "loss": 0.4682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31422367691993713, + "rewards/margins": 1.3701045513153076, + "rewards/rejected": -1.6843281984329224, + "step": 717 + }, + { + "epoch": 0.08, + "learning_rate": 2.7913074288413837e-07, + "logits/chosen": -2.17915678024292, + "logits/rejected": -2.2116074562072754, + "logps/chosen": -291.34039306640625, + "logps/rejected": -225.12042236328125, + "loss": 0.5031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23856277763843536, + "rewards/margins": 0.7490602731704712, + "rewards/rejected": -0.9876230955123901, + "step": 718 + }, + { + "epoch": 0.08, + "learning_rate": 2.7909531120822017e-07, + "logits/chosen": -2.093573570251465, + "logits/rejected": -2.3347115516662598, + "logps/chosen": -328.695068359375, + "logps/rejected": -300.95806884765625, + "loss": 0.5753, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9026257991790771, + "rewards/margins": 1.1759573221206665, + "rewards/rejected": -2.078583240509033, + "step": 719 + }, + { + "epoch": 0.08, + "learning_rate": 2.7905987953230186e-07, + "logits/chosen": -2.8407208919525146, + "logits/rejected": -2.7154998779296875, + "logps/chosen": -169.1520233154297, + "logps/rejected": -226.8980712890625, + "loss": 0.1991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39350438117980957, + "rewards/margins": 2.5129528045654297, + "rewards/rejected": -2.9064574241638184, + "step": 720 + }, + { + "epoch": 0.08, + "learning_rate": 2.790244478563836e-07, + "logits/chosen": -2.3318159580230713, + "logits/rejected": -2.4468765258789062, + "logps/chosen": -397.9857177734375, + "logps/rejected": -306.89410400390625, + "loss": 0.3031, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3837026357650757, + "rewards/margins": 1.9383556842803955, + "rewards/rejected": -2.3220582008361816, + "step": 721 + }, + { + "epoch": 0.08, + "learning_rate": 2.789890161804653e-07, + "logits/chosen": -1.924243688583374, + "logits/rejected": -1.798012375831604, + "logps/chosen": -187.9967498779297, + "logps/rejected": -194.39389038085938, + "loss": 0.6395, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5219980478286743, + "rewards/margins": 0.6820647716522217, + "rewards/rejected": -1.2040629386901855, + "step": 722 + }, + { + "epoch": 0.08, + "learning_rate": 2.7895358450454706e-07, + "logits/chosen": -2.528423547744751, + "logits/rejected": -2.359203338623047, + "logps/chosen": -275.095458984375, + "logps/rejected": -277.7443542480469, + "loss": 0.4522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4424862265586853, + "rewards/margins": 1.1355905532836914, + "rewards/rejected": -1.5780766010284424, + "step": 723 + }, + { + "epoch": 0.08, + "learning_rate": 2.789181528286288e-07, + "logits/chosen": -2.9008686542510986, + "logits/rejected": -2.8426506519317627, + "logps/chosen": -134.49192810058594, + "logps/rejected": -131.39321899414062, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07774001359939575, + "rewards/margins": 1.6229171752929688, + "rewards/rejected": -1.7006571292877197, + "step": 724 + }, + { + "epoch": 0.08, + "learning_rate": 2.788827211527105e-07, + "logits/chosen": -2.314517021179199, + "logits/rejected": -2.372591733932495, + "logps/chosen": -402.5282287597656, + "logps/rejected": -233.35751342773438, + "loss": 0.3686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03741845488548279, + "rewards/margins": 1.3896923065185547, + "rewards/rejected": -1.352273941040039, + "step": 725 + }, + { + "epoch": 0.08, + "learning_rate": 2.7884728947679225e-07, + "logits/chosen": -2.0849499702453613, + "logits/rejected": -2.025916814804077, + "logps/chosen": -344.33416748046875, + "logps/rejected": -257.15655517578125, + "loss": 0.2946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6020482778549194, + "rewards/margins": 1.996051549911499, + "rewards/rejected": -2.598099708557129, + "step": 726 + }, + { + "epoch": 0.08, + "learning_rate": 2.7881185780087395e-07, + "logits/chosen": -2.5358870029449463, + "logits/rejected": -2.5003550052642822, + "logps/chosen": -203.62911987304688, + "logps/rejected": -220.8814697265625, + "loss": 0.6079, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02344139665365219, + "rewards/margins": 0.6157550811767578, + "rewards/rejected": -0.5923135876655579, + "step": 727 + }, + { + "epoch": 0.08, + "learning_rate": 2.787764261249557e-07, + "logits/chosen": -2.0504770278930664, + "logits/rejected": -1.8686686754226685, + "logps/chosen": -363.89306640625, + "logps/rejected": -502.861328125, + "loss": 0.6617, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6800999045372009, + "rewards/margins": 0.7529337406158447, + "rewards/rejected": -1.4330337047576904, + "step": 728 + }, + { + "epoch": 0.08, + "learning_rate": 2.787409944490374e-07, + "logits/chosen": -2.374544620513916, + "logits/rejected": -2.033883571624756, + "logps/chosen": -146.85714721679688, + "logps/rejected": -300.08538818359375, + "loss": 0.3987, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5079129934310913, + "rewards/margins": 2.0036351680755615, + "rewards/rejected": -2.5115480422973633, + "step": 729 + }, + { + "epoch": 0.08, + "learning_rate": 2.7870556277311914e-07, + "logits/chosen": -2.5551064014434814, + "logits/rejected": -2.3524866104125977, + "logps/chosen": -176.80889892578125, + "logps/rejected": -254.78125, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06395383179187775, + "rewards/margins": 1.668338418006897, + "rewards/rejected": -1.6043845415115356, + "step": 730 + }, + { + "epoch": 0.09, + "learning_rate": 2.786701310972009e-07, + "logits/chosen": -2.0015459060668945, + "logits/rejected": -2.36329984664917, + "logps/chosen": -489.8224182128906, + "logps/rejected": -320.53790283203125, + "loss": 0.4448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5152555108070374, + "rewards/margins": 1.4673027992248535, + "rewards/rejected": -1.982558250427246, + "step": 731 + }, + { + "epoch": 0.09, + "learning_rate": 2.7863469942128264e-07, + "logits/chosen": -2.347792625427246, + "logits/rejected": -2.2414045333862305, + "logps/chosen": -403.95318603515625, + "logps/rejected": -478.7794494628906, + "loss": 0.7786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49405935406684875, + "rewards/margins": 0.552676796913147, + "rewards/rejected": -1.046736240386963, + "step": 732 + }, + { + "epoch": 0.09, + "learning_rate": 2.7859926774536433e-07, + "logits/chosen": -2.3601300716400146, + "logits/rejected": -2.4505324363708496, + "logps/chosen": -492.1108093261719, + "logps/rejected": -448.75201416015625, + "loss": 0.5489, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5677691698074341, + "rewards/margins": 0.538233757019043, + "rewards/rejected": -1.1060028076171875, + "step": 733 + }, + { + "epoch": 0.09, + "learning_rate": 2.785638360694461e-07, + "logits/chosen": -2.652724266052246, + "logits/rejected": -2.379751443862915, + "logps/chosen": -149.8656005859375, + "logps/rejected": -295.765869140625, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4850791096687317, + "rewards/margins": 2.219998836517334, + "rewards/rejected": -2.705078125, + "step": 734 + }, + { + "epoch": 0.09, + "learning_rate": 2.7852840439352783e-07, + "logits/chosen": -2.695128917694092, + "logits/rejected": -2.4105587005615234, + "logps/chosen": -238.45635986328125, + "logps/rejected": -201.91998291015625, + "loss": 0.4138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3905141353607178, + "rewards/margins": 0.9921962022781372, + "rewards/rejected": -1.3827104568481445, + "step": 735 + }, + { + "epoch": 0.09, + "learning_rate": 2.784929727176095e-07, + "logits/chosen": -2.409235954284668, + "logits/rejected": -2.511838912963867, + "logps/chosen": -246.9296417236328, + "logps/rejected": -258.2899169921875, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1772826761007309, + "rewards/margins": 1.1907989978790283, + "rewards/rejected": -1.3680816888809204, + "step": 736 + }, + { + "epoch": 0.09, + "learning_rate": 2.7845754104169127e-07, + "logits/chosen": -1.7174152135849, + "logits/rejected": -1.937434434890747, + "logps/chosen": -467.7893371582031, + "logps/rejected": -287.3309020996094, + "loss": 0.3511, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1859460175037384, + "rewards/margins": 1.3349452018737793, + "rewards/rejected": -1.5208911895751953, + "step": 737 + }, + { + "epoch": 0.09, + "learning_rate": 2.7842210936577297e-07, + "logits/chosen": -2.61128306388855, + "logits/rejected": -2.646527051925659, + "logps/chosen": -242.2703094482422, + "logps/rejected": -256.3012390136719, + "loss": 0.4762, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5834391117095947, + "rewards/margins": 0.6834787726402283, + "rewards/rejected": -1.2669178247451782, + "step": 738 + }, + { + "epoch": 0.09, + "learning_rate": 2.783866776898547e-07, + "logits/chosen": -1.547605276107788, + "logits/rejected": -1.8730759620666504, + "logps/chosen": -483.8611145019531, + "logps/rejected": -262.24493408203125, + "loss": 0.4126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30781662464141846, + "rewards/margins": 1.2256001234054565, + "rewards/rejected": -1.533416748046875, + "step": 739 + }, + { + "epoch": 0.09, + "learning_rate": 2.783512460139364e-07, + "logits/chosen": -2.767202377319336, + "logits/rejected": -2.5742387771606445, + "logps/chosen": -243.34011840820312, + "logps/rejected": -310.51641845703125, + "loss": 0.2148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1371321678161621, + "rewards/margins": 1.861680030822754, + "rewards/rejected": -1.998812198638916, + "step": 740 + }, + { + "epoch": 0.09, + "learning_rate": 2.7831581433801816e-07, + "logits/chosen": -1.9457716941833496, + "logits/rejected": -2.2742390632629395, + "logps/chosen": -501.174072265625, + "logps/rejected": -301.8258056640625, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9595270156860352, + "rewards/margins": 1.911973237991333, + "rewards/rejected": -2.871500253677368, + "step": 741 + }, + { + "epoch": 0.09, + "learning_rate": 2.782803826620999e-07, + "logits/chosen": -1.4854416847229004, + "logits/rejected": -2.0714879035949707, + "logps/chosen": -451.8368225097656, + "logps/rejected": -287.3528137207031, + "loss": 0.624, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.311759352684021, + "rewards/margins": 0.4674542546272278, + "rewards/rejected": -1.779213547706604, + "step": 742 + }, + { + "epoch": 0.09, + "learning_rate": 2.7824495098618166e-07, + "logits/chosen": -2.3233892917633057, + "logits/rejected": -2.397526741027832, + "logps/chosen": -415.1954040527344, + "logps/rejected": -445.427734375, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1484445333480835, + "rewards/margins": 2.864758014678955, + "rewards/rejected": -2.716313362121582, + "step": 743 + }, + { + "epoch": 0.09, + "learning_rate": 2.7820951931026335e-07, + "logits/chosen": -2.107908010482788, + "logits/rejected": -2.0752065181732178, + "logps/chosen": -237.20498657226562, + "logps/rejected": -181.3135223388672, + "loss": 1.4602, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2329823970794678, + "rewards/margins": -0.00503915548324585, + "rewards/rejected": -1.2279433012008667, + "step": 744 + }, + { + "epoch": 0.09, + "learning_rate": 2.781740876343451e-07, + "logits/chosen": -2.3671107292175293, + "logits/rejected": -2.007723093032837, + "logps/chosen": -198.54006958007812, + "logps/rejected": -294.02581787109375, + "loss": 0.4608, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5045974850654602, + "rewards/margins": 0.9476430416107178, + "rewards/rejected": -1.4522404670715332, + "step": 745 + }, + { + "epoch": 0.09, + "learning_rate": 2.7813865595842685e-07, + "logits/chosen": -1.6849923133850098, + "logits/rejected": -1.9355881214141846, + "logps/chosen": -273.6889953613281, + "logps/rejected": -263.3406982421875, + "loss": 0.6669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3472254276275635, + "rewards/margins": 1.2660205364227295, + "rewards/rejected": -2.613245964050293, + "step": 746 + }, + { + "epoch": 0.09, + "learning_rate": 2.7810322428250855e-07, + "logits/chosen": -2.4438881874084473, + "logits/rejected": -2.0594675540924072, + "logps/chosen": -191.34117126464844, + "logps/rejected": -248.95152282714844, + "loss": 1.7044, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3999998569488525, + "rewards/margins": -0.08021494746208191, + "rewards/rejected": -2.3197848796844482, + "step": 747 + }, + { + "epoch": 0.09, + "learning_rate": 2.780677926065903e-07, + "logits/chosen": -1.3684498071670532, + "logits/rejected": -1.8824512958526611, + "logps/chosen": -457.8465881347656, + "logps/rejected": -249.6936492919922, + "loss": 0.7575, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.932547390460968, + "rewards/margins": 0.055296219885349274, + "rewards/rejected": -0.9878436326980591, + "step": 748 + }, + { + "epoch": 0.09, + "learning_rate": 2.78032360930672e-07, + "logits/chosen": -2.6013617515563965, + "logits/rejected": -2.5035312175750732, + "logps/chosen": -218.39523315429688, + "logps/rejected": -292.25823974609375, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12753260135650635, + "rewards/margins": 3.019993543624878, + "rewards/rejected": -2.892460823059082, + "step": 749 + }, + { + "epoch": 0.09, + "learning_rate": 2.7799692925475374e-07, + "logits/chosen": -1.5520503520965576, + "logits/rejected": -2.206054210662842, + "logps/chosen": -348.1911926269531, + "logps/rejected": -161.59873962402344, + "loss": 0.6764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5584234595298767, + "rewards/margins": 0.31963321566581726, + "rewards/rejected": -0.8780567049980164, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 2.7796149757883544e-07, + "logits/chosen": -2.3355801105499268, + "logits/rejected": -2.4781787395477295, + "logps/chosen": -347.6722106933594, + "logps/rejected": -282.9828186035156, + "loss": 0.5541, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31582963466644287, + "rewards/margins": 0.6623234748840332, + "rewards/rejected": -0.9781531095504761, + "step": 751 + }, + { + "epoch": 0.09, + "learning_rate": 2.779260659029172e-07, + "logits/chosen": -2.9521708488464355, + "logits/rejected": -2.682919502258301, + "logps/chosen": -173.8658447265625, + "logps/rejected": -238.75140380859375, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27960172295570374, + "rewards/margins": 2.2849879264831543, + "rewards/rejected": -2.564589500427246, + "step": 752 + }, + { + "epoch": 0.09, + "learning_rate": 2.7789063422699893e-07, + "logits/chosen": -2.027177333831787, + "logits/rejected": -2.2728569507598877, + "logps/chosen": -333.5521545410156, + "logps/rejected": -225.32772827148438, + "loss": 0.757, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.975767970085144, + "rewards/margins": 0.1552283614873886, + "rewards/rejected": -1.1309963464736938, + "step": 753 + }, + { + "epoch": 0.09, + "learning_rate": 2.778552025510807e-07, + "logits/chosen": -2.277604579925537, + "logits/rejected": -2.258746862411499, + "logps/chosen": -161.8819580078125, + "logps/rejected": -165.3174285888672, + "loss": 0.718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7111003398895264, + "rewards/margins": 0.17237190902233124, + "rewards/rejected": -0.883472204208374, + "step": 754 + }, + { + "epoch": 0.09, + "learning_rate": 2.778197708751624e-07, + "logits/chosen": -1.8956842422485352, + "logits/rejected": -1.7877538204193115, + "logps/chosen": -289.3042907714844, + "logps/rejected": -367.7824401855469, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.821601152420044, + "rewards/margins": 1.4629815816879272, + "rewards/rejected": -2.2845826148986816, + "step": 755 + }, + { + "epoch": 0.09, + "learning_rate": 2.777843391992441e-07, + "logits/chosen": -2.3617255687713623, + "logits/rejected": -2.436927556991577, + "logps/chosen": -268.787353515625, + "logps/rejected": -225.15150451660156, + "loss": 0.6219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7100657224655151, + "rewards/margins": 0.6736667156219482, + "rewards/rejected": -1.3837324380874634, + "step": 756 + }, + { + "epoch": 0.09, + "learning_rate": 2.777489075233259e-07, + "logits/chosen": -2.4909229278564453, + "logits/rejected": -2.2607004642486572, + "logps/chosen": -84.35908508300781, + "logps/rejected": -181.00213623046875, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6459870934486389, + "rewards/margins": 1.2605112791061401, + "rewards/rejected": -1.9064984321594238, + "step": 757 + }, + { + "epoch": 0.09, + "learning_rate": 2.7771347584740757e-07, + "logits/chosen": -2.337822198867798, + "logits/rejected": -2.4811174869537354, + "logps/chosen": -336.9784240722656, + "logps/rejected": -266.4892883300781, + "loss": 0.5226, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7801718711853027, + "rewards/margins": 0.5696690678596497, + "rewards/rejected": -1.3498408794403076, + "step": 758 + }, + { + "epoch": 0.09, + "learning_rate": 2.776780441714893e-07, + "logits/chosen": -2.4789628982543945, + "logits/rejected": -2.607809543609619, + "logps/chosen": -115.53268432617188, + "logps/rejected": -136.46316528320312, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11838234961032867, + "rewards/margins": 1.2281792163848877, + "rewards/rejected": -1.1097968816757202, + "step": 759 + }, + { + "epoch": 0.09, + "learning_rate": 2.77642612495571e-07, + "logits/chosen": -1.9995639324188232, + "logits/rejected": -2.273143768310547, + "logps/chosen": -286.893798828125, + "logps/rejected": -187.37335205078125, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40587806701660156, + "rewards/margins": 1.0974023342132568, + "rewards/rejected": -1.5032804012298584, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 2.7760718081965276e-07, + "logits/chosen": -1.9202815294265747, + "logits/rejected": -1.9412760734558105, + "logps/chosen": -381.6779479980469, + "logps/rejected": -389.0871276855469, + "loss": 0.267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.576455295085907, + "rewards/margins": 1.6500946283340454, + "rewards/rejected": -2.2265498638153076, + "step": 761 + }, + { + "epoch": 0.09, + "learning_rate": 2.7757174914373446e-07, + "logits/chosen": -2.048334836959839, + "logits/rejected": -2.059481382369995, + "logps/chosen": -163.4219512939453, + "logps/rejected": -160.32611083984375, + "loss": 0.7496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7007333636283875, + "rewards/margins": 0.2021094560623169, + "rewards/rejected": -0.9028427600860596, + "step": 762 + }, + { + "epoch": 0.09, + "learning_rate": 2.775363174678162e-07, + "logits/chosen": -2.343045473098755, + "logits/rejected": -1.9673285484313965, + "logps/chosen": -300.00970458984375, + "logps/rejected": -433.0460205078125, + "loss": 0.299, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3984956443309784, + "rewards/margins": 2.5965304374694824, + "rewards/rejected": -2.995026111602783, + "step": 763 + }, + { + "epoch": 0.09, + "learning_rate": 2.7750088579189796e-07, + "logits/chosen": -2.638866424560547, + "logits/rejected": -2.860140085220337, + "logps/chosen": -325.6002197265625, + "logps/rejected": -262.08892822265625, + "loss": 0.3829, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484016180038452, + "rewards/margins": 1.3936734199523926, + "rewards/rejected": -2.5420749187469482, + "step": 764 + }, + { + "epoch": 0.09, + "learning_rate": 2.7746545411597965e-07, + "logits/chosen": -2.7840490341186523, + "logits/rejected": -2.6404807567596436, + "logps/chosen": -100.87081909179688, + "logps/rejected": -264.884765625, + "loss": 0.4451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7375617027282715, + "rewards/margins": 1.909604549407959, + "rewards/rejected": -2.6471662521362305, + "step": 765 + }, + { + "epoch": 0.09, + "learning_rate": 2.774300224400614e-07, + "logits/chosen": -2.1366701126098633, + "logits/rejected": -2.3100757598876953, + "logps/chosen": -411.67822265625, + "logps/rejected": -304.1395263671875, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.563908576965332, + "rewards/margins": 1.5846513509750366, + "rewards/rejected": -2.148560047149658, + "step": 766 + }, + { + "epoch": 0.09, + "learning_rate": 2.7739459076414315e-07, + "logits/chosen": -2.471198558807373, + "logits/rejected": -2.5612523555755615, + "logps/chosen": -307.8586730957031, + "logps/rejected": -304.1485595703125, + "loss": 0.3437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26797568798065186, + "rewards/margins": 5.004037857055664, + "rewards/rejected": -5.272013187408447, + "step": 767 + }, + { + "epoch": 0.09, + "learning_rate": 2.7735915908822484e-07, + "logits/chosen": -2.29203462600708, + "logits/rejected": -2.4890825748443604, + "logps/chosen": -374.6341857910156, + "logps/rejected": -294.659423828125, + "loss": 0.3593, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005583435297012329, + "rewards/margins": 1.8434807062149048, + "rewards/rejected": -1.8490641117095947, + "step": 768 + }, + { + "epoch": 0.09, + "learning_rate": 2.773237274123066e-07, + "logits/chosen": -1.830995798110962, + "logits/rejected": -1.9686992168426514, + "logps/chosen": -439.869140625, + "logps/rejected": -289.37225341796875, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5556020736694336, + "rewards/margins": 1.1723510026931763, + "rewards/rejected": -1.7279530763626099, + "step": 769 + }, + { + "epoch": 0.09, + "learning_rate": 2.7728829573638834e-07, + "logits/chosen": -2.4538049697875977, + "logits/rejected": -2.726793050765991, + "logps/chosen": -261.624755859375, + "logps/rejected": -196.87759399414062, + "loss": 0.2587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03366359323263168, + "rewards/margins": 1.9900729656219482, + "rewards/rejected": -1.956409215927124, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 2.7725286406047004e-07, + "logits/chosen": -2.227140426635742, + "logits/rejected": -2.5036261081695557, + "logps/chosen": -346.79925537109375, + "logps/rejected": -195.57916259765625, + "loss": 0.4527, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42519277334213257, + "rewards/margins": 0.9860503673553467, + "rewards/rejected": -1.411243200302124, + "step": 771 + }, + { + "epoch": 0.09, + "learning_rate": 2.772174323845518e-07, + "logits/chosen": -2.4367258548736572, + "logits/rejected": -2.472235918045044, + "logps/chosen": -252.1932373046875, + "logps/rejected": -319.0783386230469, + "loss": 0.7871, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5913159847259521, + "rewards/margins": 0.7784263491630554, + "rewards/rejected": -2.3697423934936523, + "step": 772 + }, + { + "epoch": 0.09, + "learning_rate": 2.771820007086335e-07, + "logits/chosen": -2.182321310043335, + "logits/rejected": -2.109429121017456, + "logps/chosen": -230.63787841796875, + "logps/rejected": -213.37655639648438, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2099878340959549, + "rewards/margins": 3.0668489933013916, + "rewards/rejected": -3.27683687210083, + "step": 773 + }, + { + "epoch": 0.09, + "learning_rate": 2.7714656903271523e-07, + "logits/chosen": -2.0221927165985107, + "logits/rejected": -2.1753525733947754, + "logps/chosen": -232.27548217773438, + "logps/rejected": -201.60400390625, + "loss": 0.484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48170843720436096, + "rewards/margins": 1.1055957078933716, + "rewards/rejected": -1.5873041152954102, + "step": 774 + }, + { + "epoch": 0.09, + "learning_rate": 2.77111137356797e-07, + "logits/chosen": -2.184272527694702, + "logits/rejected": -2.142824172973633, + "logps/chosen": -158.35577392578125, + "logps/rejected": -155.5572509765625, + "loss": 0.7086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9336909651756287, + "rewards/margins": 0.2074240744113922, + "rewards/rejected": -1.1411150693893433, + "step": 775 + }, + { + "epoch": 0.09, + "learning_rate": 2.770757056808787e-07, + "logits/chosen": -2.5685808658599854, + "logits/rejected": -2.6487743854522705, + "logps/chosen": -544.484619140625, + "logps/rejected": -298.53662109375, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4809766709804535, + "rewards/margins": 1.0506956577301025, + "rewards/rejected": -1.5316723585128784, + "step": 776 + }, + { + "epoch": 0.09, + "learning_rate": 2.770402740049604e-07, + "logits/chosen": -2.2997710704803467, + "logits/rejected": -2.443389654159546, + "logps/chosen": -234.91281127929688, + "logps/rejected": -327.29913330078125, + "loss": 0.4065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6117860078811646, + "rewards/margins": 1.0197553634643555, + "rewards/rejected": -1.6315414905548096, + "step": 777 + }, + { + "epoch": 0.09, + "learning_rate": 2.7700484232904217e-07, + "logits/chosen": -2.995149850845337, + "logits/rejected": -3.0570287704467773, + "logps/chosen": -168.66322326660156, + "logps/rejected": -243.2884979248047, + "loss": 0.6478, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9907075762748718, + "rewards/margins": 1.7721009254455566, + "rewards/rejected": -2.7628087997436523, + "step": 778 + }, + { + "epoch": 0.09, + "learning_rate": 2.7696941065312387e-07, + "logits/chosen": -2.3316986560821533, + "logits/rejected": -2.5488295555114746, + "logps/chosen": -252.421142578125, + "logps/rejected": -265.6931457519531, + "loss": 0.4779, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2507890462875366, + "rewards/margins": 1.328528642654419, + "rewards/rejected": -1.579317569732666, + "step": 779 + }, + { + "epoch": 0.09, + "learning_rate": 2.769339789772056e-07, + "logits/chosen": -2.075249195098877, + "logits/rejected": -2.1930127143859863, + "logps/chosen": -263.2025146484375, + "logps/rejected": -127.05046844482422, + "loss": 0.6815, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3834928870201111, + "rewards/margins": 0.12096354365348816, + "rewards/rejected": -0.5044564008712769, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 2.7689854730128736e-07, + "logits/chosen": -1.9515782594680786, + "logits/rejected": -2.0021324157714844, + "logps/chosen": -267.23907470703125, + "logps/rejected": -254.88040161132812, + "loss": 1.2877, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5350732803344727, + "rewards/margins": 0.027923956513404846, + "rewards/rejected": -1.5629972219467163, + "step": 781 + }, + { + "epoch": 0.09, + "learning_rate": 2.7686311562536906e-07, + "logits/chosen": -2.95241641998291, + "logits/rejected": -2.967081308364868, + "logps/chosen": -248.35562133789062, + "logps/rejected": -201.17227172851562, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7876554131507874, + "rewards/margins": 0.456155002117157, + "rewards/rejected": -1.2438104152679443, + "step": 782 + }, + { + "epoch": 0.09, + "learning_rate": 2.768276839494508e-07, + "logits/chosen": -2.4732584953308105, + "logits/rejected": -2.427335262298584, + "logps/chosen": -364.05792236328125, + "logps/rejected": -264.27178955078125, + "loss": 0.3338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3528885245323181, + "rewards/margins": 1.496238112449646, + "rewards/rejected": -1.8491265773773193, + "step": 783 + }, + { + "epoch": 0.09, + "learning_rate": 2.767922522735325e-07, + "logits/chosen": -2.363307476043701, + "logits/rejected": -2.545637845993042, + "logps/chosen": -347.4714660644531, + "logps/rejected": -372.4090576171875, + "loss": 1.3909, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3513449430465698, + "rewards/margins": -0.5820550918579102, + "rewards/rejected": -0.7692897319793701, + "step": 784 + }, + { + "epoch": 0.09, + "learning_rate": 2.7675682059761425e-07, + "logits/chosen": -2.2637083530426025, + "logits/rejected": -2.5251107215881348, + "logps/chosen": -331.9575500488281, + "logps/rejected": -218.31024169921875, + "loss": 0.7148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7790206670761108, + "rewards/margins": 0.4762803316116333, + "rewards/rejected": -1.2553009986877441, + "step": 785 + }, + { + "epoch": 0.09, + "learning_rate": 2.76721388921696e-07, + "logits/chosen": -2.2073974609375, + "logits/rejected": -2.0144035816192627, + "logps/chosen": -146.69845581054688, + "logps/rejected": -170.0240478515625, + "loss": 0.8367, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7718556523323059, + "rewards/margins": 0.18709015846252441, + "rewards/rejected": -0.9589458107948303, + "step": 786 + }, + { + "epoch": 0.09, + "learning_rate": 2.766859572457777e-07, + "logits/chosen": -2.3679733276367188, + "logits/rejected": -2.3844170570373535, + "logps/chosen": -225.81332397460938, + "logps/rejected": -206.98191833496094, + "loss": 0.5774, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6836738586425781, + "rewards/margins": 0.5918539762496948, + "rewards/rejected": -1.2755277156829834, + "step": 787 + }, + { + "epoch": 0.09, + "learning_rate": 2.7665052556985945e-07, + "logits/chosen": -2.55051326751709, + "logits/rejected": -2.6735756397247314, + "logps/chosen": -209.61671447753906, + "logps/rejected": -279.142578125, + "loss": 0.2528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45962581038475037, + "rewards/margins": 2.245039463043213, + "rewards/rejected": -2.704665184020996, + "step": 788 + }, + { + "epoch": 0.09, + "learning_rate": 2.766150938939412e-07, + "logits/chosen": -2.55784273147583, + "logits/rejected": -2.3539445400238037, + "logps/chosen": -117.7677230834961, + "logps/rejected": -215.36190795898438, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1559140384197235, + "rewards/margins": 1.5776423215866089, + "rewards/rejected": -1.4217283725738525, + "step": 789 + }, + { + "epoch": 0.09, + "learning_rate": 2.765796622180229e-07, + "logits/chosen": -2.49919056892395, + "logits/rejected": -2.1292238235473633, + "logps/chosen": -186.37905883789062, + "logps/rejected": -212.44821166992188, + "loss": 0.4155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6736366748809814, + "rewards/margins": 1.3675307035446167, + "rewards/rejected": -2.0411672592163086, + "step": 790 + }, + { + "epoch": 0.09, + "learning_rate": 2.7654423054210464e-07, + "logits/chosen": -2.3219194412231445, + "logits/rejected": -2.026634693145752, + "logps/chosen": -202.41001892089844, + "logps/rejected": -380.6002502441406, + "loss": 0.4469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47527679800987244, + "rewards/margins": 0.9600129127502441, + "rewards/rejected": -1.435289740562439, + "step": 791 + }, + { + "epoch": 0.09, + "learning_rate": 2.765087988661864e-07, + "logits/chosen": -2.4608097076416016, + "logits/rejected": -2.4931468963623047, + "logps/chosen": -262.5205993652344, + "logps/rejected": -265.9735412597656, + "loss": 0.3976, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3391968011856079, + "rewards/margins": 1.0428065061569214, + "rewards/rejected": -1.3820033073425293, + "step": 792 + }, + { + "epoch": 0.09, + "learning_rate": 2.764733671902681e-07, + "logits/chosen": -1.8869818449020386, + "logits/rejected": -2.116821527481079, + "logps/chosen": -379.5481262207031, + "logps/rejected": -283.2991027832031, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28465360403060913, + "rewards/margins": 1.3219412565231323, + "rewards/rejected": -1.6065948009490967, + "step": 793 + }, + { + "epoch": 0.09, + "learning_rate": 2.7643793551434983e-07, + "logits/chosen": -2.203321933746338, + "logits/rejected": -2.240670680999756, + "logps/chosen": -323.3741149902344, + "logps/rejected": -466.890625, + "loss": 0.2431, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1400532722473145, + "rewards/margins": 1.7018001079559326, + "rewards/rejected": -2.841853141784668, + "step": 794 + }, + { + "epoch": 0.09, + "learning_rate": 2.7640250383843153e-07, + "logits/chosen": -1.8497653007507324, + "logits/rejected": -1.9135180711746216, + "logps/chosen": -158.06689453125, + "logps/rejected": -240.0703125, + "loss": 0.573, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4657929539680481, + "rewards/margins": 0.5899807810783386, + "rewards/rejected": -1.0557737350463867, + "step": 795 + }, + { + "epoch": 0.09, + "learning_rate": 2.763670721625133e-07, + "logits/chosen": -2.804980754852295, + "logits/rejected": -2.5880014896392822, + "logps/chosen": -217.15866088867188, + "logps/rejected": -249.5355987548828, + "loss": 0.5109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5490109324455261, + "rewards/margins": 0.7773458957672119, + "rewards/rejected": -1.3263567686080933, + "step": 796 + }, + { + "epoch": 0.09, + "learning_rate": 2.7633164048659497e-07, + "logits/chosen": -2.3626606464385986, + "logits/rejected": -2.200685977935791, + "logps/chosen": -120.37093353271484, + "logps/rejected": -232.69943237304688, + "loss": 0.5993, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3119090795516968, + "rewards/margins": 1.0987520217895508, + "rewards/rejected": -2.410661220550537, + "step": 797 + }, + { + "epoch": 0.09, + "learning_rate": 2.762962088106767e-07, + "logits/chosen": -2.2215070724487305, + "logits/rejected": -2.170691728591919, + "logps/chosen": -225.33628845214844, + "logps/rejected": -256.47784423828125, + "loss": 0.8262, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.21883225440979, + "rewards/margins": 0.43528297543525696, + "rewards/rejected": -1.6541152000427246, + "step": 798 + }, + { + "epoch": 0.09, + "learning_rate": 2.7626077713475847e-07, + "logits/chosen": -2.135340690612793, + "logits/rejected": -2.3261606693267822, + "logps/chosen": -344.9084167480469, + "logps/rejected": -310.552001953125, + "loss": 0.3069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006478369235992432, + "rewards/margins": 1.3188191652297974, + "rewards/rejected": -1.3123407363891602, + "step": 799 + }, + { + "epoch": 0.09, + "learning_rate": 2.7622534545884016e-07, + "logits/chosen": -1.9407858848571777, + "logits/rejected": -1.762431025505066, + "logps/chosen": -232.25344848632812, + "logps/rejected": -340.0291748046875, + "loss": 0.4721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2694483995437622, + "rewards/margins": 0.9252721071243286, + "rewards/rejected": -1.1947205066680908, + "step": 800 + }, + { + "epoch": 0.09, + "learning_rate": 2.761899137829219e-07, + "logits/chosen": -2.562016248703003, + "logits/rejected": -2.43929123878479, + "logps/chosen": -203.78733825683594, + "logps/rejected": -274.46759033203125, + "loss": 0.3273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5996531844139099, + "rewards/margins": 1.1518138647079468, + "rewards/rejected": -1.7514671087265015, + "step": 801 + }, + { + "epoch": 0.09, + "learning_rate": 2.7615448210700366e-07, + "logits/chosen": -2.294565200805664, + "logits/rejected": -2.561570167541504, + "logps/chosen": -348.9151611328125, + "logps/rejected": -220.5670166015625, + "loss": 1.037, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6222509145736694, + "rewards/margins": 0.014660343527793884, + "rewards/rejected": -1.636911392211914, + "step": 802 + }, + { + "epoch": 0.09, + "learning_rate": 2.761190504310854e-07, + "logits/chosen": -1.995142936706543, + "logits/rejected": -2.0590567588806152, + "logps/chosen": -217.62197875976562, + "logps/rejected": -251.7452392578125, + "loss": 0.3422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1148129478096962, + "rewards/margins": 1.4329867362976074, + "rewards/rejected": -1.5477995872497559, + "step": 803 + }, + { + "epoch": 0.09, + "learning_rate": 2.760836187551671e-07, + "logits/chosen": -2.3252084255218506, + "logits/rejected": -2.1150801181793213, + "logps/chosen": -445.9859313964844, + "logps/rejected": -413.94976806640625, + "loss": 0.3354, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7510362267494202, + "rewards/margins": 1.5921887159347534, + "rewards/rejected": -2.3432250022888184, + "step": 804 + }, + { + "epoch": 0.09, + "learning_rate": 2.7604818707924885e-07, + "logits/chosen": -2.165978193283081, + "logits/rejected": -1.8848586082458496, + "logps/chosen": -402.4139709472656, + "logps/rejected": -357.16595458984375, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23427307605743408, + "rewards/margins": 2.4704904556274414, + "rewards/rejected": -2.704763412475586, + "step": 805 + }, + { + "epoch": 0.09, + "learning_rate": 2.7601275540333055e-07, + "logits/chosen": -2.431500196456909, + "logits/rejected": -2.123365879058838, + "logps/chosen": -253.3337860107422, + "logps/rejected": -286.802001953125, + "loss": 0.7088, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5514963865280151, + "rewards/margins": 0.2381865382194519, + "rewards/rejected": -0.789682924747467, + "step": 806 + }, + { + "epoch": 0.09, + "learning_rate": 2.759773237274123e-07, + "logits/chosen": -1.5364643335342407, + "logits/rejected": -1.8320200443267822, + "logps/chosen": -453.3089904785156, + "logps/rejected": -368.9715576171875, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31580618023872375, + "rewards/margins": 1.396337866783142, + "rewards/rejected": -1.7121440172195435, + "step": 807 + }, + { + "epoch": 0.09, + "learning_rate": 2.75941892051494e-07, + "logits/chosen": -2.7579281330108643, + "logits/rejected": -2.680347442626953, + "logps/chosen": -333.203125, + "logps/rejected": -458.37664794921875, + "loss": 0.4691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6152685284614563, + "rewards/margins": 1.2705790996551514, + "rewards/rejected": -1.8858474493026733, + "step": 808 + }, + { + "epoch": 0.09, + "learning_rate": 2.7590646037557574e-07, + "logits/chosen": -2.128244400024414, + "logits/rejected": -1.9988393783569336, + "logps/chosen": -540.3739013671875, + "logps/rejected": -458.36773681640625, + "loss": 0.438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6936752200126648, + "rewards/margins": 0.8882251977920532, + "rewards/rejected": -1.5819003582000732, + "step": 809 + }, + { + "epoch": 0.09, + "learning_rate": 2.758710286996575e-07, + "logits/chosen": -2.770843505859375, + "logits/rejected": -2.86104154586792, + "logps/chosen": -116.48832702636719, + "logps/rejected": -141.7459716796875, + "loss": 0.5111, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.625515878200531, + "rewards/margins": 1.3086469173431396, + "rewards/rejected": -1.9341628551483154, + "step": 810 + }, + { + "epoch": 0.09, + "learning_rate": 2.758355970237392e-07, + "logits/chosen": -2.7642717361450195, + "logits/rejected": -2.7046964168548584, + "logps/chosen": -295.29949951171875, + "logps/rejected": -351.4145812988281, + "loss": 0.3549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6863878965377808, + "rewards/margins": 1.9236444234848022, + "rewards/rejected": -2.610032558441162, + "step": 811 + }, + { + "epoch": 0.09, + "learning_rate": 2.7580016534782094e-07, + "logits/chosen": -2.656972646713257, + "logits/rejected": -2.689328670501709, + "logps/chosen": -144.65496826171875, + "logps/rejected": -242.3056640625, + "loss": 0.3494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5340954661369324, + "rewards/margins": 1.5823067426681519, + "rewards/rejected": -2.1164021492004395, + "step": 812 + }, + { + "epoch": 0.09, + "learning_rate": 2.757647336719027e-07, + "logits/chosen": -2.014843225479126, + "logits/rejected": -2.2872374057769775, + "logps/chosen": -323.4866027832031, + "logps/rejected": -271.4276428222656, + "loss": 1.1907, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0805068016052246, + "rewards/margins": -0.24969954788684845, + "rewards/rejected": -0.8308073282241821, + "step": 813 + }, + { + "epoch": 0.09, + "learning_rate": 2.7572930199598443e-07, + "logits/chosen": -2.2908191680908203, + "logits/rejected": -2.3647541999816895, + "logps/chosen": -348.09942626953125, + "logps/rejected": -304.20098876953125, + "loss": 0.4112, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6252138614654541, + "rewards/margins": 2.6163878440856934, + "rewards/rejected": -3.2416019439697266, + "step": 814 + }, + { + "epoch": 0.09, + "learning_rate": 2.7569387032006613e-07, + "logits/chosen": -2.087074041366577, + "logits/rejected": -2.454625129699707, + "logps/chosen": -548.4599609375, + "logps/rejected": -283.82672119140625, + "loss": 0.379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17108744382858276, + "rewards/margins": 1.8601222038269043, + "rewards/rejected": -2.031209707260132, + "step": 815 + }, + { + "epoch": 0.09, + "learning_rate": 2.756584386441479e-07, + "logits/chosen": -2.2962212562561035, + "logits/rejected": -2.016892910003662, + "logps/chosen": -529.9929809570312, + "logps/rejected": -318.89251708984375, + "loss": 0.9321, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.376366138458252, + "rewards/margins": 0.29294195771217346, + "rewards/rejected": -1.6693079471588135, + "step": 816 + }, + { + "epoch": 0.1, + "learning_rate": 2.7562300696822957e-07, + "logits/chosen": -2.312739133834839, + "logits/rejected": -2.5307068824768066, + "logps/chosen": -254.0530242919922, + "logps/rejected": -207.68264770507812, + "loss": 0.4818, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7174158096313477, + "rewards/margins": 0.9763070940971375, + "rewards/rejected": -1.6937229633331299, + "step": 817 + }, + { + "epoch": 0.1, + "learning_rate": 2.755875752923113e-07, + "logits/chosen": -2.681137800216675, + "logits/rejected": -2.8213984966278076, + "logps/chosen": -418.1108703613281, + "logps/rejected": -314.2308654785156, + "loss": 0.5405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6917369365692139, + "rewards/margins": 0.6524041295051575, + "rewards/rejected": -1.3441410064697266, + "step": 818 + }, + { + "epoch": 0.1, + "learning_rate": 2.75552143616393e-07, + "logits/chosen": -2.7520992755889893, + "logits/rejected": -2.8722448348999023, + "logps/chosen": -472.7545166015625, + "logps/rejected": -353.07562255859375, + "loss": 0.268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4939306974411011, + "rewards/margins": 2.4535770416259766, + "rewards/rejected": -2.947507858276367, + "step": 819 + }, + { + "epoch": 0.1, + "learning_rate": 2.7551671194047477e-07, + "logits/chosen": -1.8440983295440674, + "logits/rejected": -2.1140377521514893, + "logps/chosen": -400.38946533203125, + "logps/rejected": -260.4787292480469, + "loss": 0.4221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.81428462266922, + "rewards/margins": 1.009574055671692, + "rewards/rejected": -1.8238587379455566, + "step": 820 + }, + { + "epoch": 0.1, + "learning_rate": 2.754812802645565e-07, + "logits/chosen": -2.354236364364624, + "logits/rejected": -2.3987433910369873, + "logps/chosen": -113.60345458984375, + "logps/rejected": -190.42843627929688, + "loss": 0.588, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2221328020095825, + "rewards/margins": 1.5217992067337036, + "rewards/rejected": -2.7439322471618652, + "step": 821 + }, + { + "epoch": 0.1, + "learning_rate": 2.754458485886382e-07, + "logits/chosen": -2.6065571308135986, + "logits/rejected": -2.5549871921539307, + "logps/chosen": -380.747802734375, + "logps/rejected": -299.6596984863281, + "loss": 0.436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6584911346435547, + "rewards/margins": 0.7793545722961426, + "rewards/rejected": -1.4378457069396973, + "step": 822 + }, + { + "epoch": 0.1, + "learning_rate": 2.7541041691271996e-07, + "logits/chosen": -2.1508445739746094, + "logits/rejected": -2.6069607734680176, + "logps/chosen": -387.8492431640625, + "logps/rejected": -151.61810302734375, + "loss": 0.5937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28590136766433716, + "rewards/margins": 1.1521022319793701, + "rewards/rejected": -1.4380037784576416, + "step": 823 + }, + { + "epoch": 0.1, + "learning_rate": 2.753749852368017e-07, + "logits/chosen": -2.051076889038086, + "logits/rejected": -2.016786575317383, + "logps/chosen": -266.8189392089844, + "logps/rejected": -381.87945556640625, + "loss": 0.4265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27443787455558777, + "rewards/margins": 0.8248253464698792, + "rewards/rejected": -1.0992631912231445, + "step": 824 + }, + { + "epoch": 0.1, + "learning_rate": 2.7533955356088346e-07, + "logits/chosen": -2.5787770748138428, + "logits/rejected": -2.460585594177246, + "logps/chosen": -335.9285583496094, + "logps/rejected": -282.3567810058594, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3766709268093109, + "rewards/margins": 2.8563716411590576, + "rewards/rejected": -3.2330424785614014, + "step": 825 + }, + { + "epoch": 0.1, + "learning_rate": 2.7530412188496515e-07, + "logits/chosen": -2.5509676933288574, + "logits/rejected": -2.343088150024414, + "logps/chosen": -263.2168884277344, + "logps/rejected": -388.8572998046875, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.836493194103241, + "rewards/margins": 2.912576675415039, + "rewards/rejected": -3.7490696907043457, + "step": 826 + }, + { + "epoch": 0.1, + "learning_rate": 2.752686902090469e-07, + "logits/chosen": -2.0626823902130127, + "logits/rejected": -2.183810234069824, + "logps/chosen": -329.9076232910156, + "logps/rejected": -288.15325927734375, + "loss": 0.4588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7152891159057617, + "rewards/margins": 1.0756343603134155, + "rewards/rejected": -1.7909233570098877, + "step": 827 + }, + { + "epoch": 0.1, + "learning_rate": 2.752332585331286e-07, + "logits/chosen": -2.2275002002716064, + "logits/rejected": -2.2359678745269775, + "logps/chosen": -280.36541748046875, + "logps/rejected": -317.4617004394531, + "loss": 2.1092, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.762106418609619, + "rewards/margins": -1.0440540313720703, + "rewards/rejected": -1.718052625656128, + "step": 828 + }, + { + "epoch": 0.1, + "learning_rate": 2.7519782685721034e-07, + "logits/chosen": -2.3201491832733154, + "logits/rejected": -2.424807548522949, + "logps/chosen": -338.68682861328125, + "logps/rejected": -359.15985107421875, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25985994935035706, + "rewards/margins": 2.596632719039917, + "rewards/rejected": -2.8564929962158203, + "step": 829 + }, + { + "epoch": 0.1, + "learning_rate": 2.7516239518129204e-07, + "logits/chosen": -2.8091440200805664, + "logits/rejected": -2.9383184909820557, + "logps/chosen": -238.34129333496094, + "logps/rejected": -284.0589599609375, + "loss": 0.3886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5881557464599609, + "rewards/margins": 1.5575106143951416, + "rewards/rejected": -2.1456663608551025, + "step": 830 + }, + { + "epoch": 0.1, + "learning_rate": 2.751269635053738e-07, + "logits/chosen": -2.321010112762451, + "logits/rejected": -2.2876906394958496, + "logps/chosen": -296.724365234375, + "logps/rejected": -382.295166015625, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7009415030479431, + "rewards/margins": 1.2440650463104248, + "rewards/rejected": -1.9450066089630127, + "step": 831 + }, + { + "epoch": 0.1, + "learning_rate": 2.7509153182945554e-07, + "logits/chosen": -2.815554618835449, + "logits/rejected": -2.699995756149292, + "logps/chosen": -278.7213134765625, + "logps/rejected": -290.8868408203125, + "loss": 0.4161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29484686255455017, + "rewards/margins": 1.5645017623901367, + "rewards/rejected": -1.8593485355377197, + "step": 832 + }, + { + "epoch": 0.1, + "learning_rate": 2.7505610015353723e-07, + "logits/chosen": -2.4364542961120605, + "logits/rejected": -2.422778367996216, + "logps/chosen": -194.6625213623047, + "logps/rejected": -200.44354248046875, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5941398739814758, + "rewards/margins": 1.494678258895874, + "rewards/rejected": -2.088818311691284, + "step": 833 + }, + { + "epoch": 0.1, + "learning_rate": 2.75020668477619e-07, + "logits/chosen": -2.4262027740478516, + "logits/rejected": -2.6542892456054688, + "logps/chosen": -171.06784057617188, + "logps/rejected": -175.1933135986328, + "loss": 0.6241, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.725385069847107, + "rewards/margins": 0.5255303382873535, + "rewards/rejected": -2.250915288925171, + "step": 834 + }, + { + "epoch": 0.1, + "learning_rate": 2.749852368017007e-07, + "logits/chosen": -2.219679355621338, + "logits/rejected": -2.5764517784118652, + "logps/chosen": -329.9806213378906, + "logps/rejected": -246.863525390625, + "loss": 0.3792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7566499710083008, + "rewards/margins": 1.4804521799087524, + "rewards/rejected": -2.2371022701263428, + "step": 835 + }, + { + "epoch": 0.1, + "learning_rate": 2.749498051257825e-07, + "logits/chosen": -2.0636887550354004, + "logits/rejected": -2.3678150177001953, + "logps/chosen": -270.44635009765625, + "logps/rejected": -139.17648315429688, + "loss": 0.4403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1750926971435547, + "rewards/margins": 1.2872021198272705, + "rewards/rejected": -1.4622948169708252, + "step": 836 + }, + { + "epoch": 0.1, + "learning_rate": 2.749143734498642e-07, + "logits/chosen": -1.9599565267562866, + "logits/rejected": -2.2607314586639404, + "logps/chosen": -384.07073974609375, + "logps/rejected": -308.3196105957031, + "loss": 0.7774, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.087849736213684, + "rewards/margins": 0.8594973087310791, + "rewards/rejected": -1.9473469257354736, + "step": 837 + }, + { + "epoch": 0.1, + "learning_rate": 2.748789417739459e-07, + "logits/chosen": -2.095362663269043, + "logits/rejected": -2.6405675411224365, + "logps/chosen": -378.84808349609375, + "logps/rejected": -232.74066162109375, + "loss": 0.4293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6868550777435303, + "rewards/margins": 1.2646958827972412, + "rewards/rejected": -1.951551079750061, + "step": 838 + }, + { + "epoch": 0.1, + "learning_rate": 2.748435100980276e-07, + "logits/chosen": -2.1725120544433594, + "logits/rejected": -2.5240583419799805, + "logps/chosen": -375.93560791015625, + "logps/rejected": -292.49542236328125, + "loss": 0.424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5310734510421753, + "rewards/margins": 1.3675363063812256, + "rewards/rejected": -1.8986096382141113, + "step": 839 + }, + { + "epoch": 0.1, + "learning_rate": 2.7480807842210937e-07, + "logits/chosen": -2.0917673110961914, + "logits/rejected": -2.2914419174194336, + "logps/chosen": -272.8973388671875, + "logps/rejected": -261.6075744628906, + "loss": 0.5795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8168570399284363, + "rewards/margins": 0.7078297734260559, + "rewards/rejected": -1.5246866941452026, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 2.7477264674619106e-07, + "logits/chosen": -2.3124282360076904, + "logits/rejected": -2.3117291927337646, + "logps/chosen": -269.9890441894531, + "logps/rejected": -234.67852783203125, + "loss": 0.7279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8774211406707764, + "rewards/margins": 0.31102246046066284, + "rewards/rejected": -1.188443660736084, + "step": 841 + }, + { + "epoch": 0.1, + "learning_rate": 2.747372150702728e-07, + "logits/chosen": -2.4581098556518555, + "logits/rejected": -2.333850145339966, + "logps/chosen": -276.5857849121094, + "logps/rejected": -269.3536682128906, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18652315437793732, + "rewards/margins": 1.307881474494934, + "rewards/rejected": -1.494404673576355, + "step": 842 + }, + { + "epoch": 0.1, + "learning_rate": 2.7470178339435456e-07, + "logits/chosen": -2.123569965362549, + "logits/rejected": -2.2368123531341553, + "logps/chosen": -245.21453857421875, + "logps/rejected": -237.89602661132812, + "loss": 0.6574, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0193591117858887, + "rewards/margins": 0.42842838168144226, + "rewards/rejected": -1.4477875232696533, + "step": 843 + }, + { + "epoch": 0.1, + "learning_rate": 2.7466635171843625e-07, + "logits/chosen": -2.3953006267547607, + "logits/rejected": -2.4411137104034424, + "logps/chosen": -314.9009094238281, + "logps/rejected": -248.25347900390625, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9203782081604004, + "rewards/margins": 0.7418824434280396, + "rewards/rejected": -1.66226065158844, + "step": 844 + }, + { + "epoch": 0.1, + "learning_rate": 2.74630920042518e-07, + "logits/chosen": -2.8249924182891846, + "logits/rejected": -2.7767493724823, + "logps/chosen": -227.067138671875, + "logps/rejected": -235.408935546875, + "loss": 0.3778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6507272124290466, + "rewards/margins": 2.040569543838501, + "rewards/rejected": -2.6912968158721924, + "step": 845 + }, + { + "epoch": 0.1, + "learning_rate": 2.745954883665997e-07, + "logits/chosen": -2.493678569793701, + "logits/rejected": -2.1503067016601562, + "logps/chosen": -352.4228515625, + "logps/rejected": -269.718994140625, + "loss": 0.6205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5583662986755371, + "rewards/margins": 0.9630569815635681, + "rewards/rejected": -1.521423101425171, + "step": 846 + }, + { + "epoch": 0.1, + "learning_rate": 2.7456005669068145e-07, + "logits/chosen": -1.944936990737915, + "logits/rejected": -2.2964425086975098, + "logps/chosen": -341.35345458984375, + "logps/rejected": -255.51739501953125, + "loss": 0.4922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24993163347244263, + "rewards/margins": 0.9776173830032349, + "rewards/rejected": -1.2275490760803223, + "step": 847 + }, + { + "epoch": 0.1, + "learning_rate": 2.745246250147632e-07, + "logits/chosen": -2.4579014778137207, + "logits/rejected": -2.5890052318573, + "logps/chosen": -228.53457641601562, + "logps/rejected": -188.947021484375, + "loss": 0.3411, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11777147650718689, + "rewards/margins": 1.3392691612243652, + "rewards/rejected": -1.2214977741241455, + "step": 848 + }, + { + "epoch": 0.1, + "learning_rate": 2.7448919333884494e-07, + "logits/chosen": -2.4853193759918213, + "logits/rejected": -2.653001308441162, + "logps/chosen": -432.8856201171875, + "logps/rejected": -284.4534912109375, + "loss": 0.5759, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.009524941444397, + "rewards/margins": 0.7271549105644226, + "rewards/rejected": -1.7366799116134644, + "step": 849 + }, + { + "epoch": 0.1, + "learning_rate": 2.7445376166292664e-07, + "logits/chosen": -2.7064807415008545, + "logits/rejected": -2.914205551147461, + "logps/chosen": -274.72332763671875, + "logps/rejected": -232.59280395507812, + "loss": 0.7594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39183905720710754, + "rewards/margins": 0.6476144790649414, + "rewards/rejected": -1.0394536256790161, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 2.744183299870084e-07, + "logits/chosen": -2.561817169189453, + "logits/rejected": -2.5045077800750732, + "logps/chosen": -256.9534606933594, + "logps/rejected": -239.01370239257812, + "loss": 0.3352, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0992029905319214, + "rewards/margins": 1.8558465242385864, + "rewards/rejected": -2.9550492763519287, + "step": 851 + }, + { + "epoch": 0.1, + "learning_rate": 2.743828983110901e-07, + "logits/chosen": -1.9374275207519531, + "logits/rejected": -2.3309454917907715, + "logps/chosen": -372.3678894042969, + "logps/rejected": -276.0098876953125, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5369650721549988, + "rewards/margins": 1.8163683414459229, + "rewards/rejected": -2.353332996368408, + "step": 852 + }, + { + "epoch": 0.1, + "learning_rate": 2.7434746663517183e-07, + "logits/chosen": -2.095811367034912, + "logits/rejected": -2.2285728454589844, + "logps/chosen": -198.23341369628906, + "logps/rejected": -226.40940856933594, + "loss": 0.4082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6585460305213928, + "rewards/margins": 1.7346041202545166, + "rewards/rejected": -2.3931500911712646, + "step": 853 + }, + { + "epoch": 0.1, + "learning_rate": 2.743120349592536e-07, + "logits/chosen": -2.4165713787078857, + "logits/rejected": -2.41168212890625, + "logps/chosen": -357.34576416015625, + "logps/rejected": -437.22637939453125, + "loss": 0.3846, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.044335730373859406, + "rewards/margins": 1.661699891090393, + "rewards/rejected": -1.6173641681671143, + "step": 854 + }, + { + "epoch": 0.1, + "learning_rate": 2.742766032833353e-07, + "logits/chosen": -2.3180534839630127, + "logits/rejected": -2.0205044746398926, + "logps/chosen": -235.99032592773438, + "logps/rejected": -224.90594482421875, + "loss": 0.3298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49009013175964355, + "rewards/margins": 1.4078333377838135, + "rewards/rejected": -1.897923469543457, + "step": 855 + }, + { + "epoch": 0.1, + "learning_rate": 2.74241171607417e-07, + "logits/chosen": -2.4835808277130127, + "logits/rejected": -2.466080665588379, + "logps/chosen": -206.12327575683594, + "logps/rejected": -224.22584533691406, + "loss": 0.6719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7109304070472717, + "rewards/margins": 0.41522058844566345, + "rewards/rejected": -1.1261508464813232, + "step": 856 + }, + { + "epoch": 0.1, + "learning_rate": 2.742057399314987e-07, + "logits/chosen": -2.671407461166382, + "logits/rejected": -2.879507064819336, + "logps/chosen": -447.0558776855469, + "logps/rejected": -243.20494079589844, + "loss": 0.5974, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0094728469848633, + "rewards/margins": 0.8336586356163025, + "rewards/rejected": -1.8431315422058105, + "step": 857 + }, + { + "epoch": 0.1, + "learning_rate": 2.7417030825558047e-07, + "logits/chosen": -2.5173215866088867, + "logits/rejected": -2.3970425128936768, + "logps/chosen": -183.6345977783203, + "logps/rejected": -295.270263671875, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7753420472145081, + "rewards/margins": 1.450514793395996, + "rewards/rejected": -2.2258567810058594, + "step": 858 + }, + { + "epoch": 0.1, + "learning_rate": 2.741348765796622e-07, + "logits/chosen": -2.1595349311828613, + "logits/rejected": -1.8072996139526367, + "logps/chosen": -382.2784729003906, + "logps/rejected": -504.5029602050781, + "loss": 0.6275, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0388683080673218, + "rewards/margins": 0.9335626363754272, + "rewards/rejected": -1.9724310636520386, + "step": 859 + }, + { + "epoch": 0.1, + "learning_rate": 2.7409944490374397e-07, + "logits/chosen": -1.8402773141860962, + "logits/rejected": -1.7585285902023315, + "logps/chosen": -352.027099609375, + "logps/rejected": -330.027099609375, + "loss": 1.1536, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1935362815856934, + "rewards/margins": -0.0043143630027771, + "rewards/rejected": -1.1892218589782715, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 2.7406401322782566e-07, + "logits/chosen": -2.498201847076416, + "logits/rejected": -2.469534397125244, + "logps/chosen": -201.34637451171875, + "logps/rejected": -206.03701782226562, + "loss": 0.3057, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38887739181518555, + "rewards/margins": 1.4462579488754272, + "rewards/rejected": -1.8351353406906128, + "step": 861 + }, + { + "epoch": 0.1, + "learning_rate": 2.740285815519074e-07, + "logits/chosen": -2.607278823852539, + "logits/rejected": -2.6182618141174316, + "logps/chosen": -306.421875, + "logps/rejected": -161.61061096191406, + "loss": 0.6008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7242245078086853, + "rewards/margins": 0.9896548986434937, + "rewards/rejected": -1.7138793468475342, + "step": 862 + }, + { + "epoch": 0.1, + "learning_rate": 2.739931498759891e-07, + "logits/chosen": -2.9798707962036133, + "logits/rejected": -2.9436299800872803, + "logps/chosen": -153.89254760742188, + "logps/rejected": -195.43585205078125, + "loss": 0.5373, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4530704915523529, + "rewards/margins": 2.262968063354492, + "rewards/rejected": -2.716038465499878, + "step": 863 + }, + { + "epoch": 0.1, + "learning_rate": 2.7395771820007086e-07, + "logits/chosen": -2.4028046131134033, + "logits/rejected": -2.621156692504883, + "logps/chosen": -217.90699768066406, + "logps/rejected": -198.68719482421875, + "loss": 0.3619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18249444663524628, + "rewards/margins": 1.246960997581482, + "rewards/rejected": -1.4294555187225342, + "step": 864 + }, + { + "epoch": 0.1, + "learning_rate": 2.739222865241526e-07, + "logits/chosen": -2.384629249572754, + "logits/rejected": -2.1730425357818604, + "logps/chosen": -172.04638671875, + "logps/rejected": -214.42144775390625, + "loss": 0.4042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7669409513473511, + "rewards/margins": 1.453324317932129, + "rewards/rejected": -2.2202653884887695, + "step": 865 + }, + { + "epoch": 0.1, + "learning_rate": 2.738868548482343e-07, + "logits/chosen": -2.648963212966919, + "logits/rejected": -2.7522172927856445, + "logps/chosen": -158.70213317871094, + "logps/rejected": -228.22970581054688, + "loss": 0.2112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24649617075920105, + "rewards/margins": 2.320326566696167, + "rewards/rejected": -2.5668227672576904, + "step": 866 + }, + { + "epoch": 0.1, + "learning_rate": 2.7385142317231605e-07, + "logits/chosen": -2.6052682399749756, + "logits/rejected": -2.755232810974121, + "logps/chosen": -244.13059997558594, + "logps/rejected": -248.38491821289062, + "loss": 1.0656, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9004136323928833, + "rewards/margins": 0.17074266076087952, + "rewards/rejected": -2.0711562633514404, + "step": 867 + }, + { + "epoch": 0.1, + "learning_rate": 2.7381599149639774e-07, + "logits/chosen": -2.1230006217956543, + "logits/rejected": -2.153796672821045, + "logps/chosen": -206.5810546875, + "logps/rejected": -218.51907348632812, + "loss": 0.7607, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0023295879364014, + "rewards/margins": 0.769444465637207, + "rewards/rejected": -1.7717740535736084, + "step": 868 + }, + { + "epoch": 0.1, + "learning_rate": 2.737805598204795e-07, + "logits/chosen": -2.394606590270996, + "logits/rejected": -2.255336284637451, + "logps/chosen": -252.6491241455078, + "logps/rejected": -407.2364196777344, + "loss": 0.8131, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.094109296798706, + "rewards/margins": 1.452845811843872, + "rewards/rejected": -3.546955108642578, + "step": 869 + }, + { + "epoch": 0.1, + "learning_rate": 2.737451281445612e-07, + "logits/chosen": -2.2578048706054688, + "logits/rejected": -2.39266037940979, + "logps/chosen": -307.3096008300781, + "logps/rejected": -381.4688720703125, + "loss": 0.5199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40601804852485657, + "rewards/margins": 1.038485050201416, + "rewards/rejected": -1.4445029497146606, + "step": 870 + }, + { + "epoch": 0.1, + "learning_rate": 2.73709696468643e-07, + "logits/chosen": -2.3872618675231934, + "logits/rejected": -2.7459304332733154, + "logps/chosen": -266.2658996582031, + "logps/rejected": -232.3358154296875, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4222407341003418, + "rewards/margins": 1.456223964691162, + "rewards/rejected": -1.878464698791504, + "step": 871 + }, + { + "epoch": 0.1, + "learning_rate": 2.736742647927247e-07, + "logits/chosen": -2.322699546813965, + "logits/rejected": -2.590506076812744, + "logps/chosen": -254.16798400878906, + "logps/rejected": -327.3200988769531, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.679387092590332, + "rewards/margins": 2.2622036933898926, + "rewards/rejected": -3.9415905475616455, + "step": 872 + }, + { + "epoch": 0.1, + "learning_rate": 2.7363883311680643e-07, + "logits/chosen": -2.5678744316101074, + "logits/rejected": -2.3240013122558594, + "logps/chosen": -161.8018341064453, + "logps/rejected": -184.2908477783203, + "loss": 0.2974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5455507040023804, + "rewards/margins": 1.9467980861663818, + "rewards/rejected": -2.4923489093780518, + "step": 873 + }, + { + "epoch": 0.1, + "learning_rate": 2.7360340144088813e-07, + "logits/chosen": -2.3069074153900146, + "logits/rejected": -2.7034010887145996, + "logps/chosen": -366.5351257324219, + "logps/rejected": -145.95130920410156, + "loss": 0.7808, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1422231197357178, + "rewards/margins": 0.17090915143489838, + "rewards/rejected": -1.3131322860717773, + "step": 874 + }, + { + "epoch": 0.1, + "learning_rate": 2.735679697649699e-07, + "logits/chosen": -2.0309371948242188, + "logits/rejected": -2.013925075531006, + "logps/chosen": -203.388427734375, + "logps/rejected": -168.79827880859375, + "loss": 1.3891, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0217442512512207, + "rewards/margins": -0.3231983780860901, + "rewards/rejected": -1.6985459327697754, + "step": 875 + }, + { + "epoch": 0.1, + "learning_rate": 2.735325380890516e-07, + "logits/chosen": -2.37561297416687, + "logits/rejected": -2.5144829750061035, + "logps/chosen": -267.6269226074219, + "logps/rejected": -221.1397705078125, + "loss": 0.4809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08708879351615906, + "rewards/margins": 0.9561891555786133, + "rewards/rejected": -1.0432779788970947, + "step": 876 + }, + { + "epoch": 0.1, + "learning_rate": 2.734971064131333e-07, + "logits/chosen": -2.50106143951416, + "logits/rejected": -2.5077974796295166, + "logps/chosen": -400.8428955078125, + "logps/rejected": -249.34075927734375, + "loss": 0.6773, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5207757949829102, + "rewards/margins": 0.4289727509021759, + "rewards/rejected": -1.9497485160827637, + "step": 877 + }, + { + "epoch": 0.1, + "learning_rate": 2.7346167473721507e-07, + "logits/chosen": -2.448944091796875, + "logits/rejected": -2.3112964630126953, + "logps/chosen": -248.82896423339844, + "logps/rejected": -189.35926818847656, + "loss": 0.5305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5540949106216431, + "rewards/margins": 0.8031787872314453, + "rewards/rejected": -1.3572736978530884, + "step": 878 + }, + { + "epoch": 0.1, + "learning_rate": 2.7342624306129677e-07, + "logits/chosen": -2.4445419311523438, + "logits/rejected": -2.591122627258301, + "logps/chosen": -211.64498901367188, + "logps/rejected": -202.17271423339844, + "loss": 0.7751, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6425549983978271, + "rewards/margins": 0.9915646314620972, + "rewards/rejected": -1.6341195106506348, + "step": 879 + }, + { + "epoch": 0.1, + "learning_rate": 2.733908113853785e-07, + "logits/chosen": -2.38649320602417, + "logits/rejected": -2.575735330581665, + "logps/chosen": -409.8760681152344, + "logps/rejected": -291.2311706542969, + "loss": 0.1687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1492311954498291, + "rewards/margins": 1.9618580341339111, + "rewards/rejected": -2.1110892295837402, + "step": 880 + }, + { + "epoch": 0.1, + "learning_rate": 2.733553797094602e-07, + "logits/chosen": -2.7239973545074463, + "logits/rejected": -2.6517820358276367, + "logps/chosen": -386.863037109375, + "logps/rejected": -382.99322509765625, + "loss": 0.4186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.746008038520813, + "rewards/margins": 1.208742380142212, + "rewards/rejected": -1.954750418663025, + "step": 881 + }, + { + "epoch": 0.1, + "learning_rate": 2.7331994803354196e-07, + "logits/chosen": -2.5964696407318115, + "logits/rejected": -2.650857448577881, + "logps/chosen": -297.2395935058594, + "logps/rejected": -243.79217529296875, + "loss": 0.3043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.707123875617981, + "rewards/margins": 1.2238788604736328, + "rewards/rejected": -1.9310027360916138, + "step": 882 + }, + { + "epoch": 0.1, + "learning_rate": 2.732845163576237e-07, + "logits/chosen": -2.3817601203918457, + "logits/rejected": -2.4090349674224854, + "logps/chosen": -182.64474487304688, + "logps/rejected": -280.52496337890625, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1690198183059692, + "rewards/margins": 1.5565727949142456, + "rewards/rejected": -2.725592613220215, + "step": 883 + }, + { + "epoch": 0.1, + "learning_rate": 2.7324908468170546e-07, + "logits/chosen": -2.886898994445801, + "logits/rejected": -3.025939464569092, + "logps/chosen": -207.57025146484375, + "logps/rejected": -332.8024597167969, + "loss": 0.2393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3404068946838379, + "rewards/margins": 2.942467451095581, + "rewards/rejected": -3.282874345779419, + "step": 884 + }, + { + "epoch": 0.1, + "learning_rate": 2.7321365300578715e-07, + "logits/chosen": -2.549455165863037, + "logits/rejected": -2.3082275390625, + "logps/chosen": -196.8700408935547, + "logps/rejected": -308.0885009765625, + "loss": 0.2574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8000203967094421, + "rewards/margins": 2.0495731830596924, + "rewards/rejected": -2.8495936393737793, + "step": 885 + }, + { + "epoch": 0.1, + "learning_rate": 2.731782213298689e-07, + "logits/chosen": -1.8205680847167969, + "logits/rejected": -1.972083330154419, + "logps/chosen": -260.3123779296875, + "logps/rejected": -295.5809020996094, + "loss": 0.4628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6785041093826294, + "rewards/margins": 0.8022993803024292, + "rewards/rejected": -1.4808034896850586, + "step": 886 + }, + { + "epoch": 0.1, + "learning_rate": 2.731427896539506e-07, + "logits/chosen": -2.3177781105041504, + "logits/rejected": -2.51983642578125, + "logps/chosen": -404.57147216796875, + "logps/rejected": -334.4754638671875, + "loss": 0.3444, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4713652729988098, + "rewards/margins": 1.2826460599899292, + "rewards/rejected": -1.7540113925933838, + "step": 887 + }, + { + "epoch": 0.1, + "learning_rate": 2.7310735797803235e-07, + "logits/chosen": -2.1799988746643066, + "logits/rejected": -2.3793368339538574, + "logps/chosen": -255.95895385742188, + "logps/rejected": -378.0698547363281, + "loss": 0.6832, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7940413951873779, + "rewards/margins": 1.2387806177139282, + "rewards/rejected": -2.0328221321105957, + "step": 888 + }, + { + "epoch": 0.1, + "learning_rate": 2.730719263021141e-07, + "logits/chosen": -2.2570106983184814, + "logits/rejected": -2.0357398986816406, + "logps/chosen": -266.73779296875, + "logps/rejected": -374.1632080078125, + "loss": 0.2, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3968670964241028, + "rewards/margins": 2.2420458793640137, + "rewards/rejected": -2.6389129161834717, + "step": 889 + }, + { + "epoch": 0.1, + "learning_rate": 2.730364946261958e-07, + "logits/chosen": -2.657562255859375, + "logits/rejected": -2.475883960723877, + "logps/chosen": -131.65756225585938, + "logps/rejected": -232.9833526611328, + "loss": 0.4919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2719879746437073, + "rewards/margins": 1.4077529907226562, + "rewards/rejected": -1.6797407865524292, + "step": 890 + }, + { + "epoch": 0.1, + "learning_rate": 2.7300106295027754e-07, + "logits/chosen": -2.6084165573120117, + "logits/rejected": -2.811049222946167, + "logps/chosen": -279.0136413574219, + "logps/rejected": -116.66950225830078, + "loss": 0.8331, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0136877298355103, + "rewards/margins": 0.30267006158828735, + "rewards/rejected": -1.3163578510284424, + "step": 891 + }, + { + "epoch": 0.1, + "learning_rate": 2.7296563127435923e-07, + "logits/chosen": -2.4550797939300537, + "logits/rejected": -2.4692957401275635, + "logps/chosen": -303.04541015625, + "logps/rejected": -240.83773803710938, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7586424946784973, + "rewards/margins": 1.103433609008789, + "rewards/rejected": -1.8620760440826416, + "step": 892 + }, + { + "epoch": 0.1, + "learning_rate": 2.72930199598441e-07, + "logits/chosen": -2.421027660369873, + "logits/rejected": -2.2862043380737305, + "logps/chosen": -139.3295135498047, + "logps/rejected": -113.74028778076172, + "loss": 0.6243, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29174280166625977, + "rewards/margins": 0.7601765394210815, + "rewards/rejected": -1.0519193410873413, + "step": 893 + }, + { + "epoch": 0.1, + "learning_rate": 2.7289476792252273e-07, + "logits/chosen": -1.9997868537902832, + "logits/rejected": -2.1521716117858887, + "logps/chosen": -353.5522766113281, + "logps/rejected": -255.4780731201172, + "loss": 0.3388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6226689219474792, + "rewards/margins": 1.3320801258087158, + "rewards/rejected": -1.9547489881515503, + "step": 894 + }, + { + "epoch": 0.1, + "learning_rate": 2.728593362466045e-07, + "logits/chosen": -2.382061243057251, + "logits/rejected": -2.3215227127075195, + "logps/chosen": -338.754150390625, + "logps/rejected": -352.0447998046875, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5773947834968567, + "rewards/margins": 2.4783215522766113, + "rewards/rejected": -3.0557162761688232, + "step": 895 + }, + { + "epoch": 0.1, + "learning_rate": 2.728239045706862e-07, + "logits/chosen": -1.8443820476531982, + "logits/rejected": -1.7422621250152588, + "logps/chosen": -278.4216003417969, + "logps/rejected": -270.6524658203125, + "loss": 0.2563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19742241501808167, + "rewards/margins": 2.5799741744995117, + "rewards/rejected": -2.7773964405059814, + "step": 896 + }, + { + "epoch": 0.1, + "learning_rate": 2.727884728947679e-07, + "logits/chosen": -2.600281238555908, + "logits/rejected": -2.753805637359619, + "logps/chosen": -288.9471435546875, + "logps/rejected": -200.90052795410156, + "loss": 0.6112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.955651044845581, + "rewards/margins": 1.0883605480194092, + "rewards/rejected": -2.0440115928649902, + "step": 897 + }, + { + "epoch": 0.1, + "learning_rate": 2.727530412188496e-07, + "logits/chosen": -2.183803081512451, + "logits/rejected": -2.5019640922546387, + "logps/chosen": -326.1049499511719, + "logps/rejected": -244.45465087890625, + "loss": 0.8303, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7634547352790833, + "rewards/margins": 0.5251582860946655, + "rewards/rejected": -1.288612961769104, + "step": 898 + }, + { + "epoch": 0.1, + "learning_rate": 2.7271760954293137e-07, + "logits/chosen": -2.256326198577881, + "logits/rejected": -2.7890090942382812, + "logps/chosen": -265.24517822265625, + "logps/rejected": -215.26483154296875, + "loss": 0.8532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8901289701461792, + "rewards/margins": 0.5248301029205322, + "rewards/rejected": -1.4149590730667114, + "step": 899 + }, + { + "epoch": 0.1, + "learning_rate": 2.726821778670131e-07, + "logits/chosen": -2.1389808654785156, + "logits/rejected": -2.256173849105835, + "logps/chosen": -293.9353942871094, + "logps/rejected": -305.8691711425781, + "loss": 0.4095, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19598381221294403, + "rewards/margins": 1.8666284084320068, + "rewards/rejected": -2.062612295150757, + "step": 900 + }, + { + "epoch": 0.1, + "learning_rate": 2.726467461910948e-07, + "logits/chosen": -2.2993905544281006, + "logits/rejected": -2.367767572402954, + "logps/chosen": -192.41049194335938, + "logps/rejected": -130.84104919433594, + "loss": 0.8098, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0114803314208984, + "rewards/margins": 0.6559673547744751, + "rewards/rejected": -1.667447805404663, + "step": 901 + }, + { + "epoch": 0.1, + "learning_rate": 2.7261131451517656e-07, + "logits/chosen": -2.2684309482574463, + "logits/rejected": -2.3545074462890625, + "logps/chosen": -211.46060180664062, + "logps/rejected": -216.25440979003906, + "loss": 0.3949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2944456934928894, + "rewards/margins": 0.9296656847000122, + "rewards/rejected": -1.2241114377975464, + "step": 902 + }, + { + "epoch": 0.11, + "learning_rate": 2.7257588283925826e-07, + "logits/chosen": -1.9035773277282715, + "logits/rejected": -2.1768484115600586, + "logps/chosen": -426.9383544921875, + "logps/rejected": -333.42236328125, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6385980844497681, + "rewards/margins": 1.7906768321990967, + "rewards/rejected": -2.4292750358581543, + "step": 903 + }, + { + "epoch": 0.11, + "learning_rate": 2.7254045116334e-07, + "logits/chosen": -2.4897422790527344, + "logits/rejected": -2.7430386543273926, + "logps/chosen": -281.6868591308594, + "logps/rejected": -220.74661254882812, + "loss": 0.5002, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6289548277854919, + "rewards/margins": 0.8401647210121155, + "rewards/rejected": -1.4691195487976074, + "step": 904 + }, + { + "epoch": 0.11, + "learning_rate": 2.725050194874217e-07, + "logits/chosen": -2.539670467376709, + "logits/rejected": -2.6542413234710693, + "logps/chosen": -280.54193115234375, + "logps/rejected": -235.5479736328125, + "loss": 0.6285, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2756689786911011, + "rewards/margins": 0.8308883905410767, + "rewards/rejected": -1.1065573692321777, + "step": 905 + }, + { + "epoch": 0.11, + "learning_rate": 2.724695878115035e-07, + "logits/chosen": -2.2356343269348145, + "logits/rejected": -2.4383907318115234, + "logps/chosen": -259.6686096191406, + "logps/rejected": -326.6102294921875, + "loss": 0.3928, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.917183518409729, + "rewards/margins": 1.1936628818511963, + "rewards/rejected": -2.110846519470215, + "step": 906 + }, + { + "epoch": 0.11, + "learning_rate": 2.724341561355852e-07, + "logits/chosen": -1.8490135669708252, + "logits/rejected": -1.9987668991088867, + "logps/chosen": -403.26812744140625, + "logps/rejected": -316.74371337890625, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0296697616577148, + "rewards/margins": 0.22748082876205444, + "rewards/rejected": -1.257150650024414, + "step": 907 + }, + { + "epoch": 0.11, + "learning_rate": 2.7239872445966695e-07, + "logits/chosen": -2.0095643997192383, + "logits/rejected": -1.7829467058181763, + "logps/chosen": -204.13865661621094, + "logps/rejected": -521.1851806640625, + "loss": 0.3158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31645703315734863, + "rewards/margins": 2.4370474815368652, + "rewards/rejected": -2.753504514694214, + "step": 908 + }, + { + "epoch": 0.11, + "learning_rate": 2.7236329278374864e-07, + "logits/chosen": -2.5511474609375, + "logits/rejected": -2.6521260738372803, + "logps/chosen": -329.65252685546875, + "logps/rejected": -244.117919921875, + "loss": 0.9573, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6139683723449707, + "rewards/margins": 0.01573871076107025, + "rewards/rejected": -1.6297069787979126, + "step": 909 + }, + { + "epoch": 0.11, + "learning_rate": 2.723278611078304e-07, + "logits/chosen": -2.7357118129730225, + "logits/rejected": -2.6527299880981445, + "logps/chosen": -329.1422424316406, + "logps/rejected": -334.13330078125, + "loss": 0.418, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2626457214355469, + "rewards/margins": 0.8937562108039856, + "rewards/rejected": -2.1564018726348877, + "step": 910 + }, + { + "epoch": 0.11, + "learning_rate": 2.7229242943191214e-07, + "logits/chosen": -2.4607765674591064, + "logits/rejected": -2.361212730407715, + "logps/chosen": -276.8341064453125, + "logps/rejected": -279.7308349609375, + "loss": 0.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.844237208366394, + "rewards/margins": 2.1750621795654297, + "rewards/rejected": -3.019299268722534, + "step": 911 + }, + { + "epoch": 0.11, + "learning_rate": 2.7225699775599384e-07, + "logits/chosen": -2.513491630554199, + "logits/rejected": -2.7593019008636475, + "logps/chosen": -206.03125, + "logps/rejected": -206.5583953857422, + "loss": 0.5679, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8312869071960449, + "rewards/margins": 1.5916966199874878, + "rewards/rejected": -2.422983407974243, + "step": 912 + }, + { + "epoch": 0.11, + "learning_rate": 2.722215660800756e-07, + "logits/chosen": -2.7785422801971436, + "logits/rejected": -2.6310136318206787, + "logps/chosen": -87.30070495605469, + "logps/rejected": -151.00238037109375, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6325681805610657, + "rewards/margins": 1.2288638353347778, + "rewards/rejected": -1.8614320755004883, + "step": 913 + }, + { + "epoch": 0.11, + "learning_rate": 2.721861344041573e-07, + "logits/chosen": -2.4290454387664795, + "logits/rejected": -2.3605217933654785, + "logps/chosen": -288.1297607421875, + "logps/rejected": -211.72531127929688, + "loss": 0.4898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6617876291275024, + "rewards/margins": 1.375885009765625, + "rewards/rejected": -2.037672519683838, + "step": 914 + }, + { + "epoch": 0.11, + "learning_rate": 2.7215070272823903e-07, + "logits/chosen": -2.1356759071350098, + "logits/rejected": -2.027170419692993, + "logps/chosen": -127.61013793945312, + "logps/rejected": -265.3847351074219, + "loss": 0.323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19862046837806702, + "rewards/margins": 2.511618137359619, + "rewards/rejected": -2.7102389335632324, + "step": 915 + }, + { + "epoch": 0.11, + "learning_rate": 2.721152710523207e-07, + "logits/chosen": -2.080789804458618, + "logits/rejected": -2.166923761367798, + "logps/chosen": -276.890625, + "logps/rejected": -347.9039611816406, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17628367245197296, + "rewards/margins": 0.9203159213066101, + "rewards/rejected": -1.0965995788574219, + "step": 916 + }, + { + "epoch": 0.11, + "learning_rate": 2.7207983937640247e-07, + "logits/chosen": -2.4532315731048584, + "logits/rejected": -2.565042734146118, + "logps/chosen": -324.721923828125, + "logps/rejected": -255.77439880371094, + "loss": 0.4855, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2365282773971558, + "rewards/margins": 0.9416915774345398, + "rewards/rejected": -2.178219795227051, + "step": 917 + }, + { + "epoch": 0.11, + "learning_rate": 2.720444077004842e-07, + "logits/chosen": -2.923097610473633, + "logits/rejected": -2.6730334758758545, + "logps/chosen": -222.44705200195312, + "logps/rejected": -276.507080078125, + "loss": 0.3765, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4248666763305664, + "rewards/margins": 1.8859057426452637, + "rewards/rejected": -2.31077241897583, + "step": 918 + }, + { + "epoch": 0.11, + "learning_rate": 2.7200897602456597e-07, + "logits/chosen": -2.006549835205078, + "logits/rejected": -2.166224718093872, + "logps/chosen": -361.4042663574219, + "logps/rejected": -331.1242980957031, + "loss": 0.5611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0590423345565796, + "rewards/margins": 1.1534174680709839, + "rewards/rejected": -2.2124595642089844, + "step": 919 + }, + { + "epoch": 0.11, + "learning_rate": 2.7197354434864767e-07, + "logits/chosen": -2.330815315246582, + "logits/rejected": -2.3359081745147705, + "logps/chosen": -220.25315856933594, + "logps/rejected": -266.927001953125, + "loss": 0.3108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.495333731174469, + "rewards/margins": 1.6684662103652954, + "rewards/rejected": -2.16379976272583, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 2.719381126727294e-07, + "logits/chosen": -2.5380189418792725, + "logits/rejected": -2.4181535243988037, + "logps/chosen": -374.0831298828125, + "logps/rejected": -326.0532531738281, + "loss": 0.7044, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.318453311920166, + "rewards/margins": 0.9853094816207886, + "rewards/rejected": -2.303762674331665, + "step": 921 + }, + { + "epoch": 0.11, + "learning_rate": 2.7190268099681116e-07, + "logits/chosen": -2.4655890464782715, + "logits/rejected": -2.752384901046753, + "logps/chosen": -269.3939208984375, + "logps/rejected": -189.465576171875, + "loss": 1.3645, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.01847505569458, + "rewards/margins": 0.623408854007721, + "rewards/rejected": -2.6418838500976562, + "step": 922 + }, + { + "epoch": 0.11, + "learning_rate": 2.7186724932089286e-07, + "logits/chosen": -2.142364263534546, + "logits/rejected": -2.142500400543213, + "logps/chosen": -298.6378479003906, + "logps/rejected": -289.3003845214844, + "loss": 0.6167, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1336969137191772, + "rewards/margins": 1.4006930589675903, + "rewards/rejected": -2.5343899726867676, + "step": 923 + }, + { + "epoch": 0.11, + "learning_rate": 2.718318176449746e-07, + "logits/chosen": -2.345696449279785, + "logits/rejected": -2.5615217685699463, + "logps/chosen": -390.5589599609375, + "logps/rejected": -261.41229248046875, + "loss": 0.4671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32437652349472046, + "rewards/margins": 0.9223254919052124, + "rewards/rejected": -1.2467020750045776, + "step": 924 + }, + { + "epoch": 0.11, + "learning_rate": 2.717963859690563e-07, + "logits/chosen": -1.8304967880249023, + "logits/rejected": -2.089359998703003, + "logps/chosen": -293.8280944824219, + "logps/rejected": -199.13397216796875, + "loss": 0.8512, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7305306196212769, + "rewards/margins": 0.2128722369670868, + "rewards/rejected": -0.943402886390686, + "step": 925 + }, + { + "epoch": 0.11, + "learning_rate": 2.7176095429313805e-07, + "logits/chosen": -1.956066608428955, + "logits/rejected": -1.9802474975585938, + "logps/chosen": -384.04412841796875, + "logps/rejected": -392.45703125, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4696843922138214, + "rewards/margins": 1.5475095510482788, + "rewards/rejected": -2.0171940326690674, + "step": 926 + }, + { + "epoch": 0.11, + "learning_rate": 2.7172552261721975e-07, + "logits/chosen": -2.0599639415740967, + "logits/rejected": -2.404447078704834, + "logps/chosen": -416.64996337890625, + "logps/rejected": -207.01539611816406, + "loss": 0.4634, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3243686556816101, + "rewards/margins": 0.890299916267395, + "rewards/rejected": -1.21466863155365, + "step": 927 + }, + { + "epoch": 0.11, + "learning_rate": 2.716900909413015e-07, + "logits/chosen": -2.380183696746826, + "logits/rejected": -2.4667577743530273, + "logps/chosen": -189.44952392578125, + "logps/rejected": -177.78472900390625, + "loss": 0.5511, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4223308265209198, + "rewards/margins": 1.0884935855865479, + "rewards/rejected": -1.5108243227005005, + "step": 928 + }, + { + "epoch": 0.11, + "learning_rate": 2.7165465926538324e-07, + "logits/chosen": -2.347574234008789, + "logits/rejected": -2.580345392227173, + "logps/chosen": -634.8778686523438, + "logps/rejected": -458.26422119140625, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12248080968856812, + "rewards/margins": 1.4319700002670288, + "rewards/rejected": -1.3094892501831055, + "step": 929 + }, + { + "epoch": 0.11, + "learning_rate": 2.71619227589465e-07, + "logits/chosen": -2.146653175354004, + "logits/rejected": -2.1664810180664062, + "logps/chosen": -228.718505859375, + "logps/rejected": -201.86090087890625, + "loss": 0.4006, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27546143531799316, + "rewards/margins": 0.8638690114021301, + "rewards/rejected": -1.1393303871154785, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 2.715837959135467e-07, + "logits/chosen": -2.17645001411438, + "logits/rejected": -2.2205986976623535, + "logps/chosen": -381.80706787109375, + "logps/rejected": -393.1808776855469, + "loss": 0.5834, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.38829800486564636, + "rewards/margins": 0.7978224754333496, + "rewards/rejected": -1.1861203908920288, + "step": 931 + }, + { + "epoch": 0.11, + "learning_rate": 2.7154836423762844e-07, + "logits/chosen": -1.8894619941711426, + "logits/rejected": -2.2668843269348145, + "logps/chosen": -337.0008239746094, + "logps/rejected": -232.4938201904297, + "loss": 0.4556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6895352005958557, + "rewards/margins": 1.438370704650879, + "rewards/rejected": -2.127906084060669, + "step": 932 + }, + { + "epoch": 0.11, + "learning_rate": 2.715129325617102e-07, + "logits/chosen": -1.7128411531448364, + "logits/rejected": -1.767407774925232, + "logps/chosen": -292.68756103515625, + "logps/rejected": -299.2854919433594, + "loss": 0.2923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.484789103269577, + "rewards/margins": 1.4878734350204468, + "rewards/rejected": -1.9726624488830566, + "step": 933 + }, + { + "epoch": 0.11, + "learning_rate": 2.714775008857919e-07, + "logits/chosen": -2.296245813369751, + "logits/rejected": -2.5920512676239014, + "logps/chosen": -267.03961181640625, + "logps/rejected": -178.40972900390625, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3996336460113525, + "rewards/margins": 0.6982619762420654, + "rewards/rejected": -2.097895622253418, + "step": 934 + }, + { + "epoch": 0.11, + "learning_rate": 2.7144206920987363e-07, + "logits/chosen": -2.285618305206299, + "logits/rejected": -2.1237363815307617, + "logps/chosen": -216.46176147460938, + "logps/rejected": -332.8358459472656, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8936880826950073, + "rewards/margins": 1.8163669109344482, + "rewards/rejected": -2.710054874420166, + "step": 935 + }, + { + "epoch": 0.11, + "learning_rate": 2.714066375339553e-07, + "logits/chosen": -2.8034615516662598, + "logits/rejected": -2.819854974746704, + "logps/chosen": -253.01705932617188, + "logps/rejected": -274.55865478515625, + "loss": 0.3811, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0268718004226685, + "rewards/margins": 1.480276107788086, + "rewards/rejected": -2.507148027420044, + "step": 936 + }, + { + "epoch": 0.11, + "learning_rate": 2.713712058580371e-07, + "logits/chosen": -2.437837600708008, + "logits/rejected": -2.415449380874634, + "logps/chosen": -219.99838256835938, + "logps/rejected": -153.76742553710938, + "loss": 1.5873, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.936269998550415, + "rewards/margins": 0.11902236938476562, + "rewards/rejected": -2.0552923679351807, + "step": 937 + }, + { + "epoch": 0.11, + "learning_rate": 2.7133577418211877e-07, + "logits/chosen": -2.644559621810913, + "logits/rejected": -2.476400375366211, + "logps/chosen": -291.0596923828125, + "logps/rejected": -208.57508850097656, + "loss": 0.316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03176143765449524, + "rewards/margins": 1.3453729152679443, + "rewards/rejected": -1.3771344423294067, + "step": 938 + }, + { + "epoch": 0.11, + "learning_rate": 2.713003425062005e-07, + "logits/chosen": -2.1710925102233887, + "logits/rejected": -2.2746121883392334, + "logps/chosen": -306.44354248046875, + "logps/rejected": -310.90045166015625, + "loss": 0.4897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9752366542816162, + "rewards/margins": 1.3312605619430542, + "rewards/rejected": -2.306497097015381, + "step": 939 + }, + { + "epoch": 0.11, + "learning_rate": 2.7126491083028227e-07, + "logits/chosen": -1.9625437259674072, + "logits/rejected": -1.939466118812561, + "logps/chosen": -254.60308837890625, + "logps/rejected": -293.1266174316406, + "loss": 0.7012, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6357609033584595, + "rewards/margins": 0.7906701564788818, + "rewards/rejected": -1.4264310598373413, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 2.71229479154364e-07, + "logits/chosen": -2.2355852127075195, + "logits/rejected": -2.006657600402832, + "logps/chosen": -484.8934631347656, + "logps/rejected": -449.8263854980469, + "loss": 0.4179, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08952587842941284, + "rewards/margins": 1.2084903717041016, + "rewards/rejected": -1.2980163097381592, + "step": 941 + }, + { + "epoch": 0.11, + "learning_rate": 2.711940474784457e-07, + "logits/chosen": -2.3699445724487305, + "logits/rejected": -2.009874105453491, + "logps/chosen": -201.49783325195312, + "logps/rejected": -425.10455322265625, + "loss": 0.4446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6293970346450806, + "rewards/margins": 2.5021114349365234, + "rewards/rejected": -3.1315083503723145, + "step": 942 + }, + { + "epoch": 0.11, + "learning_rate": 2.7115861580252746e-07, + "logits/chosen": -1.9901020526885986, + "logits/rejected": -2.2917094230651855, + "logps/chosen": -305.6833801269531, + "logps/rejected": -242.10855102539062, + "loss": 0.2968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5023388266563416, + "rewards/margins": 2.0069451332092285, + "rewards/rejected": -2.5092835426330566, + "step": 943 + }, + { + "epoch": 0.11, + "learning_rate": 2.711231841266092e-07, + "logits/chosen": -2.687199115753174, + "logits/rejected": -2.5252292156219482, + "logps/chosen": -141.91226196289062, + "logps/rejected": -162.43707275390625, + "loss": 0.2374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5231041312217712, + "rewards/margins": 1.6486130952835083, + "rewards/rejected": -2.1717171669006348, + "step": 944 + }, + { + "epoch": 0.11, + "learning_rate": 2.710877524506909e-07, + "logits/chosen": -2.9363417625427246, + "logits/rejected": -2.9411022663116455, + "logps/chosen": -120.79934692382812, + "logps/rejected": -147.89247131347656, + "loss": 0.5199, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.899508535861969, + "rewards/margins": 1.201124906539917, + "rewards/rejected": -2.1006336212158203, + "step": 945 + }, + { + "epoch": 0.11, + "learning_rate": 2.7105232077477265e-07, + "logits/chosen": -2.233623504638672, + "logits/rejected": -2.2427022457122803, + "logps/chosen": -499.53350830078125, + "logps/rejected": -490.29248046875, + "loss": 0.6722, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.325993150472641, + "rewards/margins": 1.074951410293579, + "rewards/rejected": -1.400944709777832, + "step": 946 + }, + { + "epoch": 0.11, + "learning_rate": 2.7101688909885435e-07, + "logits/chosen": -2.242744207382202, + "logits/rejected": -2.5705549716949463, + "logps/chosen": -234.45559692382812, + "logps/rejected": -181.55722045898438, + "loss": 0.7848, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1378593444824219, + "rewards/margins": 0.6810678839683533, + "rewards/rejected": -1.81892728805542, + "step": 947 + }, + { + "epoch": 0.11, + "learning_rate": 2.709814574229361e-07, + "logits/chosen": -2.67470121383667, + "logits/rejected": -2.6241462230682373, + "logps/chosen": -463.63232421875, + "logps/rejected": -439.31268310546875, + "loss": 1.0878, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0071697235107422, + "rewards/margins": 0.46164369583129883, + "rewards/rejected": -1.4688133001327515, + "step": 948 + }, + { + "epoch": 0.11, + "learning_rate": 2.709460257470178e-07, + "logits/chosen": -2.4138174057006836, + "logits/rejected": -2.3026328086853027, + "logps/chosen": -229.15750122070312, + "logps/rejected": -237.95738220214844, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7025728225708008, + "rewards/margins": 1.0323164463043213, + "rewards/rejected": -1.734889268875122, + "step": 949 + }, + { + "epoch": 0.11, + "learning_rate": 2.7091059407109954e-07, + "logits/chosen": -2.178194284439087, + "logits/rejected": -2.4063143730163574, + "logps/chosen": -236.43006896972656, + "logps/rejected": -213.5054473876953, + "loss": 0.7791, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8494874238967896, + "rewards/margins": 1.5621049404144287, + "rewards/rejected": -2.411592483520508, + "step": 950 + }, + { + "epoch": 0.11, + "learning_rate": 2.708751623951813e-07, + "logits/chosen": -2.0526390075683594, + "logits/rejected": -1.946610927581787, + "logps/chosen": -358.041015625, + "logps/rejected": -354.4357604980469, + "loss": 0.3057, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1416274905204773, + "rewards/margins": 1.560659408569336, + "rewards/rejected": -1.702286958694458, + "step": 951 + }, + { + "epoch": 0.11, + "learning_rate": 2.70839730719263e-07, + "logits/chosen": -2.665771722793579, + "logits/rejected": -2.5515379905700684, + "logps/chosen": -162.84962463378906, + "logps/rejected": -346.3074645996094, + "loss": 0.2483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45774391293525696, + "rewards/margins": 2.006432056427002, + "rewards/rejected": -2.4641759395599365, + "step": 952 + }, + { + "epoch": 0.11, + "learning_rate": 2.7080429904334473e-07, + "logits/chosen": -2.61605167388916, + "logits/rejected": -2.34519100189209, + "logps/chosen": -188.31442260742188, + "logps/rejected": -218.47964477539062, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6505228281021118, + "rewards/margins": 1.6718947887420654, + "rewards/rejected": -2.322417736053467, + "step": 953 + }, + { + "epoch": 0.11, + "learning_rate": 2.707688673674265e-07, + "logits/chosen": -2.5794734954833984, + "logits/rejected": -2.619561195373535, + "logps/chosen": -204.60955810546875, + "logps/rejected": -218.64749145507812, + "loss": 0.388, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6478289365768433, + "rewards/margins": 2.849297523498535, + "rewards/rejected": -4.497126579284668, + "step": 954 + }, + { + "epoch": 0.11, + "learning_rate": 2.7073343569150823e-07, + "logits/chosen": -2.30175518989563, + "logits/rejected": -2.4560694694519043, + "logps/chosen": -232.8940887451172, + "logps/rejected": -266.4120788574219, + "loss": 0.7766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.102486252784729, + "rewards/margins": 0.4204743802547455, + "rewards/rejected": -1.5229606628417969, + "step": 955 + }, + { + "epoch": 0.11, + "learning_rate": 2.7069800401558993e-07, + "logits/chosen": -2.271836280822754, + "logits/rejected": -2.19091534614563, + "logps/chosen": -247.7337646484375, + "logps/rejected": -225.79930114746094, + "loss": 0.5345, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7941255569458008, + "rewards/margins": 1.5109639167785645, + "rewards/rejected": -2.3050894737243652, + "step": 956 + }, + { + "epoch": 0.11, + "learning_rate": 2.706625723396717e-07, + "logits/chosen": -2.5408670902252197, + "logits/rejected": -2.4554221630096436, + "logps/chosen": -219.6632080078125, + "logps/rejected": -307.2583923339844, + "loss": 0.5009, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2068538665771484, + "rewards/margins": 0.8313860893249512, + "rewards/rejected": -2.0382399559020996, + "step": 957 + }, + { + "epoch": 0.11, + "learning_rate": 2.7062714066375337e-07, + "logits/chosen": -2.969264507293701, + "logits/rejected": -2.9796438217163086, + "logps/chosen": -209.68667602539062, + "logps/rejected": -164.04110717773438, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14447641372680664, + "rewards/margins": 1.472083330154419, + "rewards/rejected": -1.6165597438812256, + "step": 958 + }, + { + "epoch": 0.11, + "learning_rate": 2.705917089878351e-07, + "logits/chosen": -2.173837423324585, + "logits/rejected": -2.0854241847991943, + "logps/chosen": -242.50711059570312, + "logps/rejected": -284.5107421875, + "loss": 0.5917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029210835695266724, + "rewards/margins": 0.5560863614082336, + "rewards/rejected": -0.5852972269058228, + "step": 959 + }, + { + "epoch": 0.11, + "learning_rate": 2.705562773119168e-07, + "logits/chosen": -1.9699152708053589, + "logits/rejected": -2.205106258392334, + "logps/chosen": -260.61328125, + "logps/rejected": -171.63856506347656, + "loss": 0.3393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49491944909095764, + "rewards/margins": 1.6287928819656372, + "rewards/rejected": -2.1237123012542725, + "step": 960 + }, + { + "epoch": 0.11, + "learning_rate": 2.7052084563599856e-07, + "logits/chosen": -2.141350030899048, + "logits/rejected": -2.1041855812072754, + "logps/chosen": -121.7099609375, + "logps/rejected": -119.96987915039062, + "loss": 0.6142, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31868740916252136, + "rewards/margins": 0.376986563205719, + "rewards/rejected": -0.6956740617752075, + "step": 961 + }, + { + "epoch": 0.11, + "learning_rate": 2.704854139600803e-07, + "logits/chosen": -2.2591631412506104, + "logits/rejected": -2.1635005474090576, + "logps/chosen": -190.437744140625, + "logps/rejected": -407.61016845703125, + "loss": 0.1809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3189304769039154, + "rewards/margins": 2.40791392326355, + "rewards/rejected": -2.726844549179077, + "step": 962 + }, + { + "epoch": 0.11, + "learning_rate": 2.70449982284162e-07, + "logits/chosen": -2.6043293476104736, + "logits/rejected": -2.440664052963257, + "logps/chosen": -274.945556640625, + "logps/rejected": -217.32041931152344, + "loss": 0.57, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012274503707885742, + "rewards/margins": 1.777383804321289, + "rewards/rejected": -1.7896581888198853, + "step": 963 + }, + { + "epoch": 0.11, + "learning_rate": 2.7041455060824376e-07, + "logits/chosen": -2.8405799865722656, + "logits/rejected": -2.9440767765045166, + "logps/chosen": -299.4005432128906, + "logps/rejected": -284.3367919921875, + "loss": 0.6998, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0614229440689087, + "rewards/margins": 1.359285831451416, + "rewards/rejected": -2.4207088947296143, + "step": 964 + }, + { + "epoch": 0.11, + "learning_rate": 2.703791189323255e-07, + "logits/chosen": -1.6438281536102295, + "logits/rejected": -2.4547836780548096, + "logps/chosen": -456.79559326171875, + "logps/rejected": -259.07952880859375, + "loss": 1.0071, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.0017545223236084, + "rewards/margins": -0.24269232153892517, + "rewards/rejected": -1.7590621709823608, + "step": 965 + }, + { + "epoch": 0.11, + "learning_rate": 2.703436872564072e-07, + "logits/chosen": -2.45007586479187, + "logits/rejected": -2.6021673679351807, + "logps/chosen": -432.3465881347656, + "logps/rejected": -266.842529296875, + "loss": 0.3269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18648281693458557, + "rewards/margins": 1.6305298805236816, + "rewards/rejected": -1.8170127868652344, + "step": 966 + }, + { + "epoch": 0.11, + "learning_rate": 2.7030825558048895e-07, + "logits/chosen": -1.9774234294891357, + "logits/rejected": -1.9750330448150635, + "logps/chosen": -271.7263488769531, + "logps/rejected": -239.47695922851562, + "loss": 0.6459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.275549978017807, + "rewards/margins": 0.46789300441741943, + "rewards/rejected": -0.7434430122375488, + "step": 967 + }, + { + "epoch": 0.11, + "learning_rate": 2.702728239045707e-07, + "logits/chosen": -2.7427351474761963, + "logits/rejected": -2.6258456707000732, + "logps/chosen": -211.6383056640625, + "logps/rejected": -285.5758056640625, + "loss": 0.534, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0184810161590576, + "rewards/margins": 1.6231517791748047, + "rewards/rejected": -2.641632556915283, + "step": 968 + }, + { + "epoch": 0.11, + "learning_rate": 2.702373922286524e-07, + "logits/chosen": -2.216651678085327, + "logits/rejected": -1.9870820045471191, + "logps/chosen": -199.74261474609375, + "logps/rejected": -292.4718017578125, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20429731905460358, + "rewards/margins": 1.5516011714935303, + "rewards/rejected": -1.7558984756469727, + "step": 969 + }, + { + "epoch": 0.11, + "learning_rate": 2.7020196055273414e-07, + "logits/chosen": -2.504601001739502, + "logits/rejected": -2.307323932647705, + "logps/chosen": -358.5874938964844, + "logps/rejected": -243.50167846679688, + "loss": 0.654, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0002554655075073, + "rewards/margins": 0.807239294052124, + "rewards/rejected": -1.8074947595596313, + "step": 970 + }, + { + "epoch": 0.11, + "learning_rate": 2.7016652887681584e-07, + "logits/chosen": -1.9807053804397583, + "logits/rejected": -2.2712152004241943, + "logps/chosen": -267.39324951171875, + "logps/rejected": -222.39932250976562, + "loss": 0.3321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7652286291122437, + "rewards/margins": 1.6488046646118164, + "rewards/rejected": -2.4140334129333496, + "step": 971 + }, + { + "epoch": 0.11, + "learning_rate": 2.701310972008976e-07, + "logits/chosen": -2.443753242492676, + "logits/rejected": -2.135472059249878, + "logps/chosen": -163.60165405273438, + "logps/rejected": -246.2024688720703, + "loss": 0.558, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2948487997055054, + "rewards/margins": 1.2444394826889038, + "rewards/rejected": -2.53928804397583, + "step": 972 + }, + { + "epoch": 0.11, + "learning_rate": 2.7009566552497934e-07, + "logits/chosen": -1.8400704860687256, + "logits/rejected": -2.1991758346557617, + "logps/chosen": -358.5091552734375, + "logps/rejected": -187.8582763671875, + "loss": 0.4122, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.047920823097229, + "rewards/margins": 1.0896859169006348, + "rewards/rejected": -2.137606620788574, + "step": 973 + }, + { + "epoch": 0.11, + "learning_rate": 2.7006023384906103e-07, + "logits/chosen": -2.9017271995544434, + "logits/rejected": -2.67814564704895, + "logps/chosen": -204.1541748046875, + "logps/rejected": -173.2073516845703, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09796231985092163, + "rewards/margins": 1.3070521354675293, + "rewards/rejected": -1.4050145149230957, + "step": 974 + }, + { + "epoch": 0.11, + "learning_rate": 2.700248021731428e-07, + "logits/chosen": -2.377932548522949, + "logits/rejected": -2.4133005142211914, + "logps/chosen": -431.3186340332031, + "logps/rejected": -349.1793518066406, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6560257077217102, + "rewards/margins": 1.3788739442825317, + "rewards/rejected": -2.0348997116088867, + "step": 975 + }, + { + "epoch": 0.11, + "learning_rate": 2.6998937049722453e-07, + "logits/chosen": -1.9924702644348145, + "logits/rejected": -2.17643666267395, + "logps/chosen": -231.34219360351562, + "logps/rejected": -207.58334350585938, + "loss": 0.788, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9338446855545044, + "rewards/margins": 0.06547223776578903, + "rewards/rejected": -0.999316930770874, + "step": 976 + }, + { + "epoch": 0.11, + "learning_rate": 2.699539388213062e-07, + "logits/chosen": -2.583069086074829, + "logits/rejected": -2.2863717079162598, + "logps/chosen": -109.478515625, + "logps/rejected": -222.06886291503906, + "loss": 0.4769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13123570382595062, + "rewards/margins": 1.020374059677124, + "rewards/rejected": -1.1516097784042358, + "step": 977 + }, + { + "epoch": 0.11, + "learning_rate": 2.6991850714538797e-07, + "logits/chosen": -2.516909599304199, + "logits/rejected": -2.1172170639038086, + "logps/chosen": -280.1630859375, + "logps/rejected": -259.759033203125, + "loss": 0.2998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5964169502258301, + "rewards/margins": 2.2446861267089844, + "rewards/rejected": -2.8411030769348145, + "step": 978 + }, + { + "epoch": 0.11, + "learning_rate": 2.698830754694697e-07, + "logits/chosen": -2.264089584350586, + "logits/rejected": -2.491731643676758, + "logps/chosen": -151.08004760742188, + "logps/rejected": -157.851806640625, + "loss": 0.8215, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1391971111297607, + "rewards/margins": 0.8416120409965515, + "rewards/rejected": -1.980809211730957, + "step": 979 + }, + { + "epoch": 0.11, + "learning_rate": 2.698476437935514e-07, + "logits/chosen": -2.520704984664917, + "logits/rejected": -2.6834716796875, + "logps/chosen": -140.41580200195312, + "logps/rejected": -296.4446105957031, + "loss": 0.4611, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32723724842071533, + "rewards/margins": 1.9960947036743164, + "rewards/rejected": -2.3233320713043213, + "step": 980 + }, + { + "epoch": 0.11, + "learning_rate": 2.6981221211763317e-07, + "logits/chosen": -2.4870009422302246, + "logits/rejected": -2.4610238075256348, + "logps/chosen": -150.78016662597656, + "logps/rejected": -143.23858642578125, + "loss": 0.5831, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6107122898101807, + "rewards/margins": 0.4325818121433258, + "rewards/rejected": -1.043294072151184, + "step": 981 + }, + { + "epoch": 0.11, + "learning_rate": 2.6977678044171486e-07, + "logits/chosen": -2.203237295150757, + "logits/rejected": -2.2119123935699463, + "logps/chosen": -209.34475708007812, + "logps/rejected": -263.0777282714844, + "loss": 0.6946, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.53007173538208, + "rewards/margins": 0.9476156234741211, + "rewards/rejected": -3.477687358856201, + "step": 982 + }, + { + "epoch": 0.11, + "learning_rate": 2.697413487657966e-07, + "logits/chosen": -2.299835681915283, + "logits/rejected": -2.275193929672241, + "logps/chosen": -279.0255126953125, + "logps/rejected": -283.16473388671875, + "loss": 0.2407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01750333607196808, + "rewards/margins": 1.699542760848999, + "rewards/rejected": -1.7170461416244507, + "step": 983 + }, + { + "epoch": 0.11, + "learning_rate": 2.6970591708987836e-07, + "logits/chosen": -2.0187652111053467, + "logits/rejected": -1.912190318107605, + "logps/chosen": -181.69741821289062, + "logps/rejected": -294.7813720703125, + "loss": 0.6848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6780277490615845, + "rewards/margins": 1.2629801034927368, + "rewards/rejected": -1.9410078525543213, + "step": 984 + }, + { + "epoch": 0.11, + "learning_rate": 2.6967048541396005e-07, + "logits/chosen": -2.339277982711792, + "logits/rejected": -2.1288444995880127, + "logps/chosen": -295.7228088378906, + "logps/rejected": -318.90911865234375, + "loss": 0.4986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5251777172088623, + "rewards/margins": 0.8384890556335449, + "rewards/rejected": -1.3636667728424072, + "step": 985 + }, + { + "epoch": 0.11, + "learning_rate": 2.696350537380418e-07, + "logits/chosen": -2.2033209800720215, + "logits/rejected": -2.2501397132873535, + "logps/chosen": -239.3297119140625, + "logps/rejected": -297.31036376953125, + "loss": 0.2424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27753037214279175, + "rewards/margins": 2.3806142807006836, + "rewards/rejected": -2.65814471244812, + "step": 986 + }, + { + "epoch": 0.11, + "learning_rate": 2.695996220621235e-07, + "logits/chosen": -2.5718889236450195, + "logits/rejected": -2.5732922554016113, + "logps/chosen": -230.68995666503906, + "logps/rejected": -224.53616333007812, + "loss": 0.4789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7810390591621399, + "rewards/margins": 1.1940538883209229, + "rewards/rejected": -1.975092887878418, + "step": 987 + }, + { + "epoch": 0.11, + "learning_rate": 2.6956419038620525e-07, + "logits/chosen": -2.2265970706939697, + "logits/rejected": -2.504373550415039, + "logps/chosen": -196.70303344726562, + "logps/rejected": -146.44589233398438, + "loss": 2.0652, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.9207096099853516, + "rewards/margins": -1.3861323595046997, + "rewards/rejected": -1.5345770120620728, + "step": 988 + }, + { + "epoch": 0.12, + "learning_rate": 2.69528758710287e-07, + "logits/chosen": -1.5918678045272827, + "logits/rejected": -2.1438658237457275, + "logps/chosen": -523.1455078125, + "logps/rejected": -281.2669372558594, + "loss": 0.3936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9514124393463135, + "rewards/margins": 1.2430256605148315, + "rewards/rejected": -2.1944379806518555, + "step": 989 + }, + { + "epoch": 0.12, + "learning_rate": 2.6949332703436874e-07, + "logits/chosen": -1.6903914213180542, + "logits/rejected": -1.9921557903289795, + "logps/chosen": -478.1346740722656, + "logps/rejected": -284.3717956542969, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05689529329538345, + "rewards/margins": 1.129807472229004, + "rewards/rejected": -1.1867027282714844, + "step": 990 + }, + { + "epoch": 0.12, + "learning_rate": 2.6945789535845044e-07, + "logits/chosen": -2.220155954360962, + "logits/rejected": -2.5502142906188965, + "logps/chosen": -272.29620361328125, + "logps/rejected": -194.90628051757812, + "loss": 0.8725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4292720556259155, + "rewards/margins": 0.4773487448692322, + "rewards/rejected": -1.906620740890503, + "step": 991 + }, + { + "epoch": 0.12, + "learning_rate": 2.694224636825322e-07, + "logits/chosen": -2.482943058013916, + "logits/rejected": -2.5690479278564453, + "logps/chosen": -251.05987548828125, + "logps/rejected": -261.6587829589844, + "loss": 0.3977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8658809661865234, + "rewards/margins": 2.8298652172088623, + "rewards/rejected": -3.695746421813965, + "step": 992 + }, + { + "epoch": 0.12, + "learning_rate": 2.693870320066139e-07, + "logits/chosen": -1.9928791522979736, + "logits/rejected": -2.0464701652526855, + "logps/chosen": -347.8475341796875, + "logps/rejected": -310.4609680175781, + "loss": 0.7428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.81305992603302, + "rewards/margins": 1.2728923559188843, + "rewards/rejected": -2.0859522819519043, + "step": 993 + }, + { + "epoch": 0.12, + "learning_rate": 2.6935160033069563e-07, + "logits/chosen": -2.233989953994751, + "logits/rejected": -2.1681997776031494, + "logps/chosen": -211.832275390625, + "logps/rejected": -230.78082275390625, + "loss": 0.6408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4697113633155823, + "rewards/margins": 0.8247967958450317, + "rewards/rejected": -1.2945082187652588, + "step": 994 + }, + { + "epoch": 0.12, + "learning_rate": 2.6931616865477733e-07, + "logits/chosen": -2.1802260875701904, + "logits/rejected": -2.216493606567383, + "logps/chosen": -331.4581298828125, + "logps/rejected": -282.35430908203125, + "loss": 0.7249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9356716275215149, + "rewards/margins": 1.0254273414611816, + "rewards/rejected": -1.9610990285873413, + "step": 995 + }, + { + "epoch": 0.12, + "learning_rate": 2.692807369788591e-07, + "logits/chosen": -2.2380433082580566, + "logits/rejected": -2.4743010997772217, + "logps/chosen": -450.3731384277344, + "logps/rejected": -299.612060546875, + "loss": 0.5604, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2313624918460846, + "rewards/margins": 1.202096939086914, + "rewards/rejected": -1.4334595203399658, + "step": 996 + }, + { + "epoch": 0.12, + "learning_rate": 2.692453053029408e-07, + "logits/chosen": -2.113428831100464, + "logits/rejected": -2.360644578933716, + "logps/chosen": -424.91766357421875, + "logps/rejected": -318.7708740234375, + "loss": 0.2216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5906178951263428, + "rewards/margins": 2.0988833904266357, + "rewards/rejected": -2.6895012855529785, + "step": 997 + }, + { + "epoch": 0.12, + "learning_rate": 2.692098736270225e-07, + "logits/chosen": -2.3517794609069824, + "logits/rejected": -2.319622278213501, + "logps/chosen": -251.69647216796875, + "logps/rejected": -279.42230224609375, + "loss": 0.5412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7958537340164185, + "rewards/margins": 1.0292994976043701, + "rewards/rejected": -1.8251532316207886, + "step": 998 + }, + { + "epoch": 0.12, + "learning_rate": 2.6917444195110427e-07, + "logits/chosen": -2.1526498794555664, + "logits/rejected": -2.3987255096435547, + "logps/chosen": -369.7178649902344, + "logps/rejected": -444.3445129394531, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5917199850082397, + "rewards/margins": 2.2089836597442627, + "rewards/rejected": -2.800703525543213, + "step": 999 + }, + { + "epoch": 0.12, + "learning_rate": 2.69139010275186e-07, + "logits/chosen": -2.566873550415039, + "logits/rejected": -2.574442148208618, + "logps/chosen": -281.74371337890625, + "logps/rejected": -290.25799560546875, + "loss": 1.1318, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5471998453140259, + "rewards/margins": 1.3452056646347046, + "rewards/rejected": -2.8924055099487305, + "step": 1000 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -1.7510888576507568, + "eval_logits/rejected": -1.749880075454712, + "eval_logps/chosen": -275.8467712402344, + "eval_logps/rejected": -268.0012512207031, + "eval_loss": 0.4248879849910736, + "eval_rewards/accuracies": 0.806034505367279, + "eval_rewards/chosen": -0.36813417077064514, + "eval_rewards/margins": 1.331338882446289, + "eval_rewards/rejected": -1.6994729042053223, + "eval_runtime": 237.7213, + "eval_samples_per_second": 2.924, + "eval_steps_per_second": 1.464, + "step": 1000 + }, + { + "epoch": 0.12, + "learning_rate": 2.6910357859926777e-07, + "logits/chosen": -2.8470873832702637, + "logits/rejected": -2.9010086059570312, + "logps/chosen": -251.39373779296875, + "logps/rejected": -336.2052917480469, + "loss": 0.2039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.050964951515197754, + "rewards/margins": 2.9765784740448, + "rewards/rejected": -3.027543306350708, + "step": 1001 + }, + { + "epoch": 0.12, + "learning_rate": 2.6906814692334946e-07, + "logits/chosen": -2.1684656143188477, + "logits/rejected": -2.3903608322143555, + "logps/chosen": -372.3252258300781, + "logps/rejected": -221.97869873046875, + "loss": 0.4121, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.589804470539093, + "rewards/margins": 0.8823971748352051, + "rewards/rejected": -1.4722017049789429, + "step": 1002 + }, + { + "epoch": 0.12, + "learning_rate": 2.690327152474312e-07, + "logits/chosen": -1.3848621845245361, + "logits/rejected": -1.6030499935150146, + "logps/chosen": -482.38446044921875, + "logps/rejected": -601.138916015625, + "loss": 0.1853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5383899807929993, + "rewards/margins": 3.0925827026367188, + "rewards/rejected": -3.6309728622436523, + "step": 1003 + }, + { + "epoch": 0.12, + "learning_rate": 2.689972835715129e-07, + "logits/chosen": -2.1628260612487793, + "logits/rejected": -2.359450101852417, + "logps/chosen": -231.13623046875, + "logps/rejected": -262.7522277832031, + "loss": 0.4516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.704407274723053, + "rewards/margins": 2.1793715953826904, + "rewards/rejected": -2.8837790489196777, + "step": 1004 + }, + { + "epoch": 0.12, + "learning_rate": 2.6896185189559466e-07, + "logits/chosen": -2.8480520248413086, + "logits/rejected": -2.732140064239502, + "logps/chosen": -137.153564453125, + "logps/rejected": -176.18075561523438, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.045426845550537, + "rewards/margins": 1.170914888381958, + "rewards/rejected": -2.216341972351074, + "step": 1005 + }, + { + "epoch": 0.12, + "learning_rate": 2.6892642021967635e-07, + "logits/chosen": -2.7391953468322754, + "logits/rejected": -2.872476577758789, + "logps/chosen": -199.84474182128906, + "logps/rejected": -214.40982055664062, + "loss": 0.7541, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40230244398117065, + "rewards/margins": 0.21437223255634308, + "rewards/rejected": -0.6166746616363525, + "step": 1006 + }, + { + "epoch": 0.12, + "learning_rate": 2.688909885437581e-07, + "logits/chosen": -2.429762840270996, + "logits/rejected": -2.5749754905700684, + "logps/chosen": -204.0440673828125, + "logps/rejected": -200.61854553222656, + "loss": 0.7508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7839182615280151, + "rewards/margins": 0.5106223821640015, + "rewards/rejected": -1.2945406436920166, + "step": 1007 + }, + { + "epoch": 0.12, + "learning_rate": 2.6885555686783985e-07, + "logits/chosen": -2.6658272743225098, + "logits/rejected": -2.7179317474365234, + "logps/chosen": -259.44232177734375, + "logps/rejected": -241.65744018554688, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025270909070968628, + "rewards/margins": 1.8805454969406128, + "rewards/rejected": -1.8552747964859009, + "step": 1008 + }, + { + "epoch": 0.12, + "learning_rate": 2.6882012519192154e-07, + "logits/chosen": -2.2947378158569336, + "logits/rejected": -2.145418643951416, + "logps/chosen": -282.08453369140625, + "logps/rejected": -368.7288818359375, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4319809079170227, + "rewards/margins": 1.8353404998779297, + "rewards/rejected": -2.2673215866088867, + "step": 1009 + }, + { + "epoch": 0.12, + "learning_rate": 2.687846935160033e-07, + "logits/chosen": -2.488420248031616, + "logits/rejected": -2.3148677349090576, + "logps/chosen": -183.70462036132812, + "logps/rejected": -190.0177001953125, + "loss": 0.949, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1047427654266357, + "rewards/margins": 0.6015448570251465, + "rewards/rejected": -1.7062877416610718, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 2.6874926184008504e-07, + "logits/chosen": -2.5630640983581543, + "logits/rejected": -2.443657159805298, + "logps/chosen": -260.3871154785156, + "logps/rejected": -161.87973022460938, + "loss": 0.4285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8149291276931763, + "rewards/margins": 1.055453896522522, + "rewards/rejected": -1.8703830242156982, + "step": 1011 + }, + { + "epoch": 0.12, + "learning_rate": 2.687138301641668e-07, + "logits/chosen": -2.4237070083618164, + "logits/rejected": -2.3634684085845947, + "logps/chosen": -303.64410400390625, + "logps/rejected": -199.79078674316406, + "loss": 1.2535, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3026437759399414, + "rewards/margins": 0.2994152009487152, + "rewards/rejected": -1.6020588874816895, + "step": 1012 + }, + { + "epoch": 0.12, + "learning_rate": 2.686783984882485e-07, + "logits/chosen": -2.263988494873047, + "logits/rejected": -2.643456220626831, + "logps/chosen": -582.6007080078125, + "logps/rejected": -341.6959228515625, + "loss": 0.6402, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.199205994606018, + "rewards/margins": 1.4099606275558472, + "rewards/rejected": -2.609166383743286, + "step": 1013 + }, + { + "epoch": 0.12, + "learning_rate": 2.6864296681233023e-07, + "logits/chosen": -2.2881956100463867, + "logits/rejected": -2.4538326263427734, + "logps/chosen": -316.73895263671875, + "logps/rejected": -320.487060546875, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7972468137741089, + "rewards/margins": 2.2829596996307373, + "rewards/rejected": -3.0802063941955566, + "step": 1014 + }, + { + "epoch": 0.12, + "learning_rate": 2.6860753513641193e-07, + "logits/chosen": -2.637885332107544, + "logits/rejected": -2.8041915893554688, + "logps/chosen": -248.788330078125, + "logps/rejected": -320.8568420410156, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7933220267295837, + "rewards/margins": 2.0839321613311768, + "rewards/rejected": -2.8772542476654053, + "step": 1015 + }, + { + "epoch": 0.12, + "learning_rate": 2.685721034604937e-07, + "logits/chosen": -1.9706429243087769, + "logits/rejected": -2.210228681564331, + "logps/chosen": -321.562255859375, + "logps/rejected": -189.73077392578125, + "loss": 0.6467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5760781764984131, + "rewards/margins": 0.7637758851051331, + "rewards/rejected": -1.3398540019989014, + "step": 1016 + }, + { + "epoch": 0.12, + "learning_rate": 2.685366717845754e-07, + "logits/chosen": -2.612414836883545, + "logits/rejected": -2.4439995288848877, + "logps/chosen": -333.61468505859375, + "logps/rejected": -375.1265869140625, + "loss": 0.9089, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.840308427810669, + "rewards/margins": 0.40208178758621216, + "rewards/rejected": -2.2423901557922363, + "step": 1017 + }, + { + "epoch": 0.12, + "learning_rate": 2.685012401086571e-07, + "logits/chosen": -2.449991226196289, + "logits/rejected": -2.4547030925750732, + "logps/chosen": -268.7228088378906, + "logps/rejected": -258.485107421875, + "loss": 0.3502, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3177988529205322, + "rewards/margins": 1.7583271265029907, + "rewards/rejected": -2.0761260986328125, + "step": 1018 + }, + { + "epoch": 0.12, + "learning_rate": 2.6846580843273887e-07, + "logits/chosen": -2.499650478363037, + "logits/rejected": -2.3996729850769043, + "logps/chosen": -439.25872802734375, + "logps/rejected": -342.2508544921875, + "loss": 0.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0192344188690186, + "rewards/margins": 0.9353653788566589, + "rewards/rejected": -1.9545998573303223, + "step": 1019 + }, + { + "epoch": 0.12, + "learning_rate": 2.6843037675682057e-07, + "logits/chosen": -1.8308236598968506, + "logits/rejected": -2.215582847595215, + "logps/chosen": -378.8659973144531, + "logps/rejected": -320.2466735839844, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7679028511047363, + "rewards/margins": 1.972186803817749, + "rewards/rejected": -2.7400894165039062, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 2.683949450809023e-07, + "logits/chosen": -2.5429532527923584, + "logits/rejected": -2.707886219024658, + "logps/chosen": -203.77883911132812, + "logps/rejected": -276.91156005859375, + "loss": 0.4668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0387959480285645, + "rewards/margins": 2.4250433444976807, + "rewards/rejected": -3.463839054107666, + "step": 1021 + }, + { + "epoch": 0.12, + "learning_rate": 2.68359513404984e-07, + "logits/chosen": -2.2999420166015625, + "logits/rejected": -1.9869012832641602, + "logps/chosen": -219.1651611328125, + "logps/rejected": -359.604736328125, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.085327386856079, + "rewards/margins": 1.3165650367736816, + "rewards/rejected": -2.4018921852111816, + "step": 1022 + }, + { + "epoch": 0.12, + "learning_rate": 2.683240817290658e-07, + "logits/chosen": -2.741697311401367, + "logits/rejected": -2.594670057296753, + "logps/chosen": -383.4318542480469, + "logps/rejected": -237.98895263671875, + "loss": 0.2533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8507862687110901, + "rewards/margins": 1.8031563758850098, + "rewards/rejected": -2.653942584991455, + "step": 1023 + }, + { + "epoch": 0.12, + "learning_rate": 2.682886500531475e-07, + "logits/chosen": -1.5400416851043701, + "logits/rejected": -1.4896056652069092, + "logps/chosen": -357.1156311035156, + "logps/rejected": -436.74041748046875, + "loss": 0.7384, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8909907341003418, + "rewards/margins": 0.24667218327522278, + "rewards/rejected": -1.1376628875732422, + "step": 1024 + }, + { + "epoch": 0.12, + "learning_rate": 2.6825321837722926e-07, + "logits/chosen": -2.8661704063415527, + "logits/rejected": -2.8446669578552246, + "logps/chosen": -318.3084411621094, + "logps/rejected": -144.72373962402344, + "loss": 1.0597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8996989130973816, + "rewards/margins": 0.8577672243118286, + "rewards/rejected": -1.7574660778045654, + "step": 1025 + }, + { + "epoch": 0.12, + "learning_rate": 2.6821778670131095e-07, + "logits/chosen": -2.320706367492676, + "logits/rejected": -2.2479805946350098, + "logps/chosen": -362.08465576171875, + "logps/rejected": -291.94635009765625, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.065910816192627, + "rewards/margins": 0.6854327917098999, + "rewards/rejected": -1.7513437271118164, + "step": 1026 + }, + { + "epoch": 0.12, + "learning_rate": 2.681823550253927e-07, + "logits/chosen": -2.347588300704956, + "logits/rejected": -2.508213996887207, + "logps/chosen": -438.076904296875, + "logps/rejected": -318.760498046875, + "loss": 0.1472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0752687007188797, + "rewards/margins": 2.4860312938690186, + "rewards/rejected": -2.4107627868652344, + "step": 1027 + }, + { + "epoch": 0.12, + "learning_rate": 2.681469233494744e-07, + "logits/chosen": -2.4436614513397217, + "logits/rejected": -2.191115617752075, + "logps/chosen": -169.04754638671875, + "logps/rejected": -317.03387451171875, + "loss": 0.3172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3345242738723755, + "rewards/margins": 1.572483777999878, + "rewards/rejected": -1.9070080518722534, + "step": 1028 + }, + { + "epoch": 0.12, + "learning_rate": 2.6811149167355614e-07, + "logits/chosen": -1.8599722385406494, + "logits/rejected": -2.0045199394226074, + "logps/chosen": -295.9490661621094, + "logps/rejected": -218.71075439453125, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37321341037750244, + "rewards/margins": 1.003804326057434, + "rewards/rejected": -1.3770177364349365, + "step": 1029 + }, + { + "epoch": 0.12, + "learning_rate": 2.680760599976379e-07, + "logits/chosen": -2.6005265712738037, + "logits/rejected": -2.4518320560455322, + "logps/chosen": -118.36698150634766, + "logps/rejected": -168.40753173828125, + "loss": 0.4732, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31334924697875977, + "rewards/margins": 0.9789475798606873, + "rewards/rejected": -1.2922968864440918, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 2.680406283217196e-07, + "logits/chosen": -2.7847235202789307, + "logits/rejected": -2.4587996006011963, + "logps/chosen": -162.13720703125, + "logps/rejected": -193.2193603515625, + "loss": 0.6723, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5871151685714722, + "rewards/margins": 0.37776851654052734, + "rewards/rejected": -0.9648836851119995, + "step": 1031 + }, + { + "epoch": 0.12, + "learning_rate": 2.6800519664580134e-07, + "logits/chosen": -2.4462485313415527, + "logits/rejected": -2.3342058658599854, + "logps/chosen": -320.865966796875, + "logps/rejected": -336.8431396484375, + "loss": 0.4604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.057936348021030426, + "rewards/margins": 1.15945565700531, + "rewards/rejected": -1.1015193462371826, + "step": 1032 + }, + { + "epoch": 0.12, + "learning_rate": 2.6796976496988303e-07, + "logits/chosen": -2.7538814544677734, + "logits/rejected": -2.6490368843078613, + "logps/chosen": -221.382568359375, + "logps/rejected": -213.8437042236328, + "loss": 0.6717, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2699287533760071, + "rewards/margins": 0.51250821352005, + "rewards/rejected": -0.7824369668960571, + "step": 1033 + }, + { + "epoch": 0.12, + "learning_rate": 2.679343332939648e-07, + "logits/chosen": -1.8708608150482178, + "logits/rejected": -2.0062992572784424, + "logps/chosen": -420.2269592285156, + "logps/rejected": -411.08538818359375, + "loss": 0.6734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8716568350791931, + "rewards/margins": 0.5073645114898682, + "rewards/rejected": -1.379021406173706, + "step": 1034 + }, + { + "epoch": 0.12, + "learning_rate": 2.6789890161804653e-07, + "logits/chosen": -2.680546522140503, + "logits/rejected": -2.8294050693511963, + "logps/chosen": -385.24090576171875, + "logps/rejected": -415.5498046875, + "loss": 0.2318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05524454265832901, + "rewards/margins": 1.9907004833221436, + "rewards/rejected": -2.045945167541504, + "step": 1035 + }, + { + "epoch": 0.12, + "learning_rate": 2.678634699421283e-07, + "logits/chosen": -1.9042677879333496, + "logits/rejected": -1.7738113403320312, + "logps/chosen": -311.94989013671875, + "logps/rejected": -328.2655944824219, + "loss": 0.6011, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0442520380020142, + "rewards/margins": 0.5859363079071045, + "rewards/rejected": -1.6301883459091187, + "step": 1036 + }, + { + "epoch": 0.12, + "learning_rate": 2.6782803826621e-07, + "logits/chosen": -2.7975544929504395, + "logits/rejected": -2.9573593139648438, + "logps/chosen": -231.49923706054688, + "logps/rejected": -259.990478515625, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9792052507400513, + "rewards/margins": 2.034585475921631, + "rewards/rejected": -3.0137906074523926, + "step": 1037 + }, + { + "epoch": 0.12, + "learning_rate": 2.677926065902917e-07, + "logits/chosen": -2.600123405456543, + "logits/rejected": -2.729048490524292, + "logps/chosen": -285.822021484375, + "logps/rejected": -156.70590209960938, + "loss": 0.4391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4918278455734253, + "rewards/margins": 1.4295551776885986, + "rewards/rejected": -2.9213831424713135, + "step": 1038 + }, + { + "epoch": 0.12, + "learning_rate": 2.677571749143734e-07, + "logits/chosen": -2.539416790008545, + "logits/rejected": -2.422215700149536, + "logps/chosen": -299.0378723144531, + "logps/rejected": -319.9189453125, + "loss": 0.3588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06421977281570435, + "rewards/margins": 2.081080436706543, + "rewards/rejected": -2.0168607234954834, + "step": 1039 + }, + { + "epoch": 0.12, + "learning_rate": 2.6772174323845517e-07, + "logits/chosen": -2.2689976692199707, + "logits/rejected": -2.600353240966797, + "logps/chosen": -349.33258056640625, + "logps/rejected": -260.7159729003906, + "loss": 0.6346, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7608904242515564, + "rewards/margins": 0.7425460815429688, + "rewards/rejected": -1.50343656539917, + "step": 1040 + }, + { + "epoch": 0.12, + "learning_rate": 2.676863115625369e-07, + "logits/chosen": -2.373967170715332, + "logits/rejected": -2.2361016273498535, + "logps/chosen": -336.94317626953125, + "logps/rejected": -358.672119140625, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5066670179367065, + "rewards/margins": 1.9474403858184814, + "rewards/rejected": -2.4541072845458984, + "step": 1041 + }, + { + "epoch": 0.12, + "learning_rate": 2.676508798866186e-07, + "logits/chosen": -2.100290298461914, + "logits/rejected": -1.957705020904541, + "logps/chosen": -341.8646240234375, + "logps/rejected": -343.6378173828125, + "loss": 0.1893, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5427795648574829, + "rewards/margins": 2.7412524223327637, + "rewards/rejected": -3.284031867980957, + "step": 1042 + }, + { + "epoch": 0.12, + "learning_rate": 2.6761544821070036e-07, + "logits/chosen": -2.1402664184570312, + "logits/rejected": -2.432070255279541, + "logps/chosen": -398.04815673828125, + "logps/rejected": -218.48489379882812, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5323309898376465, + "rewards/margins": 1.032625675201416, + "rewards/rejected": -1.564956784248352, + "step": 1043 + }, + { + "epoch": 0.12, + "learning_rate": 2.6758001653478206e-07, + "logits/chosen": -1.7007567882537842, + "logits/rejected": -1.9585708379745483, + "logps/chosen": -431.3917236328125, + "logps/rejected": -395.8056640625, + "loss": 0.3277, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7914546728134155, + "rewards/margins": 1.623566746711731, + "rewards/rejected": -2.4150211811065674, + "step": 1044 + }, + { + "epoch": 0.12, + "learning_rate": 2.675445848588638e-07, + "logits/chosen": -2.5013341903686523, + "logits/rejected": -2.2819950580596924, + "logps/chosen": -155.30113220214844, + "logps/rejected": -162.28001403808594, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9755802154541016, + "rewards/margins": 0.9947835206985474, + "rewards/rejected": -1.9703636169433594, + "step": 1045 + }, + { + "epoch": 0.12, + "learning_rate": 2.6750915318294555e-07, + "logits/chosen": -2.3592047691345215, + "logits/rejected": -2.3650314807891846, + "logps/chosen": -312.146728515625, + "logps/rejected": -354.27667236328125, + "loss": 0.4743, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13659609854221344, + "rewards/margins": 1.3142424821853638, + "rewards/rejected": -1.450838565826416, + "step": 1046 + }, + { + "epoch": 0.12, + "learning_rate": 2.674737215070273e-07, + "logits/chosen": -2.448552131652832, + "logits/rejected": -2.7879810333251953, + "logps/chosen": -431.3397216796875, + "logps/rejected": -270.61578369140625, + "loss": 0.4333, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.619140625, + "rewards/margins": 2.046046733856201, + "rewards/rejected": -2.665187358856201, + "step": 1047 + }, + { + "epoch": 0.12, + "learning_rate": 2.67438289831109e-07, + "logits/chosen": -2.2619872093200684, + "logits/rejected": -2.1790249347686768, + "logps/chosen": -285.6820983886719, + "logps/rejected": -324.9290771484375, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13921624422073364, + "rewards/margins": 1.1460033655166626, + "rewards/rejected": -1.2852195501327515, + "step": 1048 + }, + { + "epoch": 0.12, + "learning_rate": 2.6740285815519075e-07, + "logits/chosen": -2.2631313800811768, + "logits/rejected": -2.3668324947357178, + "logps/chosen": -288.9767761230469, + "logps/rejected": -269.7618713378906, + "loss": 0.2444, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2361612766981125, + "rewards/margins": 2.0142362117767334, + "rewards/rejected": -1.778074860572815, + "step": 1049 + }, + { + "epoch": 0.12, + "learning_rate": 2.6736742647927244e-07, + "logits/chosen": -2.451120615005493, + "logits/rejected": -2.4749855995178223, + "logps/chosen": -418.6518859863281, + "logps/rejected": -321.02978515625, + "loss": 0.6123, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2040013074874878, + "rewards/margins": 1.1053097248077393, + "rewards/rejected": -2.3093109130859375, + "step": 1050 + }, + { + "epoch": 0.12, + "learning_rate": 2.673319948033542e-07, + "logits/chosen": -2.3888111114501953, + "logits/rejected": -2.379237174987793, + "logps/chosen": -271.69091796875, + "logps/rejected": -232.19171142578125, + "loss": 0.7941, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7459869384765625, + "rewards/margins": 0.46350181102752686, + "rewards/rejected": -1.2094886302947998, + "step": 1051 + }, + { + "epoch": 0.12, + "learning_rate": 2.6729656312743594e-07, + "logits/chosen": -2.1089117527008057, + "logits/rejected": -2.0546164512634277, + "logps/chosen": -275.7714538574219, + "logps/rejected": -291.5171203613281, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5756916403770447, + "rewards/margins": 1.736299753189087, + "rewards/rejected": -2.3119914531707764, + "step": 1052 + }, + { + "epoch": 0.12, + "learning_rate": 2.6726113145151763e-07, + "logits/chosen": -2.112663745880127, + "logits/rejected": -2.066485643386841, + "logps/chosen": -129.0723876953125, + "logps/rejected": -210.68544006347656, + "loss": 0.5336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9590966105461121, + "rewards/margins": 1.7200921773910522, + "rewards/rejected": -2.6791887283325195, + "step": 1053 + }, + { + "epoch": 0.12, + "learning_rate": 2.672256997755994e-07, + "logits/chosen": -2.0749671459198, + "logits/rejected": -2.2923312187194824, + "logps/chosen": -438.4505615234375, + "logps/rejected": -365.7938537597656, + "loss": 0.3381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5590173602104187, + "rewards/margins": 1.7726699113845825, + "rewards/rejected": -2.3316872119903564, + "step": 1054 + }, + { + "epoch": 0.12, + "learning_rate": 2.671902680996811e-07, + "logits/chosen": -2.6246416568756104, + "logits/rejected": -2.4869894981384277, + "logps/chosen": -194.2587127685547, + "logps/rejected": -286.2919921875, + "loss": 0.4861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7361899614334106, + "rewards/margins": 1.0961368083953857, + "rewards/rejected": -1.832326889038086, + "step": 1055 + }, + { + "epoch": 0.12, + "learning_rate": 2.6715483642376283e-07, + "logits/chosen": -2.2494685649871826, + "logits/rejected": -2.169074058532715, + "logps/chosen": -239.6207275390625, + "logps/rejected": -222.606689453125, + "loss": 0.4798, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3303607702255249, + "rewards/margins": 0.931678056716919, + "rewards/rejected": -1.2620388269424438, + "step": 1056 + }, + { + "epoch": 0.12, + "learning_rate": 2.671194047478445e-07, + "logits/chosen": -2.022153615951538, + "logits/rejected": -2.2176949977874756, + "logps/chosen": -398.5050354003906, + "logps/rejected": -317.08270263671875, + "loss": 0.3072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46833130717277527, + "rewards/margins": 1.2909793853759766, + "rewards/rejected": -1.7593106031417847, + "step": 1057 + }, + { + "epoch": 0.12, + "learning_rate": 2.670839730719263e-07, + "logits/chosen": -2.6683053970336914, + "logits/rejected": -2.702651023864746, + "logps/chosen": -155.39759826660156, + "logps/rejected": -196.35496520996094, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26053130626678467, + "rewards/margins": 0.3053209185600281, + "rewards/rejected": -0.5658522248268127, + "step": 1058 + }, + { + "epoch": 0.12, + "learning_rate": 2.67048541396008e-07, + "logits/chosen": -2.400468111038208, + "logits/rejected": -2.214951992034912, + "logps/chosen": -269.39361572265625, + "logps/rejected": -289.14898681640625, + "loss": 0.4452, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4557674527168274, + "rewards/margins": 1.1904085874557495, + "rewards/rejected": -1.6461760997772217, + "step": 1059 + }, + { + "epoch": 0.12, + "learning_rate": 2.6701310972008977e-07, + "logits/chosen": -2.0544698238372803, + "logits/rejected": -2.095590829849243, + "logps/chosen": -377.2997131347656, + "logps/rejected": -337.1888732910156, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8498482704162598, + "rewards/margins": 1.130028247833252, + "rewards/rejected": -2.9798765182495117, + "step": 1060 + }, + { + "epoch": 0.12, + "learning_rate": 2.6697767804417146e-07, + "logits/chosen": -2.3078534603118896, + "logits/rejected": -2.4021599292755127, + "logps/chosen": -424.0606689453125, + "logps/rejected": -335.88702392578125, + "loss": 0.2092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.027568520978093147, + "rewards/margins": 2.34454607963562, + "rewards/rejected": -2.372114658355713, + "step": 1061 + }, + { + "epoch": 0.12, + "learning_rate": 2.669422463682532e-07, + "logits/chosen": -2.1604154109954834, + "logits/rejected": -1.9535936117172241, + "logps/chosen": -247.72769165039062, + "logps/rejected": -342.96441650390625, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45916077494621277, + "rewards/margins": 0.8364137411117554, + "rewards/rejected": -1.295574426651001, + "step": 1062 + }, + { + "epoch": 0.12, + "learning_rate": 2.6690681469233496e-07, + "logits/chosen": -2.328622817993164, + "logits/rejected": -2.3256759643554688, + "logps/chosen": -311.0596923828125, + "logps/rejected": -326.9505615234375, + "loss": 0.2971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5495116710662842, + "rewards/margins": 2.840487480163574, + "rewards/rejected": -3.3899993896484375, + "step": 1063 + }, + { + "epoch": 0.12, + "learning_rate": 2.6687138301641666e-07, + "logits/chosen": -2.2215142250061035, + "logits/rejected": -2.021106719970703, + "logps/chosen": -349.39068603515625, + "logps/rejected": -427.28857421875, + "loss": 0.1654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13576972484588623, + "rewards/margins": 2.2788681983947754, + "rewards/rejected": -2.414638042449951, + "step": 1064 + }, + { + "epoch": 0.12, + "learning_rate": 2.668359513404984e-07, + "logits/chosen": -2.242788791656494, + "logits/rejected": -2.4979422092437744, + "logps/chosen": -285.5115051269531, + "logps/rejected": -151.16806030273438, + "loss": 0.4754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7337660789489746, + "rewards/margins": 1.0450332164764404, + "rewards/rejected": -1.778799295425415, + "step": 1065 + }, + { + "epoch": 0.12, + "learning_rate": 2.668005196645801e-07, + "logits/chosen": -2.6846566200256348, + "logits/rejected": -2.3883676528930664, + "logps/chosen": -217.34634399414062, + "logps/rejected": -293.59197998046875, + "loss": 0.6862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6707612872123718, + "rewards/margins": 1.7070541381835938, + "rewards/rejected": -2.3778157234191895, + "step": 1066 + }, + { + "epoch": 0.12, + "learning_rate": 2.6676508798866185e-07, + "logits/chosen": -1.967170000076294, + "logits/rejected": -2.39233660697937, + "logps/chosen": -344.6943359375, + "logps/rejected": -191.7589111328125, + "loss": 0.3268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33706381916999817, + "rewards/margins": 1.3534246683120728, + "rewards/rejected": -1.690488576889038, + "step": 1067 + }, + { + "epoch": 0.12, + "learning_rate": 2.6672965631274355e-07, + "logits/chosen": -2.408377170562744, + "logits/rejected": -2.4218435287475586, + "logps/chosen": -258.0489501953125, + "logps/rejected": -309.2147216796875, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4693720042705536, + "rewards/margins": 2.582160472869873, + "rewards/rejected": -3.051532506942749, + "step": 1068 + }, + { + "epoch": 0.12, + "learning_rate": 2.666942246368253e-07, + "logits/chosen": -2.5162513256073, + "logits/rejected": -2.636547803878784, + "logps/chosen": -314.39862060546875, + "logps/rejected": -177.69070434570312, + "loss": 0.5505, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6609842777252197, + "rewards/margins": 0.594925045967102, + "rewards/rejected": -1.2559092044830322, + "step": 1069 + }, + { + "epoch": 0.12, + "learning_rate": 2.6665879296090704e-07, + "logits/chosen": -2.0990500450134277, + "logits/rejected": -2.2031667232513428, + "logps/chosen": -382.60821533203125, + "logps/rejected": -249.86444091796875, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5362615585327148, + "rewards/margins": 0.42597728967666626, + "rewards/rejected": -0.9622387886047363, + "step": 1070 + }, + { + "epoch": 0.12, + "learning_rate": 2.666233612849888e-07, + "logits/chosen": -3.0147786140441895, + "logits/rejected": -3.030062675476074, + "logps/chosen": -302.000732421875, + "logps/rejected": -319.54034423828125, + "loss": 0.5529, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0089507102966309, + "rewards/margins": 0.9571303129196167, + "rewards/rejected": -1.966080904006958, + "step": 1071 + }, + { + "epoch": 0.12, + "learning_rate": 2.665879296090705e-07, + "logits/chosen": -2.7148094177246094, + "logits/rejected": -2.6293106079101562, + "logps/chosen": -210.87208557128906, + "logps/rejected": -290.826904296875, + "loss": 0.629, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7553330063819885, + "rewards/margins": 1.174296498298645, + "rewards/rejected": -1.9296294450759888, + "step": 1072 + }, + { + "epoch": 0.12, + "learning_rate": 2.6655249793315224e-07, + "logits/chosen": -2.3252639770507812, + "logits/rejected": -2.387633800506592, + "logps/chosen": -204.40345764160156, + "logps/rejected": -276.6667785644531, + "loss": 0.4621, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.045179013162851334, + "rewards/margins": 1.420233964920044, + "rewards/rejected": -1.3750548362731934, + "step": 1073 + }, + { + "epoch": 0.12, + "learning_rate": 2.6651706625723393e-07, + "logits/chosen": -2.3053317070007324, + "logits/rejected": -2.6894307136535645, + "logps/chosen": -247.8425750732422, + "logps/rejected": -222.68019104003906, + "loss": 0.3709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8157211542129517, + "rewards/margins": 1.6436864137649536, + "rewards/rejected": -2.4594078063964844, + "step": 1074 + }, + { + "epoch": 0.13, + "learning_rate": 2.664816345813157e-07, + "logits/chosen": -2.504063367843628, + "logits/rejected": -2.294929265975952, + "logps/chosen": -201.06796264648438, + "logps/rejected": -220.40338134765625, + "loss": 0.8078, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9903233051300049, + "rewards/margins": 0.35713207721710205, + "rewards/rejected": -2.3474552631378174, + "step": 1075 + }, + { + "epoch": 0.13, + "learning_rate": 2.6644620290539743e-07, + "logits/chosen": -2.418185234069824, + "logits/rejected": -2.3862133026123047, + "logps/chosen": -311.5484619140625, + "logps/rejected": -432.5367431640625, + "loss": 0.2949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5764299035072327, + "rewards/margins": 1.8395686149597168, + "rewards/rejected": -2.4159984588623047, + "step": 1076 + }, + { + "epoch": 0.13, + "learning_rate": 2.664107712294791e-07, + "logits/chosen": -2.168757438659668, + "logits/rejected": -2.131538152694702, + "logps/chosen": -261.13519287109375, + "logps/rejected": -289.5082092285156, + "loss": 0.6945, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.723724126815796, + "rewards/margins": 0.9001120328903198, + "rewards/rejected": -2.623836040496826, + "step": 1077 + }, + { + "epoch": 0.13, + "learning_rate": 2.6637533955356087e-07, + "logits/chosen": -2.464836359024048, + "logits/rejected": -2.612760305404663, + "logps/chosen": -278.18328857421875, + "logps/rejected": -285.47003173828125, + "loss": 0.2856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5230402946472168, + "rewards/margins": 2.4169726371765137, + "rewards/rejected": -2.9400129318237305, + "step": 1078 + }, + { + "epoch": 0.13, + "learning_rate": 2.6633990787764257e-07, + "logits/chosen": -2.3580260276794434, + "logits/rejected": -2.4667322635650635, + "logps/chosen": -205.69796752929688, + "logps/rejected": -205.69241333007812, + "loss": 0.3664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3595908284187317, + "rewards/margins": 1.5037263631820679, + "rewards/rejected": -1.8633171319961548, + "step": 1079 + }, + { + "epoch": 0.13, + "learning_rate": 2.663044762017243e-07, + "logits/chosen": -2.302065849304199, + "logits/rejected": -2.739877223968506, + "logps/chosen": -484.3369140625, + "logps/rejected": -358.4576110839844, + "loss": 0.3192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11011756956577301, + "rewards/margins": 1.4185330867767334, + "rewards/rejected": -1.5286506414413452, + "step": 1080 + }, + { + "epoch": 0.13, + "learning_rate": 2.6626904452580607e-07, + "logits/chosen": -2.418699026107788, + "logits/rejected": -2.093188762664795, + "logps/chosen": -136.64671325683594, + "logps/rejected": -221.22161865234375, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5187832117080688, + "rewards/margins": 1.5653152465820312, + "rewards/rejected": -2.0840985774993896, + "step": 1081 + }, + { + "epoch": 0.13, + "learning_rate": 2.662336128498878e-07, + "logits/chosen": -1.652097463607788, + "logits/rejected": -2.124221086502075, + "logps/chosen": -378.49139404296875, + "logps/rejected": -193.49049377441406, + "loss": 1.7457, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.7611725330352783, + "rewards/margins": -0.3386959135532379, + "rewards/rejected": -2.4224765300750732, + "step": 1082 + }, + { + "epoch": 0.13, + "learning_rate": 2.661981811739695e-07, + "logits/chosen": -2.2756805419921875, + "logits/rejected": -2.5502874851226807, + "logps/chosen": -191.4690704345703, + "logps/rejected": -157.41888427734375, + "loss": 0.3008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5995219945907593, + "rewards/margins": 1.74029541015625, + "rewards/rejected": -2.3398172855377197, + "step": 1083 + }, + { + "epoch": 0.13, + "learning_rate": 2.6616274949805126e-07, + "logits/chosen": -2.98826265335083, + "logits/rejected": -2.957146167755127, + "logps/chosen": -302.45941162109375, + "logps/rejected": -239.65228271484375, + "loss": 0.4662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5962383151054382, + "rewards/margins": 1.1420583724975586, + "rewards/rejected": -1.7382967472076416, + "step": 1084 + }, + { + "epoch": 0.13, + "learning_rate": 2.6612731782213295e-07, + "logits/chosen": -2.2169206142425537, + "logits/rejected": -2.3974270820617676, + "logps/chosen": -212.6136474609375, + "logps/rejected": -240.02340698242188, + "loss": 0.8872, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.573089361190796, + "rewards/margins": 1.3270184993743896, + "rewards/rejected": -2.9001078605651855, + "step": 1085 + }, + { + "epoch": 0.13, + "learning_rate": 2.660918861462147e-07, + "logits/chosen": -2.68259334564209, + "logits/rejected": -2.5670857429504395, + "logps/chosen": -400.1985168457031, + "logps/rejected": -377.7809143066406, + "loss": 0.3017, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7853826284408569, + "rewards/margins": 2.0247440338134766, + "rewards/rejected": -2.810126781463623, + "step": 1086 + }, + { + "epoch": 0.13, + "learning_rate": 2.6605645447029645e-07, + "logits/chosen": -2.4805777072906494, + "logits/rejected": -2.3111910820007324, + "logps/chosen": -177.0059051513672, + "logps/rejected": -282.23992919921875, + "loss": 0.1999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6601979732513428, + "rewards/margins": 3.146947145462036, + "rewards/rejected": -3.807145118713379, + "step": 1087 + }, + { + "epoch": 0.13, + "learning_rate": 2.6602102279437815e-07, + "logits/chosen": -1.7882392406463623, + "logits/rejected": -2.0531795024871826, + "logps/chosen": -545.419921875, + "logps/rejected": -408.034912109375, + "loss": 0.6133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.942764163017273, + "rewards/margins": 0.8641189932823181, + "rewards/rejected": -1.8068830966949463, + "step": 1088 + }, + { + "epoch": 0.13, + "learning_rate": 2.659855911184599e-07, + "logits/chosen": -2.2806711196899414, + "logits/rejected": -1.8763352632522583, + "logps/chosen": -163.35858154296875, + "logps/rejected": -327.4326171875, + "loss": 0.2895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32234013080596924, + "rewards/margins": 2.028449535369873, + "rewards/rejected": -2.3507895469665527, + "step": 1089 + }, + { + "epoch": 0.13, + "learning_rate": 2.659501594425416e-07, + "logits/chosen": -2.550938129425049, + "logits/rejected": -2.4827687740325928, + "logps/chosen": -163.6614990234375, + "logps/rejected": -177.9490203857422, + "loss": 0.4412, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6325076818466187, + "rewards/margins": 1.0199638605117798, + "rewards/rejected": -2.6524715423583984, + "step": 1090 + }, + { + "epoch": 0.13, + "learning_rate": 2.6591472776662334e-07, + "logits/chosen": -2.247694969177246, + "logits/rejected": -2.2467408180236816, + "logps/chosen": -180.94143676757812, + "logps/rejected": -234.705810546875, + "loss": 0.2663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5827271342277527, + "rewards/margins": 2.0683765411376953, + "rewards/rejected": -2.6511034965515137, + "step": 1091 + }, + { + "epoch": 0.13, + "learning_rate": 2.658792960907051e-07, + "logits/chosen": -2.0928311347961426, + "logits/rejected": -2.2146756649017334, + "logps/chosen": -260.311279296875, + "logps/rejected": -222.48345947265625, + "loss": 0.817, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2820231914520264, + "rewards/margins": 0.635793149471283, + "rewards/rejected": -1.917816400527954, + "step": 1092 + }, + { + "epoch": 0.13, + "learning_rate": 2.6584386441478684e-07, + "logits/chosen": -2.783607006072998, + "logits/rejected": -2.773996591567993, + "logps/chosen": -235.49896240234375, + "logps/rejected": -243.24166870117188, + "loss": 0.8725, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2005155086517334, + "rewards/margins": 0.8133785724639893, + "rewards/rejected": -2.0138938426971436, + "step": 1093 + }, + { + "epoch": 0.13, + "learning_rate": 2.6580843273886853e-07, + "logits/chosen": -2.3221139907836914, + "logits/rejected": -2.1175389289855957, + "logps/chosen": -290.72161865234375, + "logps/rejected": -289.4841613769531, + "loss": 0.3888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15285608172416687, + "rewards/margins": 1.6266357898712158, + "rewards/rejected": -1.779491901397705, + "step": 1094 + }, + { + "epoch": 0.13, + "learning_rate": 2.657730010629503e-07, + "logits/chosen": -2.4182841777801514, + "logits/rejected": -2.357902765274048, + "logps/chosen": -222.34373474121094, + "logps/rejected": -359.35150146484375, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6775766611099243, + "rewards/margins": 1.882901668548584, + "rewards/rejected": -2.5604782104492188, + "step": 1095 + }, + { + "epoch": 0.13, + "learning_rate": 2.65737569387032e-07, + "logits/chosen": -2.413295269012451, + "logits/rejected": -2.527707099914551, + "logps/chosen": -321.5057373046875, + "logps/rejected": -233.25619506835938, + "loss": 0.5285, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1913217306137085, + "rewards/margins": 0.5805375576019287, + "rewards/rejected": -1.7718591690063477, + "step": 1096 + }, + { + "epoch": 0.13, + "learning_rate": 2.657021377111137e-07, + "logits/chosen": -1.4416167736053467, + "logits/rejected": -2.289273500442505, + "logps/chosen": -598.5987548828125, + "logps/rejected": -286.2430725097656, + "loss": 0.7164, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2400336265563965, + "rewards/margins": 0.589780867099762, + "rewards/rejected": -1.8298144340515137, + "step": 1097 + }, + { + "epoch": 0.13, + "learning_rate": 2.656667060351955e-07, + "logits/chosen": -2.509615898132324, + "logits/rejected": -2.2834386825561523, + "logps/chosen": -157.9860382080078, + "logps/rejected": -253.32806396484375, + "loss": 0.3401, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14680764079093933, + "rewards/margins": 1.544450283050537, + "rewards/rejected": -1.3976426124572754, + "step": 1098 + }, + { + "epoch": 0.13, + "learning_rate": 2.6563127435927717e-07, + "logits/chosen": -2.926828145980835, + "logits/rejected": -2.830982208251953, + "logps/chosen": -217.80709838867188, + "logps/rejected": -326.3794250488281, + "loss": 0.7466, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6181491017341614, + "rewards/margins": 0.8377421498298645, + "rewards/rejected": -1.4558912515640259, + "step": 1099 + }, + { + "epoch": 0.13, + "learning_rate": 2.655958426833589e-07, + "logits/chosen": -2.3615589141845703, + "logits/rejected": -2.2544095516204834, + "logps/chosen": -415.98504638671875, + "logps/rejected": -414.3159484863281, + "loss": 0.869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9406054019927979, + "rewards/margins": 0.16447995603084564, + "rewards/rejected": -1.1050854921340942, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 2.655604110074406e-07, + "logits/chosen": -2.0426700115203857, + "logits/rejected": -1.7218594551086426, + "logps/chosen": -389.1139831542969, + "logps/rejected": -369.3939208984375, + "loss": 0.9614, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.770494282245636, + "rewards/margins": 0.3803935945034027, + "rewards/rejected": -1.1508879661560059, + "step": 1101 + }, + { + "epoch": 0.13, + "learning_rate": 2.6552497933152236e-07, + "logits/chosen": -2.2631208896636963, + "logits/rejected": -2.1580097675323486, + "logps/chosen": -358.5208740234375, + "logps/rejected": -318.9033203125, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8357605338096619, + "rewards/margins": 1.4440300464630127, + "rewards/rejected": -2.2797906398773193, + "step": 1102 + }, + { + "epoch": 0.13, + "learning_rate": 2.6548954765560406e-07, + "logits/chosen": -2.308929204940796, + "logits/rejected": -2.5453543663024902, + "logps/chosen": -591.123046875, + "logps/rejected": -351.06622314453125, + "loss": 0.3023, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.282114028930664, + "rewards/margins": 1.4868993759155273, + "rewards/rejected": -2.7690136432647705, + "step": 1103 + }, + { + "epoch": 0.13, + "learning_rate": 2.654541159796858e-07, + "logits/chosen": -2.7831220626831055, + "logits/rejected": -2.698190927505493, + "logps/chosen": -190.55572509765625, + "logps/rejected": -407.96051025390625, + "loss": 0.2952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1295367181301117, + "rewards/margins": 2.2018814086914062, + "rewards/rejected": -2.33141827583313, + "step": 1104 + }, + { + "epoch": 0.13, + "learning_rate": 2.6541868430376756e-07, + "logits/chosen": -2.6207308769226074, + "logits/rejected": -2.2476484775543213, + "logps/chosen": -129.5454864501953, + "logps/rejected": -194.4342498779297, + "loss": 0.4873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6489426493644714, + "rewards/margins": 2.1855711936950684, + "rewards/rejected": -2.8345139026641846, + "step": 1105 + }, + { + "epoch": 0.13, + "learning_rate": 2.653832526278493e-07, + "logits/chosen": -1.9586948156356812, + "logits/rejected": -2.546908378601074, + "logps/chosen": -476.88177490234375, + "logps/rejected": -208.41064453125, + "loss": 0.5031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0982667207717896, + "rewards/margins": 0.824949324131012, + "rewards/rejected": -1.9232158660888672, + "step": 1106 + }, + { + "epoch": 0.13, + "learning_rate": 2.65347820951931e-07, + "logits/chosen": -1.5147030353546143, + "logits/rejected": -1.755483865737915, + "logps/chosen": -356.2819519042969, + "logps/rejected": -265.4173278808594, + "loss": 0.7831, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0069767236709595, + "rewards/margins": 0.11011778563261032, + "rewards/rejected": -1.1170945167541504, + "step": 1107 + }, + { + "epoch": 0.13, + "learning_rate": 2.6531238927601275e-07, + "logits/chosen": -2.457608699798584, + "logits/rejected": -2.6591076850891113, + "logps/chosen": -340.73577880859375, + "logps/rejected": -279.0849914550781, + "loss": 2.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.239335298538208, + "rewards/margins": -1.6881706714630127, + "rewards/rejected": -1.5511645078659058, + "step": 1108 + }, + { + "epoch": 0.13, + "learning_rate": 2.652769576000945e-07, + "logits/chosen": -2.078481912612915, + "logits/rejected": -2.4076108932495117, + "logps/chosen": -330.88232421875, + "logps/rejected": -244.72325134277344, + "loss": 0.5204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7113993167877197, + "rewards/margins": 1.6688363552093506, + "rewards/rejected": -2.380235433578491, + "step": 1109 + }, + { + "epoch": 0.13, + "learning_rate": 2.652415259241762e-07, + "logits/chosen": -2.51499342918396, + "logits/rejected": -2.336141586303711, + "logps/chosen": -103.07061004638672, + "logps/rejected": -165.7377166748047, + "loss": 0.8417, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8785473704338074, + "rewards/margins": -0.0045035481452941895, + "rewards/rejected": -0.8740438222885132, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 2.6520609424825794e-07, + "logits/chosen": -2.9689080715179443, + "logits/rejected": -2.8139913082122803, + "logps/chosen": -296.6859436035156, + "logps/rejected": -149.15565490722656, + "loss": 0.7821, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8711401224136353, + "rewards/margins": 0.423603892326355, + "rewards/rejected": -1.2947440147399902, + "step": 1111 + }, + { + "epoch": 0.13, + "learning_rate": 2.6517066257233964e-07, + "logits/chosen": -1.3513450622558594, + "logits/rejected": -1.8739275932312012, + "logps/chosen": -547.1605224609375, + "logps/rejected": -282.3814697265625, + "loss": 1.0077, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.064497709274292, + "rewards/margins": 0.17994588613510132, + "rewards/rejected": -1.244443416595459, + "step": 1112 + }, + { + "epoch": 0.13, + "learning_rate": 2.651352308964214e-07, + "logits/chosen": -2.0606064796447754, + "logits/rejected": -2.0770702362060547, + "logps/chosen": -437.1154479980469, + "logps/rejected": -370.8592529296875, + "loss": 0.3114, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8396235108375549, + "rewards/margins": 1.8326356410980225, + "rewards/rejected": -2.6722593307495117, + "step": 1113 + }, + { + "epoch": 0.13, + "learning_rate": 2.650997992205031e-07, + "logits/chosen": -2.340580940246582, + "logits/rejected": -2.4258506298065186, + "logps/chosen": -297.9563903808594, + "logps/rejected": -197.63385009765625, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2167377471923828, + "rewards/margins": 0.4782801866531372, + "rewards/rejected": -1.6950178146362305, + "step": 1114 + }, + { + "epoch": 0.13, + "learning_rate": 2.6506436754458483e-07, + "logits/chosen": -2.303903341293335, + "logits/rejected": -2.414968967437744, + "logps/chosen": -165.5876922607422, + "logps/rejected": -220.72561645507812, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0871793031692505, + "rewards/margins": 0.6749810576438904, + "rewards/rejected": -1.762160301208496, + "step": 1115 + }, + { + "epoch": 0.13, + "learning_rate": 2.650289358686666e-07, + "logits/chosen": -2.4756009578704834, + "logits/rejected": -2.348827838897705, + "logps/chosen": -193.2213897705078, + "logps/rejected": -254.95155334472656, + "loss": 0.3764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8099127411842346, + "rewards/margins": 1.7865036725997925, + "rewards/rejected": -2.596416473388672, + "step": 1116 + }, + { + "epoch": 0.13, + "learning_rate": 2.6499350419274833e-07, + "logits/chosen": -2.6768245697021484, + "logits/rejected": -2.703096866607666, + "logps/chosen": -227.60227966308594, + "logps/rejected": -261.08349609375, + "loss": 0.3892, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9357527494430542, + "rewards/margins": 1.187957525253296, + "rewards/rejected": -2.1237101554870605, + "step": 1117 + }, + { + "epoch": 0.13, + "learning_rate": 2.6495807251683e-07, + "logits/chosen": -2.486966133117676, + "logits/rejected": -2.2645680904388428, + "logps/chosen": -156.65118408203125, + "logps/rejected": -322.9341125488281, + "loss": 0.4817, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01912519335746765, + "rewards/margins": 1.8248274326324463, + "rewards/rejected": -1.8439527750015259, + "step": 1118 + }, + { + "epoch": 0.13, + "learning_rate": 2.6492264084091177e-07, + "logits/chosen": -2.4739532470703125, + "logits/rejected": -2.3989529609680176, + "logps/chosen": -256.1156005859375, + "logps/rejected": -376.54736328125, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.00928959995508194, + "rewards/margins": 3.7912755012512207, + "rewards/rejected": -3.800564765930176, + "step": 1119 + }, + { + "epoch": 0.13, + "learning_rate": 2.648872091649935e-07, + "logits/chosen": -2.5460989475250244, + "logits/rejected": -2.5991601943969727, + "logps/chosen": -312.82537841796875, + "logps/rejected": -141.22872924804688, + "loss": 0.4532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5341533422470093, + "rewards/margins": 0.8455988764762878, + "rewards/rejected": -1.3797521591186523, + "step": 1120 + }, + { + "epoch": 0.13, + "learning_rate": 2.648517774890752e-07, + "logits/chosen": -1.8204810619354248, + "logits/rejected": -1.9572088718414307, + "logps/chosen": -262.59674072265625, + "logps/rejected": -257.204833984375, + "loss": 0.6376, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.321199655532837, + "rewards/margins": 0.4694935083389282, + "rewards/rejected": -1.7906930446624756, + "step": 1121 + }, + { + "epoch": 0.13, + "learning_rate": 2.6481634581315696e-07, + "logits/chosen": -2.5142767429351807, + "logits/rejected": -2.7048089504241943, + "logps/chosen": -331.5746154785156, + "logps/rejected": -279.2624206542969, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3908864259719849, + "rewards/margins": 0.8765456676483154, + "rewards/rejected": -2.26743221282959, + "step": 1122 + }, + { + "epoch": 0.13, + "learning_rate": 2.6478091413723866e-07, + "logits/chosen": -2.2930855751037598, + "logits/rejected": -1.8471429347991943, + "logps/chosen": -452.04217529296875, + "logps/rejected": -402.63690185546875, + "loss": 0.7316, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3275957107543945, + "rewards/margins": 0.5119313597679138, + "rewards/rejected": -1.8395270109176636, + "step": 1123 + }, + { + "epoch": 0.13, + "learning_rate": 2.647454824613204e-07, + "logits/chosen": -2.7456159591674805, + "logits/rejected": -2.6556918621063232, + "logps/chosen": -362.9642639160156, + "logps/rejected": -250.8321533203125, + "loss": 0.3985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6638115048408508, + "rewards/margins": 1.9315266609191895, + "rewards/rejected": -2.5953383445739746, + "step": 1124 + }, + { + "epoch": 0.13, + "learning_rate": 2.647100507854021e-07, + "logits/chosen": -1.888339877128601, + "logits/rejected": -1.832153558731079, + "logps/chosen": -227.54742431640625, + "logps/rejected": -290.60870361328125, + "loss": 0.4358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40289634466171265, + "rewards/margins": 0.8044824600219727, + "rewards/rejected": -1.20737886428833, + "step": 1125 + }, + { + "epoch": 0.13, + "learning_rate": 2.6467461910948385e-07, + "logits/chosen": -2.500401735305786, + "logits/rejected": -2.471601963043213, + "logps/chosen": -172.76699829101562, + "logps/rejected": -253.16342163085938, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4813537001609802, + "rewards/margins": 1.4733651876449585, + "rewards/rejected": -1.954718828201294, + "step": 1126 + }, + { + "epoch": 0.13, + "learning_rate": 2.646391874335656e-07, + "logits/chosen": -2.450838327407837, + "logits/rejected": -2.178544044494629, + "logps/chosen": -430.1990966796875, + "logps/rejected": -673.9619750976562, + "loss": 0.6437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4758790135383606, + "rewards/margins": 1.9519634246826172, + "rewards/rejected": -2.427842617034912, + "step": 1127 + }, + { + "epoch": 0.13, + "learning_rate": 2.6460375575764735e-07, + "logits/chosen": -2.049823522567749, + "logits/rejected": -2.4346795082092285, + "logps/chosen": -316.6060791015625, + "logps/rejected": -138.97998046875, + "loss": 0.6029, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.782996416091919, + "rewards/margins": 0.7283990979194641, + "rewards/rejected": -1.5113954544067383, + "step": 1128 + }, + { + "epoch": 0.13, + "learning_rate": 2.6456832408172905e-07, + "logits/chosen": -2.3951737880706787, + "logits/rejected": -2.4126908779144287, + "logps/chosen": -318.27471923828125, + "logps/rejected": -388.6593933105469, + "loss": 0.328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8942172527313232, + "rewards/margins": 1.593430757522583, + "rewards/rejected": -2.4876480102539062, + "step": 1129 + }, + { + "epoch": 0.13, + "learning_rate": 2.645328924058108e-07, + "logits/chosen": -2.577788829803467, + "logits/rejected": -2.449037551879883, + "logps/chosen": -159.98333740234375, + "logps/rejected": -198.4295654296875, + "loss": 0.3265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3254249095916748, + "rewards/margins": 2.2527410984039307, + "rewards/rejected": -2.5781660079956055, + "step": 1130 + }, + { + "epoch": 0.13, + "learning_rate": 2.6449746072989254e-07, + "logits/chosen": -1.8045743703842163, + "logits/rejected": -1.8222558498382568, + "logps/chosen": -157.07113647460938, + "logps/rejected": -314.88970947265625, + "loss": 0.4468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2372838854789734, + "rewards/margins": 0.6673785448074341, + "rewards/rejected": -0.904662549495697, + "step": 1131 + }, + { + "epoch": 0.13, + "learning_rate": 2.6446202905397424e-07, + "logits/chosen": -1.8360787630081177, + "logits/rejected": -2.179034471511841, + "logps/chosen": -498.54241943359375, + "logps/rejected": -295.2343444824219, + "loss": 0.4774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5115102529525757, + "rewards/margins": 1.0113028287887573, + "rewards/rejected": -1.522813081741333, + "step": 1132 + }, + { + "epoch": 0.13, + "learning_rate": 2.64426597378056e-07, + "logits/chosen": -2.666804313659668, + "logits/rejected": -2.6620712280273438, + "logps/chosen": -210.9462432861328, + "logps/rejected": -299.9907531738281, + "loss": 0.2188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19027571380138397, + "rewards/margins": 1.890649437904358, + "rewards/rejected": -2.080925226211548, + "step": 1133 + }, + { + "epoch": 0.13, + "learning_rate": 2.643911657021377e-07, + "logits/chosen": -2.0493626594543457, + "logits/rejected": -2.429739475250244, + "logps/chosen": -323.2890319824219, + "logps/rejected": -242.64871215820312, + "loss": 0.711, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6791374683380127, + "rewards/margins": 0.3479766249656677, + "rewards/rejected": -1.0271141529083252, + "step": 1134 + }, + { + "epoch": 0.13, + "learning_rate": 2.6435573402621943e-07, + "logits/chosen": -2.422412395477295, + "logits/rejected": -2.2253293991088867, + "logps/chosen": -218.44216918945312, + "logps/rejected": -229.6834716796875, + "loss": 0.3579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29916635155677795, + "rewards/margins": 1.7493116855621338, + "rewards/rejected": -2.048478126525879, + "step": 1135 + }, + { + "epoch": 0.13, + "learning_rate": 2.6432030235030113e-07, + "logits/chosen": -1.953818440437317, + "logits/rejected": -1.7557413578033447, + "logps/chosen": -263.484375, + "logps/rejected": -354.6983947753906, + "loss": 0.4716, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6189831495285034, + "rewards/margins": 1.167496919631958, + "rewards/rejected": -2.786479949951172, + "step": 1136 + }, + { + "epoch": 0.13, + "learning_rate": 2.642848706743829e-07, + "logits/chosen": -2.321093797683716, + "logits/rejected": -2.606283664703369, + "logps/chosen": -575.6839599609375, + "logps/rejected": -295.5623474121094, + "loss": 0.5115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6694839000701904, + "rewards/margins": 0.7393667101860046, + "rewards/rejected": -1.4088505506515503, + "step": 1137 + }, + { + "epoch": 0.13, + "learning_rate": 2.642494389984646e-07, + "logits/chosen": -2.1147918701171875, + "logits/rejected": -2.159086227416992, + "logps/chosen": -305.5387878417969, + "logps/rejected": -240.6249237060547, + "loss": 0.828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8223506212234497, + "rewards/margins": 0.2699509859085083, + "rewards/rejected": -1.092301607131958, + "step": 1138 + }, + { + "epoch": 0.13, + "learning_rate": 2.642140073225463e-07, + "logits/chosen": -2.278658866882324, + "logits/rejected": -2.263537645339966, + "logps/chosen": -261.3817443847656, + "logps/rejected": -258.72625732421875, + "loss": 0.772, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7387564182281494, + "rewards/margins": 0.6883573532104492, + "rewards/rejected": -2.4271137714385986, + "step": 1139 + }, + { + "epoch": 0.13, + "learning_rate": 2.6417857564662807e-07, + "logits/chosen": -2.410578966140747, + "logits/rejected": -2.2240686416625977, + "logps/chosen": -219.3800506591797, + "logps/rejected": -255.1181640625, + "loss": 0.4691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43520426750183105, + "rewards/margins": 0.8628276586532593, + "rewards/rejected": -1.2980319261550903, + "step": 1140 + }, + { + "epoch": 0.13, + "learning_rate": 2.641431439707098e-07, + "logits/chosen": -2.3582286834716797, + "logits/rejected": -2.6278676986694336, + "logps/chosen": -243.3148651123047, + "logps/rejected": -131.18405151367188, + "loss": 0.483, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3725321590900421, + "rewards/margins": 1.076711654663086, + "rewards/rejected": -1.4492437839508057, + "step": 1141 + }, + { + "epoch": 0.13, + "learning_rate": 2.6410771229479157e-07, + "logits/chosen": -2.3114359378814697, + "logits/rejected": -2.2087669372558594, + "logps/chosen": -226.905029296875, + "logps/rejected": -233.50100708007812, + "loss": 1.1821, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2513518333435059, + "rewards/margins": -0.03444682061672211, + "rewards/rejected": -1.216904878616333, + "step": 1142 + }, + { + "epoch": 0.13, + "learning_rate": 2.6407228061887326e-07, + "logits/chosen": -2.3138952255249023, + "logits/rejected": -1.9072593450546265, + "logps/chosen": -716.3370971679688, + "logps/rejected": -401.35479736328125, + "loss": 0.3242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8961679935455322, + "rewards/margins": 1.8221282958984375, + "rewards/rejected": -2.718296527862549, + "step": 1143 + }, + { + "epoch": 0.13, + "learning_rate": 2.64036848942955e-07, + "logits/chosen": -2.4234542846679688, + "logits/rejected": -2.2874369621276855, + "logps/chosen": -137.1421356201172, + "logps/rejected": -193.217529296875, + "loss": 0.3462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09147003293037415, + "rewards/margins": 1.089120626449585, + "rewards/rejected": -1.1805906295776367, + "step": 1144 + }, + { + "epoch": 0.13, + "learning_rate": 2.640014172670367e-07, + "logits/chosen": -2.5103533267974854, + "logits/rejected": -2.282189130783081, + "logps/chosen": -195.56314086914062, + "logps/rejected": -291.3131103515625, + "loss": 0.2306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4826018214225769, + "rewards/margins": 2.0470077991485596, + "rewards/rejected": -2.5296096801757812, + "step": 1145 + }, + { + "epoch": 0.13, + "learning_rate": 2.6396598559111845e-07, + "logits/chosen": -2.4132399559020996, + "logits/rejected": -2.4032554626464844, + "logps/chosen": -259.71466064453125, + "logps/rejected": -336.08624267578125, + "loss": 0.2779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26261571049690247, + "rewards/margins": 1.910552978515625, + "rewards/rejected": -2.173168897628784, + "step": 1146 + }, + { + "epoch": 0.13, + "learning_rate": 2.6393055391520015e-07, + "logits/chosen": -2.321951150894165, + "logits/rejected": -2.4085428714752197, + "logps/chosen": -141.59036254882812, + "logps/rejected": -222.86492919921875, + "loss": 0.5874, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24035170674324036, + "rewards/margins": 0.6349031329154968, + "rewards/rejected": -0.8752548694610596, + "step": 1147 + }, + { + "epoch": 0.13, + "learning_rate": 2.638951222392819e-07, + "logits/chosen": -2.500476837158203, + "logits/rejected": -2.2185559272766113, + "logps/chosen": -282.4196472167969, + "logps/rejected": -346.120361328125, + "loss": 0.2628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48218774795532227, + "rewards/margins": 3.124493360519409, + "rewards/rejected": -3.6066813468933105, + "step": 1148 + }, + { + "epoch": 0.13, + "learning_rate": 2.6385969056336365e-07, + "logits/chosen": -2.470456123352051, + "logits/rejected": -2.6245312690734863, + "logps/chosen": -379.9046325683594, + "logps/rejected": -315.7315673828125, + "loss": 0.4635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8521103262901306, + "rewards/margins": 2.3367209434509277, + "rewards/rejected": -3.188831329345703, + "step": 1149 + }, + { + "epoch": 0.13, + "learning_rate": 2.6382425888744534e-07, + "logits/chosen": -2.362119197845459, + "logits/rejected": -2.438901901245117, + "logps/chosen": -240.6522216796875, + "logps/rejected": -267.0792236328125, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37331151962280273, + "rewards/margins": 2.65958571434021, + "rewards/rejected": -3.0328972339630127, + "step": 1150 + }, + { + "epoch": 0.13, + "learning_rate": 2.637888272115271e-07, + "logits/chosen": -1.946824312210083, + "logits/rejected": -2.186746835708618, + "logps/chosen": -447.8275451660156, + "logps/rejected": -336.83599853515625, + "loss": 0.6399, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8495250940322876, + "rewards/margins": 0.7241283655166626, + "rewards/rejected": -1.5736534595489502, + "step": 1151 + }, + { + "epoch": 0.13, + "learning_rate": 2.6375339553560884e-07, + "logits/chosen": -2.306161880493164, + "logits/rejected": -2.333437442779541, + "logps/chosen": -251.08688354492188, + "logps/rejected": -255.38507080078125, + "loss": 0.6567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0192550420761108, + "rewards/margins": 1.5989925861358643, + "rewards/rejected": -2.6182477474212646, + "step": 1152 + }, + { + "epoch": 0.13, + "learning_rate": 2.637179638596906e-07, + "logits/chosen": -1.8576173782348633, + "logits/rejected": -1.7964882850646973, + "logps/chosen": -343.2393798828125, + "logps/rejected": -339.1413269042969, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9936886429786682, + "rewards/margins": 0.9223480820655823, + "rewards/rejected": -1.916036605834961, + "step": 1153 + }, + { + "epoch": 0.13, + "learning_rate": 2.636825321837723e-07, + "logits/chosen": -2.435818672180176, + "logits/rejected": -2.4917922019958496, + "logps/chosen": -289.1380920410156, + "logps/rejected": -265.42437744140625, + "loss": 0.5782, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2091882228851318, + "rewards/margins": 1.4961382150650024, + "rewards/rejected": -2.7053263187408447, + "step": 1154 + }, + { + "epoch": 0.13, + "learning_rate": 2.6364710050785403e-07, + "logits/chosen": -2.1539082527160645, + "logits/rejected": -2.3165366649627686, + "logps/chosen": -214.49935913085938, + "logps/rejected": -208.94674682617188, + "loss": 0.2486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13195329904556274, + "rewards/margins": 1.7579033374786377, + "rewards/rejected": -1.8898565769195557, + "step": 1155 + }, + { + "epoch": 0.13, + "learning_rate": 2.6361166883193573e-07, + "logits/chosen": -2.00500226020813, + "logits/rejected": -2.322610855102539, + "logps/chosen": -470.88458251953125, + "logps/rejected": -292.46820068359375, + "loss": 0.4336, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.770811915397644, + "rewards/margins": 1.118998408317566, + "rewards/rejected": -1.88981032371521, + "step": 1156 + }, + { + "epoch": 0.13, + "learning_rate": 2.635762371560175e-07, + "logits/chosen": -2.3267693519592285, + "logits/rejected": -2.507692575454712, + "logps/chosen": -278.3951416015625, + "logps/rejected": -159.1973114013672, + "loss": 0.4496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13042141497135162, + "rewards/margins": 1.0755350589752197, + "rewards/rejected": -1.2059565782546997, + "step": 1157 + }, + { + "epoch": 0.13, + "learning_rate": 2.6354080548009917e-07, + "logits/chosen": -2.3902435302734375, + "logits/rejected": -2.7036831378936768, + "logps/chosen": -286.00567626953125, + "logps/rejected": -270.46026611328125, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9938353300094604, + "rewards/margins": 1.4550251960754395, + "rewards/rejected": -2.4488604068756104, + "step": 1158 + }, + { + "epoch": 0.13, + "learning_rate": 2.635053738041809e-07, + "logits/chosen": -2.45825457572937, + "logits/rejected": -2.304996967315674, + "logps/chosen": -66.20259094238281, + "logps/rejected": -214.89642333984375, + "loss": 0.5714, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5067706108093262, + "rewards/margins": 1.611901044845581, + "rewards/rejected": -2.1186716556549072, + "step": 1159 + }, + { + "epoch": 0.13, + "learning_rate": 2.6346994212826267e-07, + "logits/chosen": -2.286484956741333, + "logits/rejected": -2.21768856048584, + "logps/chosen": -173.68682861328125, + "logps/rejected": -174.7557830810547, + "loss": 1.4401, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.524367332458496, + "rewards/margins": 0.017395317554473877, + "rewards/rejected": -1.5417625904083252, + "step": 1160 + }, + { + "epoch": 0.14, + "learning_rate": 2.6343451045234437e-07, + "logits/chosen": -2.4704878330230713, + "logits/rejected": -2.1469638347625732, + "logps/chosen": -250.3804931640625, + "logps/rejected": -317.4695129394531, + "loss": 0.4295, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8876146674156189, + "rewards/margins": 1.1253702640533447, + "rewards/rejected": -2.0129847526550293, + "step": 1161 + }, + { + "epoch": 0.14, + "learning_rate": 2.633990787764261e-07, + "logits/chosen": -3.063331127166748, + "logits/rejected": -3.0439505577087402, + "logps/chosen": -282.2261657714844, + "logps/rejected": -238.1290740966797, + "loss": 0.3045, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6587254405021667, + "rewards/margins": 2.3743972778320312, + "rewards/rejected": -3.0331225395202637, + "step": 1162 + }, + { + "epoch": 0.14, + "learning_rate": 2.6336364710050786e-07, + "logits/chosen": -2.143299102783203, + "logits/rejected": -2.0063767433166504, + "logps/chosen": -202.81326293945312, + "logps/rejected": -202.39772033691406, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8678746223449707, + "rewards/margins": 0.9972575902938843, + "rewards/rejected": -1.8651323318481445, + "step": 1163 + }, + { + "epoch": 0.14, + "learning_rate": 2.6332821542458956e-07, + "logits/chosen": -2.108022689819336, + "logits/rejected": -2.1015584468841553, + "logps/chosen": -340.32159423828125, + "logps/rejected": -305.5045166015625, + "loss": 0.2573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3668273985385895, + "rewards/margins": 2.319349765777588, + "rewards/rejected": -2.6861772537231445, + "step": 1164 + }, + { + "epoch": 0.14, + "learning_rate": 2.632927837486713e-07, + "logits/chosen": -2.429131269454956, + "logits/rejected": -2.4190282821655273, + "logps/chosen": -208.83245849609375, + "logps/rejected": -396.3777160644531, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.239047572016716, + "rewards/margins": 2.181807041168213, + "rewards/rejected": -2.4208548069000244, + "step": 1165 + }, + { + "epoch": 0.14, + "learning_rate": 2.6325735207275306e-07, + "logits/chosen": -1.760108232498169, + "logits/rejected": -1.770267367362976, + "logps/chosen": -235.20016479492188, + "logps/rejected": -316.287353515625, + "loss": 0.5786, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2826373875141144, + "rewards/margins": 1.1674845218658447, + "rewards/rejected": -1.4501218795776367, + "step": 1166 + }, + { + "epoch": 0.14, + "learning_rate": 2.6322192039683475e-07, + "logits/chosen": -2.501248836517334, + "logits/rejected": -2.513336420059204, + "logps/chosen": -318.531982421875, + "logps/rejected": -179.18014526367188, + "loss": 0.4329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3961593508720398, + "rewards/margins": 1.1899309158325195, + "rewards/rejected": -1.586090326309204, + "step": 1167 + }, + { + "epoch": 0.14, + "learning_rate": 2.631864887209165e-07, + "logits/chosen": -2.183765172958374, + "logits/rejected": -2.099086046218872, + "logps/chosen": -409.15118408203125, + "logps/rejected": -308.73162841796875, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021406937390565872, + "rewards/margins": 1.1926398277282715, + "rewards/rejected": -1.1712329387664795, + "step": 1168 + }, + { + "epoch": 0.14, + "learning_rate": 2.631510570449982e-07, + "logits/chosen": -1.9123578071594238, + "logits/rejected": -2.0035784244537354, + "logps/chosen": -298.87176513671875, + "logps/rejected": -341.4882507324219, + "loss": 0.5553, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6749172806739807, + "rewards/margins": 0.7260206341743469, + "rewards/rejected": -1.400937795639038, + "step": 1169 + }, + { + "epoch": 0.14, + "learning_rate": 2.6311562536907994e-07, + "logits/chosen": -2.946542739868164, + "logits/rejected": -3.0337436199188232, + "logps/chosen": -292.3608703613281, + "logps/rejected": -364.19183349609375, + "loss": 0.3069, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04323352873325348, + "rewards/margins": 2.130321979522705, + "rewards/rejected": -2.173555612564087, + "step": 1170 + }, + { + "epoch": 0.14, + "learning_rate": 2.630801936931617e-07, + "logits/chosen": -2.239610195159912, + "logits/rejected": -2.7027440071105957, + "logps/chosen": -261.6641540527344, + "logps/rejected": -176.2165985107422, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48191219568252563, + "rewards/margins": 2.5597918033599854, + "rewards/rejected": -3.0417041778564453, + "step": 1171 + }, + { + "epoch": 0.14, + "learning_rate": 2.630447620172434e-07, + "logits/chosen": -2.0850253105163574, + "logits/rejected": -2.200005531311035, + "logps/chosen": -384.02728271484375, + "logps/rejected": -309.0516052246094, + "loss": 0.4421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8015917539596558, + "rewards/margins": 1.5241868495941162, + "rewards/rejected": -2.3257784843444824, + "step": 1172 + }, + { + "epoch": 0.14, + "learning_rate": 2.6300933034132514e-07, + "logits/chosen": -1.9583872556686401, + "logits/rejected": -2.246471881866455, + "logps/chosen": -282.86395263671875, + "logps/rejected": -247.89865112304688, + "loss": 0.7712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6944634914398193, + "rewards/margins": 0.166303351521492, + "rewards/rejected": -1.8607667684555054, + "step": 1173 + }, + { + "epoch": 0.14, + "learning_rate": 2.6297389866540683e-07, + "logits/chosen": -2.5510568618774414, + "logits/rejected": -2.576976776123047, + "logps/chosen": -145.6563720703125, + "logps/rejected": -133.72323608398438, + "loss": 0.4499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5828529596328735, + "rewards/margins": 1.038272500038147, + "rewards/rejected": -1.6211254596710205, + "step": 1174 + }, + { + "epoch": 0.14, + "learning_rate": 2.629384669894886e-07, + "logits/chosen": -2.595310926437378, + "logits/rejected": -2.4826741218566895, + "logps/chosen": -353.17718505859375, + "logps/rejected": -329.4533386230469, + "loss": 0.2795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6807783842086792, + "rewards/margins": 2.031381130218506, + "rewards/rejected": -2.7121593952178955, + "step": 1175 + }, + { + "epoch": 0.14, + "learning_rate": 2.6290303531357033e-07, + "logits/chosen": -2.4998817443847656, + "logits/rejected": -2.2793662548065186, + "logps/chosen": -265.1583557128906, + "logps/rejected": -347.1614685058594, + "loss": 0.299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4692988693714142, + "rewards/margins": 1.5370488166809082, + "rewards/rejected": -2.00634765625, + "step": 1176 + }, + { + "epoch": 0.14, + "learning_rate": 2.628676036376521e-07, + "logits/chosen": -2.4551334381103516, + "logits/rejected": -2.7493672370910645, + "logps/chosen": -419.4507751464844, + "logps/rejected": -396.3478088378906, + "loss": 0.3919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3036234974861145, + "rewards/margins": 1.1721701622009277, + "rewards/rejected": -1.4757936000823975, + "step": 1177 + }, + { + "epoch": 0.14, + "learning_rate": 2.628321719617338e-07, + "logits/chosen": -2.5716795921325684, + "logits/rejected": -2.5402305126190186, + "logps/chosen": -254.3050994873047, + "logps/rejected": -291.22003173828125, + "loss": 0.4109, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3384926319122314, + "rewards/margins": 1.2213716506958008, + "rewards/rejected": -2.5598642826080322, + "step": 1178 + }, + { + "epoch": 0.14, + "learning_rate": 2.627967402858155e-07, + "logits/chosen": -1.796653151512146, + "logits/rejected": -1.9004765748977661, + "logps/chosen": -420.7557067871094, + "logps/rejected": -398.71917724609375, + "loss": 0.4922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7709764242172241, + "rewards/margins": 1.1081091165542603, + "rewards/rejected": -1.879085659980774, + "step": 1179 + }, + { + "epoch": 0.14, + "learning_rate": 2.627613086098972e-07, + "logits/chosen": -2.485393524169922, + "logits/rejected": -2.565720558166504, + "logps/chosen": -392.0726623535156, + "logps/rejected": -187.7865753173828, + "loss": 0.5966, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5854492783546448, + "rewards/margins": 0.5064091682434082, + "rewards/rejected": -1.0918583869934082, + "step": 1180 + }, + { + "epoch": 0.14, + "learning_rate": 2.6272587693397897e-07, + "logits/chosen": -2.1885361671447754, + "logits/rejected": -2.459245204925537, + "logps/chosen": -360.46527099609375, + "logps/rejected": -204.71995544433594, + "loss": 0.3446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2684474587440491, + "rewards/margins": 1.7257689237594604, + "rewards/rejected": -1.9942164421081543, + "step": 1181 + }, + { + "epoch": 0.14, + "learning_rate": 2.626904452580607e-07, + "logits/chosen": -1.7976491451263428, + "logits/rejected": -1.7719624042510986, + "logps/chosen": -334.6031494140625, + "logps/rejected": -409.6170654296875, + "loss": 0.4979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05143106356263161, + "rewards/margins": 1.2966761589050293, + "rewards/rejected": -1.3481073379516602, + "step": 1182 + }, + { + "epoch": 0.14, + "learning_rate": 2.626550135821424e-07, + "logits/chosen": -2.431842803955078, + "logits/rejected": -2.246018409729004, + "logps/chosen": -230.97503662109375, + "logps/rejected": -289.959228515625, + "loss": 0.5661, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1663565635681152, + "rewards/margins": 0.8083146810531616, + "rewards/rejected": -1.9746712446212769, + "step": 1183 + }, + { + "epoch": 0.14, + "learning_rate": 2.6261958190622416e-07, + "logits/chosen": -1.9843817949295044, + "logits/rejected": -2.199049472808838, + "logps/chosen": -368.1208801269531, + "logps/rejected": -300.5089416503906, + "loss": 0.4808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.420304536819458, + "rewards/margins": 0.9655351042747498, + "rewards/rejected": -1.3858397006988525, + "step": 1184 + }, + { + "epoch": 0.14, + "learning_rate": 2.6258415023030586e-07, + "logits/chosen": -2.9077420234680176, + "logits/rejected": -2.9521031379699707, + "logps/chosen": -252.1707763671875, + "logps/rejected": -295.3094482421875, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7913833260536194, + "rewards/margins": 2.14546799659729, + "rewards/rejected": -2.9368512630462646, + "step": 1185 + }, + { + "epoch": 0.14, + "learning_rate": 2.625487185543876e-07, + "logits/chosen": -2.6385560035705566, + "logits/rejected": -2.499655246734619, + "logps/chosen": -170.9019317626953, + "logps/rejected": -203.0529022216797, + "loss": 0.986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8377872705459595, + "rewards/margins": -0.10899387300014496, + "rewards/rejected": -0.7287933826446533, + "step": 1186 + }, + { + "epoch": 0.14, + "learning_rate": 2.6251328687846935e-07, + "logits/chosen": -2.0109195709228516, + "logits/rejected": -2.114600658416748, + "logps/chosen": -203.72732543945312, + "logps/rejected": -210.8975830078125, + "loss": 0.3907, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01627233624458313, + "rewards/margins": 1.3074917793273926, + "rewards/rejected": -1.2912194728851318, + "step": 1187 + }, + { + "epoch": 0.14, + "learning_rate": 2.624778552025511e-07, + "logits/chosen": -2.3000502586364746, + "logits/rejected": -2.3431100845336914, + "logps/chosen": -304.0159912109375, + "logps/rejected": -357.82757568359375, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04214784502983093, + "rewards/margins": 1.8568241596221924, + "rewards/rejected": -1.814676284790039, + "step": 1188 + }, + { + "epoch": 0.14, + "learning_rate": 2.624424235266328e-07, + "logits/chosen": -2.133063793182373, + "logits/rejected": -1.6254559755325317, + "logps/chosen": -483.1354064941406, + "logps/rejected": -451.7200622558594, + "loss": 0.3453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1215705871582031, + "rewards/margins": 1.9926273822784424, + "rewards/rejected": -3.1141977310180664, + "step": 1189 + }, + { + "epoch": 0.14, + "learning_rate": 2.6240699185071455e-07, + "logits/chosen": -2.5405287742614746, + "logits/rejected": -2.7738232612609863, + "logps/chosen": -314.0227355957031, + "logps/rejected": -289.7928466796875, + "loss": 0.2003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22033053636550903, + "rewards/margins": 2.578366756439209, + "rewards/rejected": -2.798696994781494, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 2.6237156017479624e-07, + "logits/chosen": -2.0926194190979004, + "logits/rejected": -2.184406042098999, + "logps/chosen": -343.1962890625, + "logps/rejected": -179.75657653808594, + "loss": 0.8271, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8657636046409607, + "rewards/margins": 0.31198519468307495, + "rewards/rejected": -1.1777487993240356, + "step": 1191 + }, + { + "epoch": 0.14, + "learning_rate": 2.62336128498878e-07, + "logits/chosen": -2.6737442016601562, + "logits/rejected": -2.7173068523406982, + "logps/chosen": -236.8052978515625, + "logps/rejected": -184.764404296875, + "loss": 0.2962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02981293946504593, + "rewards/margins": 2.2216544151306152, + "rewards/rejected": -2.251467227935791, + "step": 1192 + }, + { + "epoch": 0.14, + "learning_rate": 2.623006968229597e-07, + "logits/chosen": -2.276961326599121, + "logits/rejected": -2.25769305229187, + "logps/chosen": -292.73095703125, + "logps/rejected": -371.23980712890625, + "loss": 0.4385, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0152618885040283, + "rewards/margins": 1.9563926458358765, + "rewards/rejected": -2.9716544151306152, + "step": 1193 + }, + { + "epoch": 0.14, + "learning_rate": 2.6226526514704143e-07, + "logits/chosen": -2.0135021209716797, + "logits/rejected": -2.1968777179718018, + "logps/chosen": -354.910400390625, + "logps/rejected": -254.7936248779297, + "loss": 0.5928, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2552728652954102, + "rewards/margins": 0.5037976503372192, + "rewards/rejected": -1.7590703964233398, + "step": 1194 + }, + { + "epoch": 0.14, + "learning_rate": 2.622298334711232e-07, + "logits/chosen": -2.6499695777893066, + "logits/rejected": -2.869297981262207, + "logps/chosen": -367.01806640625, + "logps/rejected": -354.322021484375, + "loss": 0.4736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38934648036956787, + "rewards/margins": 1.0461153984069824, + "rewards/rejected": -1.4354617595672607, + "step": 1195 + }, + { + "epoch": 0.14, + "learning_rate": 2.621944017952049e-07, + "logits/chosen": -1.9062325954437256, + "logits/rejected": -2.2725536823272705, + "logps/chosen": -402.7008361816406, + "logps/rejected": -261.5166015625, + "loss": 0.5861, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0627286434173584, + "rewards/margins": 0.7243897318840027, + "rewards/rejected": -1.7871183156967163, + "step": 1196 + }, + { + "epoch": 0.14, + "learning_rate": 2.621589701192866e-07, + "logits/chosen": -1.7307496070861816, + "logits/rejected": -2.127225399017334, + "logps/chosen": -543.2939453125, + "logps/rejected": -495.5115966796875, + "loss": 0.9426, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.229633092880249, + "rewards/margins": 0.22584637999534607, + "rewards/rejected": -1.455479383468628, + "step": 1197 + }, + { + "epoch": 0.14, + "learning_rate": 2.621235384433684e-07, + "logits/chosen": -2.2439160346984863, + "logits/rejected": -2.332536220550537, + "logps/chosen": -243.64869689941406, + "logps/rejected": -142.17454528808594, + "loss": 0.4622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4734541177749634, + "rewards/margins": 0.986446738243103, + "rewards/rejected": -1.459900975227356, + "step": 1198 + }, + { + "epoch": 0.14, + "learning_rate": 2.620881067674501e-07, + "logits/chosen": -1.8006635904312134, + "logits/rejected": -1.9461729526519775, + "logps/chosen": -200.7579803466797, + "logps/rejected": -188.3125457763672, + "loss": 0.8033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9019895195960999, + "rewards/margins": 0.6709003448486328, + "rewards/rejected": -1.572889804840088, + "step": 1199 + }, + { + "epoch": 0.14, + "learning_rate": 2.620526750915318e-07, + "logits/chosen": -2.531041145324707, + "logits/rejected": -2.4935429096221924, + "logps/chosen": -180.9168243408203, + "logps/rejected": -223.1610107421875, + "loss": 0.1341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2965756952762604, + "rewards/margins": 2.6207337379455566, + "rewards/rejected": -2.9173097610473633, + "step": 1200 + }, + { + "epoch": 0.14, + "learning_rate": 2.6201724341561357e-07, + "logits/chosen": -2.7763967514038086, + "logits/rejected": -2.9098730087280273, + "logps/chosen": -270.83380126953125, + "logps/rejected": -226.41517639160156, + "loss": 0.1937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5548675060272217, + "rewards/margins": 2.5873751640319824, + "rewards/rejected": -3.142242908477783, + "step": 1201 + }, + { + "epoch": 0.14, + "learning_rate": 2.6198181173969526e-07, + "logits/chosen": -2.3674473762512207, + "logits/rejected": -2.3587379455566406, + "logps/chosen": -459.4141845703125, + "logps/rejected": -279.41668701171875, + "loss": 0.297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6752339005470276, + "rewards/margins": 2.298826217651367, + "rewards/rejected": -2.974060297012329, + "step": 1202 + }, + { + "epoch": 0.14, + "learning_rate": 2.61946380063777e-07, + "logits/chosen": -1.9262473583221436, + "logits/rejected": -2.0069022178649902, + "logps/chosen": -385.65643310546875, + "logps/rejected": -297.6188049316406, + "loss": 0.8983, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7674019932746887, + "rewards/margins": 0.3854081630706787, + "rewards/rejected": -1.1528100967407227, + "step": 1203 + }, + { + "epoch": 0.14, + "learning_rate": 2.619109483878587e-07, + "logits/chosen": -2.362590789794922, + "logits/rejected": -2.553309202194214, + "logps/chosen": -393.9818420410156, + "logps/rejected": -386.4381103515625, + "loss": 1.2557, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4655123949050903, + "rewards/margins": 1.1759333610534668, + "rewards/rejected": -2.6414456367492676, + "step": 1204 + }, + { + "epoch": 0.14, + "learning_rate": 2.6187551671194046e-07, + "logits/chosen": -2.242875337600708, + "logits/rejected": -1.9882506132125854, + "logps/chosen": -323.76617431640625, + "logps/rejected": -330.654052734375, + "loss": 0.302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39922797679901123, + "rewards/margins": 1.7042144536972046, + "rewards/rejected": -2.103442430496216, + "step": 1205 + }, + { + "epoch": 0.14, + "learning_rate": 2.618400850360222e-07, + "logits/chosen": -2.1490612030029297, + "logits/rejected": -2.0353264808654785, + "logps/chosen": -229.81558227539062, + "logps/rejected": -325.930419921875, + "loss": 0.5084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6471105813980103, + "rewards/margins": 1.5092400312423706, + "rewards/rejected": -2.156350612640381, + "step": 1206 + }, + { + "epoch": 0.14, + "learning_rate": 2.618046533601039e-07, + "logits/chosen": -2.101057529449463, + "logits/rejected": -2.0508899688720703, + "logps/chosen": -290.75604248046875, + "logps/rejected": -298.0745544433594, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5231325626373291, + "rewards/margins": 1.4701275825500488, + "rewards/rejected": -1.993260145187378, + "step": 1207 + }, + { + "epoch": 0.14, + "learning_rate": 2.6176922168418565e-07, + "logits/chosen": -2.351505756378174, + "logits/rejected": -2.3583590984344482, + "logps/chosen": -232.89732360839844, + "logps/rejected": -262.2727966308594, + "loss": 0.6907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7882159352302551, + "rewards/margins": 1.2391718626022339, + "rewards/rejected": -2.0273876190185547, + "step": 1208 + }, + { + "epoch": 0.14, + "learning_rate": 2.6173379000826734e-07, + "logits/chosen": -2.822025775909424, + "logits/rejected": -2.6406497955322266, + "logps/chosen": -168.3657684326172, + "logps/rejected": -187.40231323242188, + "loss": 0.4297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7623965740203857, + "rewards/margins": 1.1170769929885864, + "rewards/rejected": -1.8794736862182617, + "step": 1209 + }, + { + "epoch": 0.14, + "learning_rate": 2.6169835833234915e-07, + "logits/chosen": -2.6718530654907227, + "logits/rejected": -2.711927890777588, + "logps/chosen": -431.6191711425781, + "logps/rejected": -244.91815185546875, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8967389464378357, + "rewards/margins": 0.5498989224433899, + "rewards/rejected": -1.4466378688812256, + "step": 1210 + }, + { + "epoch": 0.14, + "learning_rate": 2.6166292665643084e-07, + "logits/chosen": -1.991060733795166, + "logits/rejected": -2.0923988819122314, + "logps/chosen": -362.5167236328125, + "logps/rejected": -289.84393310546875, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6435012221336365, + "rewards/margins": 2.5302891731262207, + "rewards/rejected": -3.173790454864502, + "step": 1211 + }, + { + "epoch": 0.14, + "learning_rate": 2.616274949805126e-07, + "logits/chosen": -2.430065393447876, + "logits/rejected": -2.1822991371154785, + "logps/chosen": -250.44174194335938, + "logps/rejected": -302.0910339355469, + "loss": 0.3632, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1955638825893402, + "rewards/margins": 0.9921808242797852, + "rewards/rejected": -1.1877448558807373, + "step": 1212 + }, + { + "epoch": 0.14, + "learning_rate": 2.615920633045943e-07, + "logits/chosen": -2.2868592739105225, + "logits/rejected": -2.3052589893341064, + "logps/chosen": -316.4286193847656, + "logps/rejected": -315.93817138671875, + "loss": 0.3579, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6329947710037231, + "rewards/margins": 1.6671862602233887, + "rewards/rejected": -2.3001809120178223, + "step": 1213 + }, + { + "epoch": 0.14, + "learning_rate": 2.6155663162867603e-07, + "logits/chosen": -2.4589529037475586, + "logits/rejected": -2.1748476028442383, + "logps/chosen": -361.7392578125, + "logps/rejected": -335.0190124511719, + "loss": 0.8517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9949109554290771, + "rewards/margins": 0.9016218185424805, + "rewards/rejected": -1.8965328931808472, + "step": 1214 + }, + { + "epoch": 0.14, + "learning_rate": 2.6152119995275773e-07, + "logits/chosen": -2.241535186767578, + "logits/rejected": -2.297055244445801, + "logps/chosen": -243.60638427734375, + "logps/rejected": -133.24386596679688, + "loss": 0.6223, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0300573110580444, + "rewards/margins": 0.9724359512329102, + "rewards/rejected": -2.002493143081665, + "step": 1215 + }, + { + "epoch": 0.14, + "learning_rate": 2.614857682768395e-07, + "logits/chosen": -2.250706434249878, + "logits/rejected": -2.513474941253662, + "logps/chosen": -362.91754150390625, + "logps/rejected": -271.84552001953125, + "loss": 0.5401, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1390385627746582, + "rewards/margins": 1.4305357933044434, + "rewards/rejected": -2.5695741176605225, + "step": 1216 + }, + { + "epoch": 0.14, + "learning_rate": 2.6145033660092123e-07, + "logits/chosen": -2.137702226638794, + "logits/rejected": -2.1692233085632324, + "logps/chosen": -211.78146362304688, + "logps/rejected": -131.8563690185547, + "loss": 0.994, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1482348442077637, + "rewards/margins": 0.5150662660598755, + "rewards/rejected": -1.6633012294769287, + "step": 1217 + }, + { + "epoch": 0.14, + "learning_rate": 2.614149049250029e-07, + "logits/chosen": -2.6752381324768066, + "logits/rejected": -2.4664547443389893, + "logps/chosen": -141.66978454589844, + "logps/rejected": -225.0955352783203, + "loss": 0.7433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20321819186210632, + "rewards/margins": 0.17518356442451477, + "rewards/rejected": -0.3784017860889435, + "step": 1218 + }, + { + "epoch": 0.14, + "learning_rate": 2.6137947324908467e-07, + "logits/chosen": -2.2906172275543213, + "logits/rejected": -2.5458648204803467, + "logps/chosen": -281.67572021484375, + "logps/rejected": -286.2783203125, + "loss": 0.3528, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0083242654800415, + "rewards/margins": 1.3587497472763062, + "rewards/rejected": -2.3670740127563477, + "step": 1219 + }, + { + "epoch": 0.14, + "learning_rate": 2.6134404157316637e-07, + "logits/chosen": -2.37449312210083, + "logits/rejected": -2.358083963394165, + "logps/chosen": -157.3820343017578, + "logps/rejected": -220.6537628173828, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5086790323257446, + "rewards/margins": 1.149797797203064, + "rewards/rejected": -1.6584768295288086, + "step": 1220 + }, + { + "epoch": 0.14, + "learning_rate": 2.6130860989724817e-07, + "logits/chosen": -2.7063205242156982, + "logits/rejected": -2.554245948791504, + "logps/chosen": -198.2356414794922, + "logps/rejected": -259.17889404296875, + "loss": 0.3145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3179255723953247, + "rewards/margins": 1.4808061122894287, + "rewards/rejected": -1.7987315654754639, + "step": 1221 + }, + { + "epoch": 0.14, + "learning_rate": 2.6127317822132986e-07, + "logits/chosen": -2.154592514038086, + "logits/rejected": -2.034823179244995, + "logps/chosen": -325.4721374511719, + "logps/rejected": -308.1068115234375, + "loss": 0.5773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7155643105506897, + "rewards/margins": 1.0639994144439697, + "rewards/rejected": -1.7795637845993042, + "step": 1222 + }, + { + "epoch": 0.14, + "learning_rate": 2.612377465454116e-07, + "logits/chosen": -1.8830881118774414, + "logits/rejected": -1.857619047164917, + "logps/chosen": -451.446533203125, + "logps/rejected": -442.03125, + "loss": 0.3536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20267298817634583, + "rewards/margins": 1.9951961040496826, + "rewards/rejected": -2.197868824005127, + "step": 1223 + }, + { + "epoch": 0.14, + "learning_rate": 2.612023148694933e-07, + "logits/chosen": -2.2011451721191406, + "logits/rejected": -1.870465874671936, + "logps/chosen": -332.98443603515625, + "logps/rejected": -241.1824493408203, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8765718936920166, + "rewards/margins": 0.9335952997207642, + "rewards/rejected": -1.8101671934127808, + "step": 1224 + }, + { + "epoch": 0.14, + "learning_rate": 2.6116688319357506e-07, + "logits/chosen": -2.053405523300171, + "logits/rejected": -1.9158811569213867, + "logps/chosen": -299.52105712890625, + "logps/rejected": -421.9799499511719, + "loss": 0.272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5662707090377808, + "rewards/margins": 1.7271157503128052, + "rewards/rejected": -2.293386459350586, + "step": 1225 + }, + { + "epoch": 0.14, + "learning_rate": 2.6113145151765675e-07, + "logits/chosen": -2.336432456970215, + "logits/rejected": -2.6305625438690186, + "logps/chosen": -249.67605590820312, + "logps/rejected": -223.44863891601562, + "loss": 0.3659, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3940158486366272, + "rewards/margins": 1.4299249649047852, + "rewards/rejected": -1.8239408731460571, + "step": 1226 + }, + { + "epoch": 0.14, + "learning_rate": 2.610960198417385e-07, + "logits/chosen": -2.632175922393799, + "logits/rejected": -2.365797758102417, + "logps/chosen": -137.59799194335938, + "logps/rejected": -276.6572570800781, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.031806468963623, + "rewards/margins": 1.4824001789093018, + "rewards/rejected": -2.514206886291504, + "step": 1227 + }, + { + "epoch": 0.14, + "learning_rate": 2.6106058816582025e-07, + "logits/chosen": -1.9882938861846924, + "logits/rejected": -2.1020126342773438, + "logps/chosen": -418.59149169921875, + "logps/rejected": -370.8414306640625, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11219042539596558, + "rewards/margins": 1.2009856700897217, + "rewards/rejected": -1.313176155090332, + "step": 1228 + }, + { + "epoch": 0.14, + "learning_rate": 2.6102515648990195e-07, + "logits/chosen": -1.9346270561218262, + "logits/rejected": -1.883954405784607, + "logps/chosen": -296.3021240234375, + "logps/rejected": -224.70205688476562, + "loss": 0.3362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07661155611276627, + "rewards/margins": 1.8448667526245117, + "rewards/rejected": -1.921478271484375, + "step": 1229 + }, + { + "epoch": 0.14, + "learning_rate": 2.609897248139837e-07, + "logits/chosen": -2.0371227264404297, + "logits/rejected": -2.2506399154663086, + "logps/chosen": -355.3033752441406, + "logps/rejected": -297.62603759765625, + "loss": 0.2657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46851930022239685, + "rewards/margins": 1.7664119005203247, + "rewards/rejected": -2.234931230545044, + "step": 1230 + }, + { + "epoch": 0.14, + "learning_rate": 2.609542931380654e-07, + "logits/chosen": -2.5912065505981445, + "logits/rejected": -2.7296056747436523, + "logps/chosen": -178.59237670898438, + "logps/rejected": -212.0275421142578, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9331275820732117, + "rewards/margins": 2.161269187927246, + "rewards/rejected": -3.0943965911865234, + "step": 1231 + }, + { + "epoch": 0.14, + "learning_rate": 2.6091886146214714e-07, + "logits/chosen": -2.214390516281128, + "logits/rejected": -2.0811753273010254, + "logps/chosen": -259.7727966308594, + "logps/rejected": -337.0928039550781, + "loss": 0.6228, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.073153018951416, + "rewards/margins": 1.5224473476409912, + "rewards/rejected": -2.5956006050109863, + "step": 1232 + }, + { + "epoch": 0.14, + "learning_rate": 2.608834297862289e-07, + "logits/chosen": -2.130297899246216, + "logits/rejected": -2.202854871749878, + "logps/chosen": -282.682861328125, + "logps/rejected": -306.7928161621094, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5135722756385803, + "rewards/margins": 2.5404725074768066, + "rewards/rejected": -3.054044723510742, + "step": 1233 + }, + { + "epoch": 0.14, + "learning_rate": 2.6084799811031064e-07, + "logits/chosen": -2.2196273803710938, + "logits/rejected": -2.0463874340057373, + "logps/chosen": -244.0476531982422, + "logps/rejected": -290.8463134765625, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22885075211524963, + "rewards/margins": 1.4578667879104614, + "rewards/rejected": -1.6867173910140991, + "step": 1234 + }, + { + "epoch": 0.14, + "learning_rate": 2.6081256643439233e-07, + "logits/chosen": -1.8478654623031616, + "logits/rejected": -2.281740188598633, + "logps/chosen": -440.760498046875, + "logps/rejected": -316.838623046875, + "loss": 0.484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.553683340549469, + "rewards/margins": 1.7635353803634644, + "rewards/rejected": -2.317218780517578, + "step": 1235 + }, + { + "epoch": 0.14, + "learning_rate": 2.607771347584741e-07, + "logits/chosen": -1.8633878231048584, + "logits/rejected": -2.240372657775879, + "logps/chosen": -276.9737548828125, + "logps/rejected": -246.07305908203125, + "loss": 0.4326, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7352249026298523, + "rewards/margins": 1.6205451488494873, + "rewards/rejected": -2.3557701110839844, + "step": 1236 + }, + { + "epoch": 0.14, + "learning_rate": 2.607417030825558e-07, + "logits/chosen": -2.397573947906494, + "logits/rejected": -1.9563982486724854, + "logps/chosen": -297.1358642578125, + "logps/rejected": -246.6844482421875, + "loss": 0.2937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7042604088783264, + "rewards/margins": 1.9653129577636719, + "rewards/rejected": -2.6695733070373535, + "step": 1237 + }, + { + "epoch": 0.14, + "learning_rate": 2.607062714066375e-07, + "logits/chosen": -2.0208499431610107, + "logits/rejected": -2.1037914752960205, + "logps/chosen": -504.9766845703125, + "logps/rejected": -365.8271789550781, + "loss": 0.3582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7214065790176392, + "rewards/margins": 1.671844482421875, + "rewards/rejected": -2.3932509422302246, + "step": 1238 + }, + { + "epoch": 0.14, + "learning_rate": 2.6067083973071927e-07, + "logits/chosen": -1.9319088459014893, + "logits/rejected": -2.1614065170288086, + "logps/chosen": -423.293701171875, + "logps/rejected": -276.984130859375, + "loss": 0.6646, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.01180100440979, + "rewards/margins": 0.7641793489456177, + "rewards/rejected": -1.7759804725646973, + "step": 1239 + }, + { + "epoch": 0.14, + "learning_rate": 2.6063540805480097e-07, + "logits/chosen": -2.349661111831665, + "logits/rejected": -2.53029727935791, + "logps/chosen": -521.2225341796875, + "logps/rejected": -370.9586181640625, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7600740194320679, + "rewards/margins": 1.2158392667770386, + "rewards/rejected": -1.9759132862091064, + "step": 1240 + }, + { + "epoch": 0.14, + "learning_rate": 2.605999763788827e-07, + "logits/chosen": -2.6423542499542236, + "logits/rejected": -2.8583455085754395, + "logps/chosen": -138.44232177734375, + "logps/rejected": -290.8701171875, + "loss": 0.4664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5801395177841187, + "rewards/margins": 1.0942535400390625, + "rewards/rejected": -1.6743930578231812, + "step": 1241 + }, + { + "epoch": 0.14, + "learning_rate": 2.605645447029644e-07, + "logits/chosen": -1.8430917263031006, + "logits/rejected": -2.3714470863342285, + "logps/chosen": -305.1477966308594, + "logps/rejected": -223.27792358398438, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22728799283504486, + "rewards/margins": 1.3993114233016968, + "rewards/rejected": -1.6265994310379028, + "step": 1242 + }, + { + "epoch": 0.14, + "learning_rate": 2.6052911302704616e-07, + "logits/chosen": -2.206584930419922, + "logits/rejected": -2.243499279022217, + "logps/chosen": -458.9821472167969, + "logps/rejected": -413.85784912109375, + "loss": 0.5828, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3924663066864014, + "rewards/margins": 1.4343593120574951, + "rewards/rejected": -2.8268258571624756, + "step": 1243 + }, + { + "epoch": 0.14, + "learning_rate": 2.6049368135112786e-07, + "logits/chosen": -2.993178129196167, + "logits/rejected": -2.8497660160064697, + "logps/chosen": -215.64817810058594, + "logps/rejected": -273.05218505859375, + "loss": 0.3837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8832869529724121, + "rewards/margins": 1.5810787677764893, + "rewards/rejected": -2.4643657207489014, + "step": 1244 + }, + { + "epoch": 0.14, + "learning_rate": 2.6045824967520966e-07, + "logits/chosen": -2.456869125366211, + "logits/rejected": -2.540454864501953, + "logps/chosen": -149.39254760742188, + "logps/rejected": -237.100341796875, + "loss": 0.3548, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22591577470302582, + "rewards/margins": 1.687772512435913, + "rewards/rejected": -1.4618569612503052, + "step": 1245 + }, + { + "epoch": 0.14, + "learning_rate": 2.6042281799929135e-07, + "logits/chosen": -2.252779245376587, + "logits/rejected": -2.7211036682128906, + "logps/chosen": -302.27105712890625, + "logps/rejected": -147.2112579345703, + "loss": 0.6604, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0700712203979492, + "rewards/margins": 0.6435593366622925, + "rewards/rejected": -1.7136304378509521, + "step": 1246 + }, + { + "epoch": 0.15, + "learning_rate": 2.603873863233731e-07, + "logits/chosen": -3.0307254791259766, + "logits/rejected": -2.97756290435791, + "logps/chosen": -169.63092041015625, + "logps/rejected": -113.05150604248047, + "loss": 0.7544, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7444263100624084, + "rewards/margins": 0.39856547117233276, + "rewards/rejected": -1.1429919004440308, + "step": 1247 + }, + { + "epoch": 0.15, + "learning_rate": 2.603519546474548e-07, + "logits/chosen": -2.8420052528381348, + "logits/rejected": -2.6869869232177734, + "logps/chosen": -222.10028076171875, + "logps/rejected": -218.86453247070312, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07362756133079529, + "rewards/margins": 2.1067304611206055, + "rewards/rejected": -2.1803579330444336, + "step": 1248 + }, + { + "epoch": 0.15, + "learning_rate": 2.6031652297153655e-07, + "logits/chosen": -1.9170403480529785, + "logits/rejected": -2.0209596157073975, + "logps/chosen": -400.3038635253906, + "logps/rejected": -319.9889221191406, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3565017879009247, + "rewards/margins": 1.1411786079406738, + "rewards/rejected": -1.4976803064346313, + "step": 1249 + }, + { + "epoch": 0.15, + "learning_rate": 2.602810912956183e-07, + "logits/chosen": -2.292919158935547, + "logits/rejected": -2.4757347106933594, + "logps/chosen": -317.056640625, + "logps/rejected": -305.7164001464844, + "loss": 0.1871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7052890062332153, + "rewards/margins": 2.174309015274048, + "rewards/rejected": -2.8795979022979736, + "step": 1250 + }, + { + "epoch": 0.15, + "learning_rate": 2.602456596197e-07, + "logits/chosen": -2.516244888305664, + "logits/rejected": -2.4459445476531982, + "logps/chosen": -235.16522216796875, + "logps/rejected": -211.745361328125, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38409173488616943, + "rewards/margins": 2.1268701553344727, + "rewards/rejected": -2.5109620094299316, + "step": 1251 + }, + { + "epoch": 0.15, + "learning_rate": 2.6021022794378174e-07, + "logits/chosen": -2.236812114715576, + "logits/rejected": -2.0823521614074707, + "logps/chosen": -212.92654418945312, + "logps/rejected": -213.06077575683594, + "loss": 0.2804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6693189144134521, + "rewards/margins": 2.098865032196045, + "rewards/rejected": -2.768183708190918, + "step": 1252 + }, + { + "epoch": 0.15, + "learning_rate": 2.6017479626786344e-07, + "logits/chosen": -2.3237688541412354, + "logits/rejected": -2.413252353668213, + "logps/chosen": -237.13018798828125, + "logps/rejected": -133.48849487304688, + "loss": 0.9022, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1386258602142334, + "rewards/margins": 0.2657497525215149, + "rewards/rejected": -1.4043755531311035, + "step": 1253 + }, + { + "epoch": 0.15, + "learning_rate": 2.601393645919452e-07, + "logits/chosen": -2.266758680343628, + "logits/rejected": -2.3881494998931885, + "logps/chosen": -260.87646484375, + "logps/rejected": -471.16302490234375, + "loss": 0.8151, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1986429691314697, + "rewards/margins": 0.25457435846328735, + "rewards/rejected": -1.4532173871994019, + "step": 1254 + }, + { + "epoch": 0.15, + "learning_rate": 2.601039329160269e-07, + "logits/chosen": -2.8981451988220215, + "logits/rejected": -2.8271474838256836, + "logps/chosen": -141.50965881347656, + "logps/rejected": -157.44015502929688, + "loss": 0.2766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11638477444648743, + "rewards/margins": 1.9493556022644043, + "rewards/rejected": -2.0657403469085693, + "step": 1255 + }, + { + "epoch": 0.15, + "learning_rate": 2.600685012401087e-07, + "logits/chosen": -2.693871259689331, + "logits/rejected": -2.4761247634887695, + "logps/chosen": -125.81049346923828, + "logps/rejected": -251.10403442382812, + "loss": 0.4626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5399969220161438, + "rewards/margins": 1.9713658094406128, + "rewards/rejected": -2.5113627910614014, + "step": 1256 + }, + { + "epoch": 0.15, + "learning_rate": 2.600330695641904e-07, + "logits/chosen": -2.442492723464966, + "logits/rejected": -2.28092885017395, + "logps/chosen": -165.7157440185547, + "logps/rejected": -261.3372497558594, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3454078435897827, + "rewards/margins": 0.5472221374511719, + "rewards/rejected": -1.892629861831665, + "step": 1257 + }, + { + "epoch": 0.15, + "learning_rate": 2.599976378882721e-07, + "logits/chosen": -2.115691661834717, + "logits/rejected": -1.9491918087005615, + "logps/chosen": -317.12054443359375, + "logps/rejected": -340.6336364746094, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18840591609477997, + "rewards/margins": 1.293190360069275, + "rewards/rejected": -1.4815962314605713, + "step": 1258 + }, + { + "epoch": 0.15, + "learning_rate": 2.599622062123538e-07, + "logits/chosen": -2.2306504249572754, + "logits/rejected": -2.14399790763855, + "logps/chosen": -184.25833129882812, + "logps/rejected": -207.76370239257812, + "loss": 0.5517, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2603466510772705, + "rewards/margins": 0.9439060688018799, + "rewards/rejected": -1.2042527198791504, + "step": 1259 + }, + { + "epoch": 0.15, + "learning_rate": 2.5992677453643557e-07, + "logits/chosen": -2.549180507659912, + "logits/rejected": -2.6407978534698486, + "logps/chosen": -213.83932495117188, + "logps/rejected": -207.66494750976562, + "loss": 0.4595, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.711537778377533, + "rewards/margins": 1.0398741960525513, + "rewards/rejected": -1.75141179561615, + "step": 1260 + }, + { + "epoch": 0.15, + "learning_rate": 2.598913428605173e-07, + "logits/chosen": -2.313573122024536, + "logits/rejected": -2.205308675765991, + "logps/chosen": -180.78720092773438, + "logps/rejected": -199.6486358642578, + "loss": 0.2204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1279129981994629, + "rewards/margins": 1.8264344930648804, + "rewards/rejected": -1.9543473720550537, + "step": 1261 + }, + { + "epoch": 0.15, + "learning_rate": 2.59855911184599e-07, + "logits/chosen": -2.2265775203704834, + "logits/rejected": -2.345930337905884, + "logps/chosen": -316.6727294921875, + "logps/rejected": -272.6983642578125, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7236360311508179, + "rewards/margins": 1.0970818996429443, + "rewards/rejected": -1.8207180500030518, + "step": 1262 + }, + { + "epoch": 0.15, + "learning_rate": 2.5982047950868076e-07, + "logits/chosen": -2.5143589973449707, + "logits/rejected": -2.669133186340332, + "logps/chosen": -222.92259216308594, + "logps/rejected": -343.65167236328125, + "loss": 0.2482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8658496141433716, + "rewards/margins": 1.8789976835250854, + "rewards/rejected": -2.744847297668457, + "step": 1263 + }, + { + "epoch": 0.15, + "learning_rate": 2.5978504783276246e-07, + "logits/chosen": -1.77280855178833, + "logits/rejected": -1.9820737838745117, + "logps/chosen": -437.0660095214844, + "logps/rejected": -421.7515869140625, + "loss": 0.2053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5205976963043213, + "rewards/margins": 2.0472793579101562, + "rewards/rejected": -2.5678770542144775, + "step": 1264 + }, + { + "epoch": 0.15, + "learning_rate": 2.597496161568442e-07, + "logits/chosen": -2.181020736694336, + "logits/rejected": -2.156886577606201, + "logps/chosen": -361.36712646484375, + "logps/rejected": -278.8703308105469, + "loss": 0.4018, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7749343514442444, + "rewards/margins": 1.345133900642395, + "rewards/rejected": -2.120068073272705, + "step": 1265 + }, + { + "epoch": 0.15, + "learning_rate": 2.597141844809259e-07, + "logits/chosen": -2.119680643081665, + "logits/rejected": -2.056746006011963, + "logps/chosen": -277.7623291015625, + "logps/rejected": -273.0042724609375, + "loss": 0.2764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7121495604515076, + "rewards/margins": 1.2374653816223145, + "rewards/rejected": -1.9496151208877563, + "step": 1266 + }, + { + "epoch": 0.15, + "learning_rate": 2.5967875280500765e-07, + "logits/chosen": -2.1201016902923584, + "logits/rejected": -2.235252618789673, + "logps/chosen": -326.6087951660156, + "logps/rejected": -325.4222412109375, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22725319862365723, + "rewards/margins": 1.6974499225616455, + "rewards/rejected": -1.9247030019760132, + "step": 1267 + }, + { + "epoch": 0.15, + "learning_rate": 2.596433211290894e-07, + "logits/chosen": -2.818470001220703, + "logits/rejected": -2.705915927886963, + "logps/chosen": -280.83734130859375, + "logps/rejected": -216.49269104003906, + "loss": 0.3247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7131451964378357, + "rewards/margins": 1.7519458532333374, + "rewards/rejected": -2.4650912284851074, + "step": 1268 + }, + { + "epoch": 0.15, + "learning_rate": 2.5960788945317115e-07, + "logits/chosen": -2.4428930282592773, + "logits/rejected": -2.331605911254883, + "logps/chosen": -295.9519348144531, + "logps/rejected": -289.4832458496094, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.192646861076355, + "rewards/margins": 1.6220908164978027, + "rewards/rejected": -2.814737558364868, + "step": 1269 + }, + { + "epoch": 0.15, + "learning_rate": 2.5957245777725284e-07, + "logits/chosen": -2.6269357204437256, + "logits/rejected": -2.687150716781616, + "logps/chosen": -169.66976928710938, + "logps/rejected": -177.86288452148438, + "loss": 0.6397, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7958784103393555, + "rewards/margins": 0.6250782012939453, + "rewards/rejected": -1.4209566116333008, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 2.595370261013346e-07, + "logits/chosen": -2.6670022010803223, + "logits/rejected": -2.565812587738037, + "logps/chosen": -311.00115966796875, + "logps/rejected": -217.16049194335938, + "loss": 0.4264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8078153729438782, + "rewards/margins": 1.1430182456970215, + "rewards/rejected": -1.950833797454834, + "step": 1271 + }, + { + "epoch": 0.15, + "learning_rate": 2.595015944254163e-07, + "logits/chosen": -1.3872480392456055, + "logits/rejected": -1.5931191444396973, + "logps/chosen": -735.7073974609375, + "logps/rejected": -551.5750732421875, + "loss": 0.695, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3547881841659546, + "rewards/margins": 1.506098747253418, + "rewards/rejected": -2.860887050628662, + "step": 1272 + }, + { + "epoch": 0.15, + "learning_rate": 2.5946616274949804e-07, + "logits/chosen": -2.882721185684204, + "logits/rejected": -2.767637014389038, + "logps/chosen": -125.76010131835938, + "logps/rejected": -193.9713592529297, + "loss": 0.3611, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3950304687023163, + "rewards/margins": 1.374147891998291, + "rewards/rejected": -1.7691782712936401, + "step": 1273 + }, + { + "epoch": 0.15, + "learning_rate": 2.594307310735798e-07, + "logits/chosen": -1.8718408346176147, + "logits/rejected": -2.2689945697784424, + "logps/chosen": -510.5018005371094, + "logps/rejected": -342.74365234375, + "loss": 0.2101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6649178862571716, + "rewards/margins": 2.3887991905212402, + "rewards/rejected": -3.0537166595458984, + "step": 1274 + }, + { + "epoch": 0.15, + "learning_rate": 2.593952993976615e-07, + "logits/chosen": -2.3788914680480957, + "logits/rejected": -2.28865385055542, + "logps/chosen": -241.61886596679688, + "logps/rejected": -230.29275512695312, + "loss": 0.2552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6105338335037231, + "rewards/margins": 2.803457021713257, + "rewards/rejected": -3.4139909744262695, + "step": 1275 + }, + { + "epoch": 0.15, + "learning_rate": 2.5935986772174323e-07, + "logits/chosen": -2.2635085582733154, + "logits/rejected": -2.330038070678711, + "logps/chosen": -523.0123291015625, + "logps/rejected": -468.148193359375, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.740538477897644, + "rewards/margins": 1.3421372175216675, + "rewards/rejected": -2.0826756954193115, + "step": 1276 + }, + { + "epoch": 0.15, + "learning_rate": 2.593244360458249e-07, + "logits/chosen": -2.5202741622924805, + "logits/rejected": -2.2912259101867676, + "logps/chosen": -297.2366943359375, + "logps/rejected": -251.6663818359375, + "loss": 0.4441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31972527503967285, + "rewards/margins": 1.4612170457839966, + "rewards/rejected": -1.7809423208236694, + "step": 1277 + }, + { + "epoch": 0.15, + "learning_rate": 2.592890043699067e-07, + "logits/chosen": -1.3900766372680664, + "logits/rejected": -2.0586559772491455, + "logps/chosen": -529.3613891601562, + "logps/rejected": -280.0226745605469, + "loss": 0.285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.645638108253479, + "rewards/margins": 1.191230297088623, + "rewards/rejected": -1.8368682861328125, + "step": 1278 + }, + { + "epoch": 0.15, + "learning_rate": 2.592535726939884e-07, + "logits/chosen": -2.0372562408447266, + "logits/rejected": -2.2701494693756104, + "logps/chosen": -329.7569580078125, + "logps/rejected": -306.30706787109375, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44235673546791077, + "rewards/margins": 0.9151731729507446, + "rewards/rejected": -1.3575299978256226, + "step": 1279 + }, + { + "epoch": 0.15, + "learning_rate": 2.5921814101807017e-07, + "logits/chosen": -2.6823337078094482, + "logits/rejected": -2.837756872177124, + "logps/chosen": -297.3595275878906, + "logps/rejected": -250.33102416992188, + "loss": 0.2531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11365848779678345, + "rewards/margins": 2.0839953422546387, + "rewards/rejected": -2.1976537704467773, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 2.5918270934215187e-07, + "logits/chosen": -2.8172483444213867, + "logits/rejected": -2.738985300064087, + "logps/chosen": -280.9693603515625, + "logps/rejected": -155.98281860351562, + "loss": 0.5415, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7478078603744507, + "rewards/margins": 0.9187616109848022, + "rewards/rejected": -1.666569471359253, + "step": 1281 + }, + { + "epoch": 0.15, + "learning_rate": 2.591472776662336e-07, + "logits/chosen": -2.4865734577178955, + "logits/rejected": -2.6910340785980225, + "logps/chosen": -153.28834533691406, + "logps/rejected": -170.6597900390625, + "loss": 0.3194, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6262564659118652, + "rewards/margins": 1.2164052724838257, + "rewards/rejected": -1.842661738395691, + "step": 1282 + }, + { + "epoch": 0.15, + "learning_rate": 2.591118459903153e-07, + "logits/chosen": -2.588737964630127, + "logits/rejected": -2.7611048221588135, + "logps/chosen": -254.65057373046875, + "logps/rejected": -225.90975952148438, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24436911940574646, + "rewards/margins": 2.8849329948425293, + "rewards/rejected": -3.1293022632598877, + "step": 1283 + }, + { + "epoch": 0.15, + "learning_rate": 2.5907641431439706e-07, + "logits/chosen": -1.9059133529663086, + "logits/rejected": -2.0604407787323, + "logps/chosen": -330.07269287109375, + "logps/rejected": -272.6793518066406, + "loss": 0.5477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1716097593307495, + "rewards/margins": 1.5171852111816406, + "rewards/rejected": -1.6887949705123901, + "step": 1284 + }, + { + "epoch": 0.15, + "learning_rate": 2.590409826384788e-07, + "logits/chosen": -2.7010927200317383, + "logits/rejected": -2.336820602416992, + "logps/chosen": -393.81097412109375, + "logps/rejected": -324.70880126953125, + "loss": 1.1451, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.014214038848877, + "rewards/margins": 0.9166098237037659, + "rewards/rejected": -1.9308240413665771, + "step": 1285 + }, + { + "epoch": 0.15, + "learning_rate": 2.590055509625605e-07, + "logits/chosen": -2.9210479259490967, + "logits/rejected": -2.9542672634124756, + "logps/chosen": -150.58636474609375, + "logps/rejected": -185.82281494140625, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4505285620689392, + "rewards/margins": 1.8772376775741577, + "rewards/rejected": -2.3277664184570312, + "step": 1286 + }, + { + "epoch": 0.15, + "learning_rate": 2.5897011928664225e-07, + "logits/chosen": -1.6713709831237793, + "logits/rejected": -1.8420507907867432, + "logps/chosen": -336.9733581542969, + "logps/rejected": -304.183837890625, + "loss": 0.7613, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2765471935272217, + "rewards/margins": 0.2487141191959381, + "rewards/rejected": -1.5252611637115479, + "step": 1287 + }, + { + "epoch": 0.15, + "learning_rate": 2.5893468761072395e-07, + "logits/chosen": -3.0366063117980957, + "logits/rejected": -3.0322883129119873, + "logps/chosen": -266.9208984375, + "logps/rejected": -182.05624389648438, + "loss": 0.6769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8897483348846436, + "rewards/margins": 1.1671676635742188, + "rewards/rejected": -2.0569159984588623, + "step": 1288 + }, + { + "epoch": 0.15, + "learning_rate": 2.588992559348057e-07, + "logits/chosen": -2.128631114959717, + "logits/rejected": -2.6411750316619873, + "logps/chosen": -339.3894958496094, + "logps/rejected": -236.3607177734375, + "loss": 0.327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41288837790489197, + "rewards/margins": 2.578314781188965, + "rewards/rejected": -2.9912033081054688, + "step": 1289 + }, + { + "epoch": 0.15, + "learning_rate": 2.5886382425888745e-07, + "logits/chosen": -1.7747128009796143, + "logits/rejected": -1.798820972442627, + "logps/chosen": -419.06622314453125, + "logps/rejected": -370.3547058105469, + "loss": 0.5943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6487704515457153, + "rewards/margins": 1.4400386810302734, + "rewards/rejected": -2.088809013366699, + "step": 1290 + }, + { + "epoch": 0.15, + "learning_rate": 2.5882839258296914e-07, + "logits/chosen": -2.159764289855957, + "logits/rejected": -2.5085325241088867, + "logps/chosen": -292.51318359375, + "logps/rejected": -194.88055419921875, + "loss": 0.9448, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1800484657287598, + "rewards/margins": 1.1285276412963867, + "rewards/rejected": -2.3085761070251465, + "step": 1291 + }, + { + "epoch": 0.15, + "learning_rate": 2.587929609070509e-07, + "logits/chosen": -1.8661205768585205, + "logits/rejected": -1.8758271932601929, + "logps/chosen": -315.619873046875, + "logps/rejected": -403.6959228515625, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20793503522872925, + "rewards/margins": 2.5689449310302734, + "rewards/rejected": -2.7768802642822266, + "step": 1292 + }, + { + "epoch": 0.15, + "learning_rate": 2.5875752923113264e-07, + "logits/chosen": -2.6157755851745605, + "logits/rejected": -2.588261127471924, + "logps/chosen": -208.9537353515625, + "logps/rejected": -253.08358764648438, + "loss": 0.3741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7660631537437439, + "rewards/margins": 1.7738869190216064, + "rewards/rejected": -2.539950132369995, + "step": 1293 + }, + { + "epoch": 0.15, + "learning_rate": 2.5872209755521433e-07, + "logits/chosen": -2.117156744003296, + "logits/rejected": -2.127070188522339, + "logps/chosen": -216.6184539794922, + "logps/rejected": -235.77740478515625, + "loss": 0.5052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6661102771759033, + "rewards/margins": 1.7357144355773926, + "rewards/rejected": -2.401824712753296, + "step": 1294 + }, + { + "epoch": 0.15, + "learning_rate": 2.586866658792961e-07, + "logits/chosen": -2.4313063621520996, + "logits/rejected": -2.4967944622039795, + "logps/chosen": -264.153564453125, + "logps/rejected": -257.64544677734375, + "loss": 1.432, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.849565267562866, + "rewards/margins": -0.31178268790245056, + "rewards/rejected": -3.5377824306488037, + "step": 1295 + }, + { + "epoch": 0.15, + "learning_rate": 2.5865123420337783e-07, + "logits/chosen": -2.7848920822143555, + "logits/rejected": -2.9087843894958496, + "logps/chosen": -386.3840637207031, + "logps/rejected": -272.14630126953125, + "loss": 0.6555, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0581109523773193, + "rewards/margins": 0.4849759042263031, + "rewards/rejected": -1.5430867671966553, + "step": 1296 + }, + { + "epoch": 0.15, + "learning_rate": 2.5861580252745953e-07, + "logits/chosen": -2.2209672927856445, + "logits/rejected": -2.2732667922973633, + "logps/chosen": -309.7784118652344, + "logps/rejected": -227.14389038085938, + "loss": 0.3946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8161208629608154, + "rewards/margins": 0.9809682369232178, + "rewards/rejected": -1.7970889806747437, + "step": 1297 + }, + { + "epoch": 0.15, + "learning_rate": 2.585803708515413e-07, + "logits/chosen": -2.4885759353637695, + "logits/rejected": -2.8750669956207275, + "logps/chosen": -380.5934753417969, + "logps/rejected": -159.957763671875, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20302613079547882, + "rewards/margins": 1.425229549407959, + "rewards/rejected": -1.6282557249069214, + "step": 1298 + }, + { + "epoch": 0.15, + "learning_rate": 2.5854493917562297e-07, + "logits/chosen": -2.6753623485565186, + "logits/rejected": -2.6322524547576904, + "logps/chosen": -206.34133911132812, + "logps/rejected": -177.70205688476562, + "loss": 0.4729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8837290406227112, + "rewards/margins": 1.03102707862854, + "rewards/rejected": -1.9147560596466064, + "step": 1299 + }, + { + "epoch": 0.15, + "learning_rate": 2.585095074997047e-07, + "logits/chosen": -2.6090126037597656, + "logits/rejected": -2.4821982383728027, + "logps/chosen": -237.47515869140625, + "logps/rejected": -259.5172119140625, + "loss": 0.2584, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19561664760112762, + "rewards/margins": 1.6479816436767578, + "rewards/rejected": -1.4523649215698242, + "step": 1300 + }, + { + "epoch": 0.15, + "learning_rate": 2.584740758237864e-07, + "logits/chosen": -2.201249122619629, + "logits/rejected": -2.2727041244506836, + "logps/chosen": -239.57505798339844, + "logps/rejected": -172.9882049560547, + "loss": 0.7372, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3061528205871582, + "rewards/margins": 0.45246267318725586, + "rewards/rejected": -1.758615493774414, + "step": 1301 + }, + { + "epoch": 0.15, + "learning_rate": 2.5843864414786816e-07, + "logits/chosen": -2.3956263065338135, + "logits/rejected": -2.4774601459503174, + "logps/chosen": -251.79852294921875, + "logps/rejected": -280.4782409667969, + "loss": 0.5548, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9851399660110474, + "rewards/margins": 2.0133299827575684, + "rewards/rejected": -2.998469829559326, + "step": 1302 + }, + { + "epoch": 0.15, + "learning_rate": 2.584032124719499e-07, + "logits/chosen": -1.9091551303863525, + "logits/rejected": -1.9462592601776123, + "logps/chosen": -382.4305419921875, + "logps/rejected": -323.70904541015625, + "loss": 0.5732, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4826089143753052, + "rewards/margins": 0.8115018606185913, + "rewards/rejected": -1.294110655784607, + "step": 1303 + }, + { + "epoch": 0.15, + "learning_rate": 2.5836778079603166e-07, + "logits/chosen": -2.7944390773773193, + "logits/rejected": -2.7513723373413086, + "logps/chosen": -136.1783447265625, + "logps/rejected": -163.79800415039062, + "loss": 0.8062, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4117655754089355, + "rewards/margins": 0.8798112273216248, + "rewards/rejected": -2.291576862335205, + "step": 1304 + }, + { + "epoch": 0.15, + "learning_rate": 2.5833234912011336e-07, + "logits/chosen": -2.2910573482513428, + "logits/rejected": -2.236543655395508, + "logps/chosen": -266.98468017578125, + "logps/rejected": -240.9828338623047, + "loss": 0.6776, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0236725807189941, + "rewards/margins": 0.5636706948280334, + "rewards/rejected": -1.5873432159423828, + "step": 1305 + }, + { + "epoch": 0.15, + "learning_rate": 2.582969174441951e-07, + "logits/chosen": -2.1433401107788086, + "logits/rejected": -2.179068088531494, + "logps/chosen": -265.03179931640625, + "logps/rejected": -440.089111328125, + "loss": 0.2582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39379751682281494, + "rewards/margins": 1.749212384223938, + "rewards/rejected": -2.143009901046753, + "step": 1306 + }, + { + "epoch": 0.15, + "learning_rate": 2.5826148576827685e-07, + "logits/chosen": -2.9952545166015625, + "logits/rejected": -2.9684958457946777, + "logps/chosen": -278.02020263671875, + "logps/rejected": -241.33447265625, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06552602350711823, + "rewards/margins": 1.7741624116897583, + "rewards/rejected": -1.8396886587142944, + "step": 1307 + }, + { + "epoch": 0.15, + "learning_rate": 2.5822605409235855e-07, + "logits/chosen": -2.938021183013916, + "logits/rejected": -2.882143259048462, + "logps/chosen": -262.88726806640625, + "logps/rejected": -257.85260009765625, + "loss": 0.2466, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04030492156744003, + "rewards/margins": 2.654791831970215, + "rewards/rejected": -2.695096969604492, + "step": 1308 + }, + { + "epoch": 0.15, + "learning_rate": 2.581906224164403e-07, + "logits/chosen": -2.0528976917266846, + "logits/rejected": -2.0843005180358887, + "logps/chosen": -349.78253173828125, + "logps/rejected": -385.2113037109375, + "loss": 0.3894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4173968434333801, + "rewards/margins": 1.789353370666504, + "rewards/rejected": -2.2067501544952393, + "step": 1309 + }, + { + "epoch": 0.15, + "learning_rate": 2.58155190740522e-07, + "logits/chosen": -2.4269938468933105, + "logits/rejected": -2.6440858840942383, + "logps/chosen": -269.74468994140625, + "logps/rejected": -230.79983520507812, + "loss": 0.2703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7360973358154297, + "rewards/margins": 1.325857400894165, + "rewards/rejected": -2.0619544982910156, + "step": 1310 + }, + { + "epoch": 0.15, + "learning_rate": 2.5811975906460374e-07, + "logits/chosen": -2.4051551818847656, + "logits/rejected": -2.556385040283203, + "logps/chosen": -170.62881469726562, + "logps/rejected": -211.15036010742188, + "loss": 1.2095, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3140572309494019, + "rewards/margins": -0.5253822803497314, + "rewards/rejected": -0.7886749505996704, + "step": 1311 + }, + { + "epoch": 0.15, + "learning_rate": 2.5808432738868544e-07, + "logits/chosen": -2.218942642211914, + "logits/rejected": -2.1835391521453857, + "logps/chosen": -357.28985595703125, + "logps/rejected": -313.3472900390625, + "loss": 0.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.170084834098816, + "rewards/margins": 1.0231269598007202, + "rewards/rejected": -2.193211555480957, + "step": 1312 + }, + { + "epoch": 0.15, + "learning_rate": 2.580488957127672e-07, + "logits/chosen": -2.1596434116363525, + "logits/rejected": -2.027926445007324, + "logps/chosen": -374.4466247558594, + "logps/rejected": -249.26625061035156, + "loss": 0.2326, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.023939654231071472, + "rewards/margins": 2.0093207359313965, + "rewards/rejected": -1.9853808879852295, + "step": 1313 + }, + { + "epoch": 0.15, + "learning_rate": 2.5801346403684894e-07, + "logits/chosen": -2.1539344787597656, + "logits/rejected": -2.3923866748809814, + "logps/chosen": -523.3306884765625, + "logps/rejected": -216.06979370117188, + "loss": 0.5128, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9596171975135803, + "rewards/margins": 1.064602255821228, + "rewards/rejected": -2.024219274520874, + "step": 1314 + }, + { + "epoch": 0.15, + "learning_rate": 2.579780323609307e-07, + "logits/chosen": -2.6117448806762695, + "logits/rejected": -2.838325023651123, + "logps/chosen": -271.9878845214844, + "logps/rejected": -177.68978881835938, + "loss": 0.3914, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6180356740951538, + "rewards/margins": 1.081275224685669, + "rewards/rejected": -1.6993108987808228, + "step": 1315 + }, + { + "epoch": 0.15, + "learning_rate": 2.579426006850124e-07, + "logits/chosen": -3.0081565380096436, + "logits/rejected": -2.921945095062256, + "logps/chosen": -300.8717346191406, + "logps/rejected": -232.50546264648438, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6478986740112305, + "rewards/margins": 2.2287042140960693, + "rewards/rejected": -2.8766026496887207, + "step": 1316 + }, + { + "epoch": 0.15, + "learning_rate": 2.5790716900909413e-07, + "logits/chosen": -2.403447389602661, + "logits/rejected": -2.7603302001953125, + "logps/chosen": -178.54513549804688, + "logps/rejected": -237.83572387695312, + "loss": 1.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9448628425598145, + "rewards/margins": 0.6794756054878235, + "rewards/rejected": -2.6243386268615723, + "step": 1317 + }, + { + "epoch": 0.15, + "learning_rate": 2.578717373331759e-07, + "logits/chosen": -2.6839637756347656, + "logits/rejected": -2.8709754943847656, + "logps/chosen": -195.9561767578125, + "logps/rejected": -149.57472229003906, + "loss": 0.5216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6218922138214111, + "rewards/margins": 1.4257190227508545, + "rewards/rejected": -2.0476112365722656, + "step": 1318 + }, + { + "epoch": 0.15, + "learning_rate": 2.5783630565725757e-07, + "logits/chosen": -2.5710055828094482, + "logits/rejected": -2.729818820953369, + "logps/chosen": -243.94366455078125, + "logps/rejected": -198.39141845703125, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.286311388015747, + "rewards/margins": 1.6263314485549927, + "rewards/rejected": -2.9126429557800293, + "step": 1319 + }, + { + "epoch": 0.15, + "learning_rate": 2.578008739813393e-07, + "logits/chosen": -2.606369733810425, + "logits/rejected": -2.4206910133361816, + "logps/chosen": -150.28768920898438, + "logps/rejected": -226.32090759277344, + "loss": 0.302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9405965209007263, + "rewards/margins": 1.6854833364486694, + "rewards/rejected": -2.626079797744751, + "step": 1320 + }, + { + "epoch": 0.15, + "learning_rate": 2.57765442305421e-07, + "logits/chosen": -2.5065817832946777, + "logits/rejected": -2.4328176975250244, + "logps/chosen": -270.79266357421875, + "logps/rejected": -174.7091827392578, + "loss": 0.6692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8658044338226318, + "rewards/margins": 0.844093382358551, + "rewards/rejected": -1.7098979949951172, + "step": 1321 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773001062950277e-07, + "logits/chosen": -2.623093605041504, + "logits/rejected": -2.3319602012634277, + "logps/chosen": -260.4769287109375, + "logps/rejected": -421.634033203125, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5434201955795288, + "rewards/margins": 3.3658337593078613, + "rewards/rejected": -3.9092538356781006, + "step": 1322 + }, + { + "epoch": 0.15, + "learning_rate": 2.5769457895358446e-07, + "logits/chosen": -2.72861909866333, + "logits/rejected": -2.633969306945801, + "logps/chosen": -252.8408660888672, + "logps/rejected": -325.76739501953125, + "loss": 0.1924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48445379734039307, + "rewards/margins": 2.6676597595214844, + "rewards/rejected": -3.152113437652588, + "step": 1323 + }, + { + "epoch": 0.15, + "learning_rate": 2.576591472776662e-07, + "logits/chosen": -2.271425485610962, + "logits/rejected": -2.6053483486175537, + "logps/chosen": -382.7127685546875, + "logps/rejected": -262.7862548828125, + "loss": 0.5149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6939293742179871, + "rewards/margins": 1.2890352010726929, + "rewards/rejected": -1.9829645156860352, + "step": 1324 + }, + { + "epoch": 0.15, + "learning_rate": 2.5762371560174796e-07, + "logits/chosen": -2.2549819946289062, + "logits/rejected": -2.463052988052368, + "logps/chosen": -190.62564086914062, + "logps/rejected": -226.77798461914062, + "loss": 0.3452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30566123127937317, + "rewards/margins": 1.878478765487671, + "rewards/rejected": -2.1841399669647217, + "step": 1325 + }, + { + "epoch": 0.15, + "learning_rate": 2.5758828392582965e-07, + "logits/chosen": -2.437272548675537, + "logits/rejected": -2.2199971675872803, + "logps/chosen": -190.73500061035156, + "logps/rejected": -217.64028930664062, + "loss": 0.4331, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4925930500030518, + "rewards/margins": 1.1181799173355103, + "rewards/rejected": -2.6107730865478516, + "step": 1326 + }, + { + "epoch": 0.15, + "learning_rate": 2.575528522499114e-07, + "logits/chosen": -2.4711978435516357, + "logits/rejected": -2.6265907287597656, + "logps/chosen": -220.8153533935547, + "logps/rejected": -142.74517822265625, + "loss": 0.3583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5246231555938721, + "rewards/margins": 1.7034332752227783, + "rewards/rejected": -2.2280566692352295, + "step": 1327 + }, + { + "epoch": 0.15, + "learning_rate": 2.5751742057399315e-07, + "logits/chosen": -2.591740846633911, + "logits/rejected": -2.3444879055023193, + "logps/chosen": -288.9967346191406, + "logps/rejected": -199.04701232910156, + "loss": 0.3303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0520128011703491, + "rewards/margins": 1.585219383239746, + "rewards/rejected": -2.6372323036193848, + "step": 1328 + }, + { + "epoch": 0.15, + "learning_rate": 2.574819888980749e-07, + "logits/chosen": -1.963822364807129, + "logits/rejected": -1.9915214776992798, + "logps/chosen": -360.9922180175781, + "logps/rejected": -268.242919921875, + "loss": 0.2897, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21962544322013855, + "rewards/margins": 1.5154588222503662, + "rewards/rejected": -1.2958333492279053, + "step": 1329 + }, + { + "epoch": 0.15, + "learning_rate": 2.574465572221566e-07, + "logits/chosen": -2.7419259548187256, + "logits/rejected": -2.723878860473633, + "logps/chosen": -220.52293395996094, + "logps/rejected": -303.1905822753906, + "loss": 0.4512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19251510500907898, + "rewards/margins": 2.7715210914611816, + "rewards/rejected": -2.964036464691162, + "step": 1330 + }, + { + "epoch": 0.15, + "learning_rate": 2.5741112554623834e-07, + "logits/chosen": -2.5507712364196777, + "logits/rejected": -2.513840675354004, + "logps/chosen": -123.3845443725586, + "logps/rejected": -243.87869262695312, + "loss": 0.3827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6389557123184204, + "rewards/margins": 1.8937108516693115, + "rewards/rejected": -2.5326666831970215, + "step": 1331 + }, + { + "epoch": 0.15, + "learning_rate": 2.5737569387032004e-07, + "logits/chosen": -2.1146020889282227, + "logits/rejected": -2.1643099784851074, + "logps/chosen": -392.3270568847656, + "logps/rejected": -263.0843811035156, + "loss": 0.4183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5789353847503662, + "rewards/margins": 1.3677681684494019, + "rewards/rejected": -1.946703553199768, + "step": 1332 + }, + { + "epoch": 0.16, + "learning_rate": 2.573402621944018e-07, + "logits/chosen": -2.618570327758789, + "logits/rejected": -2.6255764961242676, + "logps/chosen": -384.5768737792969, + "logps/rejected": -272.1915283203125, + "loss": 0.2364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2640588879585266, + "rewards/margins": 2.0706539154052734, + "rewards/rejected": -2.3347127437591553, + "step": 1333 + }, + { + "epoch": 0.16, + "learning_rate": 2.573048305184835e-07, + "logits/chosen": -2.331184148788452, + "logits/rejected": -2.4042835235595703, + "logps/chosen": -130.33792114257812, + "logps/rejected": -156.94134521484375, + "loss": 0.6061, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.297194242477417, + "rewards/margins": 1.6097898483276367, + "rewards/rejected": -2.9069840908050537, + "step": 1334 + }, + { + "epoch": 0.16, + "learning_rate": 2.5726939884256523e-07, + "logits/chosen": -2.280982732772827, + "logits/rejected": -2.605081558227539, + "logps/chosen": -256.3509826660156, + "logps/rejected": -113.83618927001953, + "loss": 1.7714, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7609096765518188, + "rewards/margins": -1.1171070337295532, + "rewards/rejected": -0.6438026428222656, + "step": 1335 + }, + { + "epoch": 0.16, + "learning_rate": 2.57233967166647e-07, + "logits/chosen": -3.0859014987945557, + "logits/rejected": -3.128732919692993, + "logps/chosen": -152.41734313964844, + "logps/rejected": -270.3587646484375, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10739638656377792, + "rewards/margins": 2.7830190658569336, + "rewards/rejected": -2.6756224632263184, + "step": 1336 + }, + { + "epoch": 0.16, + "learning_rate": 2.571985354907287e-07, + "logits/chosen": -2.878413200378418, + "logits/rejected": -2.913888454437256, + "logps/chosen": -148.760009765625, + "logps/rejected": -181.4589385986328, + "loss": 0.5932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9090589284896851, + "rewards/margins": 1.4192866086959839, + "rewards/rejected": -2.328345537185669, + "step": 1337 + }, + { + "epoch": 0.16, + "learning_rate": 2.571631038148104e-07, + "logits/chosen": -2.5668108463287354, + "logits/rejected": -2.4721970558166504, + "logps/chosen": -313.0288391113281, + "logps/rejected": -310.969970703125, + "loss": 0.3041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5441226959228516, + "rewards/margins": 1.5696065425872803, + "rewards/rejected": -2.113729238510132, + "step": 1338 + }, + { + "epoch": 0.16, + "learning_rate": 2.571276721388922e-07, + "logits/chosen": -2.1148698329925537, + "logits/rejected": -2.149202346801758, + "logps/chosen": -262.81231689453125, + "logps/rejected": -295.19342041015625, + "loss": 0.4222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39852988719940186, + "rewards/margins": 2.0969290733337402, + "rewards/rejected": -2.4954588413238525, + "step": 1339 + }, + { + "epoch": 0.16, + "learning_rate": 2.570922404629739e-07, + "logits/chosen": -2.5729949474334717, + "logits/rejected": -2.4704928398132324, + "logps/chosen": -156.22219848632812, + "logps/rejected": -180.4857177734375, + "loss": 0.5178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6211029291152954, + "rewards/margins": 1.0970302820205688, + "rewards/rejected": -1.7181332111358643, + "step": 1340 + }, + { + "epoch": 0.16, + "learning_rate": 2.570568087870556e-07, + "logits/chosen": -2.2308919429779053, + "logits/rejected": -2.254133462905884, + "logps/chosen": -496.53070068359375, + "logps/rejected": -366.00787353515625, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09063857793807983, + "rewards/margins": 1.8412785530090332, + "rewards/rejected": -1.9319171905517578, + "step": 1341 + }, + { + "epoch": 0.16, + "learning_rate": 2.5702137711113737e-07, + "logits/chosen": -1.6075242757797241, + "logits/rejected": -1.7619737386703491, + "logps/chosen": -291.399658203125, + "logps/rejected": -225.1926727294922, + "loss": 0.9244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7909039258956909, + "rewards/margins": 0.2895992696285248, + "rewards/rejected": -1.080503225326538, + "step": 1342 + }, + { + "epoch": 0.16, + "learning_rate": 2.5698594543521906e-07, + "logits/chosen": -1.9972341060638428, + "logits/rejected": -1.772900938987732, + "logps/chosen": -270.8129577636719, + "logps/rejected": -346.89361572265625, + "loss": 0.2179, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2643498182296753, + "rewards/margins": 2.4869964122772217, + "rewards/rejected": -3.7513463497161865, + "step": 1343 + }, + { + "epoch": 0.16, + "learning_rate": 2.569505137593008e-07, + "logits/chosen": -1.8463990688323975, + "logits/rejected": -1.9862339496612549, + "logps/chosen": -443.3493957519531, + "logps/rejected": -355.87371826171875, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15197624266147614, + "rewards/margins": 1.3466036319732666, + "rewards/rejected": -1.498579978942871, + "step": 1344 + }, + { + "epoch": 0.16, + "learning_rate": 2.569150820833825e-07, + "logits/chosen": -2.4043538570404053, + "logits/rejected": -2.223766803741455, + "logps/chosen": -198.34397888183594, + "logps/rejected": -279.2518310546875, + "loss": 0.4058, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48655498027801514, + "rewards/margins": 1.6732103824615479, + "rewards/rejected": -2.1597654819488525, + "step": 1345 + }, + { + "epoch": 0.16, + "learning_rate": 2.5687965040746426e-07, + "logits/chosen": -2.5667433738708496, + "logits/rejected": -2.280487060546875, + "logps/chosen": -142.64883422851562, + "logps/rejected": -205.41995239257812, + "loss": 0.2952, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28380006551742554, + "rewards/margins": 2.515974521636963, + "rewards/rejected": -2.799774408340454, + "step": 1346 + }, + { + "epoch": 0.16, + "learning_rate": 2.56844218731546e-07, + "logits/chosen": -2.4165611267089844, + "logits/rejected": -2.3871965408325195, + "logps/chosen": -193.16836547851562, + "logps/rejected": -202.58477783203125, + "loss": 0.7506, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1418840885162354, + "rewards/margins": 1.4332711696624756, + "rewards/rejected": -2.57515549659729, + "step": 1347 + }, + { + "epoch": 0.16, + "learning_rate": 2.568087870556277e-07, + "logits/chosen": -2.6768314838409424, + "logits/rejected": -2.7175498008728027, + "logps/chosen": -254.9682159423828, + "logps/rejected": -220.0244903564453, + "loss": 0.4285, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03170609474182129, + "rewards/margins": 1.2717669010162354, + "rewards/rejected": -1.3034729957580566, + "step": 1348 + }, + { + "epoch": 0.16, + "learning_rate": 2.5677335537970945e-07, + "logits/chosen": -2.4211204051971436, + "logits/rejected": -2.2839698791503906, + "logps/chosen": -142.7660675048828, + "logps/rejected": -234.14300537109375, + "loss": 0.6832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0588620901107788, + "rewards/margins": 1.2528676986694336, + "rewards/rejected": -2.311729669570923, + "step": 1349 + }, + { + "epoch": 0.16, + "learning_rate": 2.567379237037912e-07, + "logits/chosen": -2.4518966674804688, + "logits/rejected": -2.2803406715393066, + "logps/chosen": -114.3258056640625, + "logps/rejected": -185.38467407226562, + "loss": 0.5015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4380975663661957, + "rewards/margins": 2.8100509643554688, + "rewards/rejected": -3.2481489181518555, + "step": 1350 + }, + { + "epoch": 0.16, + "learning_rate": 2.5670249202787295e-07, + "logits/chosen": -2.5462796688079834, + "logits/rejected": -2.256603956222534, + "logps/chosen": -180.09469604492188, + "logps/rejected": -230.59991455078125, + "loss": 0.615, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2468372881412506, + "rewards/margins": 0.9785282015800476, + "rewards/rejected": -1.2253655195236206, + "step": 1351 + }, + { + "epoch": 0.16, + "learning_rate": 2.5666706035195464e-07, + "logits/chosen": -2.766403913497925, + "logits/rejected": -2.698084592819214, + "logps/chosen": -286.5577697753906, + "logps/rejected": -217.39964294433594, + "loss": 0.3568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6951740384101868, + "rewards/margins": 1.902270793914795, + "rewards/rejected": -2.597445011138916, + "step": 1352 + }, + { + "epoch": 0.16, + "learning_rate": 2.566316286760364e-07, + "logits/chosen": -2.8344321250915527, + "logits/rejected": -2.8785691261291504, + "logps/chosen": -302.2632141113281, + "logps/rejected": -224.1578826904297, + "loss": 0.5536, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9144960641860962, + "rewards/margins": 1.2336583137512207, + "rewards/rejected": -2.1481542587280273, + "step": 1353 + }, + { + "epoch": 0.16, + "learning_rate": 2.565961970001181e-07, + "logits/chosen": -2.5811986923217773, + "logits/rejected": -2.459149122238159, + "logps/chosen": -111.94300079345703, + "logps/rejected": -245.4078369140625, + "loss": 0.3764, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3571208715438843, + "rewards/margins": 2.2172932624816895, + "rewards/rejected": -2.574413776397705, + "step": 1354 + }, + { + "epoch": 0.16, + "learning_rate": 2.5656076532419983e-07, + "logits/chosen": -1.9355041980743408, + "logits/rejected": -1.6608161926269531, + "logps/chosen": -307.44268798828125, + "logps/rejected": -341.14312744140625, + "loss": 0.5796, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5280605554580688, + "rewards/margins": 1.0630884170532227, + "rewards/rejected": -1.591149091720581, + "step": 1355 + }, + { + "epoch": 0.16, + "learning_rate": 2.5652533364828153e-07, + "logits/chosen": -2.4808425903320312, + "logits/rejected": -2.803968906402588, + "logps/chosen": -120.89959716796875, + "logps/rejected": -171.71539306640625, + "loss": 0.6546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8607321977615356, + "rewards/margins": 1.451585292816162, + "rewards/rejected": -2.3123176097869873, + "step": 1356 + }, + { + "epoch": 0.16, + "learning_rate": 2.564899019723633e-07, + "logits/chosen": -2.4859161376953125, + "logits/rejected": -2.514075756072998, + "logps/chosen": -244.73773193359375, + "logps/rejected": -359.1509704589844, + "loss": 0.2803, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4761694073677063, + "rewards/margins": 3.351926565170288, + "rewards/rejected": -3.8280959129333496, + "step": 1357 + }, + { + "epoch": 0.16, + "learning_rate": 2.5645447029644503e-07, + "logits/chosen": -2.4028098583221436, + "logits/rejected": -2.524381637573242, + "logps/chosen": -125.20744323730469, + "logps/rejected": -190.55715942382812, + "loss": 1.0148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1335519552230835, + "rewards/margins": 1.6602842807769775, + "rewards/rejected": -2.7938363552093506, + "step": 1358 + }, + { + "epoch": 0.16, + "learning_rate": 2.564190386205267e-07, + "logits/chosen": -2.175311326980591, + "logits/rejected": -2.1788289546966553, + "logps/chosen": -295.306640625, + "logps/rejected": -220.8590087890625, + "loss": 0.8267, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1686832904815674, + "rewards/margins": 0.896936297416687, + "rewards/rejected": -2.065619707107544, + "step": 1359 + }, + { + "epoch": 0.16, + "learning_rate": 2.5638360694460847e-07, + "logits/chosen": -2.4242844581604004, + "logits/rejected": -2.4405691623687744, + "logps/chosen": -336.92950439453125, + "logps/rejected": -355.7394714355469, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1461772918701172, + "rewards/margins": 2.3708856105804443, + "rewards/rejected": -2.5170631408691406, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 2.5634817526869017e-07, + "logits/chosen": -2.347684860229492, + "logits/rejected": -2.2065742015838623, + "logps/chosen": -234.31336975097656, + "logps/rejected": -455.9063720703125, + "loss": 0.3685, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42973679304122925, + "rewards/margins": 1.9545629024505615, + "rewards/rejected": -2.3842997550964355, + "step": 1361 + }, + { + "epoch": 0.16, + "learning_rate": 2.563127435927719e-07, + "logits/chosen": -2.2957115173339844, + "logits/rejected": -2.381695032119751, + "logps/chosen": -228.0542755126953, + "logps/rejected": -222.90185546875, + "loss": 0.8831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.084787130355835, + "rewards/margins": 0.5302080512046814, + "rewards/rejected": -1.6149951219558716, + "step": 1362 + }, + { + "epoch": 0.16, + "learning_rate": 2.5627731191685366e-07, + "logits/chosen": -2.4535489082336426, + "logits/rejected": -2.6556990146636963, + "logps/chosen": -322.32257080078125, + "logps/rejected": -631.0873413085938, + "loss": 0.1926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15538160502910614, + "rewards/margins": 2.9914615154266357, + "rewards/rejected": -3.1468429565429688, + "step": 1363 + }, + { + "epoch": 0.16, + "learning_rate": 2.562418802409354e-07, + "logits/chosen": -1.9608724117279053, + "logits/rejected": -2.2341926097869873, + "logps/chosen": -288.7054748535156, + "logps/rejected": -326.9286804199219, + "loss": 0.5128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8637473583221436, + "rewards/margins": 1.4533658027648926, + "rewards/rejected": -2.317113161087036, + "step": 1364 + }, + { + "epoch": 0.16, + "learning_rate": 2.562064485650171e-07, + "logits/chosen": -2.2859160900115967, + "logits/rejected": -2.3826234340667725, + "logps/chosen": -401.4815368652344, + "logps/rejected": -371.4460144042969, + "loss": 0.4785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10463443398475647, + "rewards/margins": 1.0209720134735107, + "rewards/rejected": -1.1256064176559448, + "step": 1365 + }, + { + "epoch": 0.16, + "learning_rate": 2.5617101688909886e-07, + "logits/chosen": -1.9445691108703613, + "logits/rejected": -2.1344969272613525, + "logps/chosen": -462.28594970703125, + "logps/rejected": -227.2145538330078, + "loss": 0.6172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6244882345199585, + "rewards/margins": 0.7379720211029053, + "rewards/rejected": -1.3624602556228638, + "step": 1366 + }, + { + "epoch": 0.16, + "learning_rate": 2.5613558521318055e-07, + "logits/chosen": -2.6682169437408447, + "logits/rejected": -2.3720436096191406, + "logps/chosen": -174.49417114257812, + "logps/rejected": -295.71661376953125, + "loss": 0.3073, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7137288451194763, + "rewards/margins": 1.3852548599243164, + "rewards/rejected": -2.0989837646484375, + "step": 1367 + }, + { + "epoch": 0.16, + "learning_rate": 2.561001535372623e-07, + "logits/chosen": -2.4442567825317383, + "logits/rejected": -2.2579214572906494, + "logps/chosen": -228.07669067382812, + "logps/rejected": -257.4449462890625, + "loss": 0.6508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5556291937828064, + "rewards/margins": 0.6714709997177124, + "rewards/rejected": -1.2271002531051636, + "step": 1368 + }, + { + "epoch": 0.16, + "learning_rate": 2.5606472186134405e-07, + "logits/chosen": -2.4356842041015625, + "logits/rejected": -2.390456438064575, + "logps/chosen": -157.12460327148438, + "logps/rejected": -259.62945556640625, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5258910655975342, + "rewards/margins": 2.343991994857788, + "rewards/rejected": -2.869882822036743, + "step": 1369 + }, + { + "epoch": 0.16, + "learning_rate": 2.5602929018542575e-07, + "logits/chosen": -2.737581729888916, + "logits/rejected": -2.7239394187927246, + "logps/chosen": -265.8567810058594, + "logps/rejected": -222.8844451904297, + "loss": 0.423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5852282047271729, + "rewards/margins": 1.593953251838684, + "rewards/rejected": -2.1791815757751465, + "step": 1370 + }, + { + "epoch": 0.16, + "learning_rate": 2.559938585095075e-07, + "logits/chosen": -1.7767667770385742, + "logits/rejected": -1.64446222782135, + "logps/chosen": -462.2033996582031, + "logps/rejected": -507.33270263671875, + "loss": 0.6732, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0352072715759277, + "rewards/margins": 1.3890979290008545, + "rewards/rejected": -2.4243052005767822, + "step": 1371 + }, + { + "epoch": 0.16, + "learning_rate": 2.559584268335892e-07, + "logits/chosen": -2.5318613052368164, + "logits/rejected": -2.5394906997680664, + "logps/chosen": -81.87847900390625, + "logps/rejected": -237.6739501953125, + "loss": 0.2442, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32076162099838257, + "rewards/margins": 2.4318761825561523, + "rewards/rejected": -2.7526376247406006, + "step": 1372 + }, + { + "epoch": 0.16, + "learning_rate": 2.5592299515767094e-07, + "logits/chosen": -2.315117359161377, + "logits/rejected": -2.050536870956421, + "logps/chosen": -223.43609619140625, + "logps/rejected": -340.232177734375, + "loss": 0.2637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6744475364685059, + "rewards/margins": 3.0697433948516846, + "rewards/rejected": -3.7441906929016113, + "step": 1373 + }, + { + "epoch": 0.16, + "learning_rate": 2.558875634817527e-07, + "logits/chosen": -2.433814525604248, + "logits/rejected": -2.3043675422668457, + "logps/chosen": -524.0728149414062, + "logps/rejected": -313.24139404296875, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.876115083694458, + "rewards/margins": 0.9196082949638367, + "rewards/rejected": -1.79572331905365, + "step": 1374 + }, + { + "epoch": 0.16, + "learning_rate": 2.5585213180583444e-07, + "logits/chosen": -2.665214776992798, + "logits/rejected": -2.723144769668579, + "logps/chosen": -253.61248779296875, + "logps/rejected": -307.5253601074219, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1106758117675781, + "rewards/margins": 3.1171908378601074, + "rewards/rejected": -4.2278666496276855, + "step": 1375 + }, + { + "epoch": 0.16, + "learning_rate": 2.5581670012991613e-07, + "logits/chosen": -2.1885123252868652, + "logits/rejected": -2.2223191261291504, + "logps/chosen": -265.1954650878906, + "logps/rejected": -218.85366821289062, + "loss": 0.3875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7820048332214355, + "rewards/margins": 1.2820836305618286, + "rewards/rejected": -2.0640883445739746, + "step": 1376 + }, + { + "epoch": 0.16, + "learning_rate": 2.557812684539979e-07, + "logits/chosen": -2.271019697189331, + "logits/rejected": -2.5251927375793457, + "logps/chosen": -300.3597106933594, + "logps/rejected": -215.67977905273438, + "loss": 0.4245, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6886817216873169, + "rewards/margins": 1.2243361473083496, + "rewards/rejected": -1.913017988204956, + "step": 1377 + }, + { + "epoch": 0.16, + "learning_rate": 2.557458367780796e-07, + "logits/chosen": -2.1268584728240967, + "logits/rejected": -2.065530776977539, + "logps/chosen": -338.9162292480469, + "logps/rejected": -451.73406982421875, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.807189404964447, + "rewards/margins": 2.9132394790649414, + "rewards/rejected": -3.720428943634033, + "step": 1378 + }, + { + "epoch": 0.16, + "learning_rate": 2.557104051021613e-07, + "logits/chosen": -2.2316670417785645, + "logits/rejected": -2.183450222015381, + "logps/chosen": -292.0419616699219, + "logps/rejected": -225.8133544921875, + "loss": 0.8372, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0222045183181763, + "rewards/margins": 0.6569688320159912, + "rewards/rejected": -1.679173231124878, + "step": 1379 + }, + { + "epoch": 0.16, + "learning_rate": 2.5567497342624307e-07, + "logits/chosen": -2.0154285430908203, + "logits/rejected": -2.4534904956817627, + "logps/chosen": -242.72067260742188, + "logps/rejected": -173.418212890625, + "loss": 0.6977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.788324236869812, + "rewards/margins": 0.409254252910614, + "rewards/rejected": -1.1975784301757812, + "step": 1380 + }, + { + "epoch": 0.16, + "learning_rate": 2.5563954175032477e-07, + "logits/chosen": -2.3626084327697754, + "logits/rejected": -2.285541534423828, + "logps/chosen": -314.45977783203125, + "logps/rejected": -246.85671997070312, + "loss": 0.3774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3344404101371765, + "rewards/margins": 1.343726396560669, + "rewards/rejected": -1.6781668663024902, + "step": 1381 + }, + { + "epoch": 0.16, + "learning_rate": 2.556041100744065e-07, + "logits/chosen": -2.135277509689331, + "logits/rejected": -1.9412298202514648, + "logps/chosen": -162.67222595214844, + "logps/rejected": -309.967529296875, + "loss": 0.2542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32310378551483154, + "rewards/margins": 1.8537085056304932, + "rewards/rejected": -2.176812171936035, + "step": 1382 + }, + { + "epoch": 0.16, + "learning_rate": 2.555686783984882e-07, + "logits/chosen": -1.503592610359192, + "logits/rejected": -2.3268470764160156, + "logps/chosen": -652.925537109375, + "logps/rejected": -299.5897216796875, + "loss": 0.8479, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2690744400024414, + "rewards/margins": 0.31684812903404236, + "rewards/rejected": -1.5859225988388062, + "step": 1383 + }, + { + "epoch": 0.16, + "learning_rate": 2.5553324672256996e-07, + "logits/chosen": -2.298218250274658, + "logits/rejected": -2.4614973068237305, + "logps/chosen": -410.8437805175781, + "logps/rejected": -306.4954528808594, + "loss": 0.4452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4800405502319336, + "rewards/margins": 1.0028564929962158, + "rewards/rejected": -1.4828970432281494, + "step": 1384 + }, + { + "epoch": 0.16, + "learning_rate": 2.554978150466517e-07, + "logits/chosen": -2.329585552215576, + "logits/rejected": -2.800426483154297, + "logps/chosen": -309.0374755859375, + "logps/rejected": -173.8949737548828, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.263191819190979, + "rewards/margins": 1.628694772720337, + "rewards/rejected": -1.8918864727020264, + "step": 1385 + }, + { + "epoch": 0.16, + "learning_rate": 2.5546238337073346e-07, + "logits/chosen": -2.497685432434082, + "logits/rejected": -2.3363919258117676, + "logps/chosen": -319.3771667480469, + "logps/rejected": -323.34844970703125, + "loss": 0.2772, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31427067518234253, + "rewards/margins": 1.4474875926971436, + "rewards/rejected": -1.7617582082748413, + "step": 1386 + }, + { + "epoch": 0.16, + "learning_rate": 2.5542695169481515e-07, + "logits/chosen": -2.2611162662506104, + "logits/rejected": -2.261871814727783, + "logps/chosen": -208.4417266845703, + "logps/rejected": -241.4300994873047, + "loss": 0.5067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8845191597938538, + "rewards/margins": 1.5092263221740723, + "rewards/rejected": -2.3937456607818604, + "step": 1387 + }, + { + "epoch": 0.16, + "learning_rate": 2.553915200188969e-07, + "logits/chosen": -2.4850268363952637, + "logits/rejected": -2.5756609439849854, + "logps/chosen": -372.37286376953125, + "logps/rejected": -369.28857421875, + "loss": 0.4674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35601887106895447, + "rewards/margins": 2.714913845062256, + "rewards/rejected": -3.0709328651428223, + "step": 1388 + }, + { + "epoch": 0.16, + "learning_rate": 2.553560883429786e-07, + "logits/chosen": -2.078963279724121, + "logits/rejected": -2.195012092590332, + "logps/chosen": -173.0458221435547, + "logps/rejected": -174.69119262695312, + "loss": 0.9262, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8130710124969482, + "rewards/margins": 0.16199657320976257, + "rewards/rejected": -1.9750676155090332, + "step": 1389 + }, + { + "epoch": 0.16, + "learning_rate": 2.5532065666706035e-07, + "logits/chosen": -2.1735665798187256, + "logits/rejected": -2.3815298080444336, + "logps/chosen": -209.03826904296875, + "logps/rejected": -237.97760009765625, + "loss": 0.46, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9716988801956177, + "rewards/margins": 1.7051472663879395, + "rewards/rejected": -2.6768462657928467, + "step": 1390 + }, + { + "epoch": 0.16, + "learning_rate": 2.5528522499114204e-07, + "logits/chosen": -2.5698699951171875, + "logits/rejected": -2.5621001720428467, + "logps/chosen": -380.8415832519531, + "logps/rejected": -294.79547119140625, + "loss": 0.1941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4688035845756531, + "rewards/margins": 2.826749563217163, + "rewards/rejected": -3.29555344581604, + "step": 1391 + }, + { + "epoch": 0.16, + "learning_rate": 2.552497933152238e-07, + "logits/chosen": -2.318331718444824, + "logits/rejected": -2.4113733768463135, + "logps/chosen": -393.7342529296875, + "logps/rejected": -246.3109130859375, + "loss": 0.6815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8531315922737122, + "rewards/margins": 0.5590286254882812, + "rewards/rejected": -1.4121601581573486, + "step": 1392 + }, + { + "epoch": 0.16, + "learning_rate": 2.5521436163930554e-07, + "logits/chosen": -2.1233081817626953, + "logits/rejected": -2.1238319873809814, + "logps/chosen": -241.42254638671875, + "logps/rejected": -193.67308044433594, + "loss": 0.4516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4255526661872864, + "rewards/margins": 1.3450242280960083, + "rewards/rejected": -1.7705767154693604, + "step": 1393 + }, + { + "epoch": 0.16, + "learning_rate": 2.5517892996338723e-07, + "logits/chosen": -2.2593624591827393, + "logits/rejected": -1.9347436428070068, + "logps/chosen": -197.30935668945312, + "logps/rejected": -351.3586730957031, + "loss": 0.092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7607911825180054, + "rewards/margins": 3.3266098499298096, + "rewards/rejected": -4.087401390075684, + "step": 1394 + }, + { + "epoch": 0.16, + "learning_rate": 2.55143498287469e-07, + "logits/chosen": -2.233491897583008, + "logits/rejected": -2.3034064769744873, + "logps/chosen": -281.9127502441406, + "logps/rejected": -178.6834259033203, + "loss": 0.1675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8219869136810303, + "rewards/margins": 2.246628999710083, + "rewards/rejected": -3.0686159133911133, + "step": 1395 + }, + { + "epoch": 0.16, + "learning_rate": 2.551080666115507e-07, + "logits/chosen": -2.4315638542175293, + "logits/rejected": -2.3997724056243896, + "logps/chosen": -328.5556945800781, + "logps/rejected": -253.26925659179688, + "loss": 0.3248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6127278208732605, + "rewards/margins": 1.7955207824707031, + "rewards/rejected": -2.4082484245300293, + "step": 1396 + }, + { + "epoch": 0.16, + "learning_rate": 2.550726349356325e-07, + "logits/chosen": -2.0050408840179443, + "logits/rejected": -2.0629847049713135, + "logps/chosen": -300.78106689453125, + "logps/rejected": -335.582763671875, + "loss": 0.8176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9264078140258789, + "rewards/margins": 0.7214505672454834, + "rewards/rejected": -1.6478583812713623, + "step": 1397 + }, + { + "epoch": 0.16, + "learning_rate": 2.550372032597142e-07, + "logits/chosen": -2.550112009048462, + "logits/rejected": -2.543943405151367, + "logps/chosen": -160.225341796875, + "logps/rejected": -161.807861328125, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8151851892471313, + "rewards/margins": 0.847303032875061, + "rewards/rejected": -1.662488341331482, + "step": 1398 + }, + { + "epoch": 0.16, + "learning_rate": 2.550017715837959e-07, + "logits/chosen": -2.269137382507324, + "logits/rejected": -2.0593338012695312, + "logps/chosen": -243.57907104492188, + "logps/rejected": -275.1543884277344, + "loss": 0.3182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2068558782339096, + "rewards/margins": 1.350659728050232, + "rewards/rejected": -1.5575157403945923, + "step": 1399 + }, + { + "epoch": 0.16, + "learning_rate": 2.549663399078776e-07, + "logits/chosen": -2.265998363494873, + "logits/rejected": -1.954296350479126, + "logps/chosen": -226.85997009277344, + "logps/rejected": -283.2669372558594, + "loss": 0.5612, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4435381591320038, + "rewards/margins": 1.1067390441894531, + "rewards/rejected": -1.5502773523330688, + "step": 1400 + }, + { + "epoch": 0.16, + "learning_rate": 2.5493090823195937e-07, + "logits/chosen": -2.5113468170166016, + "logits/rejected": -2.5954315662384033, + "logps/chosen": -163.50094604492188, + "logps/rejected": -206.63861083984375, + "loss": 0.3264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5241639614105225, + "rewards/margins": 1.8429217338562012, + "rewards/rejected": -2.3670856952667236, + "step": 1401 + }, + { + "epoch": 0.16, + "learning_rate": 2.5489547655604106e-07, + "logits/chosen": -2.233236312866211, + "logits/rejected": -2.124704122543335, + "logps/chosen": -240.0445556640625, + "logps/rejected": -384.36895751953125, + "loss": 0.9748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6617593169212341, + "rewards/margins": 0.6771878004074097, + "rewards/rejected": -1.338947057723999, + "step": 1402 + }, + { + "epoch": 0.16, + "learning_rate": 2.548600448801228e-07, + "logits/chosen": -2.21439266204834, + "logits/rejected": -2.3331658840179443, + "logps/chosen": -161.24575805664062, + "logps/rejected": -209.04949951171875, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021859414875507355, + "rewards/margins": 2.559685468673706, + "rewards/rejected": -2.581544876098633, + "step": 1403 + }, + { + "epoch": 0.16, + "learning_rate": 2.5482461320420456e-07, + "logits/chosen": -2.147665023803711, + "logits/rejected": -2.1107535362243652, + "logps/chosen": -279.0518493652344, + "logps/rejected": -419.6611022949219, + "loss": 0.3079, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8689256906509399, + "rewards/margins": 3.273087978363037, + "rewards/rejected": -4.1420135498046875, + "step": 1404 + }, + { + "epoch": 0.16, + "learning_rate": 2.5478918152828626e-07, + "logits/chosen": -2.8913490772247314, + "logits/rejected": -3.001032829284668, + "logps/chosen": -290.12628173828125, + "logps/rejected": -221.062255859375, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.955291748046875, + "rewards/margins": 0.7878057956695557, + "rewards/rejected": -1.7430974245071411, + "step": 1405 + }, + { + "epoch": 0.16, + "learning_rate": 2.54753749852368e-07, + "logits/chosen": -2.6902358531951904, + "logits/rejected": -2.475578784942627, + "logps/chosen": -186.46896362304688, + "logps/rejected": -219.74148559570312, + "loss": 0.4485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6496744751930237, + "rewards/margins": 1.7185355424880981, + "rewards/rejected": -2.3682098388671875, + "step": 1406 + }, + { + "epoch": 0.16, + "learning_rate": 2.547183181764497e-07, + "logits/chosen": -1.928343653678894, + "logits/rejected": -2.0593793392181396, + "logps/chosen": -121.61155700683594, + "logps/rejected": -112.32550048828125, + "loss": 0.5566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.062213234603405, + "rewards/margins": 0.7572500109672546, + "rewards/rejected": -0.819463312625885, + "step": 1407 + }, + { + "epoch": 0.16, + "learning_rate": 2.546828865005315e-07, + "logits/chosen": -2.183595895767212, + "logits/rejected": -2.0249199867248535, + "logps/chosen": -250.4213104248047, + "logps/rejected": -244.65008544921875, + "loss": 0.5559, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.117912769317627, + "rewards/margins": 1.2221598625183105, + "rewards/rejected": -2.3400726318359375, + "step": 1408 + }, + { + "epoch": 0.16, + "learning_rate": 2.546474548246132e-07, + "logits/chosen": -2.457787036895752, + "logits/rejected": -2.091348648071289, + "logps/chosen": -158.55740356445312, + "logps/rejected": -228.960205078125, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8764073252677917, + "rewards/margins": 1.4063568115234375, + "rewards/rejected": -2.282764196395874, + "step": 1409 + }, + { + "epoch": 0.16, + "learning_rate": 2.5461202314869495e-07, + "logits/chosen": -2.66800856590271, + "logits/rejected": -2.6126551628112793, + "logps/chosen": -503.172119140625, + "logps/rejected": -299.84820556640625, + "loss": 0.2342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5421231985092163, + "rewards/margins": 1.9351019859313965, + "rewards/rejected": -2.4772253036499023, + "step": 1410 + }, + { + "epoch": 0.16, + "learning_rate": 2.5457659147277664e-07, + "logits/chosen": -2.4586262702941895, + "logits/rejected": -2.377124309539795, + "logps/chosen": -335.5270690917969, + "logps/rejected": -225.5157470703125, + "loss": 0.401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12433779239654541, + "rewards/margins": 1.3591395616531372, + "rewards/rejected": -1.483477234840393, + "step": 1411 + }, + { + "epoch": 0.16, + "learning_rate": 2.545411597968584e-07, + "logits/chosen": -2.5692687034606934, + "logits/rejected": -2.4680094718933105, + "logps/chosen": -211.6266632080078, + "logps/rejected": -270.94122314453125, + "loss": 0.2269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47314298152923584, + "rewards/margins": 2.4017884731292725, + "rewards/rejected": -2.874931573867798, + "step": 1412 + }, + { + "epoch": 0.16, + "learning_rate": 2.545057281209401e-07, + "logits/chosen": -1.3163652420043945, + "logits/rejected": -1.7163945436477661, + "logps/chosen": -356.2242431640625, + "logps/rejected": -352.26458740234375, + "loss": 0.532, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4944991171360016, + "rewards/margins": 1.0069080591201782, + "rewards/rejected": -1.501407265663147, + "step": 1413 + }, + { + "epoch": 0.16, + "learning_rate": 2.5447029644502184e-07, + "logits/chosen": -2.384418487548828, + "logits/rejected": -2.6034252643585205, + "logps/chosen": -452.16510009765625, + "logps/rejected": -348.62579345703125, + "loss": 0.5851, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4056226015090942, + "rewards/margins": 1.7280495166778564, + "rewards/rejected": -3.1336722373962402, + "step": 1414 + }, + { + "epoch": 0.16, + "learning_rate": 2.544348647691036e-07, + "logits/chosen": -2.114053726196289, + "logits/rejected": -2.2562427520751953, + "logps/chosen": -383.5710144042969, + "logps/rejected": -190.02723693847656, + "loss": 0.6396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8270288109779358, + "rewards/margins": 0.8645678162574768, + "rewards/rejected": -1.6915966272354126, + "step": 1415 + }, + { + "epoch": 0.16, + "learning_rate": 2.543994330931853e-07, + "logits/chosen": -2.4630913734436035, + "logits/rejected": -2.5685012340545654, + "logps/chosen": -163.06808471679688, + "logps/rejected": -181.20407104492188, + "loss": 0.3612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3045794665813446, + "rewards/margins": 1.7313039302825928, + "rewards/rejected": -2.0358831882476807, + "step": 1416 + }, + { + "epoch": 0.16, + "learning_rate": 2.5436400141726703e-07, + "logits/chosen": -1.9290547370910645, + "logits/rejected": -2.1793529987335205, + "logps/chosen": -268.989501953125, + "logps/rejected": -308.44696044921875, + "loss": 0.2541, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0143487453460693, + "rewards/margins": 2.9500749111175537, + "rewards/rejected": -3.964423656463623, + "step": 1417 + }, + { + "epoch": 0.16, + "learning_rate": 2.543285697413487e-07, + "logits/chosen": -2.6333277225494385, + "logits/rejected": -2.6915454864501953, + "logps/chosen": -435.1848449707031, + "logps/rejected": -287.0091857910156, + "loss": 0.5823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8731905221939087, + "rewards/margins": 1.3949429988861084, + "rewards/rejected": -2.2681336402893066, + "step": 1418 + }, + { + "epoch": 0.17, + "learning_rate": 2.5429313806543047e-07, + "logits/chosen": -2.6009397506713867, + "logits/rejected": -2.6713075637817383, + "logps/chosen": -233.719482421875, + "logps/rejected": -265.51361083984375, + "loss": 0.4733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19008395075798035, + "rewards/margins": 1.0172450542449951, + "rewards/rejected": -1.2073290348052979, + "step": 1419 + }, + { + "epoch": 0.17, + "learning_rate": 2.542577063895122e-07, + "logits/chosen": -2.3597967624664307, + "logits/rejected": -2.5746517181396484, + "logps/chosen": -368.05743408203125, + "logps/rejected": -260.9460754394531, + "loss": 0.5387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7789199352264404, + "rewards/margins": 1.4355340003967285, + "rewards/rejected": -2.214453935623169, + "step": 1420 + }, + { + "epoch": 0.17, + "learning_rate": 2.5422227471359397e-07, + "logits/chosen": -2.4228570461273193, + "logits/rejected": -2.019545555114746, + "logps/chosen": -246.39735412597656, + "logps/rejected": -329.96337890625, + "loss": 1.0397, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6318446397781372, + "rewards/margins": 0.029366105794906616, + "rewards/rejected": -0.6612107753753662, + "step": 1421 + }, + { + "epoch": 0.17, + "learning_rate": 2.5418684303767567e-07, + "logits/chosen": -2.503176689147949, + "logits/rejected": -2.5368566513061523, + "logps/chosen": -247.5775604248047, + "logps/rejected": -186.21414184570312, + "loss": 0.8579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9218241572380066, + "rewards/margins": 0.4099642336368561, + "rewards/rejected": -1.3317883014678955, + "step": 1422 + }, + { + "epoch": 0.17, + "learning_rate": 2.541514113617574e-07, + "logits/chosen": -2.4616856575012207, + "logits/rejected": -2.6306047439575195, + "logps/chosen": -217.22451782226562, + "logps/rejected": -239.80418395996094, + "loss": 0.6277, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1337361335754395, + "rewards/margins": 2.218123435974121, + "rewards/rejected": -3.3518595695495605, + "step": 1423 + }, + { + "epoch": 0.17, + "learning_rate": 2.541159796858391e-07, + "logits/chosen": -2.4855356216430664, + "logits/rejected": -2.711442232131958, + "logps/chosen": -219.52381896972656, + "logps/rejected": -214.70159912109375, + "loss": 0.7115, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8022868037223816, + "rewards/margins": 0.6162475347518921, + "rewards/rejected": -1.4185343980789185, + "step": 1424 + }, + { + "epoch": 0.17, + "learning_rate": 2.5408054800992086e-07, + "logits/chosen": -2.5001630783081055, + "logits/rejected": -2.267730712890625, + "logps/chosen": -158.97799682617188, + "logps/rejected": -205.411376953125, + "loss": 0.4727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6244603395462036, + "rewards/margins": 0.6764624118804932, + "rewards/rejected": -1.3009227514266968, + "step": 1425 + }, + { + "epoch": 0.17, + "learning_rate": 2.540451163340026e-07, + "logits/chosen": -1.657132863998413, + "logits/rejected": -1.9204819202423096, + "logps/chosen": -443.85302734375, + "logps/rejected": -338.977783203125, + "loss": 1.0436, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3679190874099731, + "rewards/margins": -0.36289939284324646, + "rewards/rejected": -1.0050196647644043, + "step": 1426 + }, + { + "epoch": 0.17, + "learning_rate": 2.540096846580843e-07, + "logits/chosen": -2.427781105041504, + "logits/rejected": -2.5696349143981934, + "logps/chosen": -234.84890747070312, + "logps/rejected": -251.01605224609375, + "loss": 0.626, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1094026565551758, + "rewards/margins": 0.5414320230484009, + "rewards/rejected": -1.6508347988128662, + "step": 1427 + }, + { + "epoch": 0.17, + "learning_rate": 2.5397425298216605e-07, + "logits/chosen": -2.6352429389953613, + "logits/rejected": -2.749058485031128, + "logps/chosen": -361.50592041015625, + "logps/rejected": -295.8741455078125, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5807444453239441, + "rewards/margins": 1.774131417274475, + "rewards/rejected": -2.3548758029937744, + "step": 1428 + }, + { + "epoch": 0.17, + "learning_rate": 2.5393882130624775e-07, + "logits/chosen": -1.5335426330566406, + "logits/rejected": -1.8775932788848877, + "logps/chosen": -304.6595458984375, + "logps/rejected": -194.12515258789062, + "loss": 0.5354, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6694098711013794, + "rewards/margins": 0.6565424799919128, + "rewards/rejected": -1.3259522914886475, + "step": 1429 + }, + { + "epoch": 0.17, + "learning_rate": 2.539033896303295e-07, + "logits/chosen": -1.9526619911193848, + "logits/rejected": -2.251145362854004, + "logps/chosen": -351.6664123535156, + "logps/rejected": -272.32220458984375, + "loss": 0.7102, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2508903741836548, + "rewards/margins": 0.6008719801902771, + "rewards/rejected": -1.8517624139785767, + "step": 1430 + }, + { + "epoch": 0.17, + "learning_rate": 2.538679579544112e-07, + "logits/chosen": -2.1874873638153076, + "logits/rejected": -2.4208993911743164, + "logps/chosen": -213.03195190429688, + "logps/rejected": -164.6880645751953, + "loss": 0.5484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3417195677757263, + "rewards/margins": 0.7480848431587219, + "rewards/rejected": -1.0898044109344482, + "step": 1431 + }, + { + "epoch": 0.17, + "learning_rate": 2.53832526278493e-07, + "logits/chosen": -1.9024730920791626, + "logits/rejected": -1.9694797992706299, + "logps/chosen": -274.88433837890625, + "logps/rejected": -332.10333251953125, + "loss": 0.373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4101518392562866, + "rewards/margins": 1.4350073337554932, + "rewards/rejected": -1.8451590538024902, + "step": 1432 + }, + { + "epoch": 0.17, + "learning_rate": 2.537970946025747e-07, + "logits/chosen": -2.4270613193511963, + "logits/rejected": -2.2756803035736084, + "logps/chosen": -220.02890014648438, + "logps/rejected": -306.5950622558594, + "loss": 0.4172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4129566550254822, + "rewards/margins": 1.4227460622787476, + "rewards/rejected": -1.8357027769088745, + "step": 1433 + }, + { + "epoch": 0.17, + "learning_rate": 2.5376166292665644e-07, + "logits/chosen": -2.533468723297119, + "logits/rejected": -2.691944122314453, + "logps/chosen": -374.1702575683594, + "logps/rejected": -174.51168823242188, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19594290852546692, + "rewards/margins": 2.793304443359375, + "rewards/rejected": -2.5973610877990723, + "step": 1434 + }, + { + "epoch": 0.17, + "learning_rate": 2.5372623125073813e-07, + "logits/chosen": -1.8424973487854004, + "logits/rejected": -1.99131178855896, + "logps/chosen": -225.1488800048828, + "logps/rejected": -230.59222412109375, + "loss": 1.1071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6483787298202515, + "rewards/margins": -0.12687551975250244, + "rewards/rejected": -1.5215030908584595, + "step": 1435 + }, + { + "epoch": 0.17, + "learning_rate": 2.536907995748199e-07, + "logits/chosen": -2.661696195602417, + "logits/rejected": -2.8085148334503174, + "logps/chosen": -667.4241333007812, + "logps/rejected": -493.30181884765625, + "loss": 0.4306, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.160178303718567, + "rewards/margins": 2.886575222015381, + "rewards/rejected": -4.046753406524658, + "step": 1436 + }, + { + "epoch": 0.17, + "learning_rate": 2.5365536789890163e-07, + "logits/chosen": -2.760230302810669, + "logits/rejected": -2.5395493507385254, + "logps/chosen": -199.6373291015625, + "logps/rejected": -176.92825317382812, + "loss": 0.5366, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0865223407745361, + "rewards/margins": 2.866060256958008, + "rewards/rejected": -3.952582359313965, + "step": 1437 + }, + { + "epoch": 0.17, + "learning_rate": 2.536199362229833e-07, + "logits/chosen": -1.9585072994232178, + "logits/rejected": -1.838178038597107, + "logps/chosen": -330.27825927734375, + "logps/rejected": -347.9726867675781, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7503769397735596, + "rewards/margins": 1.6042174100875854, + "rewards/rejected": -2.3545942306518555, + "step": 1438 + }, + { + "epoch": 0.17, + "learning_rate": 2.535845045470651e-07, + "logits/chosen": -2.203791856765747, + "logits/rejected": -2.369785785675049, + "logps/chosen": -383.0072021484375, + "logps/rejected": -364.023681640625, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5691184401512146, + "rewards/margins": 2.0655503273010254, + "rewards/rejected": -2.634669065475464, + "step": 1439 + }, + { + "epoch": 0.17, + "learning_rate": 2.5354907287114677e-07, + "logits/chosen": -2.1995997428894043, + "logits/rejected": -2.2223525047302246, + "logps/chosen": -219.06793212890625, + "logps/rejected": -272.837158203125, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.564666748046875, + "rewards/margins": 3.3644495010375977, + "rewards/rejected": -3.9291162490844727, + "step": 1440 + }, + { + "epoch": 0.17, + "learning_rate": 2.535136411952285e-07, + "logits/chosen": -2.4065871238708496, + "logits/rejected": -2.5182597637176514, + "logps/chosen": -404.03265380859375, + "logps/rejected": -325.8576354980469, + "loss": 0.4643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3538506031036377, + "rewards/margins": 0.7207666039466858, + "rewards/rejected": -1.0746171474456787, + "step": 1441 + }, + { + "epoch": 0.17, + "learning_rate": 2.534782095193102e-07, + "logits/chosen": -2.5117228031158447, + "logits/rejected": -2.1108460426330566, + "logps/chosen": -147.0909881591797, + "logps/rejected": -186.81729125976562, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9482824802398682, + "rewards/margins": 1.3818496465682983, + "rewards/rejected": -2.330132007598877, + "step": 1442 + }, + { + "epoch": 0.17, + "learning_rate": 2.53442777843392e-07, + "logits/chosen": -2.4570987224578857, + "logits/rejected": -2.3983781337738037, + "logps/chosen": -242.34097290039062, + "logps/rejected": -274.68206787109375, + "loss": 0.4763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.610363781452179, + "rewards/margins": 1.764404058456421, + "rewards/rejected": -2.374767780303955, + "step": 1443 + }, + { + "epoch": 0.17, + "learning_rate": 2.534073461674737e-07, + "logits/chosen": -2.4009642601013184, + "logits/rejected": -2.6849422454833984, + "logps/chosen": -567.9620361328125, + "logps/rejected": -240.4154510498047, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.547966480255127, + "rewards/margins": 2.2115731239318848, + "rewards/rejected": -2.7595396041870117, + "step": 1444 + }, + { + "epoch": 0.17, + "learning_rate": 2.5337191449155546e-07, + "logits/chosen": -2.2850019931793213, + "logits/rejected": -2.1390790939331055, + "logps/chosen": -220.06048583984375, + "logps/rejected": -336.79296875, + "loss": 0.3418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5401608347892761, + "rewards/margins": 1.63780677318573, + "rewards/rejected": -2.1779675483703613, + "step": 1445 + }, + { + "epoch": 0.17, + "learning_rate": 2.5333648281563716e-07, + "logits/chosen": -2.5842723846435547, + "logits/rejected": -2.622328519821167, + "logps/chosen": -227.978515625, + "logps/rejected": -337.231689453125, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08642464876174927, + "rewards/margins": 3.774301528930664, + "rewards/rejected": -3.8607263565063477, + "step": 1446 + }, + { + "epoch": 0.17, + "learning_rate": 2.533010511397189e-07, + "logits/chosen": -2.3159894943237305, + "logits/rejected": -2.339876174926758, + "logps/chosen": -424.4003601074219, + "logps/rejected": -246.01416015625, + "loss": 0.6247, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.890349268913269, + "rewards/margins": 0.9741147756576538, + "rewards/rejected": -2.864464044570923, + "step": 1447 + }, + { + "epoch": 0.17, + "learning_rate": 2.5326561946380065e-07, + "logits/chosen": -2.5215792655944824, + "logits/rejected": -2.537692070007324, + "logps/chosen": -143.82635498046875, + "logps/rejected": -120.31312561035156, + "loss": 0.7959, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.347123622894287, + "rewards/margins": 1.232596755027771, + "rewards/rejected": -2.5797204971313477, + "step": 1448 + }, + { + "epoch": 0.17, + "learning_rate": 2.5323018778788235e-07, + "logits/chosen": -2.9211254119873047, + "logits/rejected": -2.835559606552124, + "logps/chosen": -152.33050537109375, + "logps/rejected": -218.90386962890625, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2231467962265015, + "rewards/margins": 1.9937942028045654, + "rewards/rejected": -3.2169408798217773, + "step": 1449 + }, + { + "epoch": 0.17, + "learning_rate": 2.531947561119641e-07, + "logits/chosen": -2.105167865753174, + "logits/rejected": -2.3442540168762207, + "logps/chosen": -506.58294677734375, + "logps/rejected": -351.8052978515625, + "loss": 0.5419, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22831220924854279, + "rewards/margins": 1.669630765914917, + "rewards/rejected": -1.8979430198669434, + "step": 1450 + }, + { + "epoch": 0.17, + "learning_rate": 2.531593244360458e-07, + "logits/chosen": -2.279019594192505, + "logits/rejected": -2.3845438957214355, + "logps/chosen": -438.55230712890625, + "logps/rejected": -406.7987976074219, + "loss": 0.2837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5728898644447327, + "rewards/margins": 1.429671287536621, + "rewards/rejected": -2.002561092376709, + "step": 1451 + }, + { + "epoch": 0.17, + "learning_rate": 2.5312389276012754e-07, + "logits/chosen": -1.4532936811447144, + "logits/rejected": -1.802952527999878, + "logps/chosen": -473.18121337890625, + "logps/rejected": -335.4903564453125, + "loss": 0.6637, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6462604999542236, + "rewards/margins": 0.34779414534568787, + "rewards/rejected": -0.9940545558929443, + "step": 1452 + }, + { + "epoch": 0.17, + "learning_rate": 2.5308846108420924e-07, + "logits/chosen": -2.804143190383911, + "logits/rejected": -2.635232925415039, + "logps/chosen": -270.97552490234375, + "logps/rejected": -190.22747802734375, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1187671422958374, + "rewards/margins": 1.8212106227874756, + "rewards/rejected": -1.939977765083313, + "step": 1453 + }, + { + "epoch": 0.17, + "learning_rate": 2.53053029408291e-07, + "logits/chosen": -2.361576557159424, + "logits/rejected": -2.020791530609131, + "logps/chosen": -186.4981689453125, + "logps/rejected": -302.084716796875, + "loss": 0.5477, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.264258623123169, + "rewards/margins": 0.8958760499954224, + "rewards/rejected": -2.1601345539093018, + "step": 1454 + }, + { + "epoch": 0.17, + "learning_rate": 2.5301759773237273e-07, + "logits/chosen": -2.504761219024658, + "logits/rejected": -2.341108560562134, + "logps/chosen": -380.6441345214844, + "logps/rejected": -370.5119934082031, + "loss": 0.3151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.039496660232544, + "rewards/margins": 1.3657381534576416, + "rewards/rejected": -2.4052350521087646, + "step": 1455 + }, + { + "epoch": 0.17, + "learning_rate": 2.529821660564545e-07, + "logits/chosen": -2.1535794734954834, + "logits/rejected": -1.9789355993270874, + "logps/chosen": -170.31314086914062, + "logps/rejected": -259.0187683105469, + "loss": 0.6131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8147975206375122, + "rewards/margins": 0.7351446747779846, + "rewards/rejected": -1.5499422550201416, + "step": 1456 + }, + { + "epoch": 0.17, + "learning_rate": 2.529467343805362e-07, + "logits/chosen": -2.1239097118377686, + "logits/rejected": -2.1084089279174805, + "logps/chosen": -238.11167907714844, + "logps/rejected": -223.52252197265625, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40069636702537537, + "rewards/margins": 1.4288387298583984, + "rewards/rejected": -1.829534888267517, + "step": 1457 + }, + { + "epoch": 0.17, + "learning_rate": 2.5291130270461793e-07, + "logits/chosen": -2.118220090866089, + "logits/rejected": -2.2238495349884033, + "logps/chosen": -342.74456787109375, + "logps/rejected": -306.6408996582031, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6236253976821899, + "rewards/margins": 2.4561071395874023, + "rewards/rejected": -3.079732656478882, + "step": 1458 + }, + { + "epoch": 0.17, + "learning_rate": 2.528758710286997e-07, + "logits/chosen": -2.684354305267334, + "logits/rejected": -2.5362372398376465, + "logps/chosen": -349.3921203613281, + "logps/rejected": -269.8887939453125, + "loss": 0.2044, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28179481625556946, + "rewards/margins": 2.8368444442749023, + "rewards/rejected": -3.1186392307281494, + "step": 1459 + }, + { + "epoch": 0.17, + "learning_rate": 2.5284043935278137e-07, + "logits/chosen": -2.517568349838257, + "logits/rejected": -2.8301491737365723, + "logps/chosen": -182.41664123535156, + "logps/rejected": -193.70089721679688, + "loss": 0.3487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.511802613735199, + "rewards/margins": 1.7629618644714355, + "rewards/rejected": -2.2747645378112793, + "step": 1460 + }, + { + "epoch": 0.17, + "learning_rate": 2.528050076768631e-07, + "logits/chosen": -2.5903003215789795, + "logits/rejected": -2.3646419048309326, + "logps/chosen": -289.8793029785156, + "logps/rejected": -405.5638732910156, + "loss": 0.3027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6927700042724609, + "rewards/margins": 1.9571034908294678, + "rewards/rejected": -2.6498734951019287, + "step": 1461 + }, + { + "epoch": 0.17, + "learning_rate": 2.527695760009448e-07, + "logits/chosen": -2.493424415588379, + "logits/rejected": -2.4858362674713135, + "logps/chosen": -139.37237548828125, + "logps/rejected": -178.51715087890625, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.571618914604187, + "rewards/margins": 2.6915283203125, + "rewards/rejected": -2.1199095249176025, + "step": 1462 + }, + { + "epoch": 0.17, + "learning_rate": 2.5273414432502656e-07, + "logits/chosen": -2.5667154788970947, + "logits/rejected": -2.4331793785095215, + "logps/chosen": -407.3218994140625, + "logps/rejected": -395.99395751953125, + "loss": 0.536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4371188282966614, + "rewards/margins": 1.0734144449234009, + "rewards/rejected": -1.5105332136154175, + "step": 1463 + }, + { + "epoch": 0.17, + "learning_rate": 2.5269871264910826e-07, + "logits/chosen": -2.50931453704834, + "logits/rejected": -2.313822031021118, + "logps/chosen": -254.01461791992188, + "logps/rejected": -428.7002258300781, + "loss": 0.7307, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.560958981513977, + "rewards/margins": 0.4432069659233093, + "rewards/rejected": -1.0041660070419312, + "step": 1464 + }, + { + "epoch": 0.17, + "learning_rate": 2.5266328097319e-07, + "logits/chosen": -2.5206174850463867, + "logits/rejected": -2.7586684226989746, + "logps/chosen": -256.38397216796875, + "logps/rejected": -162.2035369873047, + "loss": 0.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0403635501861572, + "rewards/margins": 0.5496081709861755, + "rewards/rejected": -1.5899717807769775, + "step": 1465 + }, + { + "epoch": 0.17, + "learning_rate": 2.5262784929727176e-07, + "logits/chosen": -2.9470343589782715, + "logits/rejected": -2.9561822414398193, + "logps/chosen": -140.767578125, + "logps/rejected": -216.20858764648438, + "loss": 0.5327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9156724810600281, + "rewards/margins": 0.7848986387252808, + "rewards/rejected": -1.7005711793899536, + "step": 1466 + }, + { + "epoch": 0.17, + "learning_rate": 2.525924176213535e-07, + "logits/chosen": -2.6634774208068848, + "logits/rejected": -2.582221269607544, + "logps/chosen": -126.55690002441406, + "logps/rejected": -262.790283203125, + "loss": 0.2676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44727328419685364, + "rewards/margins": 2.4207444190979004, + "rewards/rejected": -2.8680176734924316, + "step": 1467 + }, + { + "epoch": 0.17, + "learning_rate": 2.525569859454352e-07, + "logits/chosen": -1.8738160133361816, + "logits/rejected": -1.6901065111160278, + "logps/chosen": -353.93524169921875, + "logps/rejected": -404.9190368652344, + "loss": 0.462, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6788508892059326, + "rewards/margins": 2.2588632106781006, + "rewards/rejected": -2.937714099884033, + "step": 1468 + }, + { + "epoch": 0.17, + "learning_rate": 2.5252155426951695e-07, + "logits/chosen": -2.218775510787964, + "logits/rejected": -2.259687900543213, + "logps/chosen": -258.6424560546875, + "logps/rejected": -302.3805236816406, + "loss": 0.3018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23344296216964722, + "rewards/margins": 1.8039829730987549, + "rewards/rejected": -2.037425994873047, + "step": 1469 + }, + { + "epoch": 0.17, + "learning_rate": 2.5248612259359865e-07, + "logits/chosen": -2.7824223041534424, + "logits/rejected": -2.6774966716766357, + "logps/chosen": -451.36932373046875, + "logps/rejected": -418.739990234375, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04473362863063812, + "rewards/margins": 3.5978102684020996, + "rewards/rejected": -3.5530762672424316, + "step": 1470 + }, + { + "epoch": 0.17, + "learning_rate": 2.524506909176804e-07, + "logits/chosen": -2.01977801322937, + "logits/rejected": -1.8278520107269287, + "logps/chosen": -221.29095458984375, + "logps/rejected": -286.4285888671875, + "loss": 0.6296, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.226374864578247, + "rewards/margins": 0.9805516004562378, + "rewards/rejected": -2.2069263458251953, + "step": 1471 + }, + { + "epoch": 0.17, + "learning_rate": 2.5241525924176214e-07, + "logits/chosen": -2.4640140533447266, + "logits/rejected": -2.4418015480041504, + "logps/chosen": -175.91354370117188, + "logps/rejected": -89.90542602539062, + "loss": 0.6366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4747963845729828, + "rewards/margins": 0.9292072057723999, + "rewards/rejected": -1.404003620147705, + "step": 1472 + }, + { + "epoch": 0.17, + "learning_rate": 2.5237982756584384e-07, + "logits/chosen": -2.6665611267089844, + "logits/rejected": -2.36313533782959, + "logps/chosen": -298.3760070800781, + "logps/rejected": -326.38592529296875, + "loss": 0.393, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9506881237030029, + "rewards/margins": 1.1170557737350464, + "rewards/rejected": -2.0677437782287598, + "step": 1473 + }, + { + "epoch": 0.17, + "learning_rate": 2.523443958899256e-07, + "logits/chosen": -2.282341480255127, + "logits/rejected": -2.2311012744903564, + "logps/chosen": -366.27227783203125, + "logps/rejected": -271.2642822265625, + "loss": 0.4887, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6854420304298401, + "rewards/margins": 1.458312749862671, + "rewards/rejected": -2.1437549591064453, + "step": 1474 + }, + { + "epoch": 0.17, + "learning_rate": 2.523089642140073e-07, + "logits/chosen": -2.5329833030700684, + "logits/rejected": -2.6229522228240967, + "logps/chosen": -196.85362243652344, + "logps/rejected": -234.38980102539062, + "loss": 0.5354, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2521753311157227, + "rewards/margins": 1.101252555847168, + "rewards/rejected": -2.3534281253814697, + "step": 1475 + }, + { + "epoch": 0.17, + "learning_rate": 2.5227353253808903e-07, + "logits/chosen": -2.8712286949157715, + "logits/rejected": -2.6965372562408447, + "logps/chosen": -265.0549011230469, + "logps/rejected": -357.2110290527344, + "loss": 0.438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6656264066696167, + "rewards/margins": 1.6585111618041992, + "rewards/rejected": -2.3241376876831055, + "step": 1476 + }, + { + "epoch": 0.17, + "learning_rate": 2.522381008621708e-07, + "logits/chosen": -2.3227853775024414, + "logits/rejected": -2.1487667560577393, + "logps/chosen": -374.4190979003906, + "logps/rejected": -251.34140014648438, + "loss": 0.4249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7997742891311646, + "rewards/margins": 1.7518787384033203, + "rewards/rejected": -2.5516531467437744, + "step": 1477 + }, + { + "epoch": 0.17, + "learning_rate": 2.5220266918625253e-07, + "logits/chosen": -2.3767035007476807, + "logits/rejected": -2.31461501121521, + "logps/chosen": -222.5201873779297, + "logps/rejected": -286.67578125, + "loss": 0.7692, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2054404020309448, + "rewards/margins": 1.1064964532852173, + "rewards/rejected": -2.311936855316162, + "step": 1478 + }, + { + "epoch": 0.17, + "learning_rate": 2.521672375103342e-07, + "logits/chosen": -2.448293685913086, + "logits/rejected": -2.4010252952575684, + "logps/chosen": -237.29776000976562, + "logps/rejected": -248.95242309570312, + "loss": 0.326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8617949485778809, + "rewards/margins": 2.5980401039123535, + "rewards/rejected": -3.4598350524902344, + "step": 1479 + }, + { + "epoch": 0.17, + "learning_rate": 2.5213180583441597e-07, + "logits/chosen": -2.286609172821045, + "logits/rejected": -2.2503767013549805, + "logps/chosen": -252.99923706054688, + "logps/rejected": -294.22064208984375, + "loss": 0.3255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6527867913246155, + "rewards/margins": 1.88228440284729, + "rewards/rejected": -2.5350711345672607, + "step": 1480 + }, + { + "epoch": 0.17, + "learning_rate": 2.5209637415849767e-07, + "logits/chosen": -2.2258667945861816, + "logits/rejected": -2.4330830574035645, + "logps/chosen": -485.60833740234375, + "logps/rejected": -304.7433776855469, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4552658796310425, + "rewards/margins": 0.5030224323272705, + "rewards/rejected": -0.9582882523536682, + "step": 1481 + }, + { + "epoch": 0.17, + "learning_rate": 2.520609424825794e-07, + "logits/chosen": -1.648511528968811, + "logits/rejected": -1.666149377822876, + "logps/chosen": -280.4410095214844, + "logps/rejected": -294.32696533203125, + "loss": 0.4216, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2772990465164185, + "rewards/margins": 1.4583731889724731, + "rewards/rejected": -2.7356722354888916, + "step": 1482 + }, + { + "epoch": 0.17, + "learning_rate": 2.5202551080666117e-07, + "logits/chosen": -1.572220802307129, + "logits/rejected": -1.4331250190734863, + "logps/chosen": -342.74383544921875, + "logps/rejected": -325.2814025878906, + "loss": 0.6678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7219731211662292, + "rewards/margins": 1.875354290008545, + "rewards/rejected": -2.597327470779419, + "step": 1483 + }, + { + "epoch": 0.17, + "learning_rate": 2.5199007913074286e-07, + "logits/chosen": -2.4022998809814453, + "logits/rejected": -2.5381269454956055, + "logps/chosen": -249.94735717773438, + "logps/rejected": -174.20851135253906, + "loss": 0.6335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21491488814353943, + "rewards/margins": 0.3866581618785858, + "rewards/rejected": -0.6015730500221252, + "step": 1484 + }, + { + "epoch": 0.17, + "learning_rate": 2.519546474548246e-07, + "logits/chosen": -2.5062367916107178, + "logits/rejected": -2.5851614475250244, + "logps/chosen": -171.6474609375, + "logps/rejected": -153.98004150390625, + "loss": 0.6289, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.000792384147644, + "rewards/margins": 0.7175020575523376, + "rewards/rejected": -1.718294382095337, + "step": 1485 + }, + { + "epoch": 0.17, + "learning_rate": 2.519192157789063e-07, + "logits/chosen": -2.7694687843322754, + "logits/rejected": -2.8860514163970947, + "logps/chosen": -214.14553833007812, + "logps/rejected": -153.65623474121094, + "loss": 0.5742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9531052112579346, + "rewards/margins": 1.4404003620147705, + "rewards/rejected": -2.393505573272705, + "step": 1486 + }, + { + "epoch": 0.17, + "learning_rate": 2.5188378410298805e-07, + "logits/chosen": -2.143268585205078, + "logits/rejected": -2.3119587898254395, + "logps/chosen": -329.4967041015625, + "logps/rejected": -239.57135009765625, + "loss": 0.4642, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7891160249710083, + "rewards/margins": 1.988013744354248, + "rewards/rejected": -2.777129650115967, + "step": 1487 + }, + { + "epoch": 0.17, + "learning_rate": 2.518483524270698e-07, + "logits/chosen": -2.0523874759674072, + "logits/rejected": -2.098930597305298, + "logps/chosen": -361.4208984375, + "logps/rejected": -363.96978759765625, + "loss": 0.2568, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7395920753479004, + "rewards/margins": 2.080350160598755, + "rewards/rejected": -2.8199422359466553, + "step": 1488 + }, + { + "epoch": 0.17, + "learning_rate": 2.518129207511515e-07, + "logits/chosen": -2.91645884513855, + "logits/rejected": -2.985250949859619, + "logps/chosen": -123.6478271484375, + "logps/rejected": -186.11561584472656, + "loss": 0.5719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.570517361164093, + "rewards/margins": 0.915500283241272, + "rewards/rejected": -1.4860177040100098, + "step": 1489 + }, + { + "epoch": 0.17, + "learning_rate": 2.5177748907523325e-07, + "logits/chosen": -2.481931209564209, + "logits/rejected": -2.4050991535186768, + "logps/chosen": -122.435302734375, + "logps/rejected": -148.4688720703125, + "loss": 1.0949, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.07850980758667, + "rewards/margins": 1.361609697341919, + "rewards/rejected": -2.440119504928589, + "step": 1490 + }, + { + "epoch": 0.17, + "learning_rate": 2.51742057399315e-07, + "logits/chosen": -2.098560333251953, + "logits/rejected": -1.9676381349563599, + "logps/chosen": -258.23199462890625, + "logps/rejected": -277.38873291015625, + "loss": 0.3262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44052034616470337, + "rewards/margins": 1.443532943725586, + "rewards/rejected": -1.884053349494934, + "step": 1491 + }, + { + "epoch": 0.17, + "learning_rate": 2.517066257233967e-07, + "logits/chosen": -2.5159173011779785, + "logits/rejected": -2.380880832672119, + "logps/chosen": -375.6462097167969, + "logps/rejected": -298.427978515625, + "loss": 0.369, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1143500804901123, + "rewards/margins": 2.1153600215911865, + "rewards/rejected": -3.229710102081299, + "step": 1492 + }, + { + "epoch": 0.17, + "learning_rate": 2.5167119404747844e-07, + "logits/chosen": -2.467053174972534, + "logits/rejected": -2.1456637382507324, + "logps/chosen": -225.94448852539062, + "logps/rejected": -333.5818786621094, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23798520863056183, + "rewards/margins": 3.728442430496216, + "rewards/rejected": -3.966427803039551, + "step": 1493 + }, + { + "epoch": 0.17, + "learning_rate": 2.516357623715602e-07, + "logits/chosen": -2.2891793251037598, + "logits/rejected": -2.242398738861084, + "logps/chosen": -242.3644561767578, + "logps/rejected": -213.1519012451172, + "loss": 0.433, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5102680325508118, + "rewards/margins": 1.1294386386871338, + "rewards/rejected": -1.6397066116333008, + "step": 1494 + }, + { + "epoch": 0.17, + "learning_rate": 2.516003306956419e-07, + "logits/chosen": -2.814871311187744, + "logits/rejected": -2.8710761070251465, + "logps/chosen": -205.79241943359375, + "logps/rejected": -331.1839599609375, + "loss": 0.4704, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1354008913040161, + "rewards/margins": 1.0878069400787354, + "rewards/rejected": -2.223207950592041, + "step": 1495 + }, + { + "epoch": 0.17, + "learning_rate": 2.5156489901972363e-07, + "logits/chosen": -2.330258846282959, + "logits/rejected": -2.1349592208862305, + "logps/chosen": -176.95285034179688, + "logps/rejected": -260.0544128417969, + "loss": 0.264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3431151509284973, + "rewards/margins": 3.0848309993743896, + "rewards/rejected": -3.427946090698242, + "step": 1496 + }, + { + "epoch": 0.17, + "learning_rate": 2.5152946734380533e-07, + "logits/chosen": -2.4142892360687256, + "logits/rejected": -2.5818979740142822, + "logps/chosen": -245.11949157714844, + "logps/rejected": -266.0391540527344, + "loss": 0.4893, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3396854400634766, + "rewards/margins": 1.96656334400177, + "rewards/rejected": -3.306248664855957, + "step": 1497 + }, + { + "epoch": 0.17, + "learning_rate": 2.514940356678871e-07, + "logits/chosen": -2.2812745571136475, + "logits/rejected": -2.6983373165130615, + "logps/chosen": -488.57568359375, + "logps/rejected": -215.8365478515625, + "loss": 0.948, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9289575815200806, + "rewards/margins": 0.8433178663253784, + "rewards/rejected": -2.77227520942688, + "step": 1498 + }, + { + "epoch": 0.17, + "learning_rate": 2.5145860399196877e-07, + "logits/chosen": -2.6693925857543945, + "logits/rejected": -2.780869960784912, + "logps/chosen": -201.55117797851562, + "logps/rejected": -144.3550567626953, + "loss": 0.6541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8537842035293579, + "rewards/margins": 1.250685214996338, + "rewards/rejected": -2.1044692993164062, + "step": 1499 + }, + { + "epoch": 0.17, + "learning_rate": 2.514231723160505e-07, + "logits/chosen": -2.7454564571380615, + "logits/rejected": -2.447235107421875, + "logps/chosen": -166.00390625, + "logps/rejected": -311.5794372558594, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7722806930541992, + "rewards/margins": 3.235421657562256, + "rewards/rejected": -4.007702827453613, + "step": 1500 + }, + { + "epoch": 0.17, + "learning_rate": 2.5138774064013227e-07, + "logits/chosen": -2.5107481479644775, + "logits/rejected": -2.718550205230713, + "logps/chosen": -202.51046752929688, + "logps/rejected": -229.15499877929688, + "loss": 0.595, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7731063365936279, + "rewards/margins": 0.5997617840766907, + "rewards/rejected": -1.3728680610656738, + "step": 1501 + }, + { + "epoch": 0.17, + "learning_rate": 2.51352308964214e-07, + "logits/chosen": -1.7060024738311768, + "logits/rejected": -2.0919294357299805, + "logps/chosen": -407.52056884765625, + "logps/rejected": -325.4770812988281, + "loss": 0.5004, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06882903724908829, + "rewards/margins": 2.1670894622802734, + "rewards/rejected": -2.2359185218811035, + "step": 1502 + }, + { + "epoch": 0.17, + "learning_rate": 2.513168772882957e-07, + "logits/chosen": -2.310089588165283, + "logits/rejected": -2.7284066677093506, + "logps/chosen": -445.20355224609375, + "logps/rejected": -306.43377685546875, + "loss": 0.8819, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2190284729003906, + "rewards/margins": 1.0821205377578735, + "rewards/rejected": -2.3011488914489746, + "step": 1503 + }, + { + "epoch": 0.17, + "learning_rate": 2.5128144561237746e-07, + "logits/chosen": -2.676833152770996, + "logits/rejected": -2.618711471557617, + "logps/chosen": -323.45513916015625, + "logps/rejected": -291.5965576171875, + "loss": 0.5727, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5250710248947144, + "rewards/margins": 1.2448660135269165, + "rewards/rejected": -1.7699370384216309, + "step": 1504 + }, + { + "epoch": 0.18, + "learning_rate": 2.512460139364592e-07, + "logits/chosen": -2.694737434387207, + "logits/rejected": -2.798943519592285, + "logps/chosen": -247.27523803710938, + "logps/rejected": -242.6188201904297, + "loss": 0.605, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0828218460083008, + "rewards/margins": 0.8403846621513367, + "rewards/rejected": -1.9232065677642822, + "step": 1505 + }, + { + "epoch": 0.18, + "learning_rate": 2.512105822605409e-07, + "logits/chosen": -2.408123254776001, + "logits/rejected": -2.6716556549072266, + "logps/chosen": -303.4869384765625, + "logps/rejected": -263.4012145996094, + "loss": 0.1863, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1410737037658691, + "rewards/margins": 2.0144271850585938, + "rewards/rejected": -3.1555004119873047, + "step": 1506 + }, + { + "epoch": 0.18, + "learning_rate": 2.5117515058462266e-07, + "logits/chosen": -2.3236141204833984, + "logits/rejected": -2.5322561264038086, + "logps/chosen": -339.0780944824219, + "logps/rejected": -256.5987854003906, + "loss": 0.3905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7493884563446045, + "rewards/margins": 1.286633014678955, + "rewards/rejected": -2.0360214710235596, + "step": 1507 + }, + { + "epoch": 0.18, + "learning_rate": 2.5113971890870435e-07, + "logits/chosen": -2.2775211334228516, + "logits/rejected": -2.094860076904297, + "logps/chosen": -119.67781829833984, + "logps/rejected": -168.27798461914062, + "loss": 0.3962, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5066086053848267, + "rewards/margins": 1.8082300424575806, + "rewards/rejected": -3.3148386478424072, + "step": 1508 + }, + { + "epoch": 0.18, + "learning_rate": 2.511042872327861e-07, + "logits/chosen": -2.970726251602173, + "logits/rejected": -3.0316827297210693, + "logps/chosen": -165.9956817626953, + "logps/rejected": -187.49472045898438, + "loss": 0.5006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6500924825668335, + "rewards/margins": 1.3673573732376099, + "rewards/rejected": -2.0174498558044434, + "step": 1509 + }, + { + "epoch": 0.18, + "learning_rate": 2.510688555568678e-07, + "logits/chosen": -2.691666603088379, + "logits/rejected": -2.638864517211914, + "logps/chosen": -272.77398681640625, + "logps/rejected": -287.48046875, + "loss": 0.5889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8160590529441833, + "rewards/margins": 0.33200401067733765, + "rewards/rejected": -1.148063063621521, + "step": 1510 + }, + { + "epoch": 0.18, + "learning_rate": 2.5103342388094954e-07, + "logits/chosen": -1.9297447204589844, + "logits/rejected": -2.050508737564087, + "logps/chosen": -412.77947998046875, + "logps/rejected": -310.130126953125, + "loss": 0.6256, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4392031729221344, + "rewards/margins": 0.4176883399486542, + "rewards/rejected": -0.8568915724754333, + "step": 1511 + }, + { + "epoch": 0.18, + "learning_rate": 2.509979922050313e-07, + "logits/chosen": -2.0909669399261475, + "logits/rejected": -2.3059537410736084, + "logps/chosen": -548.546142578125, + "logps/rejected": -271.6788024902344, + "loss": 0.3569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28837189078330994, + "rewards/margins": 1.3042964935302734, + "rewards/rejected": -1.5926682949066162, + "step": 1512 + }, + { + "epoch": 0.18, + "learning_rate": 2.5096256052911304e-07, + "logits/chosen": -2.0764822959899902, + "logits/rejected": -2.0307776927948, + "logps/chosen": -141.51707458496094, + "logps/rejected": -207.7806396484375, + "loss": 0.9553, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1588444709777832, + "rewards/margins": 1.1488786935806274, + "rewards/rejected": -2.3077232837677, + "step": 1513 + }, + { + "epoch": 0.18, + "learning_rate": 2.5092712885319474e-07, + "logits/chosen": -2.116191864013672, + "logits/rejected": -2.328605890274048, + "logps/chosen": -332.21551513671875, + "logps/rejected": -208.269287109375, + "loss": 0.6543, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.82952880859375, + "rewards/margins": 1.8702086210250854, + "rewards/rejected": -2.699737310409546, + "step": 1514 + }, + { + "epoch": 0.18, + "learning_rate": 2.508916971772765e-07, + "logits/chosen": -2.771699905395508, + "logits/rejected": -2.7069451808929443, + "logps/chosen": -148.96571350097656, + "logps/rejected": -241.12701416015625, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2643533945083618, + "rewards/margins": 2.9518089294433594, + "rewards/rejected": -3.2161622047424316, + "step": 1515 + }, + { + "epoch": 0.18, + "learning_rate": 2.5085626550135823e-07, + "logits/chosen": -2.319643974304199, + "logits/rejected": -2.509733200073242, + "logps/chosen": -299.4090881347656, + "logps/rejected": -195.2904510498047, + "loss": 0.6658, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5797232389450073, + "rewards/margins": 1.4602982997894287, + "rewards/rejected": -2.0400214195251465, + "step": 1516 + }, + { + "epoch": 0.18, + "learning_rate": 2.5082083382543993e-07, + "logits/chosen": -2.4056739807128906, + "logits/rejected": -2.429441452026367, + "logps/chosen": -193.17047119140625, + "logps/rejected": -396.156494140625, + "loss": 0.2096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11505136638879776, + "rewards/margins": 3.0897388458251953, + "rewards/rejected": -3.2047903537750244, + "step": 1517 + }, + { + "epoch": 0.18, + "learning_rate": 2.507854021495217e-07, + "logits/chosen": -2.002331018447876, + "logits/rejected": -2.284052848815918, + "logps/chosen": -229.08644104003906, + "logps/rejected": -248.72535705566406, + "loss": 0.7581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1977550983428955, + "rewards/margins": 0.9308830499649048, + "rewards/rejected": -2.1286380290985107, + "step": 1518 + }, + { + "epoch": 0.18, + "learning_rate": 2.507499704736034e-07, + "logits/chosen": -2.3316404819488525, + "logits/rejected": -2.146022319793701, + "logps/chosen": -263.36114501953125, + "logps/rejected": -375.27020263671875, + "loss": 0.1333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30984193086624146, + "rewards/margins": 2.6746063232421875, + "rewards/rejected": -2.9844484329223633, + "step": 1519 + }, + { + "epoch": 0.18, + "learning_rate": 2.507145387976851e-07, + "logits/chosen": -2.1641502380371094, + "logits/rejected": -2.245091438293457, + "logps/chosen": -200.29647827148438, + "logps/rejected": -186.30877685546875, + "loss": 0.3509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.576932430267334, + "rewards/margins": 1.4339933395385742, + "rewards/rejected": -2.010925531387329, + "step": 1520 + }, + { + "epoch": 0.18, + "learning_rate": 2.506791071217668e-07, + "logits/chosen": -2.2197673320770264, + "logits/rejected": -2.269116163253784, + "logps/chosen": -240.0064239501953, + "logps/rejected": -172.71014404296875, + "loss": 1.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.226947784423828, + "rewards/margins": 0.20662975311279297, + "rewards/rejected": -2.4335777759552, + "step": 1521 + }, + { + "epoch": 0.18, + "learning_rate": 2.5064367544584857e-07, + "logits/chosen": -1.9302241802215576, + "logits/rejected": -2.5047032833099365, + "logps/chosen": -417.939697265625, + "logps/rejected": -231.62835693359375, + "loss": 0.4084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9730070233345032, + "rewards/margins": 1.7290160655975342, + "rewards/rejected": -2.7020230293273926, + "step": 1522 + }, + { + "epoch": 0.18, + "learning_rate": 2.506082437699303e-07, + "logits/chosen": -1.79429292678833, + "logits/rejected": -2.072009801864624, + "logps/chosen": -503.75482177734375, + "logps/rejected": -350.647705078125, + "loss": 0.6102, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7235039472579956, + "rewards/margins": 0.7640112638473511, + "rewards/rejected": -2.4875152111053467, + "step": 1523 + }, + { + "epoch": 0.18, + "learning_rate": 2.50572812094012e-07, + "logits/chosen": -2.486649751663208, + "logits/rejected": -2.353630304336548, + "logps/chosen": -415.8222351074219, + "logps/rejected": -319.5169372558594, + "loss": 0.3025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051511406898498535, + "rewards/margins": 1.9093295335769653, + "rewards/rejected": -1.9608409404754639, + "step": 1524 + }, + { + "epoch": 0.18, + "learning_rate": 2.5053738041809376e-07, + "logits/chosen": -1.9334419965744019, + "logits/rejected": -1.8891215324401855, + "logps/chosen": -238.2892608642578, + "logps/rejected": -355.1315002441406, + "loss": 0.2152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4996374845504761, + "rewards/margins": 2.9236738681793213, + "rewards/rejected": -3.423311233520508, + "step": 1525 + }, + { + "epoch": 0.18, + "learning_rate": 2.505019487421755e-07, + "logits/chosen": -1.782972812652588, + "logits/rejected": -2.0354576110839844, + "logps/chosen": -426.4249267578125, + "logps/rejected": -413.3320007324219, + "loss": 0.3341, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021389901638031006, + "rewards/margins": 1.850546956062317, + "rewards/rejected": -1.8719367980957031, + "step": 1526 + }, + { + "epoch": 0.18, + "learning_rate": 2.5046651706625726e-07, + "logits/chosen": -2.332895517349243, + "logits/rejected": -2.4367105960845947, + "logps/chosen": -272.3119812011719, + "logps/rejected": -290.2843933105469, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10114973783493042, + "rewards/margins": 2.8727073669433594, + "rewards/rejected": -2.771557569503784, + "step": 1527 + }, + { + "epoch": 0.18, + "learning_rate": 2.5043108539033895e-07, + "logits/chosen": -2.5286829471588135, + "logits/rejected": -2.5712099075317383, + "logps/chosen": -119.3431396484375, + "logps/rejected": -193.16403198242188, + "loss": 0.4972, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03827935829758644, + "rewards/margins": 2.533092498779297, + "rewards/rejected": -2.4948129653930664, + "step": 1528 + }, + { + "epoch": 0.18, + "learning_rate": 2.503956537144207e-07, + "logits/chosen": -2.456690788269043, + "logits/rejected": -2.5445916652679443, + "logps/chosen": -421.7591857910156, + "logps/rejected": -218.651611328125, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6789423227310181, + "rewards/margins": 1.5002985000610352, + "rewards/rejected": -2.1792409420013428, + "step": 1529 + }, + { + "epoch": 0.18, + "learning_rate": 2.503602220385024e-07, + "logits/chosen": -2.3817851543426514, + "logits/rejected": -2.317093849182129, + "logps/chosen": -342.08489990234375, + "logps/rejected": -358.735107421875, + "loss": 0.48, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26870760321617126, + "rewards/margins": 1.1289433240890503, + "rewards/rejected": -1.397650957107544, + "step": 1530 + }, + { + "epoch": 0.18, + "learning_rate": 2.5032479036258415e-07, + "logits/chosen": -2.820840358734131, + "logits/rejected": -2.798705577850342, + "logps/chosen": -112.78612518310547, + "logps/rejected": -151.77853393554688, + "loss": 0.3404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28455623984336853, + "rewards/margins": 1.9404067993164062, + "rewards/rejected": -2.2249629497528076, + "step": 1531 + }, + { + "epoch": 0.18, + "learning_rate": 2.5028935868666584e-07, + "logits/chosen": -2.0861053466796875, + "logits/rejected": -2.261622905731201, + "logps/chosen": -163.0729217529297, + "logps/rejected": -126.7846450805664, + "loss": 0.2721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1815764605998993, + "rewards/margins": 1.646965503692627, + "rewards/rejected": -1.8285419940948486, + "step": 1532 + }, + { + "epoch": 0.18, + "learning_rate": 2.502539270107476e-07, + "logits/chosen": -2.575253486633301, + "logits/rejected": -2.467475652694702, + "logps/chosen": -220.7331085205078, + "logps/rejected": -287.87469482421875, + "loss": 0.3955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6447773575782776, + "rewards/margins": 1.8861191272735596, + "rewards/rejected": -2.5308966636657715, + "step": 1533 + }, + { + "epoch": 0.18, + "learning_rate": 2.5021849533482934e-07, + "logits/chosen": -2.555385112762451, + "logits/rejected": -2.70149302482605, + "logps/chosen": -356.900146484375, + "logps/rejected": -274.7266540527344, + "loss": 0.3525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.560936689376831, + "rewards/margins": 1.5035853385925293, + "rewards/rejected": -2.0645222663879395, + "step": 1534 + }, + { + "epoch": 0.18, + "learning_rate": 2.5018306365891103e-07, + "logits/chosen": -1.6017833948135376, + "logits/rejected": -2.0642645359039307, + "logps/chosen": -419.71417236328125, + "logps/rejected": -233.4368896484375, + "loss": 0.4205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1526518017053604, + "rewards/margins": 1.17689049243927, + "rewards/rejected": -1.3295423984527588, + "step": 1535 + }, + { + "epoch": 0.18, + "learning_rate": 2.501476319829928e-07, + "logits/chosen": -2.3041820526123047, + "logits/rejected": -2.480492353439331, + "logps/chosen": -194.6871337890625, + "logps/rejected": -186.84197998046875, + "loss": 0.5952, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9411494731903076, + "rewards/margins": 1.057884931564331, + "rewards/rejected": -1.9990344047546387, + "step": 1536 + }, + { + "epoch": 0.18, + "learning_rate": 2.5011220030707453e-07, + "logits/chosen": -2.2466225624084473, + "logits/rejected": -2.128579616546631, + "logps/chosen": -299.0981750488281, + "logps/rejected": -321.57757568359375, + "loss": 0.3536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16149570047855377, + "rewards/margins": 2.5537667274475098, + "rewards/rejected": -2.7152624130249023, + "step": 1537 + }, + { + "epoch": 0.18, + "learning_rate": 2.500767686311563e-07, + "logits/chosen": -1.9885157346725464, + "logits/rejected": -2.50860595703125, + "logps/chosen": -519.5257568359375, + "logps/rejected": -193.3264923095703, + "loss": 0.7358, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.562435507774353, + "rewards/margins": 0.36027100682258606, + "rewards/rejected": -0.9227065443992615, + "step": 1538 + }, + { + "epoch": 0.18, + "learning_rate": 2.50041336955238e-07, + "logits/chosen": -2.0376358032226562, + "logits/rejected": -2.376227378845215, + "logps/chosen": -252.80661010742188, + "logps/rejected": -237.258056640625, + "loss": 0.7729, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.784058690071106, + "rewards/margins": 2.472616195678711, + "rewards/rejected": -4.256674766540527, + "step": 1539 + }, + { + "epoch": 0.18, + "learning_rate": 2.500059052793197e-07, + "logits/chosen": -2.5512025356292725, + "logits/rejected": -2.6009674072265625, + "logps/chosen": -247.01156616210938, + "logps/rejected": -182.61239624023438, + "loss": 0.2311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09920815378427505, + "rewards/margins": 1.5679889917373657, + "rewards/rejected": -1.6671972274780273, + "step": 1540 + }, + { + "epoch": 0.18, + "learning_rate": 2.499704736034014e-07, + "logits/chosen": -2.2715861797332764, + "logits/rejected": -2.4705450534820557, + "logps/chosen": -377.1352233886719, + "logps/rejected": -197.96771240234375, + "loss": 0.4618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5443218350410461, + "rewards/margins": 1.310238003730774, + "rewards/rejected": -1.8545598983764648, + "step": 1541 + }, + { + "epoch": 0.18, + "learning_rate": 2.4993504192748317e-07, + "logits/chosen": -2.1200127601623535, + "logits/rejected": -1.989243507385254, + "logps/chosen": -120.77857971191406, + "logps/rejected": -208.59091186523438, + "loss": 0.4189, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2152397632598877, + "rewards/margins": 1.7213993072509766, + "rewards/rejected": -1.9366391897201538, + "step": 1542 + }, + { + "epoch": 0.18, + "learning_rate": 2.4989961025156486e-07, + "logits/chosen": -2.3126885890960693, + "logits/rejected": -2.4105472564697266, + "logps/chosen": -293.9986877441406, + "logps/rejected": -313.76531982421875, + "loss": 0.1994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23705968260765076, + "rewards/margins": 2.400604009628296, + "rewards/rejected": -2.6376638412475586, + "step": 1543 + }, + { + "epoch": 0.18, + "learning_rate": 2.498641785756466e-07, + "logits/chosen": -2.412146806716919, + "logits/rejected": -2.2679452896118164, + "logps/chosen": -284.7437744140625, + "logps/rejected": -249.28123474121094, + "loss": 0.6143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45002251863479614, + "rewards/margins": 0.7889208793640137, + "rewards/rejected": -1.238943338394165, + "step": 1544 + }, + { + "epoch": 0.18, + "learning_rate": 2.4982874689972836e-07, + "logits/chosen": -2.254892349243164, + "logits/rejected": -2.2283618450164795, + "logps/chosen": -252.6599578857422, + "logps/rejected": -270.6684875488281, + "loss": 0.4212, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9043716192245483, + "rewards/margins": 1.2205901145935059, + "rewards/rejected": -2.1249618530273438, + "step": 1545 + }, + { + "epoch": 0.18, + "learning_rate": 2.4979331522381006e-07, + "logits/chosen": -2.9894254207611084, + "logits/rejected": -3.0304489135742188, + "logps/chosen": -221.2386474609375, + "logps/rejected": -275.53564453125, + "loss": 0.2116, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.785041868686676, + "rewards/margins": 2.2690277099609375, + "rewards/rejected": -3.054069757461548, + "step": 1546 + }, + { + "epoch": 0.18, + "learning_rate": 2.497578835478918e-07, + "logits/chosen": -1.704725980758667, + "logits/rejected": -1.6853471994400024, + "logps/chosen": -353.46514892578125, + "logps/rejected": -442.17962646484375, + "loss": 0.7695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8085172176361084, + "rewards/margins": 0.3466312289237976, + "rewards/rejected": -1.1551483869552612, + "step": 1547 + }, + { + "epoch": 0.18, + "learning_rate": 2.4972245187197355e-07, + "logits/chosen": -2.9490342140197754, + "logits/rejected": -3.0080983638763428, + "logps/chosen": -256.296142578125, + "logps/rejected": -336.89080810546875, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3070848882198334, + "rewards/margins": 3.4800562858581543, + "rewards/rejected": -3.7871413230895996, + "step": 1548 + }, + { + "epoch": 0.18, + "learning_rate": 2.496870201960553e-07, + "logits/chosen": -1.6188452243804932, + "logits/rejected": -1.8767045736312866, + "logps/chosen": -288.3042907714844, + "logps/rejected": -251.66665649414062, + "loss": 0.8084, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3664723634719849, + "rewards/margins": 0.5599867105484009, + "rewards/rejected": -1.9264590740203857, + "step": 1549 + }, + { + "epoch": 0.18, + "learning_rate": 2.49651588520137e-07, + "logits/chosen": -2.3736724853515625, + "logits/rejected": -2.232447385787964, + "logps/chosen": -147.672119140625, + "logps/rejected": -301.1698913574219, + "loss": 0.1984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4978691637516022, + "rewards/margins": 4.263669013977051, + "rewards/rejected": -4.761538028717041, + "step": 1550 + }, + { + "epoch": 0.18, + "learning_rate": 2.4961615684421875e-07, + "logits/chosen": -2.8217196464538574, + "logits/rejected": -2.7765698432922363, + "logps/chosen": -156.39334106445312, + "logps/rejected": -95.13815307617188, + "loss": 0.4199, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2756620943546295, + "rewards/margins": 0.8543390035629272, + "rewards/rejected": -1.1300010681152344, + "step": 1551 + }, + { + "epoch": 0.18, + "learning_rate": 2.4958072516830044e-07, + "logits/chosen": -2.1318089962005615, + "logits/rejected": -2.181318521499634, + "logps/chosen": -336.5406494140625, + "logps/rejected": -261.7124328613281, + "loss": 0.6391, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7594951391220093, + "rewards/margins": 0.637673020362854, + "rewards/rejected": -1.3971681594848633, + "step": 1552 + }, + { + "epoch": 0.18, + "learning_rate": 2.495452934923822e-07, + "logits/chosen": -2.3397531509399414, + "logits/rejected": -2.337611198425293, + "logps/chosen": -362.28680419921875, + "logps/rejected": -319.5302429199219, + "loss": 0.5237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0497620105743408, + "rewards/margins": 0.6509684920310974, + "rewards/rejected": -1.700730323791504, + "step": 1553 + }, + { + "epoch": 0.18, + "learning_rate": 2.495098618164639e-07, + "logits/chosen": -2.3607804775238037, + "logits/rejected": -2.3720505237579346, + "logps/chosen": -215.00918579101562, + "logps/rejected": -299.317138671875, + "loss": 0.5216, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0098356008529663, + "rewards/margins": 0.6015739440917969, + "rewards/rejected": -1.6114094257354736, + "step": 1554 + }, + { + "epoch": 0.18, + "learning_rate": 2.4947443014054563e-07, + "logits/chosen": -2.1997742652893066, + "logits/rejected": -2.315394878387451, + "logps/chosen": -182.441650390625, + "logps/rejected": -265.46026611328125, + "loss": 0.3587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34851276874542236, + "rewards/margins": 2.4511661529541016, + "rewards/rejected": -2.7996788024902344, + "step": 1555 + }, + { + "epoch": 0.18, + "learning_rate": 2.494389984646274e-07, + "logits/chosen": -2.161530017852783, + "logits/rejected": -2.1934127807617188, + "logps/chosen": -287.91009521484375, + "logps/rejected": -365.0387878417969, + "loss": 0.2391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46796494722366333, + "rewards/margins": 3.2339091300964355, + "rewards/rejected": -3.701873779296875, + "step": 1556 + }, + { + "epoch": 0.18, + "learning_rate": 2.494035667887091e-07, + "logits/chosen": -1.258085012435913, + "logits/rejected": -1.5871508121490479, + "logps/chosen": -341.570068359375, + "logps/rejected": -349.7861328125, + "loss": 0.7188, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4134564399719238, + "rewards/margins": 0.8886083960533142, + "rewards/rejected": -2.302064895629883, + "step": 1557 + }, + { + "epoch": 0.18, + "learning_rate": 2.4936813511279083e-07, + "logits/chosen": -2.3276209831237793, + "logits/rejected": -2.4646005630493164, + "logps/chosen": -415.11175537109375, + "logps/rejected": -320.3667907714844, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7355742454528809, + "rewards/margins": 2.945573568344116, + "rewards/rejected": -3.681147813796997, + "step": 1558 + }, + { + "epoch": 0.18, + "learning_rate": 2.493327034368725e-07, + "logits/chosen": -2.492936849594116, + "logits/rejected": -2.308199882507324, + "logps/chosen": -220.50656127929688, + "logps/rejected": -416.9328918457031, + "loss": 0.2885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5229282975196838, + "rewards/margins": 1.9308075904846191, + "rewards/rejected": -2.453735828399658, + "step": 1559 + }, + { + "epoch": 0.18, + "learning_rate": 2.4929727176095427e-07, + "logits/chosen": -2.207489013671875, + "logits/rejected": -2.4328033924102783, + "logps/chosen": -208.309814453125, + "logps/rejected": -186.4434814453125, + "loss": 0.5231, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.767505168914795, + "rewards/margins": 1.68399977684021, + "rewards/rejected": -3.451504945755005, + "step": 1560 + }, + { + "epoch": 0.18, + "learning_rate": 2.49261840085036e-07, + "logits/chosen": -2.080179452896118, + "logits/rejected": -1.813443899154663, + "logps/chosen": -338.6604919433594, + "logps/rejected": -389.72528076171875, + "loss": 0.1733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8504956364631653, + "rewards/margins": 2.295844078063965, + "rewards/rejected": -3.1463398933410645, + "step": 1561 + }, + { + "epoch": 0.18, + "learning_rate": 2.4922640840911777e-07, + "logits/chosen": -2.216139793395996, + "logits/rejected": -2.17146372795105, + "logps/chosen": -503.2596740722656, + "logps/rejected": -366.78765869140625, + "loss": 0.1294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41000109910964966, + "rewards/margins": 2.8530595302581787, + "rewards/rejected": -3.2630605697631836, + "step": 1562 + }, + { + "epoch": 0.18, + "learning_rate": 2.4919097673319946e-07, + "logits/chosen": -2.6860244274139404, + "logits/rejected": -2.766765594482422, + "logps/chosen": -200.458740234375, + "logps/rejected": -150.7438201904297, + "loss": 0.6081, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1114439964294434, + "rewards/margins": 0.7730604410171509, + "rewards/rejected": -1.8845045566558838, + "step": 1563 + }, + { + "epoch": 0.18, + "learning_rate": 2.491555450572812e-07, + "logits/chosen": -2.2474124431610107, + "logits/rejected": -2.1233408451080322, + "logps/chosen": -210.52536010742188, + "logps/rejected": -352.32879638671875, + "loss": 0.2111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4565039277076721, + "rewards/margins": 2.5767085552215576, + "rewards/rejected": -3.033212423324585, + "step": 1564 + }, + { + "epoch": 0.18, + "learning_rate": 2.491201133813629e-07, + "logits/chosen": -2.098848342895508, + "logits/rejected": -1.7748303413391113, + "logps/chosen": -165.5954132080078, + "logps/rejected": -325.849609375, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8838582634925842, + "rewards/margins": 1.9544262886047363, + "rewards/rejected": -2.838284492492676, + "step": 1565 + }, + { + "epoch": 0.18, + "learning_rate": 2.4908468170544466e-07, + "logits/chosen": -2.384402275085449, + "logits/rejected": -2.451805830001831, + "logps/chosen": -192.85546875, + "logps/rejected": -197.05657958984375, + "loss": 0.6436, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.249718427658081, + "rewards/margins": 0.6748732328414917, + "rewards/rejected": -1.9245917797088623, + "step": 1566 + }, + { + "epoch": 0.18, + "learning_rate": 2.490492500295264e-07, + "logits/chosen": -1.9718502759933472, + "logits/rejected": -2.046419620513916, + "logps/chosen": -162.57589721679688, + "logps/rejected": -243.692138671875, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03492674231529236, + "rewards/margins": 1.9404371976852417, + "rewards/rejected": -1.9753642082214355, + "step": 1567 + }, + { + "epoch": 0.18, + "learning_rate": 2.490138183536081e-07, + "logits/chosen": -2.1151649951934814, + "logits/rejected": -2.110372304916382, + "logps/chosen": -294.4685363769531, + "logps/rejected": -276.30523681640625, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1113777682185173, + "rewards/margins": 1.663012981414795, + "rewards/rejected": -1.774390697479248, + "step": 1568 + }, + { + "epoch": 0.18, + "learning_rate": 2.4897838667768985e-07, + "logits/chosen": -1.8254774808883667, + "logits/rejected": -2.0198965072631836, + "logps/chosen": -473.74884033203125, + "logps/rejected": -358.2305908203125, + "loss": 0.7886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9407311677932739, + "rewards/margins": 0.2967672348022461, + "rewards/rejected": -1.23749840259552, + "step": 1569 + }, + { + "epoch": 0.18, + "learning_rate": 2.4894295500177155e-07, + "logits/chosen": -2.7633907794952393, + "logits/rejected": -2.7207157611846924, + "logps/chosen": -276.8345031738281, + "logps/rejected": -250.54745483398438, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6509556770324707, + "rewards/margins": 3.0137288570404053, + "rewards/rejected": -3.664684534072876, + "step": 1570 + }, + { + "epoch": 0.18, + "learning_rate": 2.489075233258533e-07, + "logits/chosen": -2.2127819061279297, + "logits/rejected": -2.0117361545562744, + "logps/chosen": -226.91339111328125, + "logps/rejected": -345.9314270019531, + "loss": 0.6639, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.141160011291504, + "rewards/margins": 0.25041306018829346, + "rewards/rejected": -1.3915729522705078, + "step": 1571 + }, + { + "epoch": 0.18, + "learning_rate": 2.4887209164993504e-07, + "logits/chosen": -2.5478482246398926, + "logits/rejected": -2.5732626914978027, + "logps/chosen": -382.9358215332031, + "logps/rejected": -266.1111145019531, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6796671152114868, + "rewards/margins": 2.462045431137085, + "rewards/rejected": -3.1417124271392822, + "step": 1572 + }, + { + "epoch": 0.18, + "learning_rate": 2.488366599740168e-07, + "logits/chosen": -2.8376684188842773, + "logits/rejected": -2.787668228149414, + "logps/chosen": -150.76702880859375, + "logps/rejected": -210.85662841796875, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9955230355262756, + "rewards/margins": 2.8594465255737305, + "rewards/rejected": -3.8549695014953613, + "step": 1573 + }, + { + "epoch": 0.18, + "learning_rate": 2.488012282980985e-07, + "logits/chosen": -1.7406549453735352, + "logits/rejected": -2.188563108444214, + "logps/chosen": -395.1571960449219, + "logps/rejected": -305.38916015625, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3284485340118408, + "rewards/margins": 1.1766144037246704, + "rewards/rejected": -2.5050628185272217, + "step": 1574 + }, + { + "epoch": 0.18, + "learning_rate": 2.4876579662218024e-07, + "logits/chosen": -2.2789371013641357, + "logits/rejected": -2.428443193435669, + "logps/chosen": -235.85421752929688, + "logps/rejected": -300.2201232910156, + "loss": 0.765, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7787348031997681, + "rewards/margins": 0.5990278720855713, + "rewards/rejected": -1.3777626752853394, + "step": 1575 + }, + { + "epoch": 0.18, + "learning_rate": 2.4873036494626193e-07, + "logits/chosen": -2.3398284912109375, + "logits/rejected": -2.3455464839935303, + "logps/chosen": -317.6192626953125, + "logps/rejected": -335.8066101074219, + "loss": 0.2223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7553648948669434, + "rewards/margins": 1.6736154556274414, + "rewards/rejected": -2.4289803504943848, + "step": 1576 + }, + { + "epoch": 0.18, + "learning_rate": 2.486949332703437e-07, + "logits/chosen": -2.6407413482666016, + "logits/rejected": -2.3293187618255615, + "logps/chosen": -361.5087585449219, + "logps/rejected": -410.8941955566406, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2666555643081665, + "rewards/margins": 1.2130881547927856, + "rewards/rejected": -2.479743719100952, + "step": 1577 + }, + { + "epoch": 0.18, + "learning_rate": 2.486595015944254e-07, + "logits/chosen": -2.0700161457061768, + "logits/rejected": -2.2613749504089355, + "logps/chosen": -309.083251953125, + "logps/rejected": -269.8721923828125, + "loss": 0.8522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7533205151557922, + "rewards/margins": 0.5553483366966248, + "rewards/rejected": -1.308668851852417, + "step": 1578 + }, + { + "epoch": 0.18, + "learning_rate": 2.486240699185071e-07, + "logits/chosen": -2.700705051422119, + "logits/rejected": -2.837153911590576, + "logps/chosen": -221.9197998046875, + "logps/rejected": -246.61788940429688, + "loss": 0.3478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051607534289360046, + "rewards/margins": 2.577146530151367, + "rewards/rejected": -2.628753900527954, + "step": 1579 + }, + { + "epoch": 0.18, + "learning_rate": 2.4858863824258887e-07, + "logits/chosen": -2.209132671356201, + "logits/rejected": -2.039478063583374, + "logps/chosen": -364.98101806640625, + "logps/rejected": -301.44305419921875, + "loss": 0.4007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5551789999008179, + "rewards/margins": 1.5739115476608276, + "rewards/rejected": -2.1290907859802246, + "step": 1580 + }, + { + "epoch": 0.18, + "learning_rate": 2.4855320656667057e-07, + "logits/chosen": -1.939245343208313, + "logits/rejected": -2.3522422313690186, + "logps/chosen": -312.7124938964844, + "logps/rejected": -155.19241333007812, + "loss": 0.7534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.609461784362793, + "rewards/margins": 1.535059928894043, + "rewards/rejected": -2.144521713256836, + "step": 1581 + }, + { + "epoch": 0.18, + "learning_rate": 2.485177748907523e-07, + "logits/chosen": -2.3637471199035645, + "logits/rejected": -2.4151687622070312, + "logps/chosen": -272.49432373046875, + "logps/rejected": -357.1744079589844, + "loss": 0.4078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6448510885238647, + "rewards/margins": 1.9272202253341675, + "rewards/rejected": -2.5720713138580322, + "step": 1582 + }, + { + "epoch": 0.18, + "learning_rate": 2.4848234321483407e-07, + "logits/chosen": -2.557943820953369, + "logits/rejected": -2.4998703002929688, + "logps/chosen": -214.78375244140625, + "logps/rejected": -206.8654327392578, + "loss": 0.3317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4295068383216858, + "rewards/margins": 1.4204964637756348, + "rewards/rejected": -1.8500032424926758, + "step": 1583 + }, + { + "epoch": 0.18, + "learning_rate": 2.484469115389158e-07, + "logits/chosen": -2.3122305870056152, + "logits/rejected": -2.575441360473633, + "logps/chosen": -402.0369567871094, + "logps/rejected": -300.9462890625, + "loss": 0.5467, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.622820258140564, + "rewards/margins": 0.9086141586303711, + "rewards/rejected": -1.5314345359802246, + "step": 1584 + }, + { + "epoch": 0.18, + "learning_rate": 2.484114798629975e-07, + "logits/chosen": -2.761150360107422, + "logits/rejected": -2.6542038917541504, + "logps/chosen": -278.630126953125, + "logps/rejected": -163.2982177734375, + "loss": 0.4481, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6468818783760071, + "rewards/margins": 1.3342843055725098, + "rewards/rejected": -1.981166124343872, + "step": 1585 + }, + { + "epoch": 0.18, + "learning_rate": 2.4837604818707926e-07, + "logits/chosen": -2.5496127605438232, + "logits/rejected": -2.63279390335083, + "logps/chosen": -120.31170654296875, + "logps/rejected": -187.0506591796875, + "loss": 0.444, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3913326859474182, + "rewards/margins": 1.3959109783172607, + "rewards/rejected": -1.7872437238693237, + "step": 1586 + }, + { + "epoch": 0.18, + "learning_rate": 2.4834061651116095e-07, + "logits/chosen": -1.5563194751739502, + "logits/rejected": -2.171633243560791, + "logps/chosen": -344.95831298828125, + "logps/rejected": -323.2354431152344, + "loss": 0.7187, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8726016283035278, + "rewards/margins": 1.0214619636535645, + "rewards/rejected": -2.894063711166382, + "step": 1587 + }, + { + "epoch": 0.18, + "learning_rate": 2.483051848352427e-07, + "logits/chosen": -2.6318774223327637, + "logits/rejected": -2.3683998584747314, + "logps/chosen": -226.24993896484375, + "logps/rejected": -293.6310729980469, + "loss": 0.3794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9208877086639404, + "rewards/margins": 1.1803008317947388, + "rewards/rejected": -2.1011886596679688, + "step": 1588 + }, + { + "epoch": 0.18, + "learning_rate": 2.482697531593244e-07, + "logits/chosen": -2.240182399749756, + "logits/rejected": -2.3369972705841064, + "logps/chosen": -391.2474670410156, + "logps/rejected": -480.9591369628906, + "loss": 0.5563, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4819144010543823, + "rewards/margins": 1.003433346748352, + "rewards/rejected": -1.4853477478027344, + "step": 1589 + }, + { + "epoch": 0.18, + "learning_rate": 2.4823432148340615e-07, + "logits/chosen": -2.4501278400421143, + "logits/rejected": -2.0264902114868164, + "logps/chosen": -110.75640106201172, + "logps/rejected": -235.8238067626953, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4876578748226166, + "rewards/margins": 1.5461961030960083, + "rewards/rejected": -2.0338540077209473, + "step": 1590 + }, + { + "epoch": 0.19, + "learning_rate": 2.481988898074879e-07, + "logits/chosen": -2.4581921100616455, + "logits/rejected": -2.372211456298828, + "logps/chosen": -163.26498413085938, + "logps/rejected": -149.0720672607422, + "loss": 0.4934, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5460359454154968, + "rewards/margins": 1.2152831554412842, + "rewards/rejected": -1.7613190412521362, + "step": 1591 + }, + { + "epoch": 0.19, + "learning_rate": 2.481634581315696e-07, + "logits/chosen": -2.2796969413757324, + "logits/rejected": -2.4551892280578613, + "logps/chosen": -450.71478271484375, + "logps/rejected": -369.28179931640625, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3657446503639221, + "rewards/margins": 3.0056686401367188, + "rewards/rejected": -3.371412992477417, + "step": 1592 + }, + { + "epoch": 0.19, + "learning_rate": 2.4812802645565134e-07, + "logits/chosen": -2.307183027267456, + "logits/rejected": -2.3147084712982178, + "logps/chosen": -226.97027587890625, + "logps/rejected": -216.875, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02982049062848091, + "rewards/margins": 1.4486101865768433, + "rewards/rejected": -1.4784308671951294, + "step": 1593 + }, + { + "epoch": 0.19, + "learning_rate": 2.4809259477973304e-07, + "logits/chosen": -2.0645434856414795, + "logits/rejected": -2.2933509349823, + "logps/chosen": -330.84869384765625, + "logps/rejected": -300.9118347167969, + "loss": 0.4436, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.295011043548584, + "rewards/margins": 1.1833829879760742, + "rewards/rejected": -2.478394031524658, + "step": 1594 + }, + { + "epoch": 0.19, + "learning_rate": 2.4805716310381484e-07, + "logits/chosen": -2.3275811672210693, + "logits/rejected": -2.3218350410461426, + "logps/chosen": -298.8724670410156, + "logps/rejected": -232.14610290527344, + "loss": 0.3714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4331524968147278, + "rewards/margins": 1.6051493883132935, + "rewards/rejected": -2.038301944732666, + "step": 1595 + }, + { + "epoch": 0.19, + "learning_rate": 2.4802173142789653e-07, + "logits/chosen": -2.1744062900543213, + "logits/rejected": -2.1563806533813477, + "logps/chosen": -82.81033325195312, + "logps/rejected": -96.90888977050781, + "loss": 0.6217, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7297319173812866, + "rewards/margins": 0.6425690650939941, + "rewards/rejected": -1.3723009824752808, + "step": 1596 + }, + { + "epoch": 0.19, + "learning_rate": 2.479862997519783e-07, + "logits/chosen": -2.583070755004883, + "logits/rejected": -2.737325668334961, + "logps/chosen": -468.8005676269531, + "logps/rejected": -343.81292724609375, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1997784972190857, + "rewards/margins": 1.5027507543563843, + "rewards/rejected": -1.3029723167419434, + "step": 1597 + }, + { + "epoch": 0.19, + "learning_rate": 2.4795086807606e-07, + "logits/chosen": -2.1398887634277344, + "logits/rejected": -2.3512613773345947, + "logps/chosen": -338.07275390625, + "logps/rejected": -294.638671875, + "loss": 0.2652, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45333316922187805, + "rewards/margins": 2.3017525672912598, + "rewards/rejected": -2.7550857067108154, + "step": 1598 + }, + { + "epoch": 0.19, + "learning_rate": 2.479154364001417e-07, + "logits/chosen": -2.729024887084961, + "logits/rejected": -2.6687569618225098, + "logps/chosen": -101.48443603515625, + "logps/rejected": -130.80653381347656, + "loss": 0.506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.724341869354248, + "rewards/margins": 0.5748811960220337, + "rewards/rejected": -1.2992231845855713, + "step": 1599 + }, + { + "epoch": 0.19, + "learning_rate": 2.478800047242234e-07, + "logits/chosen": -2.228588104248047, + "logits/rejected": -2.1324849128723145, + "logps/chosen": -277.98114013671875, + "logps/rejected": -322.24591064453125, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2622077465057373, + "rewards/margins": 2.598194122314453, + "rewards/rejected": -2.8604016304016113, + "step": 1600 + }, + { + "epoch": 0.19, + "learning_rate": 2.4784457304830517e-07, + "logits/chosen": -2.204655408859253, + "logits/rejected": -2.154161214828491, + "logps/chosen": -263.87445068359375, + "logps/rejected": -304.49224853515625, + "loss": 0.3959, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7854706048965454, + "rewards/margins": 1.3493664264678955, + "rewards/rejected": -2.1348371505737305, + "step": 1601 + }, + { + "epoch": 0.19, + "learning_rate": 2.478091413723869e-07, + "logits/chosen": -2.2698259353637695, + "logits/rejected": -2.225752830505371, + "logps/chosen": -234.9207305908203, + "logps/rejected": -203.18589782714844, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9391934871673584, + "rewards/margins": 0.9642164707183838, + "rewards/rejected": -1.9034098386764526, + "step": 1602 + }, + { + "epoch": 0.19, + "learning_rate": 2.477737096964686e-07, + "logits/chosen": -2.749161958694458, + "logits/rejected": -2.7164313793182373, + "logps/chosen": -612.2333984375, + "logps/rejected": -378.42803955078125, + "loss": 0.5938, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.110091209411621, + "rewards/margins": 1.1412692070007324, + "rewards/rejected": -2.2513604164123535, + "step": 1603 + }, + { + "epoch": 0.19, + "learning_rate": 2.4773827802055036e-07, + "logits/chosen": -2.3034439086914062, + "logits/rejected": -2.4417028427124023, + "logps/chosen": -194.1005859375, + "logps/rejected": -203.76385498046875, + "loss": 0.7465, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3800054788589478, + "rewards/margins": 1.4787821769714355, + "rewards/rejected": -2.8587875366210938, + "step": 1604 + }, + { + "epoch": 0.19, + "learning_rate": 2.4770284634463206e-07, + "logits/chosen": -1.3297688961029053, + "logits/rejected": -1.4826452732086182, + "logps/chosen": -408.37188720703125, + "logps/rejected": -330.91595458984375, + "loss": 0.623, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2023015171289444, + "rewards/margins": 0.3903564214706421, + "rewards/rejected": -0.5926579833030701, + "step": 1605 + }, + { + "epoch": 0.19, + "learning_rate": 2.476674146687138e-07, + "logits/chosen": -2.4535305500030518, + "logits/rejected": -2.406219005584717, + "logps/chosen": -507.44140625, + "logps/rejected": -408.9334411621094, + "loss": 1.0871, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.275648593902588, + "rewards/margins": -0.1464366316795349, + "rewards/rejected": -1.1292119026184082, + "step": 1606 + }, + { + "epoch": 0.19, + "learning_rate": 2.4763198299279556e-07, + "logits/chosen": -2.068693161010742, + "logits/rejected": -2.1565823554992676, + "logps/chosen": -494.1838073730469, + "logps/rejected": -425.1766357421875, + "loss": 0.3269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32804620265960693, + "rewards/margins": 2.312948226928711, + "rewards/rejected": -2.6409945487976074, + "step": 1607 + }, + { + "epoch": 0.19, + "learning_rate": 2.475965513168773e-07, + "logits/chosen": -2.9662556648254395, + "logits/rejected": -2.959144115447998, + "logps/chosen": -126.59747314453125, + "logps/rejected": -191.7133026123047, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5092255473136902, + "rewards/margins": 2.4579687118530273, + "rewards/rejected": -2.9671945571899414, + "step": 1608 + }, + { + "epoch": 0.19, + "learning_rate": 2.47561119640959e-07, + "logits/chosen": -1.9998652935028076, + "logits/rejected": -2.4058237075805664, + "logps/chosen": -337.8708190917969, + "logps/rejected": -211.22630310058594, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37040379643440247, + "rewards/margins": 0.8850733041763306, + "rewards/rejected": -1.2554770708084106, + "step": 1609 + }, + { + "epoch": 0.19, + "learning_rate": 2.4752568796504075e-07, + "logits/chosen": -1.973876953125, + "logits/rejected": -1.842827558517456, + "logps/chosen": -300.53106689453125, + "logps/rejected": -287.74493408203125, + "loss": 0.1765, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03964511677622795, + "rewards/margins": 3.9510371685028076, + "rewards/rejected": -3.9113922119140625, + "step": 1610 + }, + { + "epoch": 0.19, + "learning_rate": 2.4749025628912244e-07, + "logits/chosen": -2.706831216812134, + "logits/rejected": -2.89486026763916, + "logps/chosen": -220.701171875, + "logps/rejected": -188.09579467773438, + "loss": 0.5161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30700773000717163, + "rewards/margins": 2.026865005493164, + "rewards/rejected": -2.3338727951049805, + "step": 1611 + }, + { + "epoch": 0.19, + "learning_rate": 2.474548246132042e-07, + "logits/chosen": -2.5341427326202393, + "logits/rejected": -2.3706843852996826, + "logps/chosen": -345.1861877441406, + "logps/rejected": -383.37677001953125, + "loss": 0.6979, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.391100525856018, + "rewards/margins": 1.091567039489746, + "rewards/rejected": -2.4826674461364746, + "step": 1612 + }, + { + "epoch": 0.19, + "learning_rate": 2.4741939293728594e-07, + "logits/chosen": -2.5054895877838135, + "logits/rejected": -2.702303409576416, + "logps/chosen": -414.1036376953125, + "logps/rejected": -323.4469299316406, + "loss": 0.352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33217793703079224, + "rewards/margins": 1.9536728858947754, + "rewards/rejected": -2.285851001739502, + "step": 1613 + }, + { + "epoch": 0.19, + "learning_rate": 2.4738396126136764e-07, + "logits/chosen": -2.337235927581787, + "logits/rejected": -2.364502191543579, + "logps/chosen": -167.00189208984375, + "logps/rejected": -171.8983612060547, + "loss": 0.3064, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6457769870758057, + "rewards/margins": 1.5658310651779175, + "rewards/rejected": -2.2116081714630127, + "step": 1614 + }, + { + "epoch": 0.19, + "learning_rate": 2.473485295854494e-07, + "logits/chosen": -2.6664135456085205, + "logits/rejected": -2.4973599910736084, + "logps/chosen": -110.15576934814453, + "logps/rejected": -206.4669189453125, + "loss": 0.4191, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48772403597831726, + "rewards/margins": 1.1777324676513672, + "rewards/rejected": -1.6654565334320068, + "step": 1615 + }, + { + "epoch": 0.19, + "learning_rate": 2.473130979095311e-07, + "logits/chosen": -2.620840549468994, + "logits/rejected": -2.348323345184326, + "logps/chosen": -270.09375, + "logps/rejected": -467.27288818359375, + "loss": 0.4726, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2675047814846039, + "rewards/margins": 1.3290588855743408, + "rewards/rejected": -1.5965638160705566, + "step": 1616 + }, + { + "epoch": 0.19, + "learning_rate": 2.4727766623361283e-07, + "logits/chosen": -2.6567177772521973, + "logits/rejected": -2.418938398361206, + "logps/chosen": -201.27809143066406, + "logps/rejected": -192.06259155273438, + "loss": 1.1171, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2269737720489502, + "rewards/margins": 0.3122572898864746, + "rewards/rejected": -1.5392310619354248, + "step": 1617 + }, + { + "epoch": 0.19, + "learning_rate": 2.472422345576945e-07, + "logits/chosen": -2.4071056842803955, + "logits/rejected": -2.4194586277008057, + "logps/chosen": -298.5013122558594, + "logps/rejected": -293.9930419921875, + "loss": 0.1454, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10188588500022888, + "rewards/margins": 3.5577855110168457, + "rewards/rejected": -3.455899238586426, + "step": 1618 + }, + { + "epoch": 0.19, + "learning_rate": 2.4720680288177633e-07, + "logits/chosen": -1.9060449600219727, + "logits/rejected": -2.1322808265686035, + "logps/chosen": -426.47613525390625, + "logps/rejected": -470.16729736328125, + "loss": 0.3165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46652740240097046, + "rewards/margins": 2.090425968170166, + "rewards/rejected": -2.5569534301757812, + "step": 1619 + }, + { + "epoch": 0.19, + "learning_rate": 2.47171371205858e-07, + "logits/chosen": -1.8772386312484741, + "logits/rejected": -1.9901789426803589, + "logps/chosen": -371.2685546875, + "logps/rejected": -292.62823486328125, + "loss": 0.1938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6496948003768921, + "rewards/margins": 2.4084155559539795, + "rewards/rejected": -3.058109998703003, + "step": 1620 + }, + { + "epoch": 0.19, + "learning_rate": 2.4713593952993977e-07, + "logits/chosen": -2.1469171047210693, + "logits/rejected": -2.1494839191436768, + "logps/chosen": -733.3466186523438, + "logps/rejected": -340.6827392578125, + "loss": 0.886, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7093323469161987, + "rewards/margins": -0.022974446415901184, + "rewards/rejected": -1.686357855796814, + "step": 1621 + }, + { + "epoch": 0.19, + "learning_rate": 2.4710050785402147e-07, + "logits/chosen": -2.586357831954956, + "logits/rejected": -2.54103946685791, + "logps/chosen": -172.69334411621094, + "logps/rejected": -233.7097930908203, + "loss": 0.464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8136546611785889, + "rewards/margins": 1.3482310771942139, + "rewards/rejected": -2.1618854999542236, + "step": 1622 + }, + { + "epoch": 0.19, + "learning_rate": 2.470650761781032e-07, + "logits/chosen": -1.8564023971557617, + "logits/rejected": -1.7837629318237305, + "logps/chosen": -319.5545349121094, + "logps/rejected": -362.03765869140625, + "loss": 0.4426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6021819710731506, + "rewards/margins": 1.3160587549209595, + "rewards/rejected": -1.9182407855987549, + "step": 1623 + }, + { + "epoch": 0.19, + "learning_rate": 2.4702964450218496e-07, + "logits/chosen": -2.212630033493042, + "logits/rejected": -2.3848471641540527, + "logps/chosen": -335.470458984375, + "logps/rejected": -296.99810791015625, + "loss": 0.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1617671549320221, + "rewards/margins": 2.6009302139282227, + "rewards/rejected": -2.762697219848633, + "step": 1624 + }, + { + "epoch": 0.19, + "learning_rate": 2.4699421282626666e-07, + "logits/chosen": -2.035496711730957, + "logits/rejected": -1.8638505935668945, + "logps/chosen": -128.8785858154297, + "logps/rejected": -226.30331420898438, + "loss": 1.1301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4559930562973022, + "rewards/margins": 1.1078910827636719, + "rewards/rejected": -2.5638840198516846, + "step": 1625 + }, + { + "epoch": 0.19, + "learning_rate": 2.469587811503484e-07, + "logits/chosen": -2.193399667739868, + "logits/rejected": -2.013120174407959, + "logps/chosen": -139.99827575683594, + "logps/rejected": -288.7445983886719, + "loss": 0.4829, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2497853934764862, + "rewards/margins": 1.8501697778701782, + "rewards/rejected": -2.0999550819396973, + "step": 1626 + }, + { + "epoch": 0.19, + "learning_rate": 2.469233494744301e-07, + "logits/chosen": -2.6202967166900635, + "logits/rejected": -2.762650966644287, + "logps/chosen": -255.84738159179688, + "logps/rejected": -134.32598876953125, + "loss": 0.76, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0226532220840454, + "rewards/margins": 0.8043995499610901, + "rewards/rejected": -1.8270527124404907, + "step": 1627 + }, + { + "epoch": 0.19, + "learning_rate": 2.4688791779851185e-07, + "logits/chosen": -2.3699445724487305, + "logits/rejected": -2.646303415298462, + "logps/chosen": -317.4638671875, + "logps/rejected": -190.20298767089844, + "loss": 0.388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8943278193473816, + "rewards/margins": 1.3383461236953735, + "rewards/rejected": -2.2326736450195312, + "step": 1628 + }, + { + "epoch": 0.19, + "learning_rate": 2.4685248612259355e-07, + "logits/chosen": -2.3288087844848633, + "logits/rejected": -2.0339713096618652, + "logps/chosen": -215.6480712890625, + "logps/rejected": -349.1902160644531, + "loss": 0.2556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5649926066398621, + "rewards/margins": 2.178818464279175, + "rewards/rejected": -2.7438108921051025, + "step": 1629 + }, + { + "epoch": 0.19, + "learning_rate": 2.4681705444667535e-07, + "logits/chosen": -2.1724677085876465, + "logits/rejected": -2.3439793586730957, + "logps/chosen": -166.39064025878906, + "logps/rejected": -233.6205596923828, + "loss": 0.7274, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1248620748519897, + "rewards/margins": 1.6157026290893555, + "rewards/rejected": -2.7405645847320557, + "step": 1630 + }, + { + "epoch": 0.19, + "learning_rate": 2.4678162277075705e-07, + "logits/chosen": -1.8891522884368896, + "logits/rejected": -1.6884751319885254, + "logps/chosen": -329.92291259765625, + "logps/rejected": -384.15570068359375, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.531204879283905, + "rewards/margins": 2.9794764518737793, + "rewards/rejected": -3.510681629180908, + "step": 1631 + }, + { + "epoch": 0.19, + "learning_rate": 2.467461910948388e-07, + "logits/chosen": -2.124361753463745, + "logits/rejected": -2.388129234313965, + "logps/chosen": -218.65716552734375, + "logps/rejected": -177.06634521484375, + "loss": 0.7028, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.683070182800293, + "rewards/margins": 0.6870852708816528, + "rewards/rejected": -2.3701555728912354, + "step": 1632 + }, + { + "epoch": 0.19, + "learning_rate": 2.467107594189205e-07, + "logits/chosen": -2.3786916732788086, + "logits/rejected": -2.60400390625, + "logps/chosen": -225.3875274658203, + "logps/rejected": -274.54571533203125, + "loss": 0.4018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6577668190002441, + "rewards/margins": 1.3098082542419434, + "rewards/rejected": -1.9675750732421875, + "step": 1633 + }, + { + "epoch": 0.19, + "learning_rate": 2.4667532774300224e-07, + "logits/chosen": -2.202277183532715, + "logits/rejected": -2.0530648231506348, + "logps/chosen": -306.07611083984375, + "logps/rejected": -276.73834228515625, + "loss": 0.403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0460026264190674, + "rewards/margins": 1.4301319122314453, + "rewards/rejected": -2.4761343002319336, + "step": 1634 + }, + { + "epoch": 0.19, + "learning_rate": 2.46639896067084e-07, + "logits/chosen": -2.3885772228240967, + "logits/rejected": -2.242403984069824, + "logps/chosen": -177.55831909179688, + "logps/rejected": -174.19418334960938, + "loss": 0.5828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6558078527450562, + "rewards/margins": 0.270005464553833, + "rewards/rejected": -0.9258133769035339, + "step": 1635 + }, + { + "epoch": 0.19, + "learning_rate": 2.466044643911657e-07, + "logits/chosen": -2.704501152038574, + "logits/rejected": -2.5327951908111572, + "logps/chosen": -454.31817626953125, + "logps/rejected": -354.42816162109375, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4679523706436157, + "rewards/margins": 1.1737794876098633, + "rewards/rejected": -2.6417319774627686, + "step": 1636 + }, + { + "epoch": 0.19, + "learning_rate": 2.4656903271524743e-07, + "logits/chosen": -2.1108548641204834, + "logits/rejected": -2.061577796936035, + "logps/chosen": -121.50480651855469, + "logps/rejected": -176.50103759765625, + "loss": 0.471, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8984793424606323, + "rewards/margins": 1.512269139289856, + "rewards/rejected": -2.4107484817504883, + "step": 1637 + }, + { + "epoch": 0.19, + "learning_rate": 2.4653360103932913e-07, + "logits/chosen": -2.217385768890381, + "logits/rejected": -2.501033306121826, + "logps/chosen": -197.13693237304688, + "logps/rejected": -137.0806121826172, + "loss": 0.6157, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1189005374908447, + "rewards/margins": 0.5807891488075256, + "rewards/rejected": -1.6996896266937256, + "step": 1638 + }, + { + "epoch": 0.19, + "learning_rate": 2.464981693634109e-07, + "logits/chosen": -2.185270071029663, + "logits/rejected": -2.253105401992798, + "logps/chosen": -363.3568115234375, + "logps/rejected": -346.2731018066406, + "loss": 0.4347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8733185529708862, + "rewards/margins": 1.4368016719818115, + "rewards/rejected": -2.310120105743408, + "step": 1639 + }, + { + "epoch": 0.19, + "learning_rate": 2.4646273768749257e-07, + "logits/chosen": -2.6977291107177734, + "logits/rejected": -2.575279712677002, + "logps/chosen": -139.70884704589844, + "logps/rejected": -201.92620849609375, + "loss": 0.4039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6454865336418152, + "rewards/margins": 1.1370195150375366, + "rewards/rejected": -1.782505989074707, + "step": 1640 + }, + { + "epoch": 0.19, + "learning_rate": 2.464273060115743e-07, + "logits/chosen": -2.3424806594848633, + "logits/rejected": -2.342161178588867, + "logps/chosen": -302.66943359375, + "logps/rejected": -252.23831176757812, + "loss": 0.5064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5033154487609863, + "rewards/margins": 1.184190034866333, + "rewards/rejected": -1.6875056028366089, + "step": 1641 + }, + { + "epoch": 0.19, + "learning_rate": 2.4639187433565607e-07, + "logits/chosen": -2.7281360626220703, + "logits/rejected": -2.693927526473999, + "logps/chosen": -222.90823364257812, + "logps/rejected": -261.49078369140625, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5387765765190125, + "rewards/margins": 1.818695306777954, + "rewards/rejected": -2.3574721813201904, + "step": 1642 + }, + { + "epoch": 0.19, + "learning_rate": 2.463564426597378e-07, + "logits/chosen": -1.4799219369888306, + "logits/rejected": -1.6706809997558594, + "logps/chosen": -382.2003479003906, + "logps/rejected": -265.8782653808594, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.490817904472351, + "rewards/margins": 0.4824899435043335, + "rewards/rejected": -1.9733078479766846, + "step": 1643 + }, + { + "epoch": 0.19, + "learning_rate": 2.463210109838195e-07, + "logits/chosen": -2.345161199569702, + "logits/rejected": -2.1760034561157227, + "logps/chosen": -153.2363739013672, + "logps/rejected": -314.918701171875, + "loss": 0.5383, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2774553298950195, + "rewards/margins": 1.6682074069976807, + "rewards/rejected": -2.9456629753112793, + "step": 1644 + }, + { + "epoch": 0.19, + "learning_rate": 2.4628557930790126e-07, + "logits/chosen": -2.497775077819824, + "logits/rejected": -2.461188554763794, + "logps/chosen": -250.63442993164062, + "logps/rejected": -219.34765625, + "loss": 0.3639, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03010420873761177, + "rewards/margins": 2.285468339920044, + "rewards/rejected": -2.315572500228882, + "step": 1645 + }, + { + "epoch": 0.19, + "learning_rate": 2.46250147631983e-07, + "logits/chosen": -2.8053550720214844, + "logits/rejected": -2.863651990890503, + "logps/chosen": -216.71046447753906, + "logps/rejected": -241.51486206054688, + "loss": 0.4356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.599645733833313, + "rewards/margins": 1.560119867324829, + "rewards/rejected": -2.1597657203674316, + "step": 1646 + }, + { + "epoch": 0.19, + "learning_rate": 2.462147159560647e-07, + "logits/chosen": -2.4533729553222656, + "logits/rejected": -2.4672818183898926, + "logps/chosen": -146.9352569580078, + "logps/rejected": -213.19842529296875, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36431729793548584, + "rewards/margins": 2.724088191986084, + "rewards/rejected": -3.0884053707122803, + "step": 1647 + }, + { + "epoch": 0.19, + "learning_rate": 2.4617928428014645e-07, + "logits/chosen": -2.381521701812744, + "logits/rejected": -2.6353683471679688, + "logps/chosen": -393.6332702636719, + "logps/rejected": -341.0672912597656, + "loss": 0.8122, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5208781957626343, + "rewards/margins": 0.5866612195968628, + "rewards/rejected": -2.107539415359497, + "step": 1648 + }, + { + "epoch": 0.19, + "learning_rate": 2.4614385260422815e-07, + "logits/chosen": -1.7442585229873657, + "logits/rejected": -1.8897044658660889, + "logps/chosen": -301.6856384277344, + "logps/rejected": -255.666259765625, + "loss": 0.6217, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47638359665870667, + "rewards/margins": 0.3181024491786957, + "rewards/rejected": -0.7944860458374023, + "step": 1649 + }, + { + "epoch": 0.19, + "learning_rate": 2.461084209283099e-07, + "logits/chosen": -2.0834336280822754, + "logits/rejected": -1.9874670505523682, + "logps/chosen": -431.22625732421875, + "logps/rejected": -451.61602783203125, + "loss": 0.3955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8054955005645752, + "rewards/margins": 1.2469618320465088, + "rewards/rejected": -2.052457332611084, + "step": 1650 + }, + { + "epoch": 0.19, + "learning_rate": 2.460729892523916e-07, + "logits/chosen": -2.6949408054351807, + "logits/rejected": -2.643350839614868, + "logps/chosen": -232.38587951660156, + "logps/rejected": -181.9983673095703, + "loss": 0.4744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41399484872817993, + "rewards/margins": 2.2576088905334473, + "rewards/rejected": -2.6716039180755615, + "step": 1651 + }, + { + "epoch": 0.19, + "learning_rate": 2.4603755757647334e-07, + "logits/chosen": -1.906083106994629, + "logits/rejected": -2.0011038780212402, + "logps/chosen": -134.99073791503906, + "logps/rejected": -179.1443328857422, + "loss": 0.6521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9401741027832031, + "rewards/margins": 1.3125808238983154, + "rewards/rejected": -2.2527549266815186, + "step": 1652 + }, + { + "epoch": 0.19, + "learning_rate": 2.460021259005551e-07, + "logits/chosen": -2.2177886962890625, + "logits/rejected": -2.2340290546417236, + "logps/chosen": -216.84971618652344, + "logps/rejected": -298.9881896972656, + "loss": 0.5447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7549578547477722, + "rewards/margins": 1.1231160163879395, + "rewards/rejected": -1.8780736923217773, + "step": 1653 + }, + { + "epoch": 0.19, + "learning_rate": 2.4596669422463684e-07, + "logits/chosen": -2.5723934173583984, + "logits/rejected": -2.2472755908966064, + "logps/chosen": -236.8076171875, + "logps/rejected": -295.8975830078125, + "loss": 0.5187, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6546620726585388, + "rewards/margins": 1.9492071866989136, + "rewards/rejected": -2.6038694381713867, + "step": 1654 + }, + { + "epoch": 0.19, + "learning_rate": 2.4593126254871854e-07, + "logits/chosen": -2.250518321990967, + "logits/rejected": -2.2545807361602783, + "logps/chosen": -331.33544921875, + "logps/rejected": -317.98345947265625, + "loss": 0.2505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6543898582458496, + "rewards/margins": 1.596533179283142, + "rewards/rejected": -2.2509231567382812, + "step": 1655 + }, + { + "epoch": 0.19, + "learning_rate": 2.458958308728003e-07, + "logits/chosen": -2.544801712036133, + "logits/rejected": -2.5372486114501953, + "logps/chosen": -276.8548889160156, + "logps/rejected": -282.4296875, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.291813462972641, + "rewards/margins": 3.296168804168701, + "rewards/rejected": -3.587982177734375, + "step": 1656 + }, + { + "epoch": 0.19, + "learning_rate": 2.4586039919688203e-07, + "logits/chosen": -2.466866970062256, + "logits/rejected": -2.5726068019866943, + "logps/chosen": -362.256103515625, + "logps/rejected": -311.67041015625, + "loss": 0.5525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6920166015625, + "rewards/margins": 1.1808189153671265, + "rewards/rejected": -1.872835397720337, + "step": 1657 + }, + { + "epoch": 0.19, + "learning_rate": 2.4582496752096373e-07, + "logits/chosen": -2.522550582885742, + "logits/rejected": -2.6290950775146484, + "logps/chosen": -272.4031677246094, + "logps/rejected": -217.59832763671875, + "loss": 0.3586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5821040868759155, + "rewards/margins": 1.3093690872192383, + "rewards/rejected": -1.8914731740951538, + "step": 1658 + }, + { + "epoch": 0.19, + "learning_rate": 2.457895358450455e-07, + "logits/chosen": -2.2223665714263916, + "logits/rejected": -2.523585319519043, + "logps/chosen": -392.3723449707031, + "logps/rejected": -327.9945068359375, + "loss": 0.3464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03784913569688797, + "rewards/margins": 1.205685019493103, + "rewards/rejected": -1.2435342073440552, + "step": 1659 + }, + { + "epoch": 0.19, + "learning_rate": 2.4575410416912717e-07, + "logits/chosen": -2.2344162464141846, + "logits/rejected": -2.3971595764160156, + "logps/chosen": -261.149169921875, + "logps/rejected": -270.95501708984375, + "loss": 0.2615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02218186855316162, + "rewards/margins": 1.8940696716308594, + "rewards/rejected": -1.9162514209747314, + "step": 1660 + }, + { + "epoch": 0.19, + "learning_rate": 2.457186724932089e-07, + "logits/chosen": -2.402071714401245, + "logits/rejected": -2.302401542663574, + "logps/chosen": -365.9566345214844, + "logps/rejected": -309.6323547363281, + "loss": 0.2403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4284933805465698, + "rewards/margins": 1.7880432605743408, + "rewards/rejected": -2.2165367603302, + "step": 1661 + }, + { + "epoch": 0.19, + "learning_rate": 2.456832408172906e-07, + "logits/chosen": -2.629833698272705, + "logits/rejected": -2.5907821655273438, + "logps/chosen": -170.3055419921875, + "logps/rejected": -206.78289794921875, + "loss": 0.4387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15459054708480835, + "rewards/margins": 1.1244741678237915, + "rewards/rejected": -1.279064655303955, + "step": 1662 + }, + { + "epoch": 0.19, + "learning_rate": 2.4564780914137237e-07, + "logits/chosen": -1.9750040769577026, + "logits/rejected": -2.087946891784668, + "logps/chosen": -222.98129272460938, + "logps/rejected": -227.36981201171875, + "loss": 0.4206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0134376287460327, + "rewards/margins": 0.8043762445449829, + "rewards/rejected": -1.8178138732910156, + "step": 1663 + }, + { + "epoch": 0.19, + "learning_rate": 2.456123774654541e-07, + "logits/chosen": -2.5260679721832275, + "logits/rejected": -2.8178257942199707, + "logps/chosen": -180.5194091796875, + "logps/rejected": -196.1873016357422, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48911577463150024, + "rewards/margins": 2.1488428115844727, + "rewards/rejected": -2.637958288192749, + "step": 1664 + }, + { + "epoch": 0.19, + "learning_rate": 2.4557694578953586e-07, + "logits/chosen": -2.1998372077941895, + "logits/rejected": -2.041743755340576, + "logps/chosen": -171.25265502929688, + "logps/rejected": -278.51043701171875, + "loss": 0.6175, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6311626434326172, + "rewards/margins": 1.034217357635498, + "rewards/rejected": -1.6653801202774048, + "step": 1665 + }, + { + "epoch": 0.19, + "learning_rate": 2.4554151411361756e-07, + "logits/chosen": -1.9722144603729248, + "logits/rejected": -2.081778049468994, + "logps/chosen": -252.91864013671875, + "logps/rejected": -268.276123046875, + "loss": 0.4252, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.319704532623291, + "rewards/margins": 1.5936336517333984, + "rewards/rejected": -2.9133381843566895, + "step": 1666 + }, + { + "epoch": 0.19, + "learning_rate": 2.455060824376993e-07, + "logits/chosen": -2.25974702835083, + "logits/rejected": -2.577329635620117, + "logps/chosen": -366.6850280761719, + "logps/rejected": -280.7003173828125, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6532570719718933, + "rewards/margins": 2.4932761192321777, + "rewards/rejected": -3.146533250808716, + "step": 1667 + }, + { + "epoch": 0.19, + "learning_rate": 2.45470650761781e-07, + "logits/chosen": -2.4962587356567383, + "logits/rejected": -2.619797468185425, + "logps/chosen": -374.7917785644531, + "logps/rejected": -343.32672119140625, + "loss": 0.8062, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2500332593917847, + "rewards/margins": 0.9725916981697083, + "rewards/rejected": -2.2226247787475586, + "step": 1668 + }, + { + "epoch": 0.19, + "learning_rate": 2.4543521908586275e-07, + "logits/chosen": -2.8976550102233887, + "logits/rejected": -2.7989583015441895, + "logps/chosen": -79.97217559814453, + "logps/rejected": -212.48098754882812, + "loss": 0.1916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1216549202799797, + "rewards/margins": 3.2446837425231934, + "rewards/rejected": -3.3663384914398193, + "step": 1669 + }, + { + "epoch": 0.19, + "learning_rate": 2.453997874099445e-07, + "logits/chosen": -2.394810914993286, + "logits/rejected": -2.648609161376953, + "logps/chosen": -387.6202392578125, + "logps/rejected": -223.49432373046875, + "loss": 0.6886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9422688484191895, + "rewards/margins": 0.30546462535858154, + "rewards/rejected": -1.247733473777771, + "step": 1670 + }, + { + "epoch": 0.19, + "learning_rate": 2.453643557340262e-07, + "logits/chosen": -1.7729119062423706, + "logits/rejected": -1.8861068487167358, + "logps/chosen": -343.9400939941406, + "logps/rejected": -301.35186767578125, + "loss": 0.3627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48091018199920654, + "rewards/margins": 1.2555638551712036, + "rewards/rejected": -1.7364740371704102, + "step": 1671 + }, + { + "epoch": 0.19, + "learning_rate": 2.4532892405810794e-07, + "logits/chosen": -2.526780128479004, + "logits/rejected": -2.3439173698425293, + "logps/chosen": -110.61292266845703, + "logps/rejected": -217.36231994628906, + "loss": 0.4063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.654490053653717, + "rewards/margins": 1.2306182384490967, + "rewards/rejected": -1.885108232498169, + "step": 1672 + }, + { + "epoch": 0.19, + "learning_rate": 2.4529349238218964e-07, + "logits/chosen": -2.3020904064178467, + "logits/rejected": -2.290064811706543, + "logps/chosen": -234.40968322753906, + "logps/rejected": -259.13128662109375, + "loss": 0.3523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46267086267471313, + "rewards/margins": 2.5941081047058105, + "rewards/rejected": -3.056779384613037, + "step": 1673 + }, + { + "epoch": 0.19, + "learning_rate": 2.452580607062714e-07, + "logits/chosen": -2.213320732116699, + "logits/rejected": -2.45706844329834, + "logps/chosen": -439.188232421875, + "logps/rejected": -267.59515380859375, + "loss": 0.2236, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41147154569625854, + "rewards/margins": 2.104182481765747, + "rewards/rejected": -2.5156538486480713, + "step": 1674 + }, + { + "epoch": 0.19, + "learning_rate": 2.4522262903035314e-07, + "logits/chosen": -1.7336218357086182, + "logits/rejected": -2.2416436672210693, + "logps/chosen": -522.2005004882812, + "logps/rejected": -300.19049072265625, + "loss": 0.4166, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23829028010368347, + "rewards/margins": 1.2266545295715332, + "rewards/rejected": -1.464944839477539, + "step": 1675 + }, + { + "epoch": 0.19, + "learning_rate": 2.4518719735443483e-07, + "logits/chosen": -2.3902876377105713, + "logits/rejected": -2.3526713848114014, + "logps/chosen": -266.8745422363281, + "logps/rejected": -222.37503051757812, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6263242959976196, + "rewards/margins": 1.5320783853530884, + "rewards/rejected": -2.158402681350708, + "step": 1676 + }, + { + "epoch": 0.2, + "learning_rate": 2.451517656785166e-07, + "logits/chosen": -2.2450807094573975, + "logits/rejected": -1.9490177631378174, + "logps/chosen": -215.37759399414062, + "logps/rejected": -284.7712707519531, + "loss": 1.0238, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4993760585784912, + "rewards/margins": -0.12135636806488037, + "rewards/rejected": -1.3780198097229004, + "step": 1677 + }, + { + "epoch": 0.2, + "learning_rate": 2.4511633400259833e-07, + "logits/chosen": -1.7775871753692627, + "logits/rejected": -1.8299651145935059, + "logps/chosen": -213.48779296875, + "logps/rejected": -259.5183410644531, + "loss": 0.6161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5519853830337524, + "rewards/margins": 0.9574552178382874, + "rewards/rejected": -1.509440541267395, + "step": 1678 + }, + { + "epoch": 0.2, + "learning_rate": 2.4508090232668e-07, + "logits/chosen": -2.671874761581421, + "logits/rejected": -2.7615647315979004, + "logps/chosen": -232.97877502441406, + "logps/rejected": -269.1032409667969, + "loss": 0.5784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9884816408157349, + "rewards/margins": 2.013230562210083, + "rewards/rejected": -3.0017120838165283, + "step": 1679 + }, + { + "epoch": 0.2, + "learning_rate": 2.450454706507618e-07, + "logits/chosen": -2.2875802516937256, + "logits/rejected": -2.2927958965301514, + "logps/chosen": -170.98037719726562, + "logps/rejected": -205.88623046875, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095455765724182, + "rewards/margins": 2.273000717163086, + "rewards/rejected": -3.0825462341308594, + "step": 1680 + }, + { + "epoch": 0.2, + "learning_rate": 2.450100389748435e-07, + "logits/chosen": -2.0003745555877686, + "logits/rejected": -1.6723541021347046, + "logps/chosen": -156.29800415039062, + "logps/rejected": -292.14337158203125, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8538736701011658, + "rewards/margins": 1.4568774700164795, + "rewards/rejected": -2.31075119972229, + "step": 1681 + }, + { + "epoch": 0.2, + "learning_rate": 2.449746072989252e-07, + "logits/chosen": -1.869611382484436, + "logits/rejected": -1.941335916519165, + "logps/chosen": -413.1986083984375, + "logps/rejected": -372.622314453125, + "loss": 0.4483, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.547063946723938, + "rewards/margins": 1.2055723667144775, + "rewards/rejected": -1.752636194229126, + "step": 1682 + }, + { + "epoch": 0.2, + "learning_rate": 2.4493917562300697e-07, + "logits/chosen": -2.2228286266326904, + "logits/rejected": -2.206742525100708, + "logps/chosen": -199.76353454589844, + "logps/rejected": -235.65126037597656, + "loss": 0.4029, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7561392784118652, + "rewards/margins": 1.791888952255249, + "rewards/rejected": -2.5480282306671143, + "step": 1683 + }, + { + "epoch": 0.2, + "learning_rate": 2.4490374394708866e-07, + "logits/chosen": -1.8739206790924072, + "logits/rejected": -2.337437152862549, + "logps/chosen": -295.287841796875, + "logps/rejected": -192.88650512695312, + "loss": 0.3685, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3055373430252075, + "rewards/margins": 1.8031154870986938, + "rewards/rejected": -2.1086528301239014, + "step": 1684 + }, + { + "epoch": 0.2, + "learning_rate": 2.448683122711704e-07, + "logits/chosen": -2.243102550506592, + "logits/rejected": -2.0844855308532715, + "logps/chosen": -127.83473205566406, + "logps/rejected": -136.898681640625, + "loss": 0.7257, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3347032070159912, + "rewards/margins": 0.23142650723457336, + "rewards/rejected": -1.5661296844482422, + "step": 1685 + }, + { + "epoch": 0.2, + "learning_rate": 2.4483288059525216e-07, + "logits/chosen": -2.5224738121032715, + "logits/rejected": -2.3596463203430176, + "logps/chosen": -229.8944549560547, + "logps/rejected": -501.04241943359375, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7905840277671814, + "rewards/margins": 1.1842104196548462, + "rewards/rejected": -1.9747943878173828, + "step": 1686 + }, + { + "epoch": 0.2, + "learning_rate": 2.4479744891933386e-07, + "logits/chosen": -2.397657632827759, + "logits/rejected": -2.67763090133667, + "logps/chosen": -332.201904296875, + "logps/rejected": -306.04638671875, + "loss": 0.2177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1924385130405426, + "rewards/margins": 2.966355323791504, + "rewards/rejected": -3.1587939262390137, + "step": 1687 + }, + { + "epoch": 0.2, + "learning_rate": 2.447620172434156e-07, + "logits/chosen": -2.4252774715423584, + "logits/rejected": -2.042975664138794, + "logps/chosen": -213.68051147460938, + "logps/rejected": -332.4058532714844, + "loss": 0.1683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5314593315124512, + "rewards/margins": 2.5401315689086914, + "rewards/rejected": -3.0715906620025635, + "step": 1688 + }, + { + "epoch": 0.2, + "learning_rate": 2.4472658556749735e-07, + "logits/chosen": -2.3236591815948486, + "logits/rejected": -2.1976144313812256, + "logps/chosen": -224.82814025878906, + "logps/rejected": -285.0279541015625, + "loss": 0.1829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5715979337692261, + "rewards/margins": 2.3901619911193848, + "rewards/rejected": -2.9617598056793213, + "step": 1689 + }, + { + "epoch": 0.2, + "learning_rate": 2.4469115389157905e-07, + "logits/chosen": -2.4370248317718506, + "logits/rejected": -2.330864667892456, + "logps/chosen": -195.62496948242188, + "logps/rejected": -172.05862426757812, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6914937496185303, + "rewards/margins": 3.0850682258605957, + "rewards/rejected": -3.776562213897705, + "step": 1690 + }, + { + "epoch": 0.2, + "learning_rate": 2.446557222156608e-07, + "logits/chosen": -2.6006290912628174, + "logits/rejected": -2.378993511199951, + "logps/chosen": -300.4176330566406, + "logps/rejected": -288.8329772949219, + "loss": 0.5316, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6617050170898438, + "rewards/margins": 1.3923941850662231, + "rewards/rejected": -2.0540993213653564, + "step": 1691 + }, + { + "epoch": 0.2, + "learning_rate": 2.4462029053974255e-07, + "logits/chosen": -2.4011642932891846, + "logits/rejected": -2.38321590423584, + "logps/chosen": -274.719482421875, + "logps/rejected": -334.5320739746094, + "loss": 0.1688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8670014142990112, + "rewards/margins": 2.6068618297576904, + "rewards/rejected": -3.473863124847412, + "step": 1692 + }, + { + "epoch": 0.2, + "learning_rate": 2.4458485886382424e-07, + "logits/chosen": -2.7992653846740723, + "logits/rejected": -2.691972255706787, + "logps/chosen": -221.02493286132812, + "logps/rejected": -216.6522979736328, + "loss": 0.5473, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.495931625366211, + "rewards/margins": 0.8376299142837524, + "rewards/rejected": -2.333561658859253, + "step": 1693 + }, + { + "epoch": 0.2, + "learning_rate": 2.44549427187906e-07, + "logits/chosen": -2.4074881076812744, + "logits/rejected": -2.5063364505767822, + "logps/chosen": -427.97503662109375, + "logps/rejected": -287.05535888671875, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0978846549987793, + "rewards/margins": 2.203669786453247, + "rewards/rejected": -3.3015546798706055, + "step": 1694 + }, + { + "epoch": 0.2, + "learning_rate": 2.445139955119877e-07, + "logits/chosen": -2.5927371978759766, + "logits/rejected": -2.790435791015625, + "logps/chosen": -257.138916015625, + "logps/rejected": -224.1407470703125, + "loss": 0.8213, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3167378902435303, + "rewards/margins": 2.021420478820801, + "rewards/rejected": -3.33815860748291, + "step": 1695 + }, + { + "epoch": 0.2, + "learning_rate": 2.4447856383606943e-07, + "logits/chosen": -2.9457216262817383, + "logits/rejected": -2.7275028228759766, + "logps/chosen": -178.66099548339844, + "logps/rejected": -268.76544189453125, + "loss": 0.3225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5670561790466309, + "rewards/margins": 1.6274358034133911, + "rewards/rejected": -2.1944918632507324, + "step": 1696 + }, + { + "epoch": 0.2, + "learning_rate": 2.4444313216015113e-07, + "logits/chosen": -2.6083035469055176, + "logits/rejected": -2.3719499111175537, + "logps/chosen": -431.0684814453125, + "logps/rejected": -250.6070556640625, + "loss": 0.2972, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4644041955471039, + "rewards/margins": 1.6125348806381226, + "rewards/rejected": -2.076939105987549, + "step": 1697 + }, + { + "epoch": 0.2, + "learning_rate": 2.444077004842329e-07, + "logits/chosen": -2.3073740005493164, + "logits/rejected": -2.217200994491577, + "logps/chosen": -283.94134521484375, + "logps/rejected": -496.77032470703125, + "loss": 0.6763, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4758017659187317, + "rewards/margins": 0.22290386259555817, + "rewards/rejected": -0.6987056136131287, + "step": 1698 + }, + { + "epoch": 0.2, + "learning_rate": 2.4437226880831463e-07, + "logits/chosen": -2.6404643058776855, + "logits/rejected": -2.5165669918060303, + "logps/chosen": -246.29473876953125, + "logps/rejected": -383.0121765136719, + "loss": 0.533, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3632125854492188, + "rewards/margins": 0.9686402082443237, + "rewards/rejected": -2.331852674484253, + "step": 1699 + }, + { + "epoch": 0.2, + "learning_rate": 2.443368371323964e-07, + "logits/chosen": -2.692833185195923, + "logits/rejected": -2.749985694885254, + "logps/chosen": -288.43060302734375, + "logps/rejected": -226.47560119628906, + "loss": 0.1727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6061241626739502, + "rewards/margins": 2.48777174949646, + "rewards/rejected": -3.093895673751831, + "step": 1700 + }, + { + "epoch": 0.2, + "learning_rate": 2.4430140545647807e-07, + "logits/chosen": -2.6619269847869873, + "logits/rejected": -2.515246629714966, + "logps/chosen": -437.6355895996094, + "logps/rejected": -348.0819396972656, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1193317174911499, + "rewards/margins": 2.6971993446350098, + "rewards/rejected": -2.816531181335449, + "step": 1701 + }, + { + "epoch": 0.2, + "learning_rate": 2.442659737805598e-07, + "logits/chosen": -2.635606288909912, + "logits/rejected": -2.6611692905426025, + "logps/chosen": -217.91265869140625, + "logps/rejected": -276.42266845703125, + "loss": 0.2388, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0138905048370361, + "rewards/margins": 2.4784152507781982, + "rewards/rejected": -3.4923057556152344, + "step": 1702 + }, + { + "epoch": 0.2, + "learning_rate": 2.4423054210464157e-07, + "logits/chosen": -1.7425665855407715, + "logits/rejected": -1.7193001508712769, + "logps/chosen": -290.45196533203125, + "logps/rejected": -254.73318481445312, + "loss": 0.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5442890524864197, + "rewards/margins": 0.9033942222595215, + "rewards/rejected": -1.447683334350586, + "step": 1703 + }, + { + "epoch": 0.2, + "learning_rate": 2.4419511042872326e-07, + "logits/chosen": -2.8911690711975098, + "logits/rejected": -2.672600746154785, + "logps/chosen": -64.15889739990234, + "logps/rejected": -203.60073852539062, + "loss": 0.2929, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5320387482643127, + "rewards/margins": 1.5270730257034302, + "rewards/rejected": -2.0591118335723877, + "step": 1704 + }, + { + "epoch": 0.2, + "learning_rate": 2.44159678752805e-07, + "logits/chosen": -2.687408447265625, + "logits/rejected": -2.487170934677124, + "logps/chosen": -192.8691864013672, + "logps/rejected": -398.520751953125, + "loss": 0.6108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3932178020477295, + "rewards/margins": 1.695877194404602, + "rewards/rejected": -3.089094877243042, + "step": 1705 + }, + { + "epoch": 0.2, + "learning_rate": 2.441242470768867e-07, + "logits/chosen": -2.171093463897705, + "logits/rejected": -2.3537776470184326, + "logps/chosen": -437.00201416015625, + "logps/rejected": -279.71221923828125, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1990809440612793, + "rewards/margins": 0.5425747632980347, + "rewards/rejected": -1.7416558265686035, + "step": 1706 + }, + { + "epoch": 0.2, + "learning_rate": 2.4408881540096846e-07, + "logits/chosen": -2.5590147972106934, + "logits/rejected": -2.745074510574341, + "logps/chosen": -236.73760986328125, + "logps/rejected": -228.22027587890625, + "loss": 0.3705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9400160908699036, + "rewards/margins": 2.9647109508514404, + "rewards/rejected": -3.904726982116699, + "step": 1707 + }, + { + "epoch": 0.2, + "learning_rate": 2.4405338372505015e-07, + "logits/chosen": -2.1368656158447266, + "logits/rejected": -2.1610143184661865, + "logps/chosen": -165.5930938720703, + "logps/rejected": -204.6591339111328, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3611239194869995, + "rewards/margins": 2.2286953926086426, + "rewards/rejected": -2.5898194313049316, + "step": 1708 + }, + { + "epoch": 0.2, + "learning_rate": 2.440179520491319e-07, + "logits/chosen": -2.3005166053771973, + "logits/rejected": -2.191126585006714, + "logps/chosen": -198.12051391601562, + "logps/rejected": -221.3509063720703, + "loss": 0.7289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7328636646270752, + "rewards/margins": 0.2828371524810791, + "rewards/rejected": -2.0157008171081543, + "step": 1709 + }, + { + "epoch": 0.2, + "learning_rate": 2.4398252037321365e-07, + "logits/chosen": -2.8378093242645264, + "logits/rejected": -2.832202911376953, + "logps/chosen": -306.0372314453125, + "logps/rejected": -284.5621643066406, + "loss": 0.1019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6866308450698853, + "rewards/margins": 3.0838046073913574, + "rewards/rejected": -3.7704360485076904, + "step": 1710 + }, + { + "epoch": 0.2, + "learning_rate": 2.4394708869729535e-07, + "logits/chosen": -2.4000842571258545, + "logits/rejected": -2.225205421447754, + "logps/chosen": -216.18511962890625, + "logps/rejected": -300.9520263671875, + "loss": 0.7521, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1064001321792603, + "rewards/margins": 1.5065675973892212, + "rewards/rejected": -2.6129677295684814, + "step": 1711 + }, + { + "epoch": 0.2, + "learning_rate": 2.439116570213771e-07, + "logits/chosen": -2.0285186767578125, + "logits/rejected": -1.8265560865402222, + "logps/chosen": -202.61000061035156, + "logps/rejected": -271.3106689453125, + "loss": 0.2628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7479815483093262, + "rewards/margins": 2.069674253463745, + "rewards/rejected": -2.8176558017730713, + "step": 1712 + }, + { + "epoch": 0.2, + "learning_rate": 2.4387622534545884e-07, + "logits/chosen": -2.3101818561553955, + "logits/rejected": -2.424459934234619, + "logps/chosen": -344.6812744140625, + "logps/rejected": -302.7609558105469, + "loss": 0.3615, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.123995065689087, + "rewards/margins": 1.4284008741378784, + "rewards/rejected": -2.552396059036255, + "step": 1713 + }, + { + "epoch": 0.2, + "learning_rate": 2.438407936695406e-07, + "logits/chosen": -2.2773690223693848, + "logits/rejected": -2.1974005699157715, + "logps/chosen": -279.6014404296875, + "logps/rejected": -315.0476379394531, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09783801436424255, + "rewards/margins": 2.8355906009674072, + "rewards/rejected": -2.9334287643432617, + "step": 1714 + }, + { + "epoch": 0.2, + "learning_rate": 2.438053619936223e-07, + "logits/chosen": -2.0184121131896973, + "logits/rejected": -2.0313773155212402, + "logps/chosen": -219.04071044921875, + "logps/rejected": -188.76881408691406, + "loss": 0.2332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17230436205863953, + "rewards/margins": 2.386626720428467, + "rewards/rejected": -2.5589311122894287, + "step": 1715 + }, + { + "epoch": 0.2, + "learning_rate": 2.4376993031770404e-07, + "logits/chosen": -2.68499755859375, + "logits/rejected": -2.846951961517334, + "logps/chosen": -123.61669158935547, + "logps/rejected": -182.27523803710938, + "loss": 0.3136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41845184564590454, + "rewards/margins": 2.9919564723968506, + "rewards/rejected": -3.4104082584381104, + "step": 1716 + }, + { + "epoch": 0.2, + "learning_rate": 2.4373449864178573e-07, + "logits/chosen": -1.7609859704971313, + "logits/rejected": -2.1357250213623047, + "logps/chosen": -267.39202880859375, + "logps/rejected": -194.7054901123047, + "loss": 0.9597, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4059154987335205, + "rewards/margins": 0.09404361248016357, + "rewards/rejected": -1.499959111213684, + "step": 1717 + }, + { + "epoch": 0.2, + "learning_rate": 2.436990669658675e-07, + "logits/chosen": -2.9427547454833984, + "logits/rejected": -2.8305983543395996, + "logps/chosen": -282.66192626953125, + "logps/rejected": -212.61468505859375, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9498907327651978, + "rewards/margins": 2.364185333251953, + "rewards/rejected": -3.3140759468078613, + "step": 1718 + }, + { + "epoch": 0.2, + "learning_rate": 2.436636352899492e-07, + "logits/chosen": -2.3011960983276367, + "logits/rejected": -2.2079832553863525, + "logps/chosen": -302.9136047363281, + "logps/rejected": -407.46063232421875, + "loss": 0.2254, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1067568063735962, + "rewards/margins": 3.2774291038513184, + "rewards/rejected": -4.384185791015625, + "step": 1719 + }, + { + "epoch": 0.2, + "learning_rate": 2.436282036140309e-07, + "logits/chosen": -1.4618102312088013, + "logits/rejected": -1.0709110498428345, + "logps/chosen": -415.23162841796875, + "logps/rejected": -532.1278686523438, + "loss": 0.2288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42020225524902344, + "rewards/margins": 2.2518253326416016, + "rewards/rejected": -2.672027587890625, + "step": 1720 + }, + { + "epoch": 0.2, + "learning_rate": 2.4359277193811267e-07, + "logits/chosen": -2.912400007247925, + "logits/rejected": -2.772385358810425, + "logps/chosen": -181.66525268554688, + "logps/rejected": -224.1291961669922, + "loss": 0.092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6237964630126953, + "rewards/margins": 3.556155204772949, + "rewards/rejected": -4.179951190948486, + "step": 1721 + }, + { + "epoch": 0.2, + "learning_rate": 2.4355734026219437e-07, + "logits/chosen": -2.3696835041046143, + "logits/rejected": -2.6130361557006836, + "logps/chosen": -264.657958984375, + "logps/rejected": -206.3082275390625, + "loss": 0.3611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6540833711624146, + "rewards/margins": 1.9865055084228516, + "rewards/rejected": -2.6405887603759766, + "step": 1722 + }, + { + "epoch": 0.2, + "learning_rate": 2.435219085862761e-07, + "logits/chosen": -2.2988734245300293, + "logits/rejected": -2.220130443572998, + "logps/chosen": -213.69557189941406, + "logps/rejected": -308.3348083496094, + "loss": 0.2898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.554688036441803, + "rewards/margins": 2.5775656700134277, + "rewards/rejected": -3.132253885269165, + "step": 1723 + }, + { + "epoch": 0.2, + "learning_rate": 2.4348647691035787e-07, + "logits/chosen": -2.378164768218994, + "logits/rejected": -2.1779351234436035, + "logps/chosen": -308.5841979980469, + "logps/rejected": -449.5793151855469, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33357128500938416, + "rewards/margins": 3.676621437072754, + "rewards/rejected": -4.01019287109375, + "step": 1724 + }, + { + "epoch": 0.2, + "learning_rate": 2.434510452344396e-07, + "logits/chosen": -2.0856363773345947, + "logits/rejected": -2.1840147972106934, + "logps/chosen": -346.25537109375, + "logps/rejected": -262.4714660644531, + "loss": 0.2715, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20403826236724854, + "rewards/margins": 2.0922205448150635, + "rewards/rejected": -2.2962586879730225, + "step": 1725 + }, + { + "epoch": 0.2, + "learning_rate": 2.434156135585213e-07, + "logits/chosen": -2.3825149536132812, + "logits/rejected": -2.3598337173461914, + "logps/chosen": -332.9206237792969, + "logps/rejected": -318.6062927246094, + "loss": 0.3573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9182248711585999, + "rewards/margins": 2.081834077835083, + "rewards/rejected": -3.000059127807617, + "step": 1726 + }, + { + "epoch": 0.2, + "learning_rate": 2.4338018188260306e-07, + "logits/chosen": -2.05372953414917, + "logits/rejected": -2.3898844718933105, + "logps/chosen": -401.6635437011719, + "logps/rejected": -204.50555419921875, + "loss": 0.9492, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8602542281150818, + "rewards/margins": 0.16262242197990417, + "rewards/rejected": -1.0228767395019531, + "step": 1727 + }, + { + "epoch": 0.2, + "learning_rate": 2.4334475020668475e-07, + "logits/chosen": -3.0013933181762695, + "logits/rejected": -3.0418941974639893, + "logps/chosen": -333.9551086425781, + "logps/rejected": -355.92230224609375, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8301363587379456, + "rewards/margins": 1.1164392232894897, + "rewards/rejected": -1.94657564163208, + "step": 1728 + }, + { + "epoch": 0.2, + "learning_rate": 2.433093185307665e-07, + "logits/chosen": -2.230476140975952, + "logits/rejected": -2.489081859588623, + "logps/chosen": -566.677734375, + "logps/rejected": -404.4932556152344, + "loss": 0.3496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5814450979232788, + "rewards/margins": 1.8401529788970947, + "rewards/rejected": -2.421597957611084, + "step": 1729 + }, + { + "epoch": 0.2, + "learning_rate": 2.432738868548482e-07, + "logits/chosen": -2.016127824783325, + "logits/rejected": -2.083613872528076, + "logps/chosen": -297.24267578125, + "logps/rejected": -208.52816772460938, + "loss": 0.4982, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2986968457698822, + "rewards/margins": 0.9703996181488037, + "rewards/rejected": -1.2690964937210083, + "step": 1730 + }, + { + "epoch": 0.2, + "learning_rate": 2.4323845517892995e-07, + "logits/chosen": -1.9601771831512451, + "logits/rejected": -2.4126944541931152, + "logps/chosen": -478.25213623046875, + "logps/rejected": -438.8279113769531, + "loss": 0.266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23452526330947876, + "rewards/margins": 2.1234560012817383, + "rewards/rejected": -2.3579814434051514, + "step": 1731 + }, + { + "epoch": 0.2, + "learning_rate": 2.432030235030117e-07, + "logits/chosen": -2.0786635875701904, + "logits/rejected": -2.107469081878662, + "logps/chosen": -383.38592529296875, + "logps/rejected": -344.9381103515625, + "loss": 0.3009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9960392713546753, + "rewards/margins": 3.0186519622802734, + "rewards/rejected": -4.014691352844238, + "step": 1732 + }, + { + "epoch": 0.2, + "learning_rate": 2.431675918270934e-07, + "logits/chosen": -2.1233458518981934, + "logits/rejected": -2.3950748443603516, + "logps/chosen": -426.977294921875, + "logps/rejected": -368.4346008300781, + "loss": 0.2884, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09678052365779877, + "rewards/margins": 1.8052647113800049, + "rewards/rejected": -1.9020452499389648, + "step": 1733 + }, + { + "epoch": 0.2, + "learning_rate": 2.4313216015117514e-07, + "logits/chosen": -2.070875883102417, + "logits/rejected": -2.2786788940429688, + "logps/chosen": -222.96792602539062, + "logps/rejected": -244.6705780029297, + "loss": 0.8561, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.727665901184082, + "rewards/margins": 1.8865258693695068, + "rewards/rejected": -3.614192247390747, + "step": 1734 + }, + { + "epoch": 0.2, + "learning_rate": 2.430967284752569e-07, + "logits/chosen": -2.0697665214538574, + "logits/rejected": -2.15297794342041, + "logps/chosen": -223.64581298828125, + "logps/rejected": -301.3560791015625, + "loss": 0.2995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33793821930885315, + "rewards/margins": 2.382056713104248, + "rewards/rejected": -2.7199950218200684, + "step": 1735 + }, + { + "epoch": 0.2, + "learning_rate": 2.4306129679933864e-07, + "logits/chosen": -1.9906092882156372, + "logits/rejected": -1.9184799194335938, + "logps/chosen": -297.36273193359375, + "logps/rejected": -299.9766845703125, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8131616115570068, + "rewards/margins": 1.6239393949508667, + "rewards/rejected": -2.437101125717163, + "step": 1736 + }, + { + "epoch": 0.2, + "learning_rate": 2.4302586512342033e-07, + "logits/chosen": -1.6368451118469238, + "logits/rejected": -2.1341922283172607, + "logps/chosen": -253.68731689453125, + "logps/rejected": -160.78619384765625, + "loss": 0.4431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45115694403648376, + "rewards/margins": 1.5704196691513062, + "rewards/rejected": -2.0215766429901123, + "step": 1737 + }, + { + "epoch": 0.2, + "learning_rate": 2.429904334475021e-07, + "logits/chosen": -2.172466993331909, + "logits/rejected": -2.4013924598693848, + "logps/chosen": -292.63861083984375, + "logps/rejected": -255.68191528320312, + "loss": 0.5456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6883822083473206, + "rewards/margins": 1.114113211631775, + "rewards/rejected": -1.8024954795837402, + "step": 1738 + }, + { + "epoch": 0.2, + "learning_rate": 2.429550017715838e-07, + "logits/chosen": -2.3342628479003906, + "logits/rejected": -2.4550535678863525, + "logps/chosen": -387.2298889160156, + "logps/rejected": -298.09002685546875, + "loss": 0.2482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24177367985248566, + "rewards/margins": 2.0086538791656494, + "rewards/rejected": -2.250427722930908, + "step": 1739 + }, + { + "epoch": 0.2, + "learning_rate": 2.429195700956655e-07, + "logits/chosen": -2.771385908126831, + "logits/rejected": -2.8684158325195312, + "logps/chosen": -150.4694061279297, + "logps/rejected": -164.94189453125, + "loss": 0.3297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6034387946128845, + "rewards/margins": 2.2737200260162354, + "rewards/rejected": -2.8771588802337646, + "step": 1740 + }, + { + "epoch": 0.2, + "learning_rate": 2.428841384197472e-07, + "logits/chosen": -2.265669584274292, + "logits/rejected": -2.398066520690918, + "logps/chosen": -104.03521728515625, + "logps/rejected": -161.91632080078125, + "loss": 0.3678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.517264723777771, + "rewards/margins": 1.8584647178649902, + "rewards/rejected": -2.3757293224334717, + "step": 1741 + }, + { + "epoch": 0.2, + "learning_rate": 2.4284870674382897e-07, + "logits/chosen": -2.5113914012908936, + "logits/rejected": -2.400177240371704, + "logps/chosen": -377.9798583984375, + "logps/rejected": -319.6923828125, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2727707028388977, + "rewards/margins": 1.5354102849960327, + "rewards/rejected": -1.8081809282302856, + "step": 1742 + }, + { + "epoch": 0.2, + "learning_rate": 2.428132750679107e-07, + "logits/chosen": -2.472517490386963, + "logits/rejected": -2.654442310333252, + "logps/chosen": -259.4212646484375, + "logps/rejected": -281.5700988769531, + "loss": 0.6438, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9785155653953552, + "rewards/margins": 1.7906217575073242, + "rewards/rejected": -2.769137382507324, + "step": 1743 + }, + { + "epoch": 0.2, + "learning_rate": 2.427778433919924e-07, + "logits/chosen": -1.9710922241210938, + "logits/rejected": -1.8633582592010498, + "logps/chosen": -188.33795166015625, + "logps/rejected": -204.1896209716797, + "loss": 0.5351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.922791600227356, + "rewards/margins": 1.250744342803955, + "rewards/rejected": -2.1735358238220215, + "step": 1744 + }, + { + "epoch": 0.2, + "learning_rate": 2.4274241171607416e-07, + "logits/chosen": -1.8367559909820557, + "logits/rejected": -1.5992932319641113, + "logps/chosen": -257.70367431640625, + "logps/rejected": -322.8680114746094, + "loss": 0.3123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.879590630531311, + "rewards/margins": 2.3005189895629883, + "rewards/rejected": -3.1801095008850098, + "step": 1745 + }, + { + "epoch": 0.2, + "learning_rate": 2.4270698004015586e-07, + "logits/chosen": -1.7887511253356934, + "logits/rejected": -1.8966578245162964, + "logps/chosen": -276.3487548828125, + "logps/rejected": -373.684326171875, + "loss": 0.53, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8763235211372375, + "rewards/margins": 1.4357131719589233, + "rewards/rejected": -2.3120367527008057, + "step": 1746 + }, + { + "epoch": 0.2, + "learning_rate": 2.426715483642376e-07, + "logits/chosen": -2.656315565109253, + "logits/rejected": -2.579333782196045, + "logps/chosen": -315.7026062011719, + "logps/rejected": -255.6226043701172, + "loss": 0.211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47250866889953613, + "rewards/margins": 2.1694719791412354, + "rewards/rejected": -2.6419806480407715, + "step": 1747 + }, + { + "epoch": 0.2, + "learning_rate": 2.4263611668831935e-07, + "logits/chosen": -2.2009036540985107, + "logits/rejected": -2.043870687484741, + "logps/chosen": -120.70841217041016, + "logps/rejected": -248.37046813964844, + "loss": 0.4674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5444543361663818, + "rewards/margins": 1.8763632774353027, + "rewards/rejected": -2.4208176136016846, + "step": 1748 + }, + { + "epoch": 0.2, + "learning_rate": 2.426006850124011e-07, + "logits/chosen": -1.8161256313323975, + "logits/rejected": -1.8112542629241943, + "logps/chosen": -337.2427978515625, + "logps/rejected": -284.72509765625, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07423257827758789, + "rewards/margins": 1.8466020822525024, + "rewards/rejected": -1.9208346605300903, + "step": 1749 + }, + { + "epoch": 0.2, + "learning_rate": 2.425652533364828e-07, + "logits/chosen": -2.2315027713775635, + "logits/rejected": -2.217707395553589, + "logps/chosen": -246.38601684570312, + "logps/rejected": -317.8531494140625, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8921371698379517, + "rewards/margins": 1.5083684921264648, + "rewards/rejected": -2.400505542755127, + "step": 1750 + }, + { + "epoch": 0.2, + "learning_rate": 2.4252982166056455e-07, + "logits/chosen": -2.1611251831054688, + "logits/rejected": -2.232698917388916, + "logps/chosen": -332.2293701171875, + "logps/rejected": -619.751953125, + "loss": 0.7693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.820287823677063, + "rewards/margins": 2.303734064102173, + "rewards/rejected": -3.124021530151367, + "step": 1751 + }, + { + "epoch": 0.2, + "learning_rate": 2.4249438998464624e-07, + "logits/chosen": -2.4108920097351074, + "logits/rejected": -2.2093849182128906, + "logps/chosen": -165.78768920898438, + "logps/rejected": -231.1561279296875, + "loss": 0.1882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3740214407444, + "rewards/margins": 2.5665183067321777, + "rewards/rejected": -2.940539598464966, + "step": 1752 + }, + { + "epoch": 0.2, + "learning_rate": 2.42458958308728e-07, + "logits/chosen": -2.602820873260498, + "logits/rejected": -2.354285955429077, + "logps/chosen": -268.86834716796875, + "logps/rejected": -301.0560302734375, + "loss": 0.5991, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9442713260650635, + "rewards/margins": 1.6500985622406006, + "rewards/rejected": -3.594370126724243, + "step": 1753 + }, + { + "epoch": 0.2, + "learning_rate": 2.4242352663280974e-07, + "logits/chosen": -2.9923768043518066, + "logits/rejected": -3.011714220046997, + "logps/chosen": -254.20814514160156, + "logps/rejected": -225.48373413085938, + "loss": 0.191, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5736883878707886, + "rewards/margins": 2.286609649658203, + "rewards/rejected": -2.8602981567382812, + "step": 1754 + }, + { + "epoch": 0.2, + "learning_rate": 2.4238809495689144e-07, + "logits/chosen": -2.4682748317718506, + "logits/rejected": -2.3028483390808105, + "logps/chosen": -319.2618103027344, + "logps/rejected": -280.13507080078125, + "loss": 0.3517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8028172254562378, + "rewards/margins": 1.7629976272583008, + "rewards/rejected": -2.565814971923828, + "step": 1755 + }, + { + "epoch": 0.2, + "learning_rate": 2.423526632809732e-07, + "logits/chosen": -2.122105836868286, + "logits/rejected": -2.2178194522857666, + "logps/chosen": -522.3229370117188, + "logps/rejected": -415.02410888671875, + "loss": 0.1862, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35548821091651917, + "rewards/margins": 2.657902240753174, + "rewards/rejected": -3.01339054107666, + "step": 1756 + }, + { + "epoch": 0.2, + "learning_rate": 2.423172316050549e-07, + "logits/chosen": -2.711503505706787, + "logits/rejected": -2.771531820297241, + "logps/chosen": -221.61416625976562, + "logps/rejected": -331.5538024902344, + "loss": 0.3237, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.090427279472351, + "rewards/margins": 2.226104736328125, + "rewards/rejected": -3.3165318965911865, + "step": 1757 + }, + { + "epoch": 0.2, + "learning_rate": 2.4228179992913663e-07, + "logits/chosen": -2.7729597091674805, + "logits/rejected": -2.800952196121216, + "logps/chosen": -112.35714721679688, + "logps/rejected": -215.96092224121094, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006350010633468628, + "rewards/margins": 3.6231441497802734, + "rewards/rejected": -3.6167938709259033, + "step": 1758 + }, + { + "epoch": 0.2, + "learning_rate": 2.422463682532184e-07, + "logits/chosen": -2.522773504257202, + "logits/rejected": -2.6411361694335938, + "logps/chosen": -350.42974853515625, + "logps/rejected": -279.61151123046875, + "loss": 1.4841, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.69384765625, + "rewards/margins": -0.13438910245895386, + "rewards/rejected": -2.5594584941864014, + "step": 1759 + }, + { + "epoch": 0.2, + "learning_rate": 2.422109365773001e-07, + "logits/chosen": -2.0582802295684814, + "logits/rejected": -1.9488418102264404, + "logps/chosen": -331.8688049316406, + "logps/rejected": -379.86431884765625, + "loss": 0.4142, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3662176132202148, + "rewards/margins": 2.1332383155822754, + "rewards/rejected": -3.4994559288024902, + "step": 1760 + }, + { + "epoch": 0.2, + "learning_rate": 2.421755049013818e-07, + "logits/chosen": -2.588632822036743, + "logits/rejected": -2.7747089862823486, + "logps/chosen": -215.11990356445312, + "logps/rejected": -201.19464111328125, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5133243203163147, + "rewards/margins": 1.2404953241348267, + "rewards/rejected": -1.7538195848464966, + "step": 1761 + }, + { + "epoch": 0.2, + "learning_rate": 2.4214007322546357e-07, + "logits/chosen": -2.286323070526123, + "logits/rejected": -1.9844850301742554, + "logps/chosen": -239.16629028320312, + "logps/rejected": -264.3179931640625, + "loss": 0.2999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7090801000595093, + "rewards/margins": 3.5325169563293457, + "rewards/rejected": -4.2415971755981445, + "step": 1762 + }, + { + "epoch": 0.21, + "learning_rate": 2.4210464154954527e-07, + "logits/chosen": -2.5698742866516113, + "logits/rejected": -2.4063408374786377, + "logps/chosen": -335.230224609375, + "logps/rejected": -375.5904541015625, + "loss": 0.2984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19878751039505005, + "rewards/margins": 2.9006171226501465, + "rewards/rejected": -3.0994043350219727, + "step": 1763 + }, + { + "epoch": 0.21, + "learning_rate": 2.42069209873627e-07, + "logits/chosen": -2.5678648948669434, + "logits/rejected": -2.040738105773926, + "logps/chosen": -111.26048278808594, + "logps/rejected": -378.9677734375, + "loss": 0.3701, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9800238609313965, + "rewards/margins": 2.095574378967285, + "rewards/rejected": -3.0755982398986816, + "step": 1764 + }, + { + "epoch": 0.21, + "learning_rate": 2.4203377819770876e-07, + "logits/chosen": -2.012361526489258, + "logits/rejected": -2.108659505844116, + "logps/chosen": -116.91748809814453, + "logps/rejected": -198.938232421875, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9344013929367065, + "rewards/margins": 1.8535056114196777, + "rewards/rejected": -2.787907123565674, + "step": 1765 + }, + { + "epoch": 0.21, + "learning_rate": 2.4199834652179046e-07, + "logits/chosen": -2.162045955657959, + "logits/rejected": -2.70467209815979, + "logps/chosen": -532.46826171875, + "logps/rejected": -295.8982238769531, + "loss": 0.5583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8712948560714722, + "rewards/margins": 1.359626293182373, + "rewards/rejected": -2.2309212684631348, + "step": 1766 + }, + { + "epoch": 0.21, + "learning_rate": 2.419629148458722e-07, + "logits/chosen": -2.1358556747436523, + "logits/rejected": -2.148402690887451, + "logps/chosen": -250.0975341796875, + "logps/rejected": -268.36871337890625, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.290607750415802, + "rewards/margins": 1.5199196338653564, + "rewards/rejected": -1.8105273246765137, + "step": 1767 + }, + { + "epoch": 0.21, + "learning_rate": 2.419274831699539e-07, + "logits/chosen": -2.3079538345336914, + "logits/rejected": -2.667905330657959, + "logps/chosen": -251.67478942871094, + "logps/rejected": -176.84173583984375, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6356084942817688, + "rewards/margins": 1.8497332334518433, + "rewards/rejected": -2.4853415489196777, + "step": 1768 + }, + { + "epoch": 0.21, + "learning_rate": 2.4189205149403565e-07, + "logits/chosen": -3.1554512977600098, + "logits/rejected": -3.1036746501922607, + "logps/chosen": -356.59979248046875, + "logps/rejected": -308.8307189941406, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8076197504997253, + "rewards/margins": 1.043885588645935, + "rewards/rejected": -1.8515052795410156, + "step": 1769 + }, + { + "epoch": 0.21, + "learning_rate": 2.418566198181174e-07, + "logits/chosen": -1.6423438787460327, + "logits/rejected": -1.8809717893600464, + "logps/chosen": -471.5633850097656, + "logps/rejected": -376.1085510253906, + "loss": 0.3424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6496620774269104, + "rewards/margins": 2.1963248252868652, + "rewards/rejected": -2.845987319946289, + "step": 1770 + }, + { + "epoch": 0.21, + "learning_rate": 2.4182118814219915e-07, + "logits/chosen": -2.3699305057525635, + "logits/rejected": -2.3077030181884766, + "logps/chosen": -152.7406463623047, + "logps/rejected": -99.40492248535156, + "loss": 0.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7054970264434814, + "rewards/margins": 1.1497149467468262, + "rewards/rejected": -1.8552119731903076, + "step": 1771 + }, + { + "epoch": 0.21, + "learning_rate": 2.4178575646628084e-07, + "logits/chosen": -2.260051727294922, + "logits/rejected": -2.225996494293213, + "logps/chosen": -182.92218017578125, + "logps/rejected": -171.13858032226562, + "loss": 0.7437, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9591944217681885, + "rewards/margins": 1.1684916019439697, + "rewards/rejected": -2.127686023712158, + "step": 1772 + }, + { + "epoch": 0.21, + "learning_rate": 2.417503247903626e-07, + "logits/chosen": -2.965254545211792, + "logits/rejected": -2.9681715965270996, + "logps/chosen": -477.01141357421875, + "logps/rejected": -319.0557861328125, + "loss": 0.4576, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2657482624053955, + "rewards/margins": 2.438108444213867, + "rewards/rejected": -3.7038567066192627, + "step": 1773 + }, + { + "epoch": 0.21, + "learning_rate": 2.417148931144443e-07, + "logits/chosen": -2.0817630290985107, + "logits/rejected": -2.1232998371124268, + "logps/chosen": -216.533447265625, + "logps/rejected": -231.09722900390625, + "loss": 0.5355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.630585253238678, + "rewards/margins": 1.9499613046646118, + "rewards/rejected": -2.5805463790893555, + "step": 1774 + }, + { + "epoch": 0.21, + "learning_rate": 2.4167946143852604e-07, + "logits/chosen": -2.634916305541992, + "logits/rejected": -2.901615858078003, + "logps/chosen": -222.041748046875, + "logps/rejected": -229.88272094726562, + "loss": 0.6299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9937788248062134, + "rewards/margins": 1.422478437423706, + "rewards/rejected": -2.41625714302063, + "step": 1775 + }, + { + "epoch": 0.21, + "learning_rate": 2.4164402976260773e-07, + "logits/chosen": -2.301354169845581, + "logits/rejected": -2.17429256439209, + "logps/chosen": -323.59344482421875, + "logps/rejected": -327.7535705566406, + "loss": 0.3608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29222571849823, + "rewards/margins": 1.6469990015029907, + "rewards/rejected": -1.9392247200012207, + "step": 1776 + }, + { + "epoch": 0.21, + "learning_rate": 2.416085980866895e-07, + "logits/chosen": -2.3338112831115723, + "logits/rejected": -2.3443639278411865, + "logps/chosen": -278.14990234375, + "logps/rejected": -229.26962280273438, + "loss": 0.3551, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.157565951347351, + "rewards/margins": 1.7625893354415894, + "rewards/rejected": -2.9201552867889404, + "step": 1777 + }, + { + "epoch": 0.21, + "learning_rate": 2.4157316641077123e-07, + "logits/chosen": -1.6919491291046143, + "logits/rejected": -2.1408681869506836, + "logps/chosen": -296.6734619140625, + "logps/rejected": -270.4972839355469, + "loss": 0.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4082125425338745, + "rewards/margins": 1.1235957145690918, + "rewards/rejected": -2.531808376312256, + "step": 1778 + }, + { + "epoch": 0.21, + "learning_rate": 2.415377347348529e-07, + "logits/chosen": -2.616039276123047, + "logits/rejected": -2.743342876434326, + "logps/chosen": -260.18988037109375, + "logps/rejected": -270.3879699707031, + "loss": 0.2968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6332108974456787, + "rewards/margins": 1.984546184539795, + "rewards/rejected": -2.6177573204040527, + "step": 1779 + }, + { + "epoch": 0.21, + "learning_rate": 2.415023030589347e-07, + "logits/chosen": -2.79740047454834, + "logits/rejected": -2.512173652648926, + "logps/chosen": -205.47618103027344, + "logps/rejected": -259.50445556640625, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17176327109336853, + "rewards/margins": 2.763949394226074, + "rewards/rejected": -2.9357125759124756, + "step": 1780 + }, + { + "epoch": 0.21, + "learning_rate": 2.4146687138301637e-07, + "logits/chosen": -2.6682446002960205, + "logits/rejected": -2.6388533115386963, + "logps/chosen": -308.41558837890625, + "logps/rejected": -203.4622344970703, + "loss": 0.2529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7370229363441467, + "rewards/margins": 2.0111355781555176, + "rewards/rejected": -2.7481584548950195, + "step": 1781 + }, + { + "epoch": 0.21, + "learning_rate": 2.4143143970709817e-07, + "logits/chosen": -1.6870448589324951, + "logits/rejected": -2.1154861450195312, + "logps/chosen": -452.85601806640625, + "logps/rejected": -342.8895263671875, + "loss": 0.9876, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5582736730575562, + "rewards/margins": 2.331254005432129, + "rewards/rejected": -3.8895277976989746, + "step": 1782 + }, + { + "epoch": 0.21, + "learning_rate": 2.4139600803117987e-07, + "logits/chosen": -2.5333008766174316, + "logits/rejected": -2.8384664058685303, + "logps/chosen": -314.5324401855469, + "logps/rejected": -202.55662536621094, + "loss": 0.4603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46156880259513855, + "rewards/margins": 2.368114709854126, + "rewards/rejected": -2.829683780670166, + "step": 1783 + }, + { + "epoch": 0.21, + "learning_rate": 2.413605763552616e-07, + "logits/chosen": -3.010484218597412, + "logits/rejected": -2.9420018196105957, + "logps/chosen": -310.7889709472656, + "logps/rejected": -273.9451599121094, + "loss": 0.3769, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0498713254928589, + "rewards/margins": 2.0087215900421143, + "rewards/rejected": -3.058593273162842, + "step": 1784 + }, + { + "epoch": 0.21, + "learning_rate": 2.413251446793433e-07, + "logits/chosen": -2.360142230987549, + "logits/rejected": -2.3157296180725098, + "logps/chosen": -490.0210266113281, + "logps/rejected": -398.24127197265625, + "loss": 0.2676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4722224175930023, + "rewards/margins": 1.8734838962554932, + "rewards/rejected": -2.345705986022949, + "step": 1785 + }, + { + "epoch": 0.21, + "learning_rate": 2.4128971300342506e-07, + "logits/chosen": -2.257172107696533, + "logits/rejected": -2.4716625213623047, + "logps/chosen": -417.0865783691406, + "logps/rejected": -339.0855407714844, + "loss": 0.6129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5448400378227234, + "rewards/margins": 0.804780900478363, + "rewards/rejected": -1.3496208190917969, + "step": 1786 + }, + { + "epoch": 0.21, + "learning_rate": 2.4125428132750676e-07, + "logits/chosen": -2.949373245239258, + "logits/rejected": -2.935990810394287, + "logps/chosen": -314.9571838378906, + "logps/rejected": -222.14329528808594, + "loss": 0.4192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5069339275360107, + "rewards/margins": 2.4671695232391357, + "rewards/rejected": -2.9741034507751465, + "step": 1787 + }, + { + "epoch": 0.21, + "learning_rate": 2.412188496515885e-07, + "logits/chosen": -2.1952579021453857, + "logits/rejected": -2.5754575729370117, + "logps/chosen": -438.81036376953125, + "logps/rejected": -258.2390441894531, + "loss": 0.6286, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1901628971099854, + "rewards/margins": 0.31962358951568604, + "rewards/rejected": -1.509786605834961, + "step": 1788 + }, + { + "epoch": 0.21, + "learning_rate": 2.4118341797567025e-07, + "logits/chosen": -2.2454657554626465, + "logits/rejected": -1.98702073097229, + "logps/chosen": -161.27996826171875, + "logps/rejected": -250.56195068359375, + "loss": 0.1593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21291275322437286, + "rewards/margins": 2.3598310947418213, + "rewards/rejected": -2.146918296813965, + "step": 1789 + }, + { + "epoch": 0.21, + "learning_rate": 2.4114798629975195e-07, + "logits/chosen": -2.7658071517944336, + "logits/rejected": -2.9031317234039307, + "logps/chosen": -196.84999084472656, + "logps/rejected": -393.04248046875, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5813136100769043, + "rewards/margins": 4.383693695068359, + "rewards/rejected": -4.965007781982422, + "step": 1790 + }, + { + "epoch": 0.21, + "learning_rate": 2.411125546238337e-07, + "logits/chosen": -2.118443489074707, + "logits/rejected": -1.9644920825958252, + "logps/chosen": -183.48013305664062, + "logps/rejected": -191.82369995117188, + "loss": 0.7392, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3673465251922607, + "rewards/margins": 0.4686541259288788, + "rewards/rejected": -2.836000680923462, + "step": 1791 + }, + { + "epoch": 0.21, + "learning_rate": 2.410771229479154e-07, + "logits/chosen": -2.101119041442871, + "logits/rejected": -2.319307565689087, + "logps/chosen": -491.99066162109375, + "logps/rejected": -331.24652099609375, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33622315526008606, + "rewards/margins": 2.8945205211639404, + "rewards/rejected": -3.230743646621704, + "step": 1792 + }, + { + "epoch": 0.21, + "learning_rate": 2.4104169127199714e-07, + "logits/chosen": -2.5244228839874268, + "logits/rejected": -2.6231677532196045, + "logps/chosen": -281.930908203125, + "logps/rejected": -297.80487060546875, + "loss": 0.3301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6820855736732483, + "rewards/margins": 1.7326068878173828, + "rewards/rejected": -2.4146924018859863, + "step": 1793 + }, + { + "epoch": 0.21, + "learning_rate": 2.410062595960789e-07, + "logits/chosen": -2.1775941848754883, + "logits/rejected": -2.140559196472168, + "logps/chosen": -265.06561279296875, + "logps/rejected": -257.12774658203125, + "loss": 0.445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4523833692073822, + "rewards/margins": 1.183227300643921, + "rewards/rejected": -1.635610818862915, + "step": 1794 + }, + { + "epoch": 0.21, + "learning_rate": 2.4097082792016064e-07, + "logits/chosen": -2.1056160926818848, + "logits/rejected": -2.1519041061401367, + "logps/chosen": -256.8416442871094, + "logps/rejected": -261.8810729980469, + "loss": 0.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.434116005897522, + "rewards/margins": 2.165034532546997, + "rewards/rejected": -2.5991506576538086, + "step": 1795 + }, + { + "epoch": 0.21, + "learning_rate": 2.4093539624424233e-07, + "logits/chosen": -2.406691074371338, + "logits/rejected": -2.403996706008911, + "logps/chosen": -210.34896850585938, + "logps/rejected": -205.80401611328125, + "loss": 0.9085, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.448098063468933, + "rewards/margins": 0.10742287337779999, + "rewards/rejected": -1.555521011352539, + "step": 1796 + }, + { + "epoch": 0.21, + "learning_rate": 2.408999645683241e-07, + "logits/chosen": -2.3813364505767822, + "logits/rejected": -2.5804226398468018, + "logps/chosen": -229.07826232910156, + "logps/rejected": -206.28733825683594, + "loss": 0.6565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5977106094360352, + "rewards/margins": 0.8167107105255127, + "rewards/rejected": -1.4144212007522583, + "step": 1797 + }, + { + "epoch": 0.21, + "learning_rate": 2.408645328924058e-07, + "logits/chosen": -2.222550868988037, + "logits/rejected": -2.104487180709839, + "logps/chosen": -200.4759979248047, + "logps/rejected": -251.73898315429688, + "loss": 0.1941, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.067495346069336, + "rewards/margins": 2.917901039123535, + "rewards/rejected": -3.985396385192871, + "step": 1798 + }, + { + "epoch": 0.21, + "learning_rate": 2.4082910121648753e-07, + "logits/chosen": -2.2080435752868652, + "logits/rejected": -1.9050899744033813, + "logps/chosen": -214.06280517578125, + "logps/rejected": -409.6534118652344, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07568755745887756, + "rewards/margins": 1.724006175994873, + "rewards/rejected": -1.6483186483383179, + "step": 1799 + }, + { + "epoch": 0.21, + "learning_rate": 2.407936695405693e-07, + "logits/chosen": -2.782255172729492, + "logits/rejected": -2.5283596515655518, + "logps/chosen": -290.778076171875, + "logps/rejected": -332.7655334472656, + "loss": 0.5626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6284094452857971, + "rewards/margins": 1.9699878692626953, + "rewards/rejected": -2.5983974933624268, + "step": 1800 + }, + { + "epoch": 0.21, + "learning_rate": 2.4075823786465097e-07, + "logits/chosen": -2.087553024291992, + "logits/rejected": -2.0506093502044678, + "logps/chosen": -495.3292236328125, + "logps/rejected": -430.4928283691406, + "loss": 0.5496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7543399333953857, + "rewards/margins": 0.6078133583068848, + "rewards/rejected": -1.3621532917022705, + "step": 1801 + }, + { + "epoch": 0.21, + "learning_rate": 2.407228061887327e-07, + "logits/chosen": -2.0319645404815674, + "logits/rejected": -2.2050719261169434, + "logps/chosen": -441.437255859375, + "logps/rejected": -355.9711608886719, + "loss": 0.4962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9836417436599731, + "rewards/margins": 0.9870733022689819, + "rewards/rejected": -1.970715045928955, + "step": 1802 + }, + { + "epoch": 0.21, + "learning_rate": 2.406873745128144e-07, + "logits/chosen": -1.780432939529419, + "logits/rejected": -1.7204499244689941, + "logps/chosen": -320.8402099609375, + "logps/rejected": -277.77276611328125, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1337292194366455, + "rewards/margins": 1.7460973262786865, + "rewards/rejected": -2.879826545715332, + "step": 1803 + }, + { + "epoch": 0.21, + "learning_rate": 2.4065194283689616e-07, + "logits/chosen": -2.732217788696289, + "logits/rejected": -2.3263778686523438, + "logps/chosen": -412.921630859375, + "logps/rejected": -264.34619140625, + "loss": 0.1524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6698638796806335, + "rewards/margins": 2.2237887382507324, + "rewards/rejected": -2.8936524391174316, + "step": 1804 + }, + { + "epoch": 0.21, + "learning_rate": 2.406165111609779e-07, + "logits/chosen": -2.475923538208008, + "logits/rejected": -2.8211891651153564, + "logps/chosen": -290.59576416015625, + "logps/rejected": -168.78836059570312, + "loss": 0.9308, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0191001892089844, + "rewards/margins": 1.2628498077392578, + "rewards/rejected": -2.281949996948242, + "step": 1805 + }, + { + "epoch": 0.21, + "learning_rate": 2.4058107948505966e-07, + "logits/chosen": -2.2517244815826416, + "logits/rejected": -2.577059745788574, + "logps/chosen": -356.13153076171875, + "logps/rejected": -245.3402099609375, + "loss": 0.3627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6635759472846985, + "rewards/margins": 1.917927622795105, + "rewards/rejected": -2.581503391265869, + "step": 1806 + }, + { + "epoch": 0.21, + "learning_rate": 2.4054564780914136e-07, + "logits/chosen": -2.65557861328125, + "logits/rejected": -2.810976982116699, + "logps/chosen": -346.8943176269531, + "logps/rejected": -213.51902770996094, + "loss": 0.7772, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.011732816696167, + "rewards/margins": 0.16379913687705994, + "rewards/rejected": -1.1755318641662598, + "step": 1807 + }, + { + "epoch": 0.21, + "learning_rate": 2.405102161332231e-07, + "logits/chosen": -2.545552968978882, + "logits/rejected": -2.1742825508117676, + "logps/chosen": -266.0611572265625, + "logps/rejected": -246.55587768554688, + "loss": 0.7344, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0296876430511475, + "rewards/margins": 0.5939750671386719, + "rewards/rejected": -1.6236627101898193, + "step": 1808 + }, + { + "epoch": 0.21, + "learning_rate": 2.404747844573048e-07, + "logits/chosen": -2.8086862564086914, + "logits/rejected": -2.844008445739746, + "logps/chosen": -685.3878173828125, + "logps/rejected": -469.28662109375, + "loss": 0.225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28446730971336365, + "rewards/margins": 2.5609359741210938, + "rewards/rejected": -2.8454034328460693, + "step": 1809 + }, + { + "epoch": 0.21, + "learning_rate": 2.4043935278138655e-07, + "logits/chosen": -1.9144043922424316, + "logits/rejected": -1.7782530784606934, + "logps/chosen": -401.7034606933594, + "logps/rejected": -332.67059326171875, + "loss": 0.4218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9646287560462952, + "rewards/margins": 1.4903197288513184, + "rewards/rejected": -2.454948663711548, + "step": 1810 + }, + { + "epoch": 0.21, + "learning_rate": 2.404039211054683e-07, + "logits/chosen": -1.3119378089904785, + "logits/rejected": -1.9025933742523193, + "logps/chosen": -336.310791015625, + "logps/rejected": -204.52452087402344, + "loss": 0.6543, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0531299114227295, + "rewards/margins": 2.0119895935058594, + "rewards/rejected": -3.0651192665100098, + "step": 1811 + }, + { + "epoch": 0.21, + "learning_rate": 2.4036848942955e-07, + "logits/chosen": -2.1743083000183105, + "logits/rejected": -2.46732759475708, + "logps/chosen": -443.3954772949219, + "logps/rejected": -270.8285217285156, + "loss": 0.2361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6953043937683105, + "rewards/margins": 3.064387798309326, + "rewards/rejected": -3.7596921920776367, + "step": 1812 + }, + { + "epoch": 0.21, + "learning_rate": 2.4033305775363174e-07, + "logits/chosen": -2.3740384578704834, + "logits/rejected": -2.2105555534362793, + "logps/chosen": -197.86326599121094, + "logps/rejected": -273.10223388671875, + "loss": 0.3574, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5061732530593872, + "rewards/margins": 1.3640061616897583, + "rewards/rejected": -1.8701794147491455, + "step": 1813 + }, + { + "epoch": 0.21, + "learning_rate": 2.4029762607771344e-07, + "logits/chosen": -2.092151641845703, + "logits/rejected": -2.1698620319366455, + "logps/chosen": -274.0726318359375, + "logps/rejected": -244.6338348388672, + "loss": 1.0721, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.246246099472046, + "rewards/margins": 1.3149744272232056, + "rewards/rejected": -3.561220645904541, + "step": 1814 + }, + { + "epoch": 0.21, + "learning_rate": 2.402621944017952e-07, + "logits/chosen": -2.3072686195373535, + "logits/rejected": -2.0539426803588867, + "logps/chosen": -117.81233978271484, + "logps/rejected": -193.07135009765625, + "loss": 0.4486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5918164253234863, + "rewards/margins": 2.0979573726654053, + "rewards/rejected": -2.6897740364074707, + "step": 1815 + }, + { + "epoch": 0.21, + "learning_rate": 2.402267627258769e-07, + "logits/chosen": -1.7902355194091797, + "logits/rejected": -1.8418197631835938, + "logps/chosen": -307.3411865234375, + "logps/rejected": -366.6976623535156, + "loss": 0.4356, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4473026990890503, + "rewards/margins": 2.647409200668335, + "rewards/rejected": -4.094712257385254, + "step": 1816 + }, + { + "epoch": 0.21, + "learning_rate": 2.401913310499587e-07, + "logits/chosen": -2.324477195739746, + "logits/rejected": -2.446094036102295, + "logps/chosen": -359.21539306640625, + "logps/rejected": -344.5244445800781, + "loss": 0.746, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0030767917633057, + "rewards/margins": 0.9773438572883606, + "rewards/rejected": -1.980420708656311, + "step": 1817 + }, + { + "epoch": 0.21, + "learning_rate": 2.401558993740404e-07, + "logits/chosen": -2.537219285964966, + "logits/rejected": -2.709055185317993, + "logps/chosen": -383.7568054199219, + "logps/rejected": -247.7540283203125, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8268507719039917, + "rewards/margins": 2.226870059967041, + "rewards/rejected": -3.053720712661743, + "step": 1818 + }, + { + "epoch": 0.21, + "learning_rate": 2.4012046769812213e-07, + "logits/chosen": -1.9707705974578857, + "logits/rejected": -2.0588130950927734, + "logps/chosen": -215.45407104492188, + "logps/rejected": -239.27899169921875, + "loss": 0.3587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.959610641002655, + "rewards/margins": 1.1813466548919678, + "rewards/rejected": -2.1409573554992676, + "step": 1819 + }, + { + "epoch": 0.21, + "learning_rate": 2.400850360222038e-07, + "logits/chosen": -2.424398899078369, + "logits/rejected": -2.6035056114196777, + "logps/chosen": -435.84649658203125, + "logps/rejected": -301.8731384277344, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05078011006116867, + "rewards/margins": 2.353275775909424, + "rewards/rejected": -2.3024957180023193, + "step": 1820 + }, + { + "epoch": 0.21, + "learning_rate": 2.4004960434628557e-07, + "logits/chosen": -2.684037685394287, + "logits/rejected": -2.6559371948242188, + "logps/chosen": -247.36549377441406, + "logps/rejected": -201.24359130859375, + "loss": 1.5475, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5226982831954956, + "rewards/margins": -0.05223196744918823, + "rewards/rejected": -1.4704663753509521, + "step": 1821 + }, + { + "epoch": 0.21, + "learning_rate": 2.400141726703673e-07, + "logits/chosen": -2.238429546356201, + "logits/rejected": -2.1038031578063965, + "logps/chosen": -233.472412109375, + "logps/rejected": -350.8326416015625, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7716639041900635, + "rewards/margins": 2.766087293624878, + "rewards/rejected": -3.5377514362335205, + "step": 1822 + }, + { + "epoch": 0.21, + "learning_rate": 2.39978740994449e-07, + "logits/chosen": -2.219170093536377, + "logits/rejected": -2.0593149662017822, + "logps/chosen": -275.6041564941406, + "logps/rejected": -345.3872985839844, + "loss": 0.3921, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44098731875419617, + "rewards/margins": 1.1421345472335815, + "rewards/rejected": -1.5831218957901, + "step": 1823 + }, + { + "epoch": 0.21, + "learning_rate": 2.3994330931853077e-07, + "logits/chosen": -2.1841928958892822, + "logits/rejected": -2.2610769271850586, + "logps/chosen": -140.24583435058594, + "logps/rejected": -162.65199279785156, + "loss": 0.6778, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3847140073776245, + "rewards/margins": 0.8519527316093445, + "rewards/rejected": -2.2366669178009033, + "step": 1824 + }, + { + "epoch": 0.21, + "learning_rate": 2.3990787764261246e-07, + "logits/chosen": -2.0864861011505127, + "logits/rejected": -2.011284351348877, + "logps/chosen": -292.2127685546875, + "logps/rejected": -524.74560546875, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.25495183467865, + "rewards/margins": 1.6638132333755493, + "rewards/rejected": -2.918765068054199, + "step": 1825 + }, + { + "epoch": 0.21, + "learning_rate": 2.398724459666942e-07, + "logits/chosen": -2.393972158432007, + "logits/rejected": -2.4124274253845215, + "logps/chosen": -135.2225341796875, + "logps/rejected": -185.97543334960938, + "loss": 0.6378, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5506224632263184, + "rewards/margins": 0.856365442276001, + "rewards/rejected": -2.4069879055023193, + "step": 1826 + }, + { + "epoch": 0.21, + "learning_rate": 2.398370142907759e-07, + "logits/chosen": -2.539834976196289, + "logits/rejected": -2.832745313644409, + "logps/chosen": -198.1143798828125, + "logps/rejected": -236.12112426757812, + "loss": 0.2266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4517902135848999, + "rewards/margins": 1.8995400667190552, + "rewards/rejected": -2.351330280303955, + "step": 1827 + }, + { + "epoch": 0.21, + "learning_rate": 2.3980158261485765e-07, + "logits/chosen": -2.160886287689209, + "logits/rejected": -2.6089634895324707, + "logps/chosen": -244.024658203125, + "logps/rejected": -189.86734008789062, + "loss": 0.8517, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0040578842163086, + "rewards/margins": 0.3868162930011749, + "rewards/rejected": -1.3908742666244507, + "step": 1828 + }, + { + "epoch": 0.21, + "learning_rate": 2.397661509389394e-07, + "logits/chosen": -2.125411033630371, + "logits/rejected": -2.2947494983673096, + "logps/chosen": -454.7753601074219, + "logps/rejected": -430.8077087402344, + "loss": 0.4029, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6187058091163635, + "rewards/margins": 1.2236459255218506, + "rewards/rejected": -1.8423516750335693, + "step": 1829 + }, + { + "epoch": 0.21, + "learning_rate": 2.3973071926302115e-07, + "logits/chosen": -2.740480422973633, + "logits/rejected": -2.67659330368042, + "logps/chosen": -225.91355895996094, + "logps/rejected": -223.62881469726562, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6065132021903992, + "rewards/margins": 1.2455552816390991, + "rewards/rejected": -1.8520684242248535, + "step": 1830 + }, + { + "epoch": 0.21, + "learning_rate": 2.3969528758710285e-07, + "logits/chosen": -2.2650599479675293, + "logits/rejected": -2.3431594371795654, + "logps/chosen": -220.01846313476562, + "logps/rejected": -223.2489013671875, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5444383025169373, + "rewards/margins": 1.2333711385726929, + "rewards/rejected": -1.7778096199035645, + "step": 1831 + }, + { + "epoch": 0.21, + "learning_rate": 2.396598559111846e-07, + "logits/chosen": -2.4323949813842773, + "logits/rejected": -2.3808979988098145, + "logps/chosen": -205.6292724609375, + "logps/rejected": -196.97604370117188, + "loss": 0.4376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7448318600654602, + "rewards/margins": 1.1018013954162598, + "rewards/rejected": -1.8466330766677856, + "step": 1832 + }, + { + "epoch": 0.21, + "learning_rate": 2.3962442423526634e-07, + "logits/chosen": -2.2598514556884766, + "logits/rejected": -2.7316391468048096, + "logps/chosen": -260.2677917480469, + "logps/rejected": -295.92901611328125, + "loss": 0.8556, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7733219861984253, + "rewards/margins": 1.5683618783950806, + "rewards/rejected": -3.3416836261749268, + "step": 1833 + }, + { + "epoch": 0.21, + "learning_rate": 2.3958899255934804e-07, + "logits/chosen": -1.938433051109314, + "logits/rejected": -2.050424575805664, + "logps/chosen": -267.85443115234375, + "logps/rejected": -219.1025390625, + "loss": 0.3592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6399181485176086, + "rewards/margins": 1.650614857673645, + "rewards/rejected": -2.2905330657958984, + "step": 1834 + }, + { + "epoch": 0.21, + "learning_rate": 2.395535608834298e-07, + "logits/chosen": -2.028898239135742, + "logits/rejected": -2.1120588779449463, + "logps/chosen": -286.2508544921875, + "logps/rejected": -265.2257385253906, + "loss": 0.4171, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8999100923538208, + "rewards/margins": 1.5799853801727295, + "rewards/rejected": -2.4798953533172607, + "step": 1835 + }, + { + "epoch": 0.21, + "learning_rate": 2.395181292075115e-07, + "logits/chosen": -2.1489248275756836, + "logits/rejected": -2.2173194885253906, + "logps/chosen": -399.3984375, + "logps/rejected": -279.0289306640625, + "loss": 0.7069, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9595643281936646, + "rewards/margins": 0.9332965016365051, + "rewards/rejected": -1.892860770225525, + "step": 1836 + }, + { + "epoch": 0.21, + "learning_rate": 2.3948269753159323e-07, + "logits/chosen": -1.767270803451538, + "logits/rejected": -1.9848899841308594, + "logps/chosen": -334.19915771484375, + "logps/rejected": -292.3046569824219, + "loss": 0.633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3692512512207031, + "rewards/margins": 0.6491051912307739, + "rewards/rejected": -1.0183563232421875, + "step": 1837 + }, + { + "epoch": 0.21, + "learning_rate": 2.3944726585567493e-07, + "logits/chosen": -2.1437666416168213, + "logits/rejected": -2.232595443725586, + "logps/chosen": -316.97711181640625, + "logps/rejected": -200.85821533203125, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3444887697696686, + "rewards/margins": 2.1315860748291016, + "rewards/rejected": -2.476074457168579, + "step": 1838 + }, + { + "epoch": 0.21, + "learning_rate": 2.394118341797567e-07, + "logits/chosen": -2.4141809940338135, + "logits/rejected": -2.471381902694702, + "logps/chosen": -327.2308044433594, + "logps/rejected": -280.989013671875, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9609078168869019, + "rewards/margins": 1.5989973545074463, + "rewards/rejected": -2.5599050521850586, + "step": 1839 + }, + { + "epoch": 0.21, + "learning_rate": 2.393764025038384e-07, + "logits/chosen": -1.2056314945220947, + "logits/rejected": -2.028480291366577, + "logps/chosen": -510.6068420410156, + "logps/rejected": -229.74786376953125, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6691687703132629, + "rewards/margins": 0.607108473777771, + "rewards/rejected": -1.2762773036956787, + "step": 1840 + }, + { + "epoch": 0.21, + "learning_rate": 2.393409708279202e-07, + "logits/chosen": -2.1948962211608887, + "logits/rejected": -2.531216621398926, + "logps/chosen": -237.39706420898438, + "logps/rejected": -186.1221160888672, + "loss": 1.0606, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.315165400505066, + "rewards/margins": -0.23163765668869019, + "rewards/rejected": -1.0835278034210205, + "step": 1841 + }, + { + "epoch": 0.21, + "learning_rate": 2.3930553915200187e-07, + "logits/chosen": -2.106248378753662, + "logits/rejected": -2.3820412158966064, + "logps/chosen": -398.3311767578125, + "logps/rejected": -197.82061767578125, + "loss": 0.1997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46229487657546997, + "rewards/margins": 2.288001775741577, + "rewards/rejected": -2.7502965927124023, + "step": 1842 + }, + { + "epoch": 0.21, + "learning_rate": 2.392701074760836e-07, + "logits/chosen": -2.2125391960144043, + "logits/rejected": -2.2547972202301025, + "logps/chosen": -284.9970703125, + "logps/rejected": -291.548828125, + "loss": 0.869, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1225461959838867, + "rewards/margins": 0.5770548582077026, + "rewards/rejected": -1.6996010541915894, + "step": 1843 + }, + { + "epoch": 0.21, + "learning_rate": 2.3923467580016537e-07, + "logits/chosen": -2.118744134902954, + "logits/rejected": -1.8897547721862793, + "logps/chosen": -219.31349182128906, + "logps/rejected": -196.26116943359375, + "loss": 0.476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7955001592636108, + "rewards/margins": 1.586808681488037, + "rewards/rejected": -2.3823089599609375, + "step": 1844 + }, + { + "epoch": 0.21, + "learning_rate": 2.3919924412424706e-07, + "logits/chosen": -2.5527396202087402, + "logits/rejected": -2.2266621589660645, + "logps/chosen": -120.480224609375, + "logps/rejected": -310.6474304199219, + "loss": 0.268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1293451189994812, + "rewards/margins": 2.0820906162261963, + "rewards/rejected": -2.2114357948303223, + "step": 1845 + }, + { + "epoch": 0.21, + "learning_rate": 2.391638124483288e-07, + "logits/chosen": -2.293996572494507, + "logits/rejected": -2.3616414070129395, + "logps/chosen": -388.346435546875, + "logps/rejected": -233.2850341796875, + "loss": 1.168, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.548737645149231, + "rewards/margins": -0.4776061773300171, + "rewards/rejected": -1.0711315870285034, + "step": 1846 + }, + { + "epoch": 0.21, + "learning_rate": 2.391283807724105e-07, + "logits/chosen": -1.8465558290481567, + "logits/rejected": -1.9513492584228516, + "logps/chosen": -404.521728515625, + "logps/rejected": -302.3456115722656, + "loss": 0.2203, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06522838771343231, + "rewards/margins": 2.3514344692230225, + "rewards/rejected": -2.4166626930236816, + "step": 1847 + }, + { + "epoch": 0.21, + "learning_rate": 2.3909294909649226e-07, + "logits/chosen": -1.9971561431884766, + "logits/rejected": -2.1445775032043457, + "logps/chosen": -195.35919189453125, + "logps/rejected": -338.66632080078125, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15782007575035095, + "rewards/margins": 2.248278856277466, + "rewards/rejected": -2.4060990810394287, + "step": 1848 + }, + { + "epoch": 0.22, + "learning_rate": 2.3905751742057395e-07, + "logits/chosen": -2.793036460876465, + "logits/rejected": -2.6389265060424805, + "logps/chosen": -196.2069091796875, + "logps/rejected": -215.1942901611328, + "loss": 0.6496, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0071046352386475, + "rewards/margins": 1.3775696754455566, + "rewards/rejected": -2.384674310684204, + "step": 1849 + }, + { + "epoch": 0.22, + "learning_rate": 2.390220857446557e-07, + "logits/chosen": -2.6398863792419434, + "logits/rejected": -2.6811814308166504, + "logps/chosen": -220.59967041015625, + "logps/rejected": -232.10797119140625, + "loss": 0.1671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11532637476921082, + "rewards/margins": 3.08892822265625, + "rewards/rejected": -3.204254627227783, + "step": 1850 + }, + { + "epoch": 0.22, + "learning_rate": 2.3898665406873745e-07, + "logits/chosen": -2.6183342933654785, + "logits/rejected": -2.753331184387207, + "logps/chosen": -242.7398223876953, + "logps/rejected": -194.97073364257812, + "loss": 0.6219, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.981342613697052, + "rewards/margins": 1.2197438478469849, + "rewards/rejected": -2.2010862827301025, + "step": 1851 + }, + { + "epoch": 0.22, + "learning_rate": 2.389512223928192e-07, + "logits/chosen": -2.4667465686798096, + "logits/rejected": -2.49292254447937, + "logps/chosen": -267.1186218261719, + "logps/rejected": -229.34974670410156, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4146808683872223, + "rewards/margins": 2.0976667404174805, + "rewards/rejected": -2.512347459793091, + "step": 1852 + }, + { + "epoch": 0.22, + "learning_rate": 2.389157907169009e-07, + "logits/chosen": -2.541045904159546, + "logits/rejected": -2.487985372543335, + "logps/chosen": -357.8911437988281, + "logps/rejected": -431.518310546875, + "loss": 0.3187, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9771466851234436, + "rewards/margins": 2.05190372467041, + "rewards/rejected": -3.029050350189209, + "step": 1853 + }, + { + "epoch": 0.22, + "learning_rate": 2.3888035904098264e-07, + "logits/chosen": -1.7816026210784912, + "logits/rejected": -1.9180233478546143, + "logps/chosen": -382.764404296875, + "logps/rejected": -357.87701416015625, + "loss": 0.4381, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.013568639755249, + "rewards/margins": 1.7097814083099365, + "rewards/rejected": -2.7233500480651855, + "step": 1854 + }, + { + "epoch": 0.22, + "learning_rate": 2.388449273650644e-07, + "logits/chosen": -2.483443021774292, + "logits/rejected": -2.617896556854248, + "logps/chosen": -152.0458526611328, + "logps/rejected": -156.21456909179688, + "loss": 0.5595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6136979460716248, + "rewards/margins": 0.8004008531570435, + "rewards/rejected": -1.4140987396240234, + "step": 1855 + }, + { + "epoch": 0.22, + "learning_rate": 2.388094956891461e-07, + "logits/chosen": -2.2962586879730225, + "logits/rejected": -2.2386789321899414, + "logps/chosen": -232.10865783691406, + "logps/rejected": -269.0420837402344, + "loss": 0.5778, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3749765157699585, + "rewards/margins": 1.1745188236236572, + "rewards/rejected": -2.5494954586029053, + "step": 1856 + }, + { + "epoch": 0.22, + "learning_rate": 2.3877406401322783e-07, + "logits/chosen": -2.349285364151001, + "logits/rejected": -2.4561026096343994, + "logps/chosen": -248.56243896484375, + "logps/rejected": -312.0973815917969, + "loss": 0.3199, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6428518891334534, + "rewards/margins": 2.0816967487335205, + "rewards/rejected": -2.724548578262329, + "step": 1857 + }, + { + "epoch": 0.22, + "learning_rate": 2.3873863233730953e-07, + "logits/chosen": -2.654658079147339, + "logits/rejected": -2.70839262008667, + "logps/chosen": -263.2142639160156, + "logps/rejected": -232.64837646484375, + "loss": 0.3819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24215373396873474, + "rewards/margins": 2.618910074234009, + "rewards/rejected": -2.8610637187957764, + "step": 1858 + }, + { + "epoch": 0.22, + "learning_rate": 2.387032006613913e-07, + "logits/chosen": -2.1176605224609375, + "logits/rejected": -1.8720145225524902, + "logps/chosen": -293.8715515136719, + "logps/rejected": -356.38323974609375, + "loss": 0.7794, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1942709684371948, + "rewards/margins": 0.8505164384841919, + "rewards/rejected": -2.0447874069213867, + "step": 1859 + }, + { + "epoch": 0.22, + "learning_rate": 2.38667768985473e-07, + "logits/chosen": -2.1390013694763184, + "logits/rejected": -2.161271810531616, + "logps/chosen": -243.033935546875, + "logps/rejected": -226.15463256835938, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1890732049942017, + "rewards/margins": 2.2810869216918945, + "rewards/rejected": -3.4701602458953857, + "step": 1860 + }, + { + "epoch": 0.22, + "learning_rate": 2.386323373095547e-07, + "logits/chosen": -2.2839748859405518, + "logits/rejected": -2.6485610008239746, + "logps/chosen": -315.6589050292969, + "logps/rejected": -172.8271026611328, + "loss": 0.3502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37328648567199707, + "rewards/margins": 1.629736304283142, + "rewards/rejected": -2.0030229091644287, + "step": 1861 + }, + { + "epoch": 0.22, + "learning_rate": 2.3859690563363647e-07, + "logits/chosen": -2.7193374633789062, + "logits/rejected": -2.3855364322662354, + "logps/chosen": -223.84414672851562, + "logps/rejected": -304.731201171875, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.285675048828125, + "rewards/margins": 2.962185859680176, + "rewards/rejected": -3.24786114692688, + "step": 1862 + }, + { + "epoch": 0.22, + "learning_rate": 2.3856147395771817e-07, + "logits/chosen": -1.8834855556488037, + "logits/rejected": -2.037998914718628, + "logps/chosen": -445.2252502441406, + "logps/rejected": -393.1165771484375, + "loss": 0.4185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8777363300323486, + "rewards/margins": 2.065230369567871, + "rewards/rejected": -2.942966938018799, + "step": 1863 + }, + { + "epoch": 0.22, + "learning_rate": 2.385260422817999e-07, + "logits/chosen": -2.3141584396362305, + "logits/rejected": -2.650653600692749, + "logps/chosen": -348.17987060546875, + "logps/rejected": -212.69046020507812, + "loss": 1.9008, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.8680331707000732, + "rewards/margins": -0.04981809854507446, + "rewards/rejected": -2.8182151317596436, + "step": 1864 + }, + { + "epoch": 0.22, + "learning_rate": 2.3849061060588166e-07, + "logits/chosen": -2.130000114440918, + "logits/rejected": -2.181450366973877, + "logps/chosen": -441.27691650390625, + "logps/rejected": -479.3074951171875, + "loss": 0.1648, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0422428846359253, + "rewards/margins": 3.426182746887207, + "rewards/rejected": -4.468425750732422, + "step": 1865 + }, + { + "epoch": 0.22, + "learning_rate": 2.3845517892996336e-07, + "logits/chosen": -3.0332584381103516, + "logits/rejected": -2.930889129638672, + "logps/chosen": -331.23699951171875, + "logps/rejected": -201.7147216796875, + "loss": 0.9153, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2843034267425537, + "rewards/margins": 0.42358124256134033, + "rewards/rejected": -1.7078847885131836, + "step": 1866 + }, + { + "epoch": 0.22, + "learning_rate": 2.384197472540451e-07, + "logits/chosen": -2.251643419265747, + "logits/rejected": -2.4208426475524902, + "logps/chosen": -249.50155639648438, + "logps/rejected": -276.2348327636719, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2371821403503418, + "rewards/margins": 1.7915068864822388, + "rewards/rejected": -3.028688907623291, + "step": 1867 + }, + { + "epoch": 0.22, + "learning_rate": 2.3838431557812683e-07, + "logits/chosen": -2.7052016258239746, + "logits/rejected": -2.7283475399017334, + "logps/chosen": -279.57769775390625, + "logps/rejected": -117.5062026977539, + "loss": 0.375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7914484739303589, + "rewards/margins": 1.1875109672546387, + "rewards/rejected": -1.978959560394287, + "step": 1868 + }, + { + "epoch": 0.22, + "learning_rate": 2.3834888390220855e-07, + "logits/chosen": -2.074162244796753, + "logits/rejected": -2.1329944133758545, + "logps/chosen": -326.43621826171875, + "logps/rejected": -305.2041015625, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7786694765090942, + "rewards/margins": 2.6644654273986816, + "rewards/rejected": -3.4431350231170654, + "step": 1869 + }, + { + "epoch": 0.22, + "learning_rate": 2.383134522262903e-07, + "logits/chosen": -2.163203716278076, + "logits/rejected": -2.1183652877807617, + "logps/chosen": -229.25927734375, + "logps/rejected": -267.69451904296875, + "loss": 0.7225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9101919531822205, + "rewards/margins": 1.0462372303009033, + "rewards/rejected": -1.956429362297058, + "step": 1870 + }, + { + "epoch": 0.22, + "learning_rate": 2.3827802055037202e-07, + "logits/chosen": -2.105525493621826, + "logits/rejected": -2.397615909576416, + "logps/chosen": -223.27398681640625, + "logps/rejected": -142.81784057617188, + "loss": 0.4884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5663559436798096, + "rewards/margins": 0.8729592561721802, + "rewards/rejected": -1.4393153190612793, + "step": 1871 + }, + { + "epoch": 0.22, + "learning_rate": 2.3824258887445375e-07, + "logits/chosen": -2.2896339893341064, + "logits/rejected": -2.2466232776641846, + "logps/chosen": -267.2420349121094, + "logps/rejected": -310.07586669921875, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3726680278778076, + "rewards/margins": 2.13045597076416, + "rewards/rejected": -2.5031239986419678, + "step": 1872 + }, + { + "epoch": 0.22, + "learning_rate": 2.3820715719853547e-07, + "logits/chosen": -2.0047855377197266, + "logits/rejected": -2.0819265842437744, + "logps/chosen": -312.0521240234375, + "logps/rejected": -314.2274169921875, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35509783029556274, + "rewards/margins": 2.9787514209747314, + "rewards/rejected": -3.3338494300842285, + "step": 1873 + }, + { + "epoch": 0.22, + "learning_rate": 2.381717255226172e-07, + "logits/chosen": -2.528146982192993, + "logits/rejected": -2.373764753341675, + "logps/chosen": -254.23388671875, + "logps/rejected": -434.03778076171875, + "loss": 0.2513, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0806447267532349, + "rewards/margins": 2.702732801437378, + "rewards/rejected": -3.7833776473999023, + "step": 1874 + }, + { + "epoch": 0.22, + "learning_rate": 2.3813629384669896e-07, + "logits/chosen": -2.2492728233337402, + "logits/rejected": -2.5492730140686035, + "logps/chosen": -287.4822692871094, + "logps/rejected": -241.01199340820312, + "loss": 1.2629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9821091890335083, + "rewards/margins": 1.0684847831726074, + "rewards/rejected": -3.050593852996826, + "step": 1875 + }, + { + "epoch": 0.22, + "learning_rate": 2.3810086217078069e-07, + "logits/chosen": -1.9452821016311646, + "logits/rejected": -1.8459829092025757, + "logps/chosen": -541.5878295898438, + "logps/rejected": -586.473876953125, + "loss": 0.5774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.985153079032898, + "rewards/margins": 1.1230075359344482, + "rewards/rejected": -2.1081604957580566, + "step": 1876 + }, + { + "epoch": 0.22, + "learning_rate": 2.380654304948624e-07, + "logits/chosen": -1.9148962497711182, + "logits/rejected": -2.084139823913574, + "logps/chosen": -462.7070617675781, + "logps/rejected": -379.54718017578125, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6314287781715393, + "rewards/margins": 1.5687813758850098, + "rewards/rejected": -2.2002103328704834, + "step": 1877 + }, + { + "epoch": 0.22, + "learning_rate": 2.3802999881894413e-07, + "logits/chosen": -2.085134506225586, + "logits/rejected": -2.197228193283081, + "logps/chosen": -497.1221008300781, + "logps/rejected": -362.1059875488281, + "loss": 0.353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8373152613639832, + "rewards/margins": 1.9410117864608765, + "rewards/rejected": -2.778326988220215, + "step": 1878 + }, + { + "epoch": 0.22, + "learning_rate": 2.3799456714302585e-07, + "logits/chosen": -1.9802736043930054, + "logits/rejected": -2.198188304901123, + "logps/chosen": -217.31658935546875, + "logps/rejected": -150.99002075195312, + "loss": 0.6217, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6188395023345947, + "rewards/margins": 0.4184085726737976, + "rewards/rejected": -2.037248134613037, + "step": 1879 + }, + { + "epoch": 0.22, + "learning_rate": 2.3795913546710758e-07, + "logits/chosen": -2.2546474933624268, + "logits/rejected": -2.3717594146728516, + "logps/chosen": -176.18020629882812, + "logps/rejected": -300.4624328613281, + "loss": 0.4854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41547009348869324, + "rewards/margins": 0.9045188426971436, + "rewards/rejected": -1.3199889659881592, + "step": 1880 + }, + { + "epoch": 0.22, + "learning_rate": 2.3792370379118932e-07, + "logits/chosen": -2.157809257507324, + "logits/rejected": -2.3635897636413574, + "logps/chosen": -298.9660339355469, + "logps/rejected": -222.71099853515625, + "loss": 0.2591, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5718898773193359, + "rewards/margins": 2.52756667137146, + "rewards/rejected": -3.099456548690796, + "step": 1881 + }, + { + "epoch": 0.22, + "learning_rate": 2.3788827211527105e-07, + "logits/chosen": -2.6111104488372803, + "logits/rejected": -2.764491081237793, + "logps/chosen": -316.73480224609375, + "logps/rejected": -301.009521484375, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7221324443817139, + "rewards/margins": 2.25185227394104, + "rewards/rejected": -2.973984718322754, + "step": 1882 + }, + { + "epoch": 0.22, + "learning_rate": 2.3785284043935277e-07, + "logits/chosen": -2.3831427097320557, + "logits/rejected": -2.3915817737579346, + "logps/chosen": -441.51751708984375, + "logps/rejected": -417.6058044433594, + "loss": 0.4829, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7003589868545532, + "rewards/margins": 0.9849268198013306, + "rewards/rejected": -1.6852858066558838, + "step": 1883 + }, + { + "epoch": 0.22, + "learning_rate": 2.378174087634345e-07, + "logits/chosen": -2.1098759174346924, + "logits/rejected": -2.2708804607391357, + "logps/chosen": -359.12237548828125, + "logps/rejected": -263.175537109375, + "loss": 0.4785, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4044111967086792, + "rewards/margins": 1.0543668270111084, + "rewards/rejected": -2.458777904510498, + "step": 1884 + }, + { + "epoch": 0.22, + "learning_rate": 2.377819770875162e-07, + "logits/chosen": -2.520573377609253, + "logits/rejected": -2.5162525177001953, + "logps/chosen": -261.03619384765625, + "logps/rejected": -226.56716918945312, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8558923602104187, + "rewards/margins": 1.6590983867645264, + "rewards/rejected": -2.51499080657959, + "step": 1885 + }, + { + "epoch": 0.22, + "learning_rate": 2.3774654541159793e-07, + "logits/chosen": -2.594071388244629, + "logits/rejected": -2.5104005336761475, + "logps/chosen": -165.8953094482422, + "logps/rejected": -121.86151885986328, + "loss": 0.5706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8685418963432312, + "rewards/margins": 1.0428014993667603, + "rewards/rejected": -1.9113434553146362, + "step": 1886 + }, + { + "epoch": 0.22, + "learning_rate": 2.377111137356797e-07, + "logits/chosen": -2.0869667530059814, + "logits/rejected": -1.713941216468811, + "logps/chosen": -218.2683868408203, + "logps/rejected": -353.7004699707031, + "loss": 0.5165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7024774551391602, + "rewards/margins": 1.7456401586532593, + "rewards/rejected": -2.448117733001709, + "step": 1887 + }, + { + "epoch": 0.22, + "learning_rate": 2.3767568205976143e-07, + "logits/chosen": -2.164726972579956, + "logits/rejected": -2.13382625579834, + "logps/chosen": -496.8125915527344, + "logps/rejected": -539.302978515625, + "loss": 0.3261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7606878280639648, + "rewards/margins": 1.917716145515442, + "rewards/rejected": -2.678403854370117, + "step": 1888 + }, + { + "epoch": 0.22, + "learning_rate": 2.3764025038384315e-07, + "logits/chosen": -2.454073429107666, + "logits/rejected": -2.3721110820770264, + "logps/chosen": -435.46527099609375, + "logps/rejected": -462.8751220703125, + "loss": 0.6519, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3418333828449249, + "rewards/margins": 1.033921718597412, + "rewards/rejected": -1.375754952430725, + "step": 1889 + }, + { + "epoch": 0.22, + "learning_rate": 2.3760481870792488e-07, + "logits/chosen": -2.7343053817749023, + "logits/rejected": -2.5915627479553223, + "logps/chosen": -232.17544555664062, + "logps/rejected": -186.90301513671875, + "loss": 0.5301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1238192319869995, + "rewards/margins": 0.8672865629196167, + "rewards/rejected": -1.9911059141159058, + "step": 1890 + }, + { + "epoch": 0.22, + "learning_rate": 2.375693870320066e-07, + "logits/chosen": -2.536543607711792, + "logits/rejected": -2.6756515502929688, + "logps/chosen": -148.28323364257812, + "logps/rejected": -197.31802368164062, + "loss": 0.5705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7416296601295471, + "rewards/margins": 1.5229395627975464, + "rewards/rejected": -2.2645692825317383, + "step": 1891 + }, + { + "epoch": 0.22, + "learning_rate": 2.3753395535608835e-07, + "logits/chosen": -2.717271089553833, + "logits/rejected": -2.8955729007720947, + "logps/chosen": -217.17567443847656, + "logps/rejected": -211.36114501953125, + "loss": 0.1937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34070247411727905, + "rewards/margins": 2.527327537536621, + "rewards/rejected": -2.868029832839966, + "step": 1892 + }, + { + "epoch": 0.22, + "learning_rate": 2.3749852368017007e-07, + "logits/chosen": -2.5037477016448975, + "logits/rejected": -2.6694488525390625, + "logps/chosen": -273.09356689453125, + "logps/rejected": -174.6703643798828, + "loss": 0.2878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14729967713356018, + "rewards/margins": 1.5316331386566162, + "rewards/rejected": -1.3843333721160889, + "step": 1893 + }, + { + "epoch": 0.22, + "learning_rate": 2.374630920042518e-07, + "logits/chosen": -2.7910709381103516, + "logits/rejected": -2.6893210411071777, + "logps/chosen": -128.33018493652344, + "logps/rejected": -184.21444702148438, + "loss": 1.5366, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9125924110412598, + "rewards/margins": -0.40597808361053467, + "rewards/rejected": -1.5066144466400146, + "step": 1894 + }, + { + "epoch": 0.22, + "learning_rate": 2.374276603283335e-07, + "logits/chosen": -2.7422919273376465, + "logits/rejected": -2.6861977577209473, + "logps/chosen": -187.099609375, + "logps/rejected": -196.91485595703125, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6197161078453064, + "rewards/margins": 1.7339483499526978, + "rewards/rejected": -2.3536643981933594, + "step": 1895 + }, + { + "epoch": 0.22, + "learning_rate": 2.3739222865241524e-07, + "logits/chosen": -2.1273694038391113, + "logits/rejected": -2.6363954544067383, + "logps/chosen": -436.3154602050781, + "logps/rejected": -273.14404296875, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7139080762863159, + "rewards/margins": 2.351698398590088, + "rewards/rejected": -3.0656065940856934, + "step": 1896 + }, + { + "epoch": 0.22, + "learning_rate": 2.3735679697649696e-07, + "logits/chosen": -1.9425008296966553, + "logits/rejected": -2.4603288173675537, + "logps/chosen": -532.032470703125, + "logps/rejected": -298.94549560546875, + "loss": 0.5571, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.990544855594635, + "rewards/margins": 1.7373557090759277, + "rewards/rejected": -2.727900505065918, + "step": 1897 + }, + { + "epoch": 0.22, + "learning_rate": 2.3732136530057868e-07, + "logits/chosen": -2.8450167179107666, + "logits/rejected": -2.76806902885437, + "logps/chosen": -231.7842559814453, + "logps/rejected": -215.12518310546875, + "loss": 0.2112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19720341265201569, + "rewards/margins": 1.9906363487243652, + "rewards/rejected": -2.1878397464752197, + "step": 1898 + }, + { + "epoch": 0.22, + "learning_rate": 2.3728593362466045e-07, + "logits/chosen": -2.5170092582702637, + "logits/rejected": -2.3557422161102295, + "logps/chosen": -190.64340209960938, + "logps/rejected": -473.658203125, + "loss": 0.2103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32688775658607483, + "rewards/margins": 2.3585994243621826, + "rewards/rejected": -2.6854870319366455, + "step": 1899 + }, + { + "epoch": 0.22, + "learning_rate": 2.3725050194874218e-07, + "logits/chosen": -2.546168327331543, + "logits/rejected": -2.6860127449035645, + "logps/chosen": -323.1658020019531, + "logps/rejected": -319.9196472167969, + "loss": 0.355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6752281188964844, + "rewards/margins": 2.7249855995178223, + "rewards/rejected": -3.4002134799957275, + "step": 1900 + }, + { + "epoch": 0.22, + "learning_rate": 2.372150702728239e-07, + "logits/chosen": -2.2880942821502686, + "logits/rejected": -2.0898585319519043, + "logps/chosen": -347.9389343261719, + "logps/rejected": -303.85595703125, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48539644479751587, + "rewards/margins": 3.4133224487304688, + "rewards/rejected": -3.89871883392334, + "step": 1901 + }, + { + "epoch": 0.22, + "learning_rate": 2.3717963859690562e-07, + "logits/chosen": -1.71338951587677, + "logits/rejected": -1.8896095752716064, + "logps/chosen": -401.1963195800781, + "logps/rejected": -244.762939453125, + "loss": 0.6482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6270143985748291, + "rewards/margins": 0.48526421189308167, + "rewards/rejected": -1.1122785806655884, + "step": 1902 + }, + { + "epoch": 0.22, + "learning_rate": 2.3714420692098734e-07, + "logits/chosen": -2.188588857650757, + "logits/rejected": -2.1990718841552734, + "logps/chosen": -206.90773010253906, + "logps/rejected": -206.59719848632812, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5991261005401611, + "rewards/margins": 1.3950114250183105, + "rewards/rejected": -1.9941375255584717, + "step": 1903 + }, + { + "epoch": 0.22, + "learning_rate": 2.371087752450691e-07, + "logits/chosen": -2.695085048675537, + "logits/rejected": -2.350358486175537, + "logps/chosen": -246.888916015625, + "logps/rejected": -285.4039306640625, + "loss": 0.4253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49656784534454346, + "rewards/margins": 2.3885788917541504, + "rewards/rejected": -2.8851468563079834, + "step": 1904 + }, + { + "epoch": 0.22, + "learning_rate": 2.3707334356915081e-07, + "logits/chosen": -2.0312933921813965, + "logits/rejected": -2.07850980758667, + "logps/chosen": -317.5693359375, + "logps/rejected": -287.5160217285156, + "loss": 0.3495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30298808217048645, + "rewards/margins": 1.3230944871902466, + "rewards/rejected": -1.6260825395584106, + "step": 1905 + }, + { + "epoch": 0.22, + "learning_rate": 2.3703791189323254e-07, + "logits/chosen": -2.5914878845214844, + "logits/rejected": -2.727274179458618, + "logps/chosen": -302.0438537597656, + "logps/rejected": -407.56207275390625, + "loss": 0.1321, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5776203870773315, + "rewards/margins": 4.692588806152344, + "rewards/rejected": -5.270208835601807, + "step": 1906 + }, + { + "epoch": 0.22, + "learning_rate": 2.3700248021731426e-07, + "logits/chosen": -2.3409159183502197, + "logits/rejected": -2.3960700035095215, + "logps/chosen": -288.47283935546875, + "logps/rejected": -378.9532775878906, + "loss": 1.0877, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1508336067199707, + "rewards/margins": 0.8084985613822937, + "rewards/rejected": -1.9593323469161987, + "step": 1907 + }, + { + "epoch": 0.22, + "learning_rate": 2.3696704854139598e-07, + "logits/chosen": -2.4041659832000732, + "logits/rejected": -2.312610149383545, + "logps/chosen": -219.34585571289062, + "logps/rejected": -146.05990600585938, + "loss": 0.5602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6781108975410461, + "rewards/margins": 1.097469687461853, + "rewards/rejected": -1.7755805253982544, + "step": 1908 + }, + { + "epoch": 0.22, + "learning_rate": 2.369316168654777e-07, + "logits/chosen": -2.5388309955596924, + "logits/rejected": -2.544203281402588, + "logps/chosen": -344.69866943359375, + "logps/rejected": -198.77029418945312, + "loss": 0.565, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3352962732315063, + "rewards/margins": 1.5767372846603394, + "rewards/rejected": -2.9120335578918457, + "step": 1909 + }, + { + "epoch": 0.22, + "learning_rate": 2.3689618518955948e-07, + "logits/chosen": -2.34572696685791, + "logits/rejected": -1.9676403999328613, + "logps/chosen": -203.9527587890625, + "logps/rejected": -292.52044677734375, + "loss": 0.4774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7269752621650696, + "rewards/margins": 1.6519368886947632, + "rewards/rejected": -2.3789122104644775, + "step": 1910 + }, + { + "epoch": 0.22, + "learning_rate": 2.368607535136412e-07, + "logits/chosen": -3.0167646408081055, + "logits/rejected": -2.9568276405334473, + "logps/chosen": -166.76547241210938, + "logps/rejected": -158.6016082763672, + "loss": 0.8436, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8339132070541382, + "rewards/margins": 1.053346872329712, + "rewards/rejected": -1.88726007938385, + "step": 1911 + }, + { + "epoch": 0.22, + "learning_rate": 2.3682532183772292e-07, + "logits/chosen": -2.1457624435424805, + "logits/rejected": -1.905735969543457, + "logps/chosen": -264.6837158203125, + "logps/rejected": -288.00994873046875, + "loss": 0.3102, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8385372161865234, + "rewards/margins": 1.6179676055908203, + "rewards/rejected": -3.4565048217773438, + "step": 1912 + }, + { + "epoch": 0.22, + "learning_rate": 2.3678989016180464e-07, + "logits/chosen": -2.1915013790130615, + "logits/rejected": -1.9362857341766357, + "logps/chosen": -294.81787109375, + "logps/rejected": -371.5819091796875, + "loss": 0.2749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7099554538726807, + "rewards/margins": 1.3402166366577148, + "rewards/rejected": -2.0501720905303955, + "step": 1913 + }, + { + "epoch": 0.22, + "learning_rate": 2.3675445848588637e-07, + "logits/chosen": -2.6588778495788574, + "logits/rejected": -2.423527479171753, + "logps/chosen": -82.45245361328125, + "logps/rejected": -239.75526428222656, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029128743335604668, + "rewards/margins": 2.595555543899536, + "rewards/rejected": -2.6246840953826904, + "step": 1914 + }, + { + "epoch": 0.22, + "learning_rate": 2.3671902680996811e-07, + "logits/chosen": -2.0048234462738037, + "logits/rejected": -2.0521535873413086, + "logps/chosen": -333.2558898925781, + "logps/rejected": -279.37567138671875, + "loss": 0.1844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2931111454963684, + "rewards/margins": 2.1779022216796875, + "rewards/rejected": -2.471013069152832, + "step": 1915 + }, + { + "epoch": 0.22, + "learning_rate": 2.3668359513404984e-07, + "logits/chosen": -2.118030548095703, + "logits/rejected": -2.0117831230163574, + "logps/chosen": -345.56512451171875, + "logps/rejected": -250.82711791992188, + "loss": 0.5729, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46400970220565796, + "rewards/margins": 1.0468201637268066, + "rewards/rejected": -1.5108299255371094, + "step": 1916 + }, + { + "epoch": 0.22, + "learning_rate": 2.3664816345813156e-07, + "logits/chosen": -2.4285521507263184, + "logits/rejected": -2.483677864074707, + "logps/chosen": -450.41802978515625, + "logps/rejected": -507.4884033203125, + "loss": 0.5013, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.101522445678711, + "rewards/margins": 1.6162968873977661, + "rewards/rejected": -2.7178192138671875, + "step": 1917 + }, + { + "epoch": 0.22, + "learning_rate": 2.3661273178221328e-07, + "logits/chosen": -2.5896239280700684, + "logits/rejected": -2.7026164531707764, + "logps/chosen": -334.0228271484375, + "logps/rejected": -293.3704833984375, + "loss": 0.4221, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5501595735549927, + "rewards/margins": 1.6361674070358276, + "rewards/rejected": -2.1863269805908203, + "step": 1918 + }, + { + "epoch": 0.22, + "learning_rate": 2.36577300106295e-07, + "logits/chosen": -2.4875712394714355, + "logits/rejected": -1.9761066436767578, + "logps/chosen": -275.9391174316406, + "logps/rejected": -327.94244384765625, + "loss": 0.2748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45149117708206177, + "rewards/margins": 2.6281967163085938, + "rewards/rejected": -3.0796875953674316, + "step": 1919 + }, + { + "epoch": 0.22, + "learning_rate": 2.3654186843037672e-07, + "logits/chosen": -2.7367970943450928, + "logits/rejected": -2.667642593383789, + "logps/chosen": -394.35650634765625, + "logps/rejected": -374.9109802246094, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0935614109039307, + "rewards/margins": 2.049429416656494, + "rewards/rejected": -3.1429905891418457, + "step": 1920 + }, + { + "epoch": 0.22, + "learning_rate": 2.3650643675445847e-07, + "logits/chosen": -2.410553455352783, + "logits/rejected": -2.764514923095703, + "logps/chosen": -349.839111328125, + "logps/rejected": -339.2150573730469, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.505952000617981, + "rewards/margins": 1.3651684522628784, + "rewards/rejected": -1.8711204528808594, + "step": 1921 + }, + { + "epoch": 0.22, + "learning_rate": 2.3647100507854022e-07, + "logits/chosen": -2.1740853786468506, + "logits/rejected": -2.0532174110412598, + "logps/chosen": -203.07275390625, + "logps/rejected": -221.76226806640625, + "loss": 0.2656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39534902572631836, + "rewards/margins": 1.7812209129333496, + "rewards/rejected": -2.176569938659668, + "step": 1922 + }, + { + "epoch": 0.22, + "learning_rate": 2.3643557340262194e-07, + "logits/chosen": -2.423549175262451, + "logits/rejected": -2.443378448486328, + "logps/chosen": -266.1633605957031, + "logps/rejected": -336.642578125, + "loss": 0.5955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6327597498893738, + "rewards/margins": 1.6455696821212769, + "rewards/rejected": -2.278329372406006, + "step": 1923 + }, + { + "epoch": 0.22, + "learning_rate": 2.3640014172670367e-07, + "logits/chosen": -2.4912524223327637, + "logits/rejected": -2.5601677894592285, + "logps/chosen": -287.7222900390625, + "logps/rejected": -179.62554931640625, + "loss": 0.436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5092442035675049, + "rewards/margins": 1.5516278743743896, + "rewards/rejected": -2.0608720779418945, + "step": 1924 + }, + { + "epoch": 0.22, + "learning_rate": 2.363647100507854e-07, + "logits/chosen": -2.261140823364258, + "logits/rejected": -2.7423696517944336, + "logps/chosen": -230.692138671875, + "logps/rejected": -211.391845703125, + "loss": 0.6102, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1278530359268188, + "rewards/margins": 1.3393990993499756, + "rewards/rejected": -2.467252016067505, + "step": 1925 + }, + { + "epoch": 0.22, + "learning_rate": 2.3632927837486714e-07, + "logits/chosen": -2.661656618118286, + "logits/rejected": -2.6082255840301514, + "logps/chosen": -126.98728942871094, + "logps/rejected": -172.0341796875, + "loss": 0.3999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38309431076049805, + "rewards/margins": 1.4705787897109985, + "rewards/rejected": -1.8536731004714966, + "step": 1926 + }, + { + "epoch": 0.22, + "learning_rate": 2.3629384669894886e-07, + "logits/chosen": -2.1700048446655273, + "logits/rejected": -2.533985137939453, + "logps/chosen": -405.62750244140625, + "logps/rejected": -283.95703125, + "loss": 0.4207, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9974756240844727, + "rewards/margins": 1.5932140350341797, + "rewards/rejected": -2.5906896591186523, + "step": 1927 + }, + { + "epoch": 0.22, + "learning_rate": 2.3625841502303058e-07, + "logits/chosen": -2.6873040199279785, + "logits/rejected": -2.136427402496338, + "logps/chosen": -193.7666473388672, + "logps/rejected": -244.97193908691406, + "loss": 0.531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8325738310813904, + "rewards/margins": 2.3003652095794678, + "rewards/rejected": -3.132938861846924, + "step": 1928 + }, + { + "epoch": 0.22, + "learning_rate": 2.362229833471123e-07, + "logits/chosen": -2.473618507385254, + "logits/rejected": -2.4008960723876953, + "logps/chosen": -139.99038696289062, + "logps/rejected": -245.99298095703125, + "loss": 0.2713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4486868679523468, + "rewards/margins": 2.5053346157073975, + "rewards/rejected": -2.9540212154388428, + "step": 1929 + }, + { + "epoch": 0.22, + "learning_rate": 2.3618755167119403e-07, + "logits/chosen": -2.0137226581573486, + "logits/rejected": -2.099252700805664, + "logps/chosen": -379.299560546875, + "logps/rejected": -202.45355224609375, + "loss": 0.4251, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.094517469406128, + "rewards/margins": 1.51676344871521, + "rewards/rejected": -2.611281156539917, + "step": 1930 + }, + { + "epoch": 0.22, + "learning_rate": 2.3615211999527575e-07, + "logits/chosen": -2.228405475616455, + "logits/rejected": -2.389601707458496, + "logps/chosen": -728.1940307617188, + "logps/rejected": -276.8704833984375, + "loss": 0.4111, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1772117614746094, + "rewards/margins": 1.1355983018875122, + "rewards/rejected": -2.312810182571411, + "step": 1931 + }, + { + "epoch": 0.22, + "learning_rate": 2.3611668831935747e-07, + "logits/chosen": -2.2375380992889404, + "logits/rejected": -2.3524980545043945, + "logps/chosen": -263.7438659667969, + "logps/rejected": -262.020751953125, + "loss": 0.2629, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.324380874633789, + "rewards/margins": 2.5025112628936768, + "rewards/rejected": -3.8268918991088867, + "step": 1932 + }, + { + "epoch": 0.22, + "learning_rate": 2.3608125664343922e-07, + "logits/chosen": -2.625051259994507, + "logits/rejected": -2.6038501262664795, + "logps/chosen": -293.1787109375, + "logps/rejected": -164.8754119873047, + "loss": 0.207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1285419464111328, + "rewards/margins": 2.102248430252075, + "rewards/rejected": -1.9737063646316528, + "step": 1933 + }, + { + "epoch": 0.22, + "learning_rate": 2.3604582496752097e-07, + "logits/chosen": -2.1377415657043457, + "logits/rejected": -2.231637716293335, + "logps/chosen": -173.719970703125, + "logps/rejected": -148.34371948242188, + "loss": 0.9323, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1020605564117432, + "rewards/margins": 0.2662084996700287, + "rewards/rejected": -1.3682689666748047, + "step": 1934 + }, + { + "epoch": 0.23, + "learning_rate": 2.360103932916027e-07, + "logits/chosen": -2.0937788486480713, + "logits/rejected": -1.9793736934661865, + "logps/chosen": -156.51296997070312, + "logps/rejected": -262.4140625, + "loss": 1.0533, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.027639865875244, + "rewards/margins": 1.5333921909332275, + "rewards/rejected": -3.5610318183898926, + "step": 1935 + }, + { + "epoch": 0.23, + "learning_rate": 2.359749616156844e-07, + "logits/chosen": -1.7152124643325806, + "logits/rejected": -1.708038091659546, + "logps/chosen": -463.2947082519531, + "logps/rejected": -535.9517211914062, + "loss": 0.7962, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4877917766571045, + "rewards/margins": 1.05384361743927, + "rewards/rejected": -1.541635274887085, + "step": 1936 + }, + { + "epoch": 0.23, + "learning_rate": 2.3593952993976613e-07, + "logits/chosen": -1.968490719795227, + "logits/rejected": -2.457221746444702, + "logps/chosen": -319.496826171875, + "logps/rejected": -254.2911834716797, + "loss": 0.2835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14511051774024963, + "rewards/margins": 2.079331874847412, + "rewards/rejected": -2.224442481994629, + "step": 1937 + }, + { + "epoch": 0.23, + "learning_rate": 2.3590409826384788e-07, + "logits/chosen": -1.9732520580291748, + "logits/rejected": -2.133970260620117, + "logps/chosen": -355.1065979003906, + "logps/rejected": -320.26568603515625, + "loss": 0.5496, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7588386535644531, + "rewards/margins": 1.116028070449829, + "rewards/rejected": -2.8748669624328613, + "step": 1938 + }, + { + "epoch": 0.23, + "learning_rate": 2.358686665879296e-07, + "logits/chosen": -2.4308066368103027, + "logits/rejected": -2.509688138961792, + "logps/chosen": -265.9458312988281, + "logps/rejected": -215.50738525390625, + "loss": 0.5021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5390626192092896, + "rewards/margins": 1.0280547142028809, + "rewards/rejected": -1.56711745262146, + "step": 1939 + }, + { + "epoch": 0.23, + "learning_rate": 2.3583323491201133e-07, + "logits/chosen": -2.268925905227661, + "logits/rejected": -2.3927552700042725, + "logps/chosen": -147.5147705078125, + "logps/rejected": -249.91552734375, + "loss": 0.4324, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.537254810333252, + "rewards/margins": 1.823635458946228, + "rewards/rejected": -2.3608901500701904, + "step": 1940 + }, + { + "epoch": 0.23, + "learning_rate": 2.3579780323609305e-07, + "logits/chosen": -2.2943806648254395, + "logits/rejected": -2.589171886444092, + "logps/chosen": -316.0751037597656, + "logps/rejected": -225.43399047851562, + "loss": 0.251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49682310223579407, + "rewards/margins": 1.7436914443969727, + "rewards/rejected": -2.2405142784118652, + "step": 1941 + }, + { + "epoch": 0.23, + "learning_rate": 2.3576237156017477e-07, + "logits/chosen": -1.8934153318405151, + "logits/rejected": -2.3351941108703613, + "logps/chosen": -364.35992431640625, + "logps/rejected": -278.6876220703125, + "loss": 0.5707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5959425568580627, + "rewards/margins": 1.0454621315002441, + "rewards/rejected": -1.641404628753662, + "step": 1942 + }, + { + "epoch": 0.23, + "learning_rate": 2.357269398842565e-07, + "logits/chosen": -2.1956911087036133, + "logits/rejected": -2.0930843353271484, + "logps/chosen": -310.4448547363281, + "logps/rejected": -368.12750244140625, + "loss": 0.936, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2604751586914062, + "rewards/margins": -0.1302127093076706, + "rewards/rejected": -1.1302623748779297, + "step": 1943 + }, + { + "epoch": 0.23, + "learning_rate": 2.3569150820833824e-07, + "logits/chosen": -2.2690470218658447, + "logits/rejected": -2.1416778564453125, + "logps/chosen": -278.9833679199219, + "logps/rejected": -265.6446533203125, + "loss": 0.4253, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32209083437919617, + "rewards/margins": 1.1877819299697876, + "rewards/rejected": -1.5098727941513062, + "step": 1944 + }, + { + "epoch": 0.23, + "learning_rate": 2.3565607653241996e-07, + "logits/chosen": -2.8202309608459473, + "logits/rejected": -2.90403151512146, + "logps/chosen": -115.6203842163086, + "logps/rejected": -155.35916137695312, + "loss": 0.4547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6795730590820312, + "rewards/margins": 1.7632499933242798, + "rewards/rejected": -2.4428229331970215, + "step": 1945 + }, + { + "epoch": 0.23, + "learning_rate": 2.356206448565017e-07, + "logits/chosen": -3.018399953842163, + "logits/rejected": -2.945033550262451, + "logps/chosen": -233.0406951904297, + "logps/rejected": -225.58056640625, + "loss": 0.2849, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11904823780059814, + "rewards/margins": 2.310420513153076, + "rewards/rejected": -2.4294686317443848, + "step": 1946 + }, + { + "epoch": 0.23, + "learning_rate": 2.3558521318058343e-07, + "logits/chosen": -2.167405128479004, + "logits/rejected": -2.1109790802001953, + "logps/chosen": -190.81240844726562, + "logps/rejected": -250.09608459472656, + "loss": 0.4934, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6616719961166382, + "rewards/margins": 0.6092509031295776, + "rewards/rejected": -2.270923137664795, + "step": 1947 + }, + { + "epoch": 0.23, + "learning_rate": 2.3554978150466516e-07, + "logits/chosen": -1.8608980178833008, + "logits/rejected": -1.9582914113998413, + "logps/chosen": -297.0230712890625, + "logps/rejected": -319.9097900390625, + "loss": 0.4985, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5452778339385986, + "rewards/margins": 1.4580882787704468, + "rewards/rejected": -3.003365993499756, + "step": 1948 + }, + { + "epoch": 0.23, + "learning_rate": 2.355143498287469e-07, + "logits/chosen": -2.5621790885925293, + "logits/rejected": -2.434837818145752, + "logps/chosen": -118.51498413085938, + "logps/rejected": -211.962158203125, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26106390357017517, + "rewards/margins": 2.9498674869537354, + "rewards/rejected": -3.2109313011169434, + "step": 1949 + }, + { + "epoch": 0.23, + "learning_rate": 2.3547891815282863e-07, + "logits/chosen": -2.096292018890381, + "logits/rejected": -2.5016775131225586, + "logps/chosen": -253.35401916503906, + "logps/rejected": -220.43362426757812, + "loss": 0.3765, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3263643980026245, + "rewards/margins": 0.9609323740005493, + "rewards/rejected": -2.287296772003174, + "step": 1950 + }, + { + "epoch": 0.23, + "learning_rate": 2.3544348647691035e-07, + "logits/chosen": -1.7153205871582031, + "logits/rejected": -2.0528337955474854, + "logps/chosen": -437.9040222167969, + "logps/rejected": -416.1257019042969, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.504633367061615, + "rewards/margins": 3.1203224658966064, + "rewards/rejected": -3.624955892562866, + "step": 1951 + }, + { + "epoch": 0.23, + "learning_rate": 2.3540805480099207e-07, + "logits/chosen": -1.8872873783111572, + "logits/rejected": -1.938816785812378, + "logps/chosen": -427.9764709472656, + "logps/rejected": -277.7381591796875, + "loss": 0.7244, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0255331993103027, + "rewards/margins": 0.7432510256767273, + "rewards/rejected": -1.7687842845916748, + "step": 1952 + }, + { + "epoch": 0.23, + "learning_rate": 2.353726231250738e-07, + "logits/chosen": -2.3966822624206543, + "logits/rejected": -2.498239278793335, + "logps/chosen": -234.72531127929688, + "logps/rejected": -315.43548583984375, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.548599362373352, + "rewards/margins": 1.6499407291412354, + "rewards/rejected": -2.198539972305298, + "step": 1953 + }, + { + "epoch": 0.23, + "learning_rate": 2.3533719144915552e-07, + "logits/chosen": -2.9425292015075684, + "logits/rejected": -2.8486223220825195, + "logps/chosen": -297.0843505859375, + "logps/rejected": -363.0697021484375, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09073683619499207, + "rewards/margins": 2.4803364276885986, + "rewards/rejected": -2.571073293685913, + "step": 1954 + }, + { + "epoch": 0.23, + "learning_rate": 2.3530175977323726e-07, + "logits/chosen": -2.3794262409210205, + "logits/rejected": -2.4866795539855957, + "logps/chosen": -289.1757507324219, + "logps/rejected": -340.517333984375, + "loss": 1.2129, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3949239253997803, + "rewards/margins": 1.0033807754516602, + "rewards/rejected": -2.3983047008514404, + "step": 1955 + }, + { + "epoch": 0.23, + "learning_rate": 2.3526632809731899e-07, + "logits/chosen": -2.2765073776245117, + "logits/rejected": -2.6103739738464355, + "logps/chosen": -437.8962097167969, + "logps/rejected": -234.457275390625, + "loss": 0.4078, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5877244472503662, + "rewards/margins": 1.8323813676834106, + "rewards/rejected": -2.4201059341430664, + "step": 1956 + }, + { + "epoch": 0.23, + "learning_rate": 2.3523089642140073e-07, + "logits/chosen": -1.8146144151687622, + "logits/rejected": -1.6830052137374878, + "logps/chosen": -165.2733154296875, + "logps/rejected": -289.4575500488281, + "loss": 0.3389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2034711837768555, + "rewards/margins": 3.3609015941619873, + "rewards/rejected": -4.564373016357422, + "step": 1957 + }, + { + "epoch": 0.23, + "learning_rate": 2.3519546474548246e-07, + "logits/chosen": -2.467728614807129, + "logits/rejected": -2.5623514652252197, + "logps/chosen": -155.88412475585938, + "logps/rejected": -266.7792053222656, + "loss": 0.1203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.181828111410141, + "rewards/margins": 3.773216485977173, + "rewards/rejected": -3.59138822555542, + "step": 1958 + }, + { + "epoch": 0.23, + "learning_rate": 2.3516003306956418e-07, + "logits/chosen": -2.1743507385253906, + "logits/rejected": -2.415201187133789, + "logps/chosen": -434.0635986328125, + "logps/rejected": -273.3455810546875, + "loss": 0.8771, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6704363822937012, + "rewards/margins": 0.5951224565505981, + "rewards/rejected": -2.265558958053589, + "step": 1959 + }, + { + "epoch": 0.23, + "learning_rate": 2.3512460139364593e-07, + "logits/chosen": -2.2718307971954346, + "logits/rejected": -2.4912564754486084, + "logps/chosen": -449.3606262207031, + "logps/rejected": -346.3267822265625, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.021274691447615623, + "rewards/margins": 2.50516939163208, + "rewards/rejected": -2.5264439582824707, + "step": 1960 + }, + { + "epoch": 0.23, + "learning_rate": 2.3508916971772765e-07, + "logits/chosen": -2.204979419708252, + "logits/rejected": -2.037120819091797, + "logps/chosen": -216.04345703125, + "logps/rejected": -308.4803466796875, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8284422159194946, + "rewards/margins": 3.37876558303833, + "rewards/rejected": -4.207207679748535, + "step": 1961 + }, + { + "epoch": 0.23, + "learning_rate": 2.3505373804180937e-07, + "logits/chosen": -2.1463544368743896, + "logits/rejected": -2.1713199615478516, + "logps/chosen": -254.8804931640625, + "logps/rejected": -267.84039306640625, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5927608013153076, + "rewards/margins": 1.1968047618865967, + "rewards/rejected": -1.7895654439926147, + "step": 1962 + }, + { + "epoch": 0.23, + "learning_rate": 2.350183063658911e-07, + "logits/chosen": -2.13004732131958, + "logits/rejected": -2.4813008308410645, + "logps/chosen": -232.4341583251953, + "logps/rejected": -179.20472717285156, + "loss": 0.6793, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2120013236999512, + "rewards/margins": 0.7421838045120239, + "rewards/rejected": -1.954185128211975, + "step": 1963 + }, + { + "epoch": 0.23, + "learning_rate": 2.3498287468997282e-07, + "logits/chosen": -2.0552053451538086, + "logits/rejected": -2.465153217315674, + "logps/chosen": -370.2318115234375, + "logps/rejected": -184.74234008789062, + "loss": 0.2956, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6746222972869873, + "rewards/margins": 1.4199590682983398, + "rewards/rejected": -2.094581365585327, + "step": 1964 + }, + { + "epoch": 0.23, + "learning_rate": 2.3494744301405454e-07, + "logits/chosen": -2.3425588607788086, + "logits/rejected": -2.1580755710601807, + "logps/chosen": -233.93360900878906, + "logps/rejected": -224.84085083007812, + "loss": 0.3867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4941990077495575, + "rewards/margins": 1.095848798751831, + "rewards/rejected": -1.5900477170944214, + "step": 1965 + }, + { + "epoch": 0.23, + "learning_rate": 2.3491201133813626e-07, + "logits/chosen": -2.7572269439697266, + "logits/rejected": -2.6859066486358643, + "logps/chosen": -432.6898193359375, + "logps/rejected": -297.7322998046875, + "loss": 0.23, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.562412679195404, + "rewards/margins": 2.265326976776123, + "rewards/rejected": -2.8277394771575928, + "step": 1966 + }, + { + "epoch": 0.23, + "learning_rate": 2.34876579662218e-07, + "logits/chosen": -2.519235849380493, + "logits/rejected": -2.2266769409179688, + "logps/chosen": -155.7998504638672, + "logps/rejected": -275.9801330566406, + "loss": 0.2644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7779420018196106, + "rewards/margins": 1.891699194908142, + "rewards/rejected": -2.6696410179138184, + "step": 1967 + }, + { + "epoch": 0.23, + "learning_rate": 2.3484114798629973e-07, + "logits/chosen": -2.0956647396087646, + "logits/rejected": -2.4342117309570312, + "logps/chosen": -292.76593017578125, + "logps/rejected": -222.981689453125, + "loss": 0.4182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7448635697364807, + "rewards/margins": 1.3684239387512207, + "rewards/rejected": -2.1132874488830566, + "step": 1968 + }, + { + "epoch": 0.23, + "learning_rate": 2.3480571631038148e-07, + "logits/chosen": -1.8876087665557861, + "logits/rejected": -1.982627034187317, + "logps/chosen": -394.96636962890625, + "logps/rejected": -292.7491149902344, + "loss": 0.5063, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5040005445480347, + "rewards/margins": 1.3237920999526978, + "rewards/rejected": -2.8277926445007324, + "step": 1969 + }, + { + "epoch": 0.23, + "learning_rate": 2.347702846344632e-07, + "logits/chosen": -2.6826772689819336, + "logits/rejected": -2.7892727851867676, + "logps/chosen": -373.95941162109375, + "logps/rejected": -299.981201171875, + "loss": 0.1881, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6756617426872253, + "rewards/margins": 2.263915777206421, + "rewards/rejected": -2.939577579498291, + "step": 1970 + }, + { + "epoch": 0.23, + "learning_rate": 2.3473485295854495e-07, + "logits/chosen": -2.7992630004882812, + "logits/rejected": -2.9575328826904297, + "logps/chosen": -230.15194702148438, + "logps/rejected": -293.1745300292969, + "loss": 0.3331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.303811252117157, + "rewards/margins": 2.593111515045166, + "rewards/rejected": -2.8969228267669678, + "step": 1971 + }, + { + "epoch": 0.23, + "learning_rate": 2.3469942128262667e-07, + "logits/chosen": -2.6200759410858154, + "logits/rejected": -2.640418767929077, + "logps/chosen": -342.5304870605469, + "logps/rejected": -345.995361328125, + "loss": 0.3214, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2785898745059967, + "rewards/margins": 1.940861463546753, + "rewards/rejected": -2.219451427459717, + "step": 1972 + }, + { + "epoch": 0.23, + "learning_rate": 2.346639896067084e-07, + "logits/chosen": -2.1326887607574463, + "logits/rejected": -2.063842296600342, + "logps/chosen": -210.5382080078125, + "logps/rejected": -274.0124816894531, + "loss": 0.5267, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6867938041687012, + "rewards/margins": 2.4064230918884277, + "rewards/rejected": -3.0932164192199707, + "step": 1973 + }, + { + "epoch": 0.23, + "learning_rate": 2.3462855793079012e-07, + "logits/chosen": -2.5352542400360107, + "logits/rejected": -2.451993227005005, + "logps/chosen": -327.9621276855469, + "logps/rejected": -298.9266357421875, + "loss": 0.4011, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16905011236667633, + "rewards/margins": 1.6964938640594482, + "rewards/rejected": -1.8655438423156738, + "step": 1974 + }, + { + "epoch": 0.23, + "learning_rate": 2.3459312625487184e-07, + "logits/chosen": -2.5143871307373047, + "logits/rejected": -2.6486411094665527, + "logps/chosen": -317.9917907714844, + "logps/rejected": -165.66217041015625, + "loss": 0.7759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9655543565750122, + "rewards/margins": 0.6170029640197754, + "rewards/rejected": -1.582557201385498, + "step": 1975 + }, + { + "epoch": 0.23, + "learning_rate": 2.3455769457895356e-07, + "logits/chosen": -2.1544029712677, + "logits/rejected": -2.1629600524902344, + "logps/chosen": -313.361328125, + "logps/rejected": -323.8248291015625, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6415922045707703, + "rewards/margins": 2.151778221130371, + "rewards/rejected": -2.793370246887207, + "step": 1976 + }, + { + "epoch": 0.23, + "learning_rate": 2.3452226290303528e-07, + "logits/chosen": -2.2814230918884277, + "logits/rejected": -2.280259132385254, + "logps/chosen": -310.88861083984375, + "logps/rejected": -292.0172119140625, + "loss": 1.1235, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7768259048461914, + "rewards/margins": 0.9707800149917603, + "rewards/rejected": -3.7476062774658203, + "step": 1977 + }, + { + "epoch": 0.23, + "learning_rate": 2.3448683122711703e-07, + "logits/chosen": -2.8088107109069824, + "logits/rejected": -2.6930596828460693, + "logps/chosen": -437.1911926269531, + "logps/rejected": -254.23907470703125, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6103285551071167, + "rewards/margins": 2.8429336547851562, + "rewards/rejected": -3.4532623291015625, + "step": 1978 + }, + { + "epoch": 0.23, + "learning_rate": 2.3445139955119875e-07, + "logits/chosen": -2.281198263168335, + "logits/rejected": -2.3766019344329834, + "logps/chosen": -300.1064147949219, + "logps/rejected": -228.39898681640625, + "loss": 0.7406, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0748196840286255, + "rewards/margins": 0.38440003991127014, + "rewards/rejected": -1.4592196941375732, + "step": 1979 + }, + { + "epoch": 0.23, + "learning_rate": 2.3441596787528048e-07, + "logits/chosen": -2.1105713844299316, + "logits/rejected": -2.339521884918213, + "logps/chosen": -453.9054260253906, + "logps/rejected": -246.70291137695312, + "loss": 0.2837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15994839370250702, + "rewards/margins": 1.74416184425354, + "rewards/rejected": -1.9041101932525635, + "step": 1980 + }, + { + "epoch": 0.23, + "learning_rate": 2.3438053619936222e-07, + "logits/chosen": -2.2441442012786865, + "logits/rejected": -2.398287534713745, + "logps/chosen": -306.375244140625, + "logps/rejected": -322.2066345214844, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.007738925516605377, + "rewards/margins": 2.2194318771362305, + "rewards/rejected": -2.227170705795288, + "step": 1981 + }, + { + "epoch": 0.23, + "learning_rate": 2.3434510452344395e-07, + "logits/chosen": -2.2389562129974365, + "logits/rejected": -2.1819119453430176, + "logps/chosen": -282.5087890625, + "logps/rejected": -339.15948486328125, + "loss": 0.8624, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5152976512908936, + "rewards/margins": 2.8148577213287354, + "rewards/rejected": -4.330155372619629, + "step": 1982 + }, + { + "epoch": 0.23, + "learning_rate": 2.343096728475257e-07, + "logits/chosen": -1.6608740091323853, + "logits/rejected": -2.1132619380950928, + "logps/chosen": -344.1090087890625, + "logps/rejected": -279.04180908203125, + "loss": 0.366, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6082732081413269, + "rewards/margins": 2.6341352462768555, + "rewards/rejected": -3.242408275604248, + "step": 1983 + }, + { + "epoch": 0.23, + "learning_rate": 2.3427424117160742e-07, + "logits/chosen": -1.8645031452178955, + "logits/rejected": -2.065497398376465, + "logps/chosen": -180.1361846923828, + "logps/rejected": -198.21923828125, + "loss": 0.382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49544787406921387, + "rewards/margins": 2.2168498039245605, + "rewards/rejected": -2.7122976779937744, + "step": 1984 + }, + { + "epoch": 0.23, + "learning_rate": 2.3423880949568914e-07, + "logits/chosen": -2.149522304534912, + "logits/rejected": -2.6492624282836914, + "logps/chosen": -512.5096435546875, + "logps/rejected": -283.8351135253906, + "loss": 0.9904, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.106372356414795, + "rewards/margins": 0.4951530694961548, + "rewards/rejected": -1.6015253067016602, + "step": 1985 + }, + { + "epoch": 0.23, + "learning_rate": 2.3420337781977086e-07, + "logits/chosen": -2.1917884349823, + "logits/rejected": -2.109611988067627, + "logps/chosen": -282.18646240234375, + "logps/rejected": -329.8034973144531, + "loss": 0.4203, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.655138373374939, + "rewards/margins": 1.4564317464828491, + "rewards/rejected": -2.111570119857788, + "step": 1986 + }, + { + "epoch": 0.23, + "learning_rate": 2.3416794614385258e-07, + "logits/chosen": -2.290922164916992, + "logits/rejected": -2.383075475692749, + "logps/chosen": -311.380615234375, + "logps/rejected": -415.647216796875, + "loss": 0.833, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3233685493469238, + "rewards/margins": 1.2943097352981567, + "rewards/rejected": -2.617678165435791, + "step": 1987 + }, + { + "epoch": 0.23, + "learning_rate": 2.341325144679343e-07, + "logits/chosen": -2.421081066131592, + "logits/rejected": -2.6070947647094727, + "logps/chosen": -318.7978515625, + "logps/rejected": -294.3724670410156, + "loss": 0.553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4900839924812317, + "rewards/margins": 1.3121429681777954, + "rewards/rejected": -1.8022270202636719, + "step": 1988 + }, + { + "epoch": 0.23, + "learning_rate": 2.3409708279201605e-07, + "logits/chosen": -1.7563356161117554, + "logits/rejected": -1.8937163352966309, + "logps/chosen": -418.55633544921875, + "logps/rejected": -405.0843811035156, + "loss": 0.7327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8827035427093506, + "rewards/margins": 0.8655978441238403, + "rewards/rejected": -1.748301386833191, + "step": 1989 + }, + { + "epoch": 0.23, + "learning_rate": 2.3406165111609778e-07, + "logits/chosen": -2.4359049797058105, + "logits/rejected": -2.4992260932922363, + "logps/chosen": -263.842529296875, + "logps/rejected": -240.83753967285156, + "loss": 0.3228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5593858957290649, + "rewards/margins": 1.442859411239624, + "rewards/rejected": -2.0022454261779785, + "step": 1990 + }, + { + "epoch": 0.23, + "learning_rate": 2.340262194401795e-07, + "logits/chosen": -1.9965806007385254, + "logits/rejected": -1.7110234498977661, + "logps/chosen": -283.6479797363281, + "logps/rejected": -451.0930480957031, + "loss": 0.6586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4226959943771362, + "rewards/margins": 0.2742460370063782, + "rewards/rejected": -1.6969420909881592, + "step": 1991 + }, + { + "epoch": 0.23, + "learning_rate": 2.3399078776426125e-07, + "logits/chosen": -2.2080020904541016, + "logits/rejected": -2.383505344390869, + "logps/chosen": -280.75390625, + "logps/rejected": -316.1920471191406, + "loss": 0.4667, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0683393478393555, + "rewards/margins": 2.130797863006592, + "rewards/rejected": -3.199136972427368, + "step": 1992 + }, + { + "epoch": 0.23, + "learning_rate": 2.3395535608834297e-07, + "logits/chosen": -2.60107421875, + "logits/rejected": -2.6125307083129883, + "logps/chosen": -300.8184509277344, + "logps/rejected": -224.2725830078125, + "loss": 0.2961, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8431633710861206, + "rewards/margins": 2.1309609413146973, + "rewards/rejected": -2.9741241931915283, + "step": 1993 + }, + { + "epoch": 0.23, + "learning_rate": 2.3391992441242472e-07, + "logits/chosen": -2.338379383087158, + "logits/rejected": -2.4408397674560547, + "logps/chosen": -232.31024169921875, + "logps/rejected": -260.48931884765625, + "loss": 0.3086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34689652919769287, + "rewards/margins": 2.1600465774536133, + "rewards/rejected": -2.5069432258605957, + "step": 1994 + }, + { + "epoch": 0.23, + "learning_rate": 2.3388449273650644e-07, + "logits/chosen": -2.1882948875427246, + "logits/rejected": -2.266958475112915, + "logps/chosen": -202.89471435546875, + "logps/rejected": -314.1727600097656, + "loss": 0.2779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5279630422592163, + "rewards/margins": 2.4742531776428223, + "rewards/rejected": -3.002216339111328, + "step": 1995 + }, + { + "epoch": 0.23, + "learning_rate": 2.3384906106058816e-07, + "logits/chosen": -1.9148709774017334, + "logits/rejected": -1.682118535041809, + "logps/chosen": -323.4260559082031, + "logps/rejected": -398.777587890625, + "loss": 0.3918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8555980920791626, + "rewards/margins": 2.0248217582702637, + "rewards/rejected": -2.880419969558716, + "step": 1996 + }, + { + "epoch": 0.23, + "learning_rate": 2.3381362938466988e-07, + "logits/chosen": -2.2660160064697266, + "logits/rejected": -2.798758029937744, + "logps/chosen": -439.35076904296875, + "logps/rejected": -192.93997192382812, + "loss": 0.3105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8983439207077026, + "rewards/margins": 1.3224809169769287, + "rewards/rejected": -2.220824956893921, + "step": 1997 + }, + { + "epoch": 0.23, + "learning_rate": 2.337781977087516e-07, + "logits/chosen": -2.2869553565979004, + "logits/rejected": -2.3743093013763428, + "logps/chosen": -316.4087219238281, + "logps/rejected": -177.6245574951172, + "loss": 0.5853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47578078508377075, + "rewards/margins": 0.6859957575798035, + "rewards/rejected": -1.1617765426635742, + "step": 1998 + }, + { + "epoch": 0.23, + "learning_rate": 2.3374276603283333e-07, + "logits/chosen": -2.9616076946258545, + "logits/rejected": -2.9450843334198, + "logps/chosen": -206.57513427734375, + "logps/rejected": -214.69483947753906, + "loss": 0.5622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8950778245925903, + "rewards/margins": 1.3593403100967407, + "rewards/rejected": -2.254418134689331, + "step": 1999 + }, + { + "epoch": 0.23, + "learning_rate": 2.3370733435691508e-07, + "logits/chosen": -1.951313853263855, + "logits/rejected": -1.8784806728363037, + "logps/chosen": -299.53179931640625, + "logps/rejected": -327.47607421875, + "loss": 0.5171, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5280905961990356, + "rewards/margins": 1.1732994318008423, + "rewards/rejected": -1.701390027999878, + "step": 2000 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -1.735522747039795, + "eval_logits/rejected": -1.734364628791809, + "eval_logps/chosen": -276.79437255859375, + "eval_logps/rejected": -272.12823486328125, + "eval_loss": 0.39564093947410583, + "eval_rewards/accuracies": 0.8275862336158752, + "eval_rewards/chosen": -0.46289071440696716, + "eval_rewards/margins": 1.6492795944213867, + "eval_rewards/rejected": -2.112170457839966, + "eval_runtime": 237.1853, + "eval_samples_per_second": 2.93, + "eval_steps_per_second": 1.467, + "step": 2000 + }, + { + "epoch": 0.23, + "learning_rate": 2.336719026809968e-07, + "logits/chosen": -2.3690707683563232, + "logits/rejected": -2.2364253997802734, + "logps/chosen": -323.9754943847656, + "logps/rejected": -361.447509765625, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6039774417877197, + "rewards/margins": 1.4217994213104248, + "rewards/rejected": -2.0257768630981445, + "step": 2001 + }, + { + "epoch": 0.23, + "learning_rate": 2.3363647100507852e-07, + "logits/chosen": -2.474982500076294, + "logits/rejected": -2.5558054447174072, + "logps/chosen": -322.45361328125, + "logps/rejected": -304.1692199707031, + "loss": 0.3839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8651862144470215, + "rewards/margins": 2.4731554985046387, + "rewards/rejected": -3.33834171295166, + "step": 2002 + }, + { + "epoch": 0.23, + "learning_rate": 2.3360103932916024e-07, + "logits/chosen": -1.9826539754867554, + "logits/rejected": -1.961648941040039, + "logps/chosen": -189.33905029296875, + "logps/rejected": -184.55433654785156, + "loss": 0.2725, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17802608013153076, + "rewards/margins": 2.0691685676574707, + "rewards/rejected": -2.247194766998291, + "step": 2003 + }, + { + "epoch": 0.23, + "learning_rate": 2.33565607653242e-07, + "logits/chosen": -2.3278913497924805, + "logits/rejected": -2.4057881832122803, + "logps/chosen": -411.4052429199219, + "logps/rejected": -282.1052551269531, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6539748907089233, + "rewards/margins": 1.9738472700119019, + "rewards/rejected": -2.6278223991394043, + "step": 2004 + }, + { + "epoch": 0.23, + "learning_rate": 2.3353017597732374e-07, + "logits/chosen": -2.5039446353912354, + "logits/rejected": -2.599365472793579, + "logps/chosen": -254.27488708496094, + "logps/rejected": -146.72772216796875, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8487481474876404, + "rewards/margins": 1.592360019683838, + "rewards/rejected": -2.441108226776123, + "step": 2005 + }, + { + "epoch": 0.23, + "learning_rate": 2.3349474430140546e-07, + "logits/chosen": -2.6036622524261475, + "logits/rejected": -2.564223289489746, + "logps/chosen": -422.2083740234375, + "logps/rejected": -424.9235534667969, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7821944355964661, + "rewards/margins": 1.8510080575942993, + "rewards/rejected": -2.63320255279541, + "step": 2006 + }, + { + "epoch": 0.23, + "learning_rate": 2.3345931262548718e-07, + "logits/chosen": -2.4397385120391846, + "logits/rejected": -2.354215383529663, + "logps/chosen": -186.90576171875, + "logps/rejected": -318.61517333984375, + "loss": 1.0345, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5632215738296509, + "rewards/margins": 1.6802515983581543, + "rewards/rejected": -3.2434730529785156, + "step": 2007 + }, + { + "epoch": 0.23, + "learning_rate": 2.334238809495689e-07, + "logits/chosen": -1.9265174865722656, + "logits/rejected": -2.0805206298828125, + "logps/chosen": -295.6748962402344, + "logps/rejected": -193.3943328857422, + "loss": 0.4987, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7567787766456604, + "rewards/margins": 1.4953749179840088, + "rewards/rejected": -2.2521536350250244, + "step": 2008 + }, + { + "epoch": 0.23, + "learning_rate": 2.3338844927365063e-07, + "logits/chosen": -2.1402738094329834, + "logits/rejected": -2.12782883644104, + "logps/chosen": -223.82821655273438, + "logps/rejected": -219.71551513671875, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41703885793685913, + "rewards/margins": 2.0865871906280518, + "rewards/rejected": -2.5036258697509766, + "step": 2009 + }, + { + "epoch": 0.23, + "learning_rate": 2.3335301759773235e-07, + "logits/chosen": -2.6035897731781006, + "logits/rejected": -2.533144474029541, + "logps/chosen": -185.51358032226562, + "logps/rejected": -318.8678283691406, + "loss": 0.1707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03219519555568695, + "rewards/margins": 3.306462049484253, + "rewards/rejected": -3.3386573791503906, + "step": 2010 + }, + { + "epoch": 0.23, + "learning_rate": 2.3331758592181407e-07, + "logits/chosen": -1.7301537990570068, + "logits/rejected": -1.8537843227386475, + "logps/chosen": -440.43804931640625, + "logps/rejected": -424.9377746582031, + "loss": 0.4584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3460497260093689, + "rewards/margins": 0.6747007369995117, + "rewards/rejected": -1.0207504034042358, + "step": 2011 + }, + { + "epoch": 0.23, + "learning_rate": 2.3328215424589582e-07, + "logits/chosen": -2.189547061920166, + "logits/rejected": -2.338770627975464, + "logps/chosen": -367.4891357421875, + "logps/rejected": -217.9007110595703, + "loss": 0.4429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8467769026756287, + "rewards/margins": 1.5399144887924194, + "rewards/rejected": -2.3866913318634033, + "step": 2012 + }, + { + "epoch": 0.23, + "learning_rate": 2.3324672256997754e-07, + "logits/chosen": -1.6091705560684204, + "logits/rejected": -2.2894623279571533, + "logps/chosen": -495.65777587890625, + "logps/rejected": -186.2374267578125, + "loss": 1.7759, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.099937677383423, + "rewards/margins": 0.2060619592666626, + "rewards/rejected": -3.305999755859375, + "step": 2013 + }, + { + "epoch": 0.23, + "learning_rate": 2.3321129089405927e-07, + "logits/chosen": -2.8091211318969727, + "logits/rejected": -2.8256993293762207, + "logps/chosen": -255.82662963867188, + "logps/rejected": -230.39810180664062, + "loss": 0.4307, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0929681062698364, + "rewards/margins": 1.9111194610595703, + "rewards/rejected": -3.004087448120117, + "step": 2014 + }, + { + "epoch": 0.23, + "learning_rate": 2.33175859218141e-07, + "logits/chosen": -2.4361212253570557, + "logits/rejected": -2.5535519123077393, + "logps/chosen": -201.01461791992188, + "logps/rejected": -214.1597442626953, + "loss": 0.8398, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7802462577819824, + "rewards/margins": 1.396444320678711, + "rewards/rejected": -3.1766905784606934, + "step": 2015 + }, + { + "epoch": 0.23, + "learning_rate": 2.3314042754222276e-07, + "logits/chosen": -2.5939416885375977, + "logits/rejected": -2.6764373779296875, + "logps/chosen": -184.04202270507812, + "logps/rejected": -255.7120361328125, + "loss": 0.205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08808886259794235, + "rewards/margins": 2.2749335765838623, + "rewards/rejected": -2.363022565841675, + "step": 2016 + }, + { + "epoch": 0.23, + "learning_rate": 2.3310499586630449e-07, + "logits/chosen": -2.580514907836914, + "logits/rejected": -2.515012502670288, + "logps/chosen": -160.08331298828125, + "logps/rejected": -161.6158447265625, + "loss": 0.3607, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9722243547439575, + "rewards/margins": 1.1654281616210938, + "rewards/rejected": -2.137652635574341, + "step": 2017 + }, + { + "epoch": 0.23, + "learning_rate": 2.330695641903862e-07, + "logits/chosen": -2.553687572479248, + "logits/rejected": -2.749514579772949, + "logps/chosen": -215.03172302246094, + "logps/rejected": -147.36903381347656, + "loss": 0.3162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38330480456352234, + "rewards/margins": 1.4313088655471802, + "rewards/rejected": -1.8146135807037354, + "step": 2018 + }, + { + "epoch": 0.23, + "learning_rate": 2.3303413251446793e-07, + "logits/chosen": -2.0398356914520264, + "logits/rejected": -2.089282512664795, + "logps/chosen": -247.7024383544922, + "logps/rejected": -266.53082275390625, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3406440019607544, + "rewards/margins": 1.3034788370132446, + "rewards/rejected": -1.6441229581832886, + "step": 2019 + }, + { + "epoch": 0.23, + "learning_rate": 2.3299870083854965e-07, + "logits/chosen": -2.074162721633911, + "logits/rejected": -2.5535104274749756, + "logps/chosen": -314.71136474609375, + "logps/rejected": -236.4280548095703, + "loss": 0.9495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9658845663070679, + "rewards/margins": 0.9747937917709351, + "rewards/rejected": -1.940678358078003, + "step": 2020 + }, + { + "epoch": 0.24, + "learning_rate": 2.3296326916263137e-07, + "logits/chosen": -2.3555169105529785, + "logits/rejected": -2.1334614753723145, + "logps/chosen": -189.9724578857422, + "logps/rejected": -262.4036865234375, + "loss": 0.3364, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4933680295944214, + "rewards/margins": 1.6365783214569092, + "rewards/rejected": -2.129946231842041, + "step": 2021 + }, + { + "epoch": 0.24, + "learning_rate": 2.329278374867131e-07, + "logits/chosen": -2.0900139808654785, + "logits/rejected": -2.155226469039917, + "logps/chosen": -297.7610778808594, + "logps/rejected": -200.39273071289062, + "loss": 0.2205, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5367335081100464, + "rewards/margins": 1.7805781364440918, + "rewards/rejected": -3.3173115253448486, + "step": 2022 + }, + { + "epoch": 0.24, + "learning_rate": 2.3289240581079484e-07, + "logits/chosen": -2.1037867069244385, + "logits/rejected": -2.1060948371887207, + "logps/chosen": -402.76129150390625, + "logps/rejected": -390.22613525390625, + "loss": 0.7194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8463070392608643, + "rewards/margins": 1.1483607292175293, + "rewards/rejected": -1.994667649269104, + "step": 2023 + }, + { + "epoch": 0.24, + "learning_rate": 2.3285697413487657e-07, + "logits/chosen": -2.7792553901672363, + "logits/rejected": -2.5888612270355225, + "logps/chosen": -426.4296875, + "logps/rejected": -374.66754150390625, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8128154277801514, + "rewards/margins": 1.6679027080535889, + "rewards/rejected": -2.4807181358337402, + "step": 2024 + }, + { + "epoch": 0.24, + "learning_rate": 2.328215424589583e-07, + "logits/chosen": -2.588773250579834, + "logits/rejected": -2.525178909301758, + "logps/chosen": -281.7531433105469, + "logps/rejected": -160.92002868652344, + "loss": 0.4545, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8186485767364502, + "rewards/margins": 1.5059287548065186, + "rewards/rejected": -2.3245773315429688, + "step": 2025 + }, + { + "epoch": 0.24, + "learning_rate": 2.3278611078304e-07, + "logits/chosen": -2.3077287673950195, + "logits/rejected": -2.3561182022094727, + "logps/chosen": -160.13929748535156, + "logps/rejected": -121.38096618652344, + "loss": 0.8238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7729524374008179, + "rewards/margins": 0.37409037351608276, + "rewards/rejected": -1.1470427513122559, + "step": 2026 + }, + { + "epoch": 0.24, + "learning_rate": 2.3275067910712176e-07, + "logits/chosen": -2.769462823867798, + "logits/rejected": -2.6716341972351074, + "logps/chosen": -318.64117431640625, + "logps/rejected": -276.5351257324219, + "loss": 0.7202, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2279462814331055, + "rewards/margins": 0.9949378967285156, + "rewards/rejected": -2.222884178161621, + "step": 2027 + }, + { + "epoch": 0.24, + "learning_rate": 2.327152474312035e-07, + "logits/chosen": -2.302131414413452, + "logits/rejected": -2.2596590518951416, + "logps/chosen": -175.794677734375, + "logps/rejected": -301.22906494140625, + "loss": 0.6166, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6479148864746094, + "rewards/margins": 0.2617035508155823, + "rewards/rejected": -1.9096184968948364, + "step": 2028 + }, + { + "epoch": 0.24, + "learning_rate": 2.3267981575528523e-07, + "logits/chosen": -1.9586384296417236, + "logits/rejected": -2.2462337017059326, + "logps/chosen": -299.5837097167969, + "logps/rejected": -315.59246826171875, + "loss": 0.3702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28985777497291565, + "rewards/margins": 2.7063097953796387, + "rewards/rejected": -2.9961676597595215, + "step": 2029 + }, + { + "epoch": 0.24, + "learning_rate": 2.3264438407936695e-07, + "logits/chosen": -1.7195850610733032, + "logits/rejected": -1.730041265487671, + "logps/chosen": -349.7305603027344, + "logps/rejected": -293.2488708496094, + "loss": 0.7856, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.515875220298767, + "rewards/margins": 0.4168912172317505, + "rewards/rejected": -1.932766318321228, + "step": 2030 + }, + { + "epoch": 0.24, + "learning_rate": 2.3260895240344867e-07, + "logits/chosen": -2.7275562286376953, + "logits/rejected": -2.449794054031372, + "logps/chosen": -149.19786071777344, + "logps/rejected": -242.50421142578125, + "loss": 0.3971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4693740904331207, + "rewards/margins": 1.6094392538070679, + "rewards/rejected": -2.078813314437866, + "step": 2031 + }, + { + "epoch": 0.24, + "learning_rate": 2.325735207275304e-07, + "logits/chosen": -2.406891345977783, + "logits/rejected": -2.341055154800415, + "logps/chosen": -496.196533203125, + "logps/rejected": -371.0252685546875, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5625265836715698, + "rewards/margins": 3.1403613090515137, + "rewards/rejected": -2.5778346061706543, + "step": 2032 + }, + { + "epoch": 0.24, + "learning_rate": 2.3253808905161212e-07, + "logits/chosen": -2.828106641769409, + "logits/rejected": -2.9514827728271484, + "logps/chosen": -256.3870544433594, + "logps/rejected": -244.3540802001953, + "loss": 0.5676, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.174810528755188, + "rewards/margins": 1.7223913669586182, + "rewards/rejected": -2.8972015380859375, + "step": 2033 + }, + { + "epoch": 0.24, + "learning_rate": 2.3250265737569387e-07, + "logits/chosen": -2.842668056488037, + "logits/rejected": -2.7221367359161377, + "logps/chosen": -243.26437377929688, + "logps/rejected": -200.65142822265625, + "loss": 0.1688, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6657168865203857, + "rewards/margins": 2.4617412090301514, + "rewards/rejected": -3.127458095550537, + "step": 2034 + }, + { + "epoch": 0.24, + "learning_rate": 2.324672256997756e-07, + "logits/chosen": -2.2707204818725586, + "logits/rejected": -2.093201160430908, + "logps/chosen": -214.98446655273438, + "logps/rejected": -341.29180908203125, + "loss": 0.4165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9320346117019653, + "rewards/margins": 1.3567686080932617, + "rewards/rejected": -2.2888031005859375, + "step": 2035 + }, + { + "epoch": 0.24, + "learning_rate": 2.324317940238573e-07, + "logits/chosen": -2.769197702407837, + "logits/rejected": -2.6296226978302, + "logps/chosen": -151.77188110351562, + "logps/rejected": -212.40762329101562, + "loss": 0.369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21906369924545288, + "rewards/margins": 1.6712594032287598, + "rewards/rejected": -1.8903231620788574, + "step": 2036 + }, + { + "epoch": 0.24, + "learning_rate": 2.3239636234793903e-07, + "logits/chosen": -2.212777614593506, + "logits/rejected": -2.278111696243286, + "logps/chosen": -146.70469665527344, + "logps/rejected": -138.28033447265625, + "loss": 0.4649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22962410748004913, + "rewards/margins": 1.7458806037902832, + "rewards/rejected": -1.975504755973816, + "step": 2037 + }, + { + "epoch": 0.24, + "learning_rate": 2.3236093067202076e-07, + "logits/chosen": -2.291670083999634, + "logits/rejected": -2.3384108543395996, + "logps/chosen": -366.8844299316406, + "logps/rejected": -294.0512390136719, + "loss": 0.1504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8982610702514648, + "rewards/margins": 4.005279541015625, + "rewards/rejected": -4.903540134429932, + "step": 2038 + }, + { + "epoch": 0.24, + "learning_rate": 2.3232549899610253e-07, + "logits/chosen": -1.968712329864502, + "logits/rejected": -2.3144710063934326, + "logps/chosen": -384.4434814453125, + "logps/rejected": -253.3641815185547, + "loss": 0.3642, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9081239700317383, + "rewards/margins": 2.717484951019287, + "rewards/rejected": -3.6256091594696045, + "step": 2039 + }, + { + "epoch": 0.24, + "learning_rate": 2.3229006732018425e-07, + "logits/chosen": -2.510974884033203, + "logits/rejected": -2.609832286834717, + "logps/chosen": -220.684326171875, + "logps/rejected": -263.5408630371094, + "loss": 0.4068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3319597840309143, + "rewards/margins": 2.481935977935791, + "rewards/rejected": -2.8138959407806396, + "step": 2040 + }, + { + "epoch": 0.24, + "learning_rate": 2.3225463564426598e-07, + "logits/chosen": -2.5460615158081055, + "logits/rejected": -2.557543992996216, + "logps/chosen": -360.0933837890625, + "logps/rejected": -213.01080322265625, + "loss": 0.2613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2244347482919693, + "rewards/margins": 1.4214119911193848, + "rewards/rejected": -1.6458466053009033, + "step": 2041 + }, + { + "epoch": 0.24, + "learning_rate": 2.322192039683477e-07, + "logits/chosen": -2.2325141429901123, + "logits/rejected": -2.304518222808838, + "logps/chosen": -110.122314453125, + "logps/rejected": -119.24663543701172, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3240876793861389, + "rewards/margins": 1.9563941955566406, + "rewards/rejected": -2.2804818153381348, + "step": 2042 + }, + { + "epoch": 0.24, + "learning_rate": 2.3218377229242942e-07, + "logits/chosen": -2.4074838161468506, + "logits/rejected": -2.5174002647399902, + "logps/chosen": -201.61880493164062, + "logps/rejected": -149.50794982910156, + "loss": 1.0537, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4754530191421509, + "rewards/margins": 0.2362002283334732, + "rewards/rejected": -1.7116531133651733, + "step": 2043 + }, + { + "epoch": 0.24, + "learning_rate": 2.3214834061651114e-07, + "logits/chosen": -2.812570571899414, + "logits/rejected": -2.731611967086792, + "logps/chosen": -143.41311645507812, + "logps/rejected": -195.87518310546875, + "loss": 0.2186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6606496572494507, + "rewards/margins": 2.649125576019287, + "rewards/rejected": -3.309774875640869, + "step": 2044 + }, + { + "epoch": 0.24, + "learning_rate": 2.321129089405929e-07, + "logits/chosen": -2.150832414627075, + "logits/rejected": -2.5207042694091797, + "logps/chosen": -354.78961181640625, + "logps/rejected": -223.37905883789062, + "loss": 0.2641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3215101659297943, + "rewards/margins": 1.7496140003204346, + "rewards/rejected": -2.071124315261841, + "step": 2045 + }, + { + "epoch": 0.24, + "learning_rate": 2.320774772646746e-07, + "logits/chosen": -2.068253993988037, + "logits/rejected": -2.0605204105377197, + "logps/chosen": -510.9864196777344, + "logps/rejected": -365.5048828125, + "loss": 0.7527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7201324701309204, + "rewards/margins": 0.29562777280807495, + "rewards/rejected": -1.0157601833343506, + "step": 2046 + }, + { + "epoch": 0.24, + "learning_rate": 2.3204204558875633e-07, + "logits/chosen": -2.3502566814422607, + "logits/rejected": -2.396901845932007, + "logps/chosen": -319.370361328125, + "logps/rejected": -228.249267578125, + "loss": 0.3779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49296289682388306, + "rewards/margins": 1.6559906005859375, + "rewards/rejected": -2.148953437805176, + "step": 2047 + }, + { + "epoch": 0.24, + "learning_rate": 2.3200661391283806e-07, + "logits/chosen": -2.3833396434783936, + "logits/rejected": -2.4190738201141357, + "logps/chosen": -378.88348388671875, + "logps/rejected": -221.31948852539062, + "loss": 0.439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9592045545578003, + "rewards/margins": 1.101942539215088, + "rewards/rejected": -2.0611469745635986, + "step": 2048 + }, + { + "epoch": 0.24, + "learning_rate": 2.3197118223691978e-07, + "logits/chosen": -1.9409353733062744, + "logits/rejected": -1.886033535003662, + "logps/chosen": -148.40745544433594, + "logps/rejected": -255.3448486328125, + "loss": 0.5596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3841799199581146, + "rewards/margins": 1.4257230758666992, + "rewards/rejected": -1.8099030256271362, + "step": 2049 + }, + { + "epoch": 0.24, + "learning_rate": 2.319357505610015e-07, + "logits/chosen": -1.6990395784378052, + "logits/rejected": -2.2230329513549805, + "logps/chosen": -297.362548828125, + "logps/rejected": -228.18093872070312, + "loss": 0.7327, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1835384368896484, + "rewards/margins": 1.3047230243682861, + "rewards/rejected": -2.4882614612579346, + "step": 2050 + }, + { + "epoch": 0.24, + "learning_rate": 2.3190031888508328e-07, + "logits/chosen": -2.537649631500244, + "logits/rejected": -2.6452975273132324, + "logps/chosen": -619.759765625, + "logps/rejected": -353.56005859375, + "loss": 0.3772, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49194395542144775, + "rewards/margins": 1.7878797054290771, + "rewards/rejected": -2.2798235416412354, + "step": 2051 + }, + { + "epoch": 0.24, + "learning_rate": 2.31864887209165e-07, + "logits/chosen": -2.764446973800659, + "logits/rejected": -2.6761300563812256, + "logps/chosen": -263.1351623535156, + "logps/rejected": -254.53260803222656, + "loss": 0.5305, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9918043613433838, + "rewards/margins": 1.6356933116912842, + "rewards/rejected": -3.627497673034668, + "step": 2052 + }, + { + "epoch": 0.24, + "learning_rate": 2.3182945553324672e-07, + "logits/chosen": -2.69112229347229, + "logits/rejected": -3.0157575607299805, + "logps/chosen": -408.39666748046875, + "logps/rejected": -309.54595947265625, + "loss": 0.1826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07711735367774963, + "rewards/margins": 2.1510908603668213, + "rewards/rejected": -2.228208065032959, + "step": 2053 + }, + { + "epoch": 0.24, + "learning_rate": 2.3179402385732844e-07, + "logits/chosen": -2.081737518310547, + "logits/rejected": -2.133505344390869, + "logps/chosen": -359.0486755371094, + "logps/rejected": -281.8216857910156, + "loss": 0.7259, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5981441736221313, + "rewards/margins": 0.981544017791748, + "rewards/rejected": -1.579688310623169, + "step": 2054 + }, + { + "epoch": 0.24, + "learning_rate": 2.3175859218141016e-07, + "logits/chosen": -1.7702155113220215, + "logits/rejected": -2.0965635776519775, + "logps/chosen": -402.19921875, + "logps/rejected": -220.8493194580078, + "loss": 0.7826, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4614759683609009, + "rewards/margins": 0.63457852602005, + "rewards/rejected": -2.0960545539855957, + "step": 2055 + }, + { + "epoch": 0.24, + "learning_rate": 2.3172316050549189e-07, + "logits/chosen": -2.2324156761169434, + "logits/rejected": -2.329479932785034, + "logps/chosen": -373.3224182128906, + "logps/rejected": -341.1250305175781, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.242738962173462, + "rewards/margins": 1.1859095096588135, + "rewards/rejected": -2.4286484718322754, + "step": 2056 + }, + { + "epoch": 0.24, + "learning_rate": 2.3168772882957364e-07, + "logits/chosen": -2.249872922897339, + "logits/rejected": -2.153383493423462, + "logps/chosen": -233.1608428955078, + "logps/rejected": -258.5486145019531, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.123549222946167, + "rewards/margins": 1.4703047275543213, + "rewards/rejected": -2.5938539505004883, + "step": 2057 + }, + { + "epoch": 0.24, + "learning_rate": 2.3165229715365536e-07, + "logits/chosen": -2.293715000152588, + "logits/rejected": -2.3873379230499268, + "logps/chosen": -162.99310302734375, + "logps/rejected": -145.78494262695312, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5111904144287109, + "rewards/margins": 0.13250459730625153, + "rewards/rejected": -0.643695056438446, + "step": 2058 + }, + { + "epoch": 0.24, + "learning_rate": 2.3161686547773708e-07, + "logits/chosen": -2.834343910217285, + "logits/rejected": -2.8110439777374268, + "logps/chosen": -252.72042846679688, + "logps/rejected": -183.12646484375, + "loss": 0.5396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23687627911567688, + "rewards/margins": 1.654449701309204, + "rewards/rejected": -1.8913260698318481, + "step": 2059 + }, + { + "epoch": 0.24, + "learning_rate": 2.315814338018188e-07, + "logits/chosen": -2.174685478210449, + "logits/rejected": -2.1113905906677246, + "logps/chosen": -210.0044708251953, + "logps/rejected": -308.68798828125, + "loss": 0.9212, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1340367794036865, + "rewards/margins": 0.7136525511741638, + "rewards/rejected": -1.8476893901824951, + "step": 2060 + }, + { + "epoch": 0.24, + "learning_rate": 2.3154600212590052e-07, + "logits/chosen": -2.350801706314087, + "logits/rejected": -2.3886895179748535, + "logps/chosen": -283.32476806640625, + "logps/rejected": -346.052734375, + "loss": 1.0974, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3154613971710205, + "rewards/margins": -0.07462924718856812, + "rewards/rejected": -1.2408322095870972, + "step": 2061 + }, + { + "epoch": 0.24, + "learning_rate": 2.315105704499823e-07, + "logits/chosen": -2.6257810592651367, + "logits/rejected": -2.564004898071289, + "logps/chosen": -230.0107879638672, + "logps/rejected": -203.06492614746094, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0469653606414795, + "rewards/margins": 1.104737401008606, + "rewards/rejected": -2.151702880859375, + "step": 2062 + }, + { + "epoch": 0.24, + "learning_rate": 2.3147513877406402e-07, + "logits/chosen": -2.4696969985961914, + "logits/rejected": -2.5073649883270264, + "logps/chosen": -111.795654296875, + "logps/rejected": -120.26400756835938, + "loss": 0.5582, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3880670070648193, + "rewards/margins": 0.3678975999355316, + "rewards/rejected": -1.7559646368026733, + "step": 2063 + }, + { + "epoch": 0.24, + "learning_rate": 2.3143970709814574e-07, + "logits/chosen": -2.7043700218200684, + "logits/rejected": -2.7811365127563477, + "logps/chosen": -332.83892822265625, + "logps/rejected": -295.2342529296875, + "loss": 0.2479, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2664511203765869, + "rewards/margins": 2.4638257026672363, + "rewards/rejected": -2.1973745822906494, + "step": 2064 + }, + { + "epoch": 0.24, + "learning_rate": 2.3140427542222747e-07, + "logits/chosen": -2.7371633052825928, + "logits/rejected": -2.7435383796691895, + "logps/chosen": -280.2341003417969, + "logps/rejected": -314.576416015625, + "loss": 0.2323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9098036289215088, + "rewards/margins": 2.62107515335083, + "rewards/rejected": -3.5308785438537598, + "step": 2065 + }, + { + "epoch": 0.24, + "learning_rate": 2.313688437463092e-07, + "logits/chosen": -2.5510506629943848, + "logits/rejected": -2.7809622287750244, + "logps/chosen": -184.58187866210938, + "logps/rejected": -231.81297302246094, + "loss": 0.4834, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46479663252830505, + "rewards/margins": 1.634385108947754, + "rewards/rejected": -2.09918212890625, + "step": 2066 + }, + { + "epoch": 0.24, + "learning_rate": 2.313334120703909e-07, + "logits/chosen": -2.286210536956787, + "logits/rejected": -2.2372682094573975, + "logps/chosen": -380.9281311035156, + "logps/rejected": -415.9924621582031, + "loss": 0.6469, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.122817039489746, + "rewards/margins": 1.3784973621368408, + "rewards/rejected": -2.501314640045166, + "step": 2067 + }, + { + "epoch": 0.24, + "learning_rate": 2.3129798039447266e-07, + "logits/chosen": -1.848660945892334, + "logits/rejected": -2.0682692527770996, + "logps/chosen": -363.060546875, + "logps/rejected": -296.3851623535156, + "loss": 0.5715, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9150069952011108, + "rewards/margins": 0.4359191060066223, + "rewards/rejected": -1.350926160812378, + "step": 2068 + }, + { + "epoch": 0.24, + "learning_rate": 2.3126254871855438e-07, + "logits/chosen": -2.7038397789001465, + "logits/rejected": -2.680751085281372, + "logps/chosen": -261.6009826660156, + "logps/rejected": -305.29888916015625, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40541332960128784, + "rewards/margins": 3.810908317565918, + "rewards/rejected": -4.21632194519043, + "step": 2069 + }, + { + "epoch": 0.24, + "learning_rate": 2.312271170426361e-07, + "logits/chosen": -2.157916307449341, + "logits/rejected": -2.4135901927948, + "logps/chosen": -383.1419372558594, + "logps/rejected": -267.9462890625, + "loss": 0.6058, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.081172227859497, + "rewards/margins": 1.9821442365646362, + "rewards/rejected": -3.0633163452148438, + "step": 2070 + }, + { + "epoch": 0.24, + "learning_rate": 2.3119168536671782e-07, + "logits/chosen": -1.894946575164795, + "logits/rejected": -2.351252794265747, + "logps/chosen": -393.8211669921875, + "logps/rejected": -375.05755615234375, + "loss": 0.3264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8408360481262207, + "rewards/margins": 1.4590222835540771, + "rewards/rejected": -2.299858331680298, + "step": 2071 + }, + { + "epoch": 0.24, + "learning_rate": 2.3115625369079955e-07, + "logits/chosen": -2.423804759979248, + "logits/rejected": -2.4525179862976074, + "logps/chosen": -287.20611572265625, + "logps/rejected": -300.79193115234375, + "loss": 0.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6825218200683594, + "rewards/margins": 1.355987548828125, + "rewards/rejected": -3.0385093688964844, + "step": 2072 + }, + { + "epoch": 0.24, + "learning_rate": 2.3112082201488127e-07, + "logits/chosen": -2.0599029064178467, + "logits/rejected": -2.065988302230835, + "logps/chosen": -741.3494262695312, + "logps/rejected": -731.4830322265625, + "loss": 0.2933, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0429275035858154, + "rewards/margins": 2.9162943363189697, + "rewards/rejected": -3.9592220783233643, + "step": 2073 + }, + { + "epoch": 0.24, + "learning_rate": 2.3108539033896304e-07, + "logits/chosen": -2.25712513923645, + "logits/rejected": -2.23083233833313, + "logps/chosen": -284.7942199707031, + "logps/rejected": -268.1402587890625, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9001434445381165, + "rewards/margins": 2.5516343116760254, + "rewards/rejected": -3.451777696609497, + "step": 2074 + }, + { + "epoch": 0.24, + "learning_rate": 2.3104995866304477e-07, + "logits/chosen": -2.1770951747894287, + "logits/rejected": -2.2381420135498047, + "logps/chosen": -298.89630126953125, + "logps/rejected": -274.75726318359375, + "loss": 0.3309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5474511384963989, + "rewards/margins": 1.3452699184417725, + "rewards/rejected": -1.8927209377288818, + "step": 2075 + }, + { + "epoch": 0.24, + "learning_rate": 2.310145269871265e-07, + "logits/chosen": -2.7789416313171387, + "logits/rejected": -2.640777587890625, + "logps/chosen": -276.1231689453125, + "logps/rejected": -177.1703338623047, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6109340190887451, + "rewards/margins": 3.7640726566314697, + "rewards/rejected": -4.375006675720215, + "step": 2076 + }, + { + "epoch": 0.24, + "learning_rate": 2.309790953112082e-07, + "logits/chosen": -2.224620819091797, + "logits/rejected": -2.350910186767578, + "logps/chosen": -422.27069091796875, + "logps/rejected": -321.5037841796875, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8151144981384277, + "rewards/margins": 2.2183737754821777, + "rewards/rejected": -3.0334882736206055, + "step": 2077 + }, + { + "epoch": 0.24, + "learning_rate": 2.3094366363528993e-07, + "logits/chosen": -1.8314849138259888, + "logits/rejected": -2.0104141235351562, + "logps/chosen": -375.28900146484375, + "logps/rejected": -277.774169921875, + "loss": 0.6302, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8396937847137451, + "rewards/margins": 1.0412566661834717, + "rewards/rejected": -1.8809503316879272, + "step": 2078 + }, + { + "epoch": 0.24, + "learning_rate": 2.3090823195937168e-07, + "logits/chosen": -2.5636136531829834, + "logits/rejected": -2.265578269958496, + "logps/chosen": -244.00924682617188, + "logps/rejected": -351.1802978515625, + "loss": 0.497, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9172402024269104, + "rewards/margins": 1.0423904657363892, + "rewards/rejected": -1.9596307277679443, + "step": 2079 + }, + { + "epoch": 0.24, + "learning_rate": 2.308728002834534e-07, + "logits/chosen": -2.590925693511963, + "logits/rejected": -2.4717209339141846, + "logps/chosen": -252.8572998046875, + "logps/rejected": -226.3404541015625, + "loss": 0.2072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27060365676879883, + "rewards/margins": 2.2467408180236816, + "rewards/rejected": -2.5173444747924805, + "step": 2080 + }, + { + "epoch": 0.24, + "learning_rate": 2.3083736860753513e-07, + "logits/chosen": -2.6463701725006104, + "logits/rejected": -2.631129741668701, + "logps/chosen": -254.4882049560547, + "logps/rejected": -146.6963348388672, + "loss": 0.3981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6959709525108337, + "rewards/margins": 1.3422380685806274, + "rewards/rejected": -2.0382089614868164, + "step": 2081 + }, + { + "epoch": 0.24, + "learning_rate": 2.3080193693161685e-07, + "logits/chosen": -1.93552565574646, + "logits/rejected": -2.299076795578003, + "logps/chosen": -249.86318969726562, + "logps/rejected": -191.1592254638672, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.617728590965271, + "rewards/margins": 2.052072525024414, + "rewards/rejected": -2.6698012351989746, + "step": 2082 + }, + { + "epoch": 0.24, + "learning_rate": 2.3076650525569857e-07, + "logits/chosen": -2.4341533184051514, + "logits/rejected": -2.0779552459716797, + "logps/chosen": -448.59710693359375, + "logps/rejected": -466.44964599609375, + "loss": 0.2894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6883320808410645, + "rewards/margins": 1.6192301511764526, + "rewards/rejected": -2.3075623512268066, + "step": 2083 + }, + { + "epoch": 0.24, + "learning_rate": 2.307310735797803e-07, + "logits/chosen": -3.071538209915161, + "logits/rejected": -3.023592233657837, + "logps/chosen": -412.66546630859375, + "logps/rejected": -214.17709350585938, + "loss": 0.9459, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8521295189857483, + "rewards/margins": 1.0258171558380127, + "rewards/rejected": -1.8779468536376953, + "step": 2084 + }, + { + "epoch": 0.24, + "learning_rate": 2.3069564190386201e-07, + "logits/chosen": -2.1012156009674072, + "logits/rejected": -2.3568899631500244, + "logps/chosen": -223.47088623046875, + "logps/rejected": -202.3527069091797, + "loss": 2.0635, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4038147926330566, + "rewards/margins": -1.3419294357299805, + "rewards/rejected": -1.0618852376937866, + "step": 2085 + }, + { + "epoch": 0.24, + "learning_rate": 2.306602102279438e-07, + "logits/chosen": -2.7428174018859863, + "logits/rejected": -2.579439878463745, + "logps/chosen": -256.74359130859375, + "logps/rejected": -292.8490295410156, + "loss": 0.8754, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.502882957458496, + "rewards/margins": 1.1394391059875488, + "rewards/rejected": -2.642321825027466, + "step": 2086 + }, + { + "epoch": 0.24, + "learning_rate": 2.306247785520255e-07, + "logits/chosen": -2.425887107849121, + "logits/rejected": -2.408629894256592, + "logps/chosen": -243.2975311279297, + "logps/rejected": -291.95831298828125, + "loss": 0.3789, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1353849172592163, + "rewards/margins": 1.7271255254745483, + "rewards/rejected": -2.8625104427337646, + "step": 2087 + }, + { + "epoch": 0.24, + "learning_rate": 2.3058934687610723e-07, + "logits/chosen": -2.4236881732940674, + "logits/rejected": -2.7078330516815186, + "logps/chosen": -253.06715393066406, + "logps/rejected": -166.00115966796875, + "loss": 0.8406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8810558915138245, + "rewards/margins": -0.051469504833221436, + "rewards/rejected": -0.8295864462852478, + "step": 2088 + }, + { + "epoch": 0.24, + "learning_rate": 2.3055391520018895e-07, + "logits/chosen": -2.336369276046753, + "logits/rejected": -2.4930810928344727, + "logps/chosen": -309.11517333984375, + "logps/rejected": -280.1531066894531, + "loss": 0.8765, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0990009307861328, + "rewards/margins": 1.2734899520874023, + "rewards/rejected": -2.372490882873535, + "step": 2089 + }, + { + "epoch": 0.24, + "learning_rate": 2.305184835242707e-07, + "logits/chosen": -2.08042573928833, + "logits/rejected": -2.172767162322998, + "logps/chosen": -380.06939697265625, + "logps/rejected": -305.51788330078125, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.619519829750061, + "rewards/margins": 2.6680397987365723, + "rewards/rejected": -3.287559747695923, + "step": 2090 + }, + { + "epoch": 0.24, + "learning_rate": 2.3048305184835243e-07, + "logits/chosen": -2.351177215576172, + "logits/rejected": -2.3011081218719482, + "logps/chosen": -266.94195556640625, + "logps/rejected": -361.29876708984375, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5726664662361145, + "rewards/margins": 2.9676342010498047, + "rewards/rejected": -3.5403010845184326, + "step": 2091 + }, + { + "epoch": 0.24, + "learning_rate": 2.3044762017243415e-07, + "logits/chosen": -1.9984383583068848, + "logits/rejected": -2.188103675842285, + "logps/chosen": -374.37518310546875, + "logps/rejected": -255.70880126953125, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.048550959676504135, + "rewards/margins": 1.3606481552124023, + "rewards/rejected": -1.4091991186141968, + "step": 2092 + }, + { + "epoch": 0.24, + "learning_rate": 2.3041218849651587e-07, + "logits/chosen": -2.6444292068481445, + "logits/rejected": -2.693577289581299, + "logps/chosen": -358.3814697265625, + "logps/rejected": -341.25885009765625, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.562543511390686, + "rewards/margins": 2.9499905109405518, + "rewards/rejected": -3.5125343799591064, + "step": 2093 + }, + { + "epoch": 0.24, + "learning_rate": 2.303767568205976e-07, + "logits/chosen": -1.6545289754867554, + "logits/rejected": -2.066967248916626, + "logps/chosen": -594.4368286132812, + "logps/rejected": -363.7176208496094, + "loss": 0.637, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6513514518737793, + "rewards/margins": 1.0310990810394287, + "rewards/rejected": -1.6824506521224976, + "step": 2094 + }, + { + "epoch": 0.24, + "learning_rate": 2.3034132514467931e-07, + "logits/chosen": -2.0807619094848633, + "logits/rejected": -1.957549810409546, + "logps/chosen": -184.15469360351562, + "logps/rejected": -188.47393798828125, + "loss": 0.7225, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7239547967910767, + "rewards/margins": 1.2631405591964722, + "rewards/rejected": -1.9870953559875488, + "step": 2095 + }, + { + "epoch": 0.24, + "learning_rate": 2.3030589346876104e-07, + "logits/chosen": -2.6687190532684326, + "logits/rejected": -2.6768581867218018, + "logps/chosen": -399.9473876953125, + "logps/rejected": -389.5585632324219, + "loss": 0.2147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25516819953918457, + "rewards/margins": 2.385730743408203, + "rewards/rejected": -2.640899181365967, + "step": 2096 + }, + { + "epoch": 0.24, + "learning_rate": 2.302704617928428e-07, + "logits/chosen": -2.547635555267334, + "logits/rejected": -2.5019633769989014, + "logps/chosen": -421.4581298828125, + "logps/rejected": -286.7607727050781, + "loss": 0.5365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2664855718612671, + "rewards/margins": 0.6848435401916504, + "rewards/rejected": -0.9513291120529175, + "step": 2097 + }, + { + "epoch": 0.24, + "learning_rate": 2.3023503011692453e-07, + "logits/chosen": -1.9565714597702026, + "logits/rejected": -2.28084659576416, + "logps/chosen": -240.66305541992188, + "logps/rejected": -150.10205078125, + "loss": 0.653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4271824061870575, + "rewards/margins": 0.35893547534942627, + "rewards/rejected": -0.7861179113388062, + "step": 2098 + }, + { + "epoch": 0.24, + "learning_rate": 2.3019959844100626e-07, + "logits/chosen": -2.659397602081299, + "logits/rejected": -2.739529609680176, + "logps/chosen": -242.95175170898438, + "logps/rejected": -167.45614624023438, + "loss": 0.4208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.819594144821167, + "rewards/margins": 1.6537340879440308, + "rewards/rejected": -2.473328113555908, + "step": 2099 + }, + { + "epoch": 0.24, + "learning_rate": 2.3016416676508798e-07, + "logits/chosen": -1.6965988874435425, + "logits/rejected": -2.2584781646728516, + "logps/chosen": -500.3504943847656, + "logps/rejected": -179.69728088378906, + "loss": 0.3855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19545957446098328, + "rewards/margins": 1.5131888389587402, + "rewards/rejected": -1.708648443222046, + "step": 2100 + }, + { + "epoch": 0.24, + "learning_rate": 2.301287350891697e-07, + "logits/chosen": -2.114398717880249, + "logits/rejected": -2.162353038787842, + "logps/chosen": -525.50634765625, + "logps/rejected": -471.1441345214844, + "loss": 0.6405, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01101897656917572, + "rewards/margins": 1.8904545307159424, + "rewards/rejected": -1.9014735221862793, + "step": 2101 + }, + { + "epoch": 0.24, + "learning_rate": 2.3009330341325145e-07, + "logits/chosen": -2.7504093647003174, + "logits/rejected": -2.8256869316101074, + "logps/chosen": -164.28782653808594, + "logps/rejected": -250.89205932617188, + "loss": 0.1963, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5459045171737671, + "rewards/margins": 2.1295342445373535, + "rewards/rejected": -2.67543888092041, + "step": 2102 + }, + { + "epoch": 0.24, + "learning_rate": 2.3005787173733317e-07, + "logits/chosen": -2.526620388031006, + "logits/rejected": -2.308732032775879, + "logps/chosen": -181.3770294189453, + "logps/rejected": -301.6756286621094, + "loss": 0.3151, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8538342714309692, + "rewards/margins": 2.393160343170166, + "rewards/rejected": -3.2469944953918457, + "step": 2103 + }, + { + "epoch": 0.24, + "learning_rate": 2.300224400614149e-07, + "logits/chosen": -1.8640164136886597, + "logits/rejected": -2.5301032066345215, + "logps/chosen": -557.3271484375, + "logps/rejected": -228.86883544921875, + "loss": 0.6514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7960251569747925, + "rewards/margins": 0.8735105991363525, + "rewards/rejected": -1.6695358753204346, + "step": 2104 + }, + { + "epoch": 0.24, + "learning_rate": 2.2998700838549661e-07, + "logits/chosen": -2.514897346496582, + "logits/rejected": -2.7236521244049072, + "logps/chosen": -417.3526611328125, + "logps/rejected": -199.49081420898438, + "loss": 16.4606, + "rewards/accuracies": 0.625, + "rewards/chosen": -17.218109130859375, + "rewards/margins": -14.884782791137695, + "rewards/rejected": -2.3333230018615723, + "step": 2105 + }, + { + "epoch": 0.24, + "learning_rate": 2.2995157670957834e-07, + "logits/chosen": -2.3351211547851562, + "logits/rejected": -2.4225146770477295, + "logps/chosen": -189.0, + "logps/rejected": -291.29022216796875, + "loss": 0.378, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8758213520050049, + "rewards/margins": 2.2854561805725098, + "rewards/rejected": -3.1612775325775146, + "step": 2106 + }, + { + "epoch": 0.25, + "learning_rate": 2.2991614503366006e-07, + "logits/chosen": -1.9343101978302002, + "logits/rejected": -2.377197742462158, + "logps/chosen": -209.54122924804688, + "logps/rejected": -112.00872039794922, + "loss": 0.395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38213610649108887, + "rewards/margins": 1.0223580598831177, + "rewards/rejected": -1.4044941663742065, + "step": 2107 + }, + { + "epoch": 0.25, + "learning_rate": 2.298807133577418e-07, + "logits/chosen": -1.549855351448059, + "logits/rejected": -1.805354118347168, + "logps/chosen": -521.0791015625, + "logps/rejected": -362.57257080078125, + "loss": 0.6282, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.171830415725708, + "rewards/margins": 0.8442789316177368, + "rewards/rejected": -2.0161094665527344, + "step": 2108 + }, + { + "epoch": 0.25, + "learning_rate": 2.2984528168182356e-07, + "logits/chosen": -1.7286882400512695, + "logits/rejected": -1.9006962776184082, + "logps/chosen": -503.29583740234375, + "logps/rejected": -304.79986572265625, + "loss": 0.8084, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1845356225967407, + "rewards/margins": 0.34847450256347656, + "rewards/rejected": -1.5330100059509277, + "step": 2109 + }, + { + "epoch": 0.25, + "learning_rate": 2.2980985000590528e-07, + "logits/chosen": -2.5560271739959717, + "logits/rejected": -2.4566686153411865, + "logps/chosen": -328.2701416015625, + "logps/rejected": -252.1268768310547, + "loss": 0.3875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1438974142074585, + "rewards/margins": 1.2961591482162476, + "rewards/rejected": -2.440056324005127, + "step": 2110 + }, + { + "epoch": 0.25, + "learning_rate": 2.29774418329987e-07, + "logits/chosen": -2.1737284660339355, + "logits/rejected": -2.262253761291504, + "logps/chosen": -204.8711395263672, + "logps/rejected": -211.63177490234375, + "loss": 0.4168, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18141092360019684, + "rewards/margins": 1.2299103736877441, + "rewards/rejected": -1.0484994649887085, + "step": 2111 + }, + { + "epoch": 0.25, + "learning_rate": 2.2973898665406872e-07, + "logits/chosen": -2.201474189758301, + "logits/rejected": -2.180999755859375, + "logps/chosen": -381.7806091308594, + "logps/rejected": -644.9986572265625, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7228696942329407, + "rewards/margins": 3.134190797805786, + "rewards/rejected": -3.857060432434082, + "step": 2112 + }, + { + "epoch": 0.25, + "learning_rate": 2.2970355497815047e-07, + "logits/chosen": -2.541313886642456, + "logits/rejected": -2.6406726837158203, + "logps/chosen": -104.00942993164062, + "logps/rejected": -187.33856201171875, + "loss": 0.6473, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.075761318206787, + "rewards/margins": 1.0872206687927246, + "rewards/rejected": -2.1629819869995117, + "step": 2113 + }, + { + "epoch": 0.25, + "learning_rate": 2.296681233022322e-07, + "logits/chosen": -2.585556983947754, + "logits/rejected": -2.6896328926086426, + "logps/chosen": -312.0445556640625, + "logps/rejected": -217.39840698242188, + "loss": 0.5744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7982922792434692, + "rewards/margins": 2.5577845573425293, + "rewards/rejected": -3.356076717376709, + "step": 2114 + }, + { + "epoch": 0.25, + "learning_rate": 2.2963269162631392e-07, + "logits/chosen": -2.1246178150177, + "logits/rejected": -2.2158761024475098, + "logps/chosen": -341.9050598144531, + "logps/rejected": -263.41668701171875, + "loss": 0.3931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49661314487457275, + "rewards/margins": 1.7788549661636353, + "rewards/rejected": -2.275468111038208, + "step": 2115 + }, + { + "epoch": 0.25, + "learning_rate": 2.2959725995039564e-07, + "logits/chosen": -2.5187501907348633, + "logits/rejected": -2.4576497077941895, + "logps/chosen": -258.60784912109375, + "logps/rejected": -215.1470947265625, + "loss": 0.2985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3413189947605133, + "rewards/margins": 1.5360360145568848, + "rewards/rejected": -1.8773548603057861, + "step": 2116 + }, + { + "epoch": 0.25, + "learning_rate": 2.2956182827447736e-07, + "logits/chosen": -2.502315044403076, + "logits/rejected": -2.363102436065674, + "logps/chosen": -171.89984130859375, + "logps/rejected": -247.88565063476562, + "loss": 0.5386, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1271822452545166, + "rewards/margins": 1.1379424333572388, + "rewards/rejected": -2.265124559402466, + "step": 2117 + }, + { + "epoch": 0.25, + "learning_rate": 2.2952639659855908e-07, + "logits/chosen": -2.2036654949188232, + "logits/rejected": -2.236550807952881, + "logps/chosen": -230.99203491210938, + "logps/rejected": -237.07241821289062, + "loss": 0.4014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44174614548683167, + "rewards/margins": 1.165037751197815, + "rewards/rejected": -1.6067838668823242, + "step": 2118 + }, + { + "epoch": 0.25, + "learning_rate": 2.2949096492264083e-07, + "logits/chosen": -2.558526039123535, + "logits/rejected": -2.632307291030884, + "logps/chosen": -205.89813232421875, + "logps/rejected": -215.0721435546875, + "loss": 1.8303, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.262101888656616, + "rewards/margins": 0.5201675891876221, + "rewards/rejected": -2.7822694778442383, + "step": 2119 + }, + { + "epoch": 0.25, + "learning_rate": 2.2945553324672255e-07, + "logits/chosen": -1.5845973491668701, + "logits/rejected": -1.8394256830215454, + "logps/chosen": -331.51104736328125, + "logps/rejected": -358.0188293457031, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6169475317001343, + "rewards/margins": 0.8839913010597229, + "rewards/rejected": -1.5009386539459229, + "step": 2120 + }, + { + "epoch": 0.25, + "learning_rate": 2.294201015708043e-07, + "logits/chosen": -2.456845998764038, + "logits/rejected": -2.316467761993408, + "logps/chosen": -273.5861511230469, + "logps/rejected": -349.6385498046875, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1065634489059448, + "rewards/margins": 1.7222654819488525, + "rewards/rejected": -2.828828811645508, + "step": 2121 + }, + { + "epoch": 0.25, + "learning_rate": 2.2938466989488602e-07, + "logits/chosen": -2.3535051345825195, + "logits/rejected": -2.4179320335388184, + "logps/chosen": -281.8887939453125, + "logps/rejected": -339.3114929199219, + "loss": 0.8535, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4515576362609863, + "rewards/margins": 0.47012099623680115, + "rewards/rejected": -1.9216787815093994, + "step": 2122 + }, + { + "epoch": 0.25, + "learning_rate": 2.2934923821896775e-07, + "logits/chosen": -2.4629600048065186, + "logits/rejected": -2.5789308547973633, + "logps/chosen": -364.0344543457031, + "logps/rejected": -281.83563232421875, + "loss": 0.2476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6804347038269043, + "rewards/margins": 2.183723211288452, + "rewards/rejected": -2.8641576766967773, + "step": 2123 + }, + { + "epoch": 0.25, + "learning_rate": 2.293138065430495e-07, + "logits/chosen": -2.014516830444336, + "logits/rejected": -2.265544891357422, + "logps/chosen": -426.75164794921875, + "logps/rejected": -272.9317626953125, + "loss": 0.4549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9944907426834106, + "rewards/margins": 0.7225480675697327, + "rewards/rejected": -1.7170387506484985, + "step": 2124 + }, + { + "epoch": 0.25, + "learning_rate": 2.2927837486713122e-07, + "logits/chosen": -1.9910438060760498, + "logits/rejected": -2.2715377807617188, + "logps/chosen": -165.43820190429688, + "logps/rejected": -150.1271209716797, + "loss": 0.276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31181901693344116, + "rewards/margins": 1.5795241594314575, + "rewards/rejected": -1.891343355178833, + "step": 2125 + }, + { + "epoch": 0.25, + "learning_rate": 2.2924294319121294e-07, + "logits/chosen": -2.6272196769714355, + "logits/rejected": -2.5384135246276855, + "logps/chosen": -145.94424438476562, + "logps/rejected": -256.4827575683594, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4750710725784302, + "rewards/margins": 1.81942880153656, + "rewards/rejected": -2.2944998741149902, + "step": 2126 + }, + { + "epoch": 0.25, + "learning_rate": 2.2920751151529466e-07, + "logits/chosen": -2.496472120285034, + "logits/rejected": -2.3808064460754395, + "logps/chosen": -330.91802978515625, + "logps/rejected": -295.9268798828125, + "loss": 0.4974, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1536391973495483, + "rewards/margins": 1.3214571475982666, + "rewards/rejected": -2.4750962257385254, + "step": 2127 + }, + { + "epoch": 0.25, + "learning_rate": 2.2917207983937638e-07, + "logits/chosen": -2.190812826156616, + "logits/rejected": -2.4244284629821777, + "logps/chosen": -338.1206359863281, + "logps/rejected": -233.25277709960938, + "loss": 0.4055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4596202075481415, + "rewards/margins": 1.4400889873504639, + "rewards/rejected": -1.8997092247009277, + "step": 2128 + }, + { + "epoch": 0.25, + "learning_rate": 2.291366481634581e-07, + "logits/chosen": -2.2277932167053223, + "logits/rejected": -2.407660484313965, + "logps/chosen": -304.5791015625, + "logps/rejected": -300.0853271484375, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43349504470825195, + "rewards/margins": 2.2916736602783203, + "rewards/rejected": -2.7251687049865723, + "step": 2129 + }, + { + "epoch": 0.25, + "learning_rate": 2.2910121648753983e-07, + "logits/chosen": -2.237928867340088, + "logits/rejected": -2.6818368434906006, + "logps/chosen": -375.8564758300781, + "logps/rejected": -240.56015014648438, + "loss": 0.2772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.835185170173645, + "rewards/margins": 2.009805202484131, + "rewards/rejected": -2.8449904918670654, + "step": 2130 + }, + { + "epoch": 0.25, + "learning_rate": 2.2906578481162158e-07, + "logits/chosen": -1.8794376850128174, + "logits/rejected": -2.2117056846618652, + "logps/chosen": -414.12359619140625, + "logps/rejected": -203.04776000976562, + "loss": 0.4977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44557681679725647, + "rewards/margins": 1.6291123628616333, + "rewards/rejected": -2.0746891498565674, + "step": 2131 + }, + { + "epoch": 0.25, + "learning_rate": 2.2903035313570332e-07, + "logits/chosen": -2.604048252105713, + "logits/rejected": -2.703619956970215, + "logps/chosen": -168.92449951171875, + "logps/rejected": -244.16683959960938, + "loss": 0.4155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7219383716583252, + "rewards/margins": 1.4182631969451904, + "rewards/rejected": -2.1402015686035156, + "step": 2132 + }, + { + "epoch": 0.25, + "learning_rate": 2.2899492145978505e-07, + "logits/chosen": -2.058744430541992, + "logits/rejected": -2.150193452835083, + "logps/chosen": -228.8919677734375, + "logps/rejected": -253.7061004638672, + "loss": 0.2413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9216313362121582, + "rewards/margins": 1.5907222032546997, + "rewards/rejected": -2.5123536586761475, + "step": 2133 + }, + { + "epoch": 0.25, + "learning_rate": 2.2895948978386677e-07, + "logits/chosen": -2.0887513160705566, + "logits/rejected": -1.9864083528518677, + "logps/chosen": -347.41241455078125, + "logps/rejected": -340.7961120605469, + "loss": 0.4651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.927433967590332, + "rewards/margins": 2.149841547012329, + "rewards/rejected": -3.0772757530212402, + "step": 2134 + }, + { + "epoch": 0.25, + "learning_rate": 2.289240581079485e-07, + "logits/chosen": -2.3944852352142334, + "logits/rejected": -2.4618420600891113, + "logps/chosen": -307.84527587890625, + "logps/rejected": -362.5429992675781, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24313297867774963, + "rewards/margins": 3.026973009109497, + "rewards/rejected": -3.2701058387756348, + "step": 2135 + }, + { + "epoch": 0.25, + "learning_rate": 2.2888862643203024e-07, + "logits/chosen": -2.4910762310028076, + "logits/rejected": -2.3431849479675293, + "logps/chosen": -397.3056945800781, + "logps/rejected": -358.9815368652344, + "loss": 0.8569, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4302827715873718, + "rewards/margins": 1.3840219974517822, + "rewards/rejected": -1.8143047094345093, + "step": 2136 + }, + { + "epoch": 0.25, + "learning_rate": 2.2885319475611196e-07, + "logits/chosen": -2.7177534103393555, + "logits/rejected": -2.8236515522003174, + "logps/chosen": -166.70004272460938, + "logps/rejected": -251.80120849609375, + "loss": 0.3298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5478538274765015, + "rewards/margins": 2.876509666442871, + "rewards/rejected": -3.424363613128662, + "step": 2137 + }, + { + "epoch": 0.25, + "learning_rate": 2.2881776308019368e-07, + "logits/chosen": -2.3688271045684814, + "logits/rejected": -2.226384162902832, + "logps/chosen": -386.25531005859375, + "logps/rejected": -346.11669921875, + "loss": 0.4517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14075793325901031, + "rewards/margins": 2.106947183609009, + "rewards/rejected": -2.2477052211761475, + "step": 2138 + }, + { + "epoch": 0.25, + "learning_rate": 2.287823314042754e-07, + "logits/chosen": -2.571503162384033, + "logits/rejected": -2.8303749561309814, + "logps/chosen": -496.48614501953125, + "logps/rejected": -315.65814208984375, + "loss": 0.3743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15027162432670593, + "rewards/margins": 1.311789631843567, + "rewards/rejected": -1.4620612859725952, + "step": 2139 + }, + { + "epoch": 0.25, + "learning_rate": 2.2874689972835713e-07, + "logits/chosen": -2.4124035835266113, + "logits/rejected": -2.544478416442871, + "logps/chosen": -237.2946014404297, + "logps/rejected": -255.879638671875, + "loss": 0.6711, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.050889253616333, + "rewards/margins": 1.1060473918914795, + "rewards/rejected": -2.1569366455078125, + "step": 2140 + }, + { + "epoch": 0.25, + "learning_rate": 2.2871146805243885e-07, + "logits/chosen": -2.790017604827881, + "logits/rejected": -2.8436005115509033, + "logps/chosen": -129.85610961914062, + "logps/rejected": -243.6861572265625, + "loss": 0.3345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.023372262716293335, + "rewards/margins": 2.127570629119873, + "rewards/rejected": -2.150942802429199, + "step": 2141 + }, + { + "epoch": 0.25, + "learning_rate": 2.286760363765206e-07, + "logits/chosen": -2.323880672454834, + "logits/rejected": -2.3414485454559326, + "logps/chosen": -150.3916015625, + "logps/rejected": -154.37591552734375, + "loss": 0.4648, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2354729175567627, + "rewards/margins": 1.809091567993164, + "rewards/rejected": -3.0445644855499268, + "step": 2142 + }, + { + "epoch": 0.25, + "learning_rate": 2.2864060470060232e-07, + "logits/chosen": -2.4641032218933105, + "logits/rejected": -2.1864349842071533, + "logps/chosen": -172.58273315429688, + "logps/rejected": -237.34593200683594, + "loss": 0.6329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.867960512638092, + "rewards/margins": 1.3109456300735474, + "rewards/rejected": -2.178906202316284, + "step": 2143 + }, + { + "epoch": 0.25, + "learning_rate": 2.2860517302468407e-07, + "logits/chosen": -2.2381465435028076, + "logits/rejected": -2.1028833389282227, + "logps/chosen": -515.9766845703125, + "logps/rejected": -384.5388488769531, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.264719009399414, + "rewards/margins": 3.6574082374572754, + "rewards/rejected": -4.922127723693848, + "step": 2144 + }, + { + "epoch": 0.25, + "learning_rate": 2.285697413487658e-07, + "logits/chosen": -2.3454036712646484, + "logits/rejected": -2.3121447563171387, + "logps/chosen": -355.50030517578125, + "logps/rejected": -258.6581726074219, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17693877220153809, + "rewards/margins": 1.9134660959243774, + "rewards/rejected": -2.090404987335205, + "step": 2145 + }, + { + "epoch": 0.25, + "learning_rate": 2.285343096728475e-07, + "logits/chosen": -2.735893726348877, + "logits/rejected": -2.8115148544311523, + "logps/chosen": -192.97540283203125, + "logps/rejected": -150.6136932373047, + "loss": 0.3932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5548878908157349, + "rewards/margins": 1.6299331188201904, + "rewards/rejected": -2.184821128845215, + "step": 2146 + }, + { + "epoch": 0.25, + "learning_rate": 2.2849887799692926e-07, + "logits/chosen": -2.314082622528076, + "logits/rejected": -2.6197214126586914, + "logps/chosen": -377.728759765625, + "logps/rejected": -231.7926025390625, + "loss": 0.2437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8115788698196411, + "rewards/margins": 1.5890796184539795, + "rewards/rejected": -2.40065860748291, + "step": 2147 + }, + { + "epoch": 0.25, + "learning_rate": 2.2846344632101098e-07, + "logits/chosen": -1.670051097869873, + "logits/rejected": -2.135693073272705, + "logps/chosen": -306.4853515625, + "logps/rejected": -270.49725341796875, + "loss": 0.7444, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.326229453086853, + "rewards/margins": 0.1724797934293747, + "rewards/rejected": -1.4987092018127441, + "step": 2148 + }, + { + "epoch": 0.25, + "learning_rate": 2.284280146450927e-07, + "logits/chosen": -2.5106849670410156, + "logits/rejected": -2.512481451034546, + "logps/chosen": -239.30255126953125, + "logps/rejected": -186.81539916992188, + "loss": 0.2753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5068041086196899, + "rewards/margins": 1.5255060195922852, + "rewards/rejected": -2.0323102474212646, + "step": 2149 + }, + { + "epoch": 0.25, + "learning_rate": 2.2839258296917443e-07, + "logits/chosen": -2.034332752227783, + "logits/rejected": -2.2236948013305664, + "logps/chosen": -417.65570068359375, + "logps/rejected": -199.47605895996094, + "loss": 0.3308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6438850164413452, + "rewards/margins": 1.8943629264831543, + "rewards/rejected": -2.538248062133789, + "step": 2150 + }, + { + "epoch": 0.25, + "learning_rate": 2.2835715129325615e-07, + "logits/chosen": -2.150095224380493, + "logits/rejected": -2.4181201457977295, + "logps/chosen": -316.0111083984375, + "logps/rejected": -291.03912353515625, + "loss": 0.7188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8399404883384705, + "rewards/margins": 2.407923698425293, + "rewards/rejected": -3.247864246368408, + "step": 2151 + }, + { + "epoch": 0.25, + "learning_rate": 2.2832171961733787e-07, + "logits/chosen": -2.6252546310424805, + "logits/rejected": -2.8130970001220703, + "logps/chosen": -291.6003723144531, + "logps/rejected": -299.0618591308594, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3494405746459961, + "rewards/margins": 1.6680450439453125, + "rewards/rejected": -2.0174856185913086, + "step": 2152 + }, + { + "epoch": 0.25, + "learning_rate": 2.2828628794141962e-07, + "logits/chosen": -2.400312662124634, + "logits/rejected": -2.5134358406066895, + "logps/chosen": -344.0612487792969, + "logps/rejected": -244.72296142578125, + "loss": 1.0196, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1275907754898071, + "rewards/margins": 1.1151525974273682, + "rewards/rejected": -2.2427432537078857, + "step": 2153 + }, + { + "epoch": 0.25, + "learning_rate": 2.2825085626550134e-07, + "logits/chosen": -2.3547616004943848, + "logits/rejected": -2.119281053543091, + "logps/chosen": -331.4162292480469, + "logps/rejected": -289.97186279296875, + "loss": 0.5657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0001826286315918, + "rewards/margins": 1.0805103778839111, + "rewards/rejected": -2.080693006515503, + "step": 2154 + }, + { + "epoch": 0.25, + "learning_rate": 2.2821542458958307e-07, + "logits/chosen": -2.3150649070739746, + "logits/rejected": -2.1082205772399902, + "logps/chosen": -232.9322052001953, + "logps/rejected": -260.71600341796875, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1300795078277588, + "rewards/margins": 2.1710898876190186, + "rewards/rejected": -3.3011693954467773, + "step": 2155 + }, + { + "epoch": 0.25, + "learning_rate": 2.2817999291366481e-07, + "logits/chosen": -2.336153507232666, + "logits/rejected": -2.715815544128418, + "logps/chosen": -723.7492065429688, + "logps/rejected": -231.21392822265625, + "loss": 0.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0666197538375854, + "rewards/margins": 0.7664984464645386, + "rewards/rejected": -1.8331180810928345, + "step": 2156 + }, + { + "epoch": 0.25, + "learning_rate": 2.2814456123774654e-07, + "logits/chosen": -2.0084612369537354, + "logits/rejected": -1.59686279296875, + "logps/chosen": -285.2032165527344, + "logps/rejected": -362.973388671875, + "loss": 0.5674, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6827245354652405, + "rewards/margins": 0.9338477849960327, + "rewards/rejected": -1.6165724992752075, + "step": 2157 + }, + { + "epoch": 0.25, + "learning_rate": 2.2810912956182828e-07, + "logits/chosen": -2.693225383758545, + "logits/rejected": -2.6949892044067383, + "logps/chosen": -332.5247497558594, + "logps/rejected": -278.5345458984375, + "loss": 0.3053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3524531424045563, + "rewards/margins": 1.9375313520431519, + "rewards/rejected": -2.289984703063965, + "step": 2158 + }, + { + "epoch": 0.25, + "learning_rate": 2.2807369788591e-07, + "logits/chosen": -2.1743569374084473, + "logits/rejected": -2.186223030090332, + "logps/chosen": -249.9532470703125, + "logps/rejected": -356.5863037109375, + "loss": 0.2661, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1279939413070679, + "rewards/margins": 2.0470759868621826, + "rewards/rejected": -3.175069570541382, + "step": 2159 + }, + { + "epoch": 0.25, + "learning_rate": 2.2803826620999173e-07, + "logits/chosen": -2.0684783458709717, + "logits/rejected": -2.278337001800537, + "logps/chosen": -216.55099487304688, + "logps/rejected": -205.36614990234375, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8345708250999451, + "rewards/margins": 1.2827980518341064, + "rewards/rejected": -2.117368698120117, + "step": 2160 + }, + { + "epoch": 0.25, + "learning_rate": 2.2800283453407345e-07, + "logits/chosen": -1.887197732925415, + "logits/rejected": -2.0081796646118164, + "logps/chosen": -323.2264404296875, + "logps/rejected": -244.2742919921875, + "loss": 1.0444, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0651228427886963, + "rewards/margins": -0.26668572425842285, + "rewards/rejected": -0.7984371781349182, + "step": 2161 + }, + { + "epoch": 0.25, + "learning_rate": 2.2796740285815517e-07, + "logits/chosen": -2.2592294216156006, + "logits/rejected": -2.133533239364624, + "logps/chosen": -255.3863525390625, + "logps/rejected": -273.91229248046875, + "loss": 0.4766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7411010265350342, + "rewards/margins": 1.3481422662734985, + "rewards/rejected": -2.0892434120178223, + "step": 2162 + }, + { + "epoch": 0.25, + "learning_rate": 2.279319711822369e-07, + "logits/chosen": -2.39513897895813, + "logits/rejected": -2.6418890953063965, + "logps/chosen": -387.3426818847656, + "logps/rejected": -211.67279052734375, + "loss": 0.3396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7786517143249512, + "rewards/margins": 2.430246353149414, + "rewards/rejected": -3.2088980674743652, + "step": 2163 + }, + { + "epoch": 0.25, + "learning_rate": 2.2789653950631862e-07, + "logits/chosen": -2.5594708919525146, + "logits/rejected": -2.400991916656494, + "logps/chosen": -305.7545471191406, + "logps/rejected": -338.04656982421875, + "loss": 0.2144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8824601769447327, + "rewards/margins": 2.3318915367126465, + "rewards/rejected": -3.2143516540527344, + "step": 2164 + }, + { + "epoch": 0.25, + "learning_rate": 2.2786110783040037e-07, + "logits/chosen": -2.7051806449890137, + "logits/rejected": -2.6197474002838135, + "logps/chosen": -152.98471069335938, + "logps/rejected": -351.36248779296875, + "loss": 0.2955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.303146094083786, + "rewards/margins": 2.4990811347961426, + "rewards/rejected": -2.80222749710083, + "step": 2165 + }, + { + "epoch": 0.25, + "learning_rate": 2.278256761544821e-07, + "logits/chosen": -2.1872992515563965, + "logits/rejected": -2.317622184753418, + "logps/chosen": -378.1176452636719, + "logps/rejected": -418.0537414550781, + "loss": 0.5468, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20752966403961182, + "rewards/margins": 1.941022276878357, + "rewards/rejected": -2.1485519409179688, + "step": 2166 + }, + { + "epoch": 0.25, + "learning_rate": 2.2779024447856384e-07, + "logits/chosen": -2.4930427074432373, + "logits/rejected": -2.4171035289764404, + "logps/chosen": -235.86534118652344, + "logps/rejected": -290.5405578613281, + "loss": 0.2915, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18631824851036072, + "rewards/margins": 3.558704376220703, + "rewards/rejected": -3.7450222969055176, + "step": 2167 + }, + { + "epoch": 0.25, + "learning_rate": 2.2775481280264556e-07, + "logits/chosen": -2.1766223907470703, + "logits/rejected": -2.381653308868408, + "logps/chosen": -334.9700012207031, + "logps/rejected": -228.99057006835938, + "loss": 0.4758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0367393493652344, + "rewards/margins": 0.8552526235580444, + "rewards/rejected": -1.8919920921325684, + "step": 2168 + }, + { + "epoch": 0.25, + "learning_rate": 2.277193811267273e-07, + "logits/chosen": -2.0178306102752686, + "logits/rejected": -2.189450740814209, + "logps/chosen": -275.6355895996094, + "logps/rejected": -281.29693603515625, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7972977757453918, + "rewards/margins": 1.1546599864959717, + "rewards/rejected": -1.9519578218460083, + "step": 2169 + }, + { + "epoch": 0.25, + "learning_rate": 2.2768394945080903e-07, + "logits/chosen": -2.2734668254852295, + "logits/rejected": -2.4732158184051514, + "logps/chosen": -187.61366271972656, + "logps/rejected": -185.02780151367188, + "loss": 0.9356, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1494472026824951, + "rewards/margins": 1.1912405490875244, + "rewards/rejected": -2.3406877517700195, + "step": 2170 + }, + { + "epoch": 0.25, + "learning_rate": 2.2764851777489075e-07, + "logits/chosen": -2.298065662384033, + "logits/rejected": -2.368726968765259, + "logps/chosen": -233.78732299804688, + "logps/rejected": -232.2427215576172, + "loss": 0.7904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7596050500869751, + "rewards/margins": 0.35640284419059753, + "rewards/rejected": -1.1160078048706055, + "step": 2171 + }, + { + "epoch": 0.25, + "learning_rate": 2.2761308609897247e-07, + "logits/chosen": -1.6020190715789795, + "logits/rejected": -2.079550266265869, + "logps/chosen": -403.8466796875, + "logps/rejected": -250.55337524414062, + "loss": 0.4163, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16269069910049438, + "rewards/margins": 0.8778970241546631, + "rewards/rejected": -1.0405877828598022, + "step": 2172 + }, + { + "epoch": 0.25, + "learning_rate": 2.275776544230542e-07, + "logits/chosen": -2.381019353866577, + "logits/rejected": -2.465379476547241, + "logps/chosen": -282.2665100097656, + "logps/rejected": -376.0032958984375, + "loss": 0.4417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12151315808296204, + "rewards/margins": 1.1937079429626465, + "rewards/rejected": -1.3152210712432861, + "step": 2173 + }, + { + "epoch": 0.25, + "learning_rate": 2.2754222274713592e-07, + "logits/chosen": -1.6910005807876587, + "logits/rejected": -1.7976192235946655, + "logps/chosen": -452.1587219238281, + "logps/rejected": -310.80059814453125, + "loss": 0.4499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7068763375282288, + "rewards/margins": 1.420177936553955, + "rewards/rejected": -2.127054214477539, + "step": 2174 + }, + { + "epoch": 0.25, + "learning_rate": 2.2750679107121764e-07, + "logits/chosen": -2.0479683876037598, + "logits/rejected": -1.8563112020492554, + "logps/chosen": -308.58099365234375, + "logps/rejected": -300.50927734375, + "loss": 0.3251, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0412843227386475, + "rewards/margins": 1.8914991617202759, + "rewards/rejected": -2.932783603668213, + "step": 2175 + }, + { + "epoch": 0.25, + "learning_rate": 2.274713593952994e-07, + "logits/chosen": -2.4977524280548096, + "logits/rejected": -2.725266695022583, + "logps/chosen": -316.5888366699219, + "logps/rejected": -230.71267700195312, + "loss": 0.4248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4561302661895752, + "rewards/margins": 1.786384105682373, + "rewards/rejected": -2.2425143718719482, + "step": 2176 + }, + { + "epoch": 0.25, + "learning_rate": 2.274359277193811e-07, + "logits/chosen": -2.2812459468841553, + "logits/rejected": -2.227173328399658, + "logps/chosen": -202.62049865722656, + "logps/rejected": -277.80438232421875, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9735349416732788, + "rewards/margins": 1.0427041053771973, + "rewards/rejected": -2.0162389278411865, + "step": 2177 + }, + { + "epoch": 0.25, + "learning_rate": 2.2740049604346283e-07, + "logits/chosen": -2.3988869190216064, + "logits/rejected": -2.5430619716644287, + "logps/chosen": -350.46612548828125, + "logps/rejected": -285.5378723144531, + "loss": 0.3341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5103209018707275, + "rewards/margins": 1.8445733785629272, + "rewards/rejected": -2.3548941612243652, + "step": 2178 + }, + { + "epoch": 0.25, + "learning_rate": 2.2736506436754458e-07, + "logits/chosen": -2.0520081520080566, + "logits/rejected": -2.3029398918151855, + "logps/chosen": -203.90997314453125, + "logps/rejected": -132.52655029296875, + "loss": 0.4222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7518880367279053, + "rewards/margins": 1.1252784729003906, + "rewards/rejected": -1.8771663904190063, + "step": 2179 + }, + { + "epoch": 0.25, + "learning_rate": 2.273296326916263e-07, + "logits/chosen": -2.256568670272827, + "logits/rejected": -2.358093500137329, + "logps/chosen": -279.9107971191406, + "logps/rejected": -406.9961853027344, + "loss": 0.1796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6932511925697327, + "rewards/margins": 2.5141732692718506, + "rewards/rejected": -3.2074244022369385, + "step": 2180 + }, + { + "epoch": 0.25, + "learning_rate": 2.2729420101570805e-07, + "logits/chosen": -1.9170485734939575, + "logits/rejected": -2.6212596893310547, + "logps/chosen": -416.2386169433594, + "logps/rejected": -299.912109375, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06405384838581085, + "rewards/margins": 4.170487880706787, + "rewards/rejected": -4.106434345245361, + "step": 2181 + }, + { + "epoch": 0.25, + "learning_rate": 2.2725876933978977e-07, + "logits/chosen": -2.1969571113586426, + "logits/rejected": -2.1868491172790527, + "logps/chosen": -204.36643981933594, + "logps/rejected": -231.0072021484375, + "loss": 0.8245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7387568950653076, + "rewards/margins": 0.8371239900588989, + "rewards/rejected": -2.575880527496338, + "step": 2182 + }, + { + "epoch": 0.25, + "learning_rate": 2.272233376638715e-07, + "logits/chosen": -2.528383255004883, + "logits/rejected": -2.509833812713623, + "logps/chosen": -137.77626037597656, + "logps/rejected": -222.89300537109375, + "loss": 0.3118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1394088715314865, + "rewards/margins": 1.960296869277954, + "rewards/rejected": -2.099705696105957, + "step": 2183 + }, + { + "epoch": 0.25, + "learning_rate": 2.2718790598795322e-07, + "logits/chosen": -2.4688563346862793, + "logits/rejected": -2.467459201812744, + "logps/chosen": -192.42608642578125, + "logps/rejected": -174.33621215820312, + "loss": 0.4512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45962536334991455, + "rewards/margins": 1.3100085258483887, + "rewards/rejected": -1.7696338891983032, + "step": 2184 + }, + { + "epoch": 0.25, + "learning_rate": 2.2715247431203494e-07, + "logits/chosen": -2.521930456161499, + "logits/rejected": -2.4143190383911133, + "logps/chosen": -233.11837768554688, + "logps/rejected": -340.8379821777344, + "loss": 0.6535, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5772802829742432, + "rewards/margins": 0.8542445302009583, + "rewards/rejected": -2.4315247535705566, + "step": 2185 + }, + { + "epoch": 0.25, + "learning_rate": 2.2711704263611666e-07, + "logits/chosen": -2.4585909843444824, + "logits/rejected": -2.1747941970825195, + "logps/chosen": -180.0541534423828, + "logps/rejected": -399.7169189453125, + "loss": 0.4061, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4606156051158905, + "rewards/margins": 1.515753149986267, + "rewards/rejected": -1.9763686656951904, + "step": 2186 + }, + { + "epoch": 0.25, + "learning_rate": 2.270816109601984e-07, + "logits/chosen": -1.9671661853790283, + "logits/rejected": -2.0288028717041016, + "logps/chosen": -570.3203735351562, + "logps/rejected": -288.6346130371094, + "loss": 0.3564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48816007375717163, + "rewards/margins": 1.9157480001449585, + "rewards/rejected": -2.4039080142974854, + "step": 2187 + }, + { + "epoch": 0.25, + "learning_rate": 2.2704617928428013e-07, + "logits/chosen": -2.7198400497436523, + "logits/rejected": -2.52475905418396, + "logps/chosen": -285.28680419921875, + "logps/rejected": -218.46337890625, + "loss": 0.4109, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9212007522583008, + "rewards/margins": 1.336998701095581, + "rewards/rejected": -2.258199691772461, + "step": 2188 + }, + { + "epoch": 0.25, + "learning_rate": 2.2701074760836186e-07, + "logits/chosen": -2.2880024909973145, + "logits/rejected": -2.521951675415039, + "logps/chosen": -379.365234375, + "logps/rejected": -174.3182373046875, + "loss": 0.8191, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3549180030822754, + "rewards/margins": 0.5332018136978149, + "rewards/rejected": -0.8881198167800903, + "step": 2189 + }, + { + "epoch": 0.25, + "learning_rate": 2.2697531593244358e-07, + "logits/chosen": -2.1973929405212402, + "logits/rejected": -2.365079879760742, + "logps/chosen": -458.2642822265625, + "logps/rejected": -294.8658142089844, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3687232732772827, + "rewards/margins": 1.8982001543045044, + "rewards/rejected": -2.266923427581787, + "step": 2190 + }, + { + "epoch": 0.25, + "learning_rate": 2.2693988425652533e-07, + "logits/chosen": -2.3547797203063965, + "logits/rejected": -2.307621479034424, + "logps/chosen": -322.23980712890625, + "logps/rejected": -333.84039306640625, + "loss": 0.4692, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.055474333465099335, + "rewards/margins": 2.1145195960998535, + "rewards/rejected": -2.1699936389923096, + "step": 2191 + }, + { + "epoch": 0.25, + "learning_rate": 2.2690445258060707e-07, + "logits/chosen": -2.077676773071289, + "logits/rejected": -2.034428596496582, + "logps/chosen": -186.52151489257812, + "logps/rejected": -370.5426025390625, + "loss": 0.588, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4094897508621216, + "rewards/margins": 1.5558617115020752, + "rewards/rejected": -1.9653514623641968, + "step": 2192 + }, + { + "epoch": 0.26, + "learning_rate": 2.268690209046888e-07, + "logits/chosen": -1.9660917520523071, + "logits/rejected": -2.412707805633545, + "logps/chosen": -303.42559814453125, + "logps/rejected": -375.7781982421875, + "loss": 0.8299, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0103862285614014, + "rewards/margins": 1.102613925933838, + "rewards/rejected": -2.11299991607666, + "step": 2193 + }, + { + "epoch": 0.26, + "learning_rate": 2.2683358922877052e-07, + "logits/chosen": -1.8946905136108398, + "logits/rejected": -1.6209267377853394, + "logps/chosen": -245.49798583984375, + "logps/rejected": -361.66046142578125, + "loss": 0.3415, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.96596097946167, + "rewards/margins": 2.207536458969116, + "rewards/rejected": -4.173497200012207, + "step": 2194 + }, + { + "epoch": 0.26, + "learning_rate": 2.2679815755285224e-07, + "logits/chosen": -2.6899492740631104, + "logits/rejected": -2.829401731491089, + "logps/chosen": -550.650390625, + "logps/rejected": -421.2415771484375, + "loss": 0.6853, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.443908452987671, + "rewards/margins": 1.0611375570297241, + "rewards/rejected": -2.5050458908081055, + "step": 2195 + }, + { + "epoch": 0.26, + "learning_rate": 2.2676272587693396e-07, + "logits/chosen": -2.27907395362854, + "logits/rejected": -2.3962414264678955, + "logps/chosen": -187.3654022216797, + "logps/rejected": -214.6670684814453, + "loss": 0.305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5460546612739563, + "rewards/margins": 1.57201087474823, + "rewards/rejected": -2.118065357208252, + "step": 2196 + }, + { + "epoch": 0.26, + "learning_rate": 2.2672729420101569e-07, + "logits/chosen": -2.7213053703308105, + "logits/rejected": -2.679159164428711, + "logps/chosen": -126.7208251953125, + "logps/rejected": -137.71084594726562, + "loss": 1.0016, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8497123718261719, + "rewards/margins": 1.409379243850708, + "rewards/rejected": -3.259091377258301, + "step": 2197 + }, + { + "epoch": 0.26, + "learning_rate": 2.2669186252509743e-07, + "logits/chosen": -2.4415602684020996, + "logits/rejected": -2.539029359817505, + "logps/chosen": -318.6957092285156, + "logps/rejected": -338.78985595703125, + "loss": 0.5346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8570128679275513, + "rewards/margins": 0.7651805281639099, + "rewards/rejected": -1.622193455696106, + "step": 2198 + }, + { + "epoch": 0.26, + "learning_rate": 2.2665643084917916e-07, + "logits/chosen": -2.4161999225616455, + "logits/rejected": -2.4458882808685303, + "logps/chosen": -100.66584014892578, + "logps/rejected": -156.87957763671875, + "loss": 0.4165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7642151117324829, + "rewards/margins": 1.532168984413147, + "rewards/rejected": -2.29638409614563, + "step": 2199 + }, + { + "epoch": 0.26, + "learning_rate": 2.2662099917326088e-07, + "logits/chosen": -2.0434110164642334, + "logits/rejected": -2.3836755752563477, + "logps/chosen": -305.6235046386719, + "logps/rejected": -268.07110595703125, + "loss": 0.7506, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.291447639465332, + "rewards/margins": 0.857170581817627, + "rewards/rejected": -2.14861798286438, + "step": 2200 + }, + { + "epoch": 0.26, + "learning_rate": 2.265855674973426e-07, + "logits/chosen": -2.269157886505127, + "logits/rejected": -2.276824712753296, + "logps/chosen": -161.12884521484375, + "logps/rejected": -164.8707733154297, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3817580044269562, + "rewards/margins": 2.5294058322906494, + "rewards/rejected": -2.911163806915283, + "step": 2201 + }, + { + "epoch": 0.26, + "learning_rate": 2.2655013582142435e-07, + "logits/chosen": -2.4443743228912354, + "logits/rejected": -2.704706907272339, + "logps/chosen": -219.93496704101562, + "logps/rejected": -129.1767120361328, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4428876042366028, + "rewards/margins": 1.0850483179092407, + "rewards/rejected": -1.5279359817504883, + "step": 2202 + }, + { + "epoch": 0.26, + "learning_rate": 2.265147041455061e-07, + "logits/chosen": -1.9535980224609375, + "logits/rejected": -2.116917133331299, + "logps/chosen": -237.8778076171875, + "logps/rejected": -182.68408203125, + "loss": 0.4124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20374450087547302, + "rewards/margins": 1.2534356117248535, + "rewards/rejected": -1.4571800231933594, + "step": 2203 + }, + { + "epoch": 0.26, + "learning_rate": 2.2647927246958782e-07, + "logits/chosen": -2.682274580001831, + "logits/rejected": -2.4723587036132812, + "logps/chosen": -234.87371826171875, + "logps/rejected": -360.26568603515625, + "loss": 0.2897, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3790254592895508, + "rewards/margins": 2.2587177753448486, + "rewards/rejected": -3.6377432346343994, + "step": 2204 + }, + { + "epoch": 0.26, + "learning_rate": 2.2644384079366954e-07, + "logits/chosen": -2.148134469985962, + "logits/rejected": -2.590517282485962, + "logps/chosen": -296.2895812988281, + "logps/rejected": -252.99815368652344, + "loss": 0.8687, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8539435863494873, + "rewards/margins": 0.5031282305717468, + "rewards/rejected": -1.3570716381072998, + "step": 2205 + }, + { + "epoch": 0.26, + "learning_rate": 2.2640840911775126e-07, + "logits/chosen": -2.566002607345581, + "logits/rejected": -2.6361191272735596, + "logps/chosen": -194.16429138183594, + "logps/rejected": -253.6163330078125, + "loss": 0.1997, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12994100153446198, + "rewards/margins": 3.005037307739258, + "rewards/rejected": -2.875096321105957, + "step": 2206 + }, + { + "epoch": 0.26, + "learning_rate": 2.2637297744183299e-07, + "logits/chosen": -1.7851834297180176, + "logits/rejected": -2.045300006866455, + "logps/chosen": -405.2264709472656, + "logps/rejected": -397.5347900390625, + "loss": 0.6084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6460085511207581, + "rewards/margins": 0.49397343397140503, + "rewards/rejected": -1.139981985092163, + "step": 2207 + }, + { + "epoch": 0.26, + "learning_rate": 2.263375457659147e-07, + "logits/chosen": -2.2607016563415527, + "logits/rejected": -2.3870835304260254, + "logps/chosen": -317.15496826171875, + "logps/rejected": -346.7872314453125, + "loss": 0.2319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5566080808639526, + "rewards/margins": 2.465899705886841, + "rewards/rejected": -3.022507905960083, + "step": 2208 + }, + { + "epoch": 0.26, + "learning_rate": 2.2630211408999643e-07, + "logits/chosen": -2.788239002227783, + "logits/rejected": -2.683950901031494, + "logps/chosen": -297.99078369140625, + "logps/rejected": -248.7438507080078, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39687103033065796, + "rewards/margins": 2.641606092453003, + "rewards/rejected": -3.0384771823883057, + "step": 2209 + }, + { + "epoch": 0.26, + "learning_rate": 2.2626668241407818e-07, + "logits/chosen": -1.77592933177948, + "logits/rejected": -1.803506851196289, + "logps/chosen": -347.27459716796875, + "logps/rejected": -389.534423828125, + "loss": 0.3034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6428102254867554, + "rewards/margins": 1.9889980554580688, + "rewards/rejected": -2.631808280944824, + "step": 2210 + }, + { + "epoch": 0.26, + "learning_rate": 2.262312507381599e-07, + "logits/chosen": -2.326338768005371, + "logits/rejected": -2.5560641288757324, + "logps/chosen": -208.26580810546875, + "logps/rejected": -238.36288452148438, + "loss": 0.3188, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5791000127792358, + "rewards/margins": 1.9776796102523804, + "rewards/rejected": -3.556779384613037, + "step": 2211 + }, + { + "epoch": 0.26, + "learning_rate": 2.2619581906224162e-07, + "logits/chosen": -2.4291164875030518, + "logits/rejected": -2.549701690673828, + "logps/chosen": -301.35821533203125, + "logps/rejected": -339.755859375, + "loss": 0.1116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0733058825135231, + "rewards/margins": 3.8136773109436035, + "rewards/rejected": -3.7403712272644043, + "step": 2212 + }, + { + "epoch": 0.26, + "learning_rate": 2.2616038738632335e-07, + "logits/chosen": -2.618539810180664, + "logits/rejected": -2.4691076278686523, + "logps/chosen": -241.77700805664062, + "logps/rejected": -300.0421447753906, + "loss": 0.3953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5121986865997314, + "rewards/margins": 1.188902735710144, + "rewards/rejected": -1.701101541519165, + "step": 2213 + }, + { + "epoch": 0.26, + "learning_rate": 2.2612495571040512e-07, + "logits/chosen": -2.6005430221557617, + "logits/rejected": -2.5233426094055176, + "logps/chosen": -286.72528076171875, + "logps/rejected": -203.00558471679688, + "loss": 0.5138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6384978890419006, + "rewards/margins": 2.0146634578704834, + "rewards/rejected": -2.6531615257263184, + "step": 2214 + }, + { + "epoch": 0.26, + "learning_rate": 2.2608952403448684e-07, + "logits/chosen": -2.1191959381103516, + "logits/rejected": -2.606194257736206, + "logps/chosen": -351.64996337890625, + "logps/rejected": -199.28225708007812, + "loss": 0.8352, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.02115797996521, + "rewards/margins": 0.7858879566192627, + "rewards/rejected": -1.8070459365844727, + "step": 2215 + }, + { + "epoch": 0.26, + "learning_rate": 2.2605409235856856e-07, + "logits/chosen": -2.7401115894317627, + "logits/rejected": -2.407297134399414, + "logps/chosen": -238.26165771484375, + "logps/rejected": -239.33822631835938, + "loss": 0.4776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.819509744644165, + "rewards/margins": 1.4317800998687744, + "rewards/rejected": -2.2512898445129395, + "step": 2216 + }, + { + "epoch": 0.26, + "learning_rate": 2.260186606826503e-07, + "logits/chosen": -1.8954682350158691, + "logits/rejected": -2.068930149078369, + "logps/chosen": -226.3697509765625, + "logps/rejected": -277.2089538574219, + "loss": 0.4133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5585901737213135, + "rewards/margins": 1.5252166986465454, + "rewards/rejected": -2.0838069915771484, + "step": 2217 + }, + { + "epoch": 0.26, + "learning_rate": 2.25983229006732e-07, + "logits/chosen": -1.775301456451416, + "logits/rejected": -1.7959669828414917, + "logps/chosen": -329.22589111328125, + "logps/rejected": -392.1826171875, + "loss": 0.24, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7417924404144287, + "rewards/margins": 2.6951303482055664, + "rewards/rejected": -3.436922788619995, + "step": 2218 + }, + { + "epoch": 0.26, + "learning_rate": 2.2594779733081373e-07, + "logits/chosen": -2.3922863006591797, + "logits/rejected": -2.254333734512329, + "logps/chosen": -271.20068359375, + "logps/rejected": -290.08245849609375, + "loss": 0.6595, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.157062292098999, + "rewards/margins": 1.2331371307373047, + "rewards/rejected": -2.3901994228363037, + "step": 2219 + }, + { + "epoch": 0.26, + "learning_rate": 2.2591236565489545e-07, + "logits/chosen": -2.2101669311523438, + "logits/rejected": -2.2030749320983887, + "logps/chosen": -148.06199645996094, + "logps/rejected": -238.65240478515625, + "loss": 0.2563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7841014862060547, + "rewards/margins": 3.0515024662017822, + "rewards/rejected": -3.835604190826416, + "step": 2220 + }, + { + "epoch": 0.26, + "learning_rate": 2.258769339789772e-07, + "logits/chosen": -2.9097933769226074, + "logits/rejected": -2.8605053424835205, + "logps/chosen": -273.4096374511719, + "logps/rejected": -289.3459167480469, + "loss": 0.1587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5949381589889526, + "rewards/margins": 2.1197361946105957, + "rewards/rejected": -2.714674472808838, + "step": 2221 + }, + { + "epoch": 0.26, + "learning_rate": 2.2584150230305892e-07, + "logits/chosen": -2.0957529544830322, + "logits/rejected": -2.2686238288879395, + "logps/chosen": -495.4945068359375, + "logps/rejected": -324.6546630859375, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6759283542633057, + "rewards/margins": 1.2904428243637085, + "rewards/rejected": -2.9663710594177246, + "step": 2222 + }, + { + "epoch": 0.26, + "learning_rate": 2.2580607062714065e-07, + "logits/chosen": -2.495971202850342, + "logits/rejected": -2.5593466758728027, + "logps/chosen": -398.02996826171875, + "logps/rejected": -442.7575378417969, + "loss": 0.5096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28825056552886963, + "rewards/margins": 0.725037157535553, + "rewards/rejected": -1.0132877826690674, + "step": 2223 + }, + { + "epoch": 0.26, + "learning_rate": 2.2577063895122237e-07, + "logits/chosen": -1.965828537940979, + "logits/rejected": -2.0859873294830322, + "logps/chosen": -445.4443054199219, + "logps/rejected": -285.328369140625, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6924425363540649, + "rewards/margins": 2.374490261077881, + "rewards/rejected": -3.066932439804077, + "step": 2224 + }, + { + "epoch": 0.26, + "learning_rate": 2.257352072753041e-07, + "logits/chosen": -2.3780739307403564, + "logits/rejected": -2.2002902030944824, + "logps/chosen": -372.5032043457031, + "logps/rejected": -531.9457397460938, + "loss": 0.6927, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.838510274887085, + "rewards/margins": 1.7599385976791382, + "rewards/rejected": -2.5984487533569336, + "step": 2225 + }, + { + "epoch": 0.26, + "learning_rate": 2.2569977559938587e-07, + "logits/chosen": -2.2050116062164307, + "logits/rejected": -2.1608080863952637, + "logps/chosen": -272.6571350097656, + "logps/rejected": -260.0111083984375, + "loss": 0.4261, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35595569014549255, + "rewards/margins": 1.4814578294754028, + "rewards/rejected": -1.8374134302139282, + "step": 2226 + }, + { + "epoch": 0.26, + "learning_rate": 2.256643439234676e-07, + "logits/chosen": -1.8067381381988525, + "logits/rejected": -2.125800132751465, + "logps/chosen": -273.30633544921875, + "logps/rejected": -261.3936767578125, + "loss": 0.3367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7177894115447998, + "rewards/margins": 2.1182758808135986, + "rewards/rejected": -2.8360652923583984, + "step": 2227 + }, + { + "epoch": 0.26, + "learning_rate": 2.256289122475493e-07, + "logits/chosen": -2.0081374645233154, + "logits/rejected": -2.266482353210449, + "logps/chosen": -438.4326171875, + "logps/rejected": -410.3953857421875, + "loss": 0.3921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2940959334373474, + "rewards/margins": 1.4941316843032837, + "rewards/rejected": -1.7882274389266968, + "step": 2228 + }, + { + "epoch": 0.26, + "learning_rate": 2.2559348057163103e-07, + "logits/chosen": -2.166203498840332, + "logits/rejected": -2.0084404945373535, + "logps/chosen": -187.4442901611328, + "logps/rejected": -260.0781555175781, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20120257139205933, + "rewards/margins": 1.6559299230575562, + "rewards/rejected": -1.8571325540542603, + "step": 2229 + }, + { + "epoch": 0.26, + "learning_rate": 2.2555804889571275e-07, + "logits/chosen": -2.450007438659668, + "logits/rejected": -2.6020655632019043, + "logps/chosen": -329.8806457519531, + "logps/rejected": -344.6948547363281, + "loss": 0.5299, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9587228298187256, + "rewards/margins": 1.1871380805969238, + "rewards/rejected": -2.1458609104156494, + "step": 2230 + }, + { + "epoch": 0.26, + "learning_rate": 2.2552261721979448e-07, + "logits/chosen": -2.6287450790405273, + "logits/rejected": -2.438342332839966, + "logps/chosen": -298.167724609375, + "logps/rejected": -260.5318298339844, + "loss": 0.2594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7182356119155884, + "rewards/margins": 2.13051438331604, + "rewards/rejected": -2.848750114440918, + "step": 2231 + }, + { + "epoch": 0.26, + "learning_rate": 2.2548718554387622e-07, + "logits/chosen": -2.962860345840454, + "logits/rejected": -2.952655076980591, + "logps/chosen": -115.7410888671875, + "logps/rejected": -138.054931640625, + "loss": 0.2907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5736470222473145, + "rewards/margins": 2.3354883193969727, + "rewards/rejected": -2.909135103225708, + "step": 2232 + }, + { + "epoch": 0.26, + "learning_rate": 2.2545175386795795e-07, + "logits/chosen": -2.0351409912109375, + "logits/rejected": -2.147841215133667, + "logps/chosen": -378.5544738769531, + "logps/rejected": -290.6115417480469, + "loss": 0.5648, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2123260498046875, + "rewards/margins": 2.6956775188446045, + "rewards/rejected": -3.908003568649292, + "step": 2233 + }, + { + "epoch": 0.26, + "learning_rate": 2.2541632219203967e-07, + "logits/chosen": -2.39721941947937, + "logits/rejected": -2.4118902683258057, + "logps/chosen": -234.1080780029297, + "logps/rejected": -228.02670288085938, + "loss": 0.5131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.425875723361969, + "rewards/margins": 1.2953789234161377, + "rewards/rejected": -1.7212547063827515, + "step": 2234 + }, + { + "epoch": 0.26, + "learning_rate": 2.253808905161214e-07, + "logits/chosen": -1.7584824562072754, + "logits/rejected": -1.499441385269165, + "logps/chosen": -222.8790283203125, + "logps/rejected": -250.87045288085938, + "loss": 0.7079, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2404229640960693, + "rewards/margins": 2.239509344100952, + "rewards/rejected": -4.479931831359863, + "step": 2235 + }, + { + "epoch": 0.26, + "learning_rate": 2.253454588402031e-07, + "logits/chosen": -1.4448570013046265, + "logits/rejected": -1.9737565517425537, + "logps/chosen": -631.2334594726562, + "logps/rejected": -440.1190185546875, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5250144004821777, + "rewards/margins": 1.8212811946868896, + "rewards/rejected": -1.296266794204712, + "step": 2236 + }, + { + "epoch": 0.26, + "learning_rate": 2.253100271642849e-07, + "logits/chosen": -2.993021011352539, + "logits/rejected": -2.9816830158233643, + "logps/chosen": -249.64254760742188, + "logps/rejected": -311.1166076660156, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5826046466827393, + "rewards/margins": 2.261411190032959, + "rewards/rejected": -2.844015598297119, + "step": 2237 + }, + { + "epoch": 0.26, + "learning_rate": 2.252745954883666e-07, + "logits/chosen": -2.2093067169189453, + "logits/rejected": -2.5027079582214355, + "logps/chosen": -303.1380615234375, + "logps/rejected": -216.9829559326172, + "loss": 0.2033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21893131732940674, + "rewards/margins": 2.499175786972046, + "rewards/rejected": -2.718106985092163, + "step": 2238 + }, + { + "epoch": 0.26, + "learning_rate": 2.2523916381244833e-07, + "logits/chosen": -2.306025743484497, + "logits/rejected": -2.240424156188965, + "logps/chosen": -127.5557861328125, + "logps/rejected": -136.601318359375, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8704906105995178, + "rewards/margins": 1.0905183553695679, + "rewards/rejected": -1.9610090255737305, + "step": 2239 + }, + { + "epoch": 0.26, + "learning_rate": 2.2520373213653005e-07, + "logits/chosen": -2.478996753692627, + "logits/rejected": -2.3323349952697754, + "logps/chosen": -264.19854736328125, + "logps/rejected": -284.23291015625, + "loss": 0.5327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6179327964782715, + "rewards/margins": 1.0518726110458374, + "rewards/rejected": -1.6698055267333984, + "step": 2240 + }, + { + "epoch": 0.26, + "learning_rate": 2.2516830046061178e-07, + "logits/chosen": -2.162780284881592, + "logits/rejected": -2.033653736114502, + "logps/chosen": -266.3164367675781, + "logps/rejected": -373.35113525390625, + "loss": 0.1597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4184514284133911, + "rewards/margins": 2.8973429203033447, + "rewards/rejected": -2.478891372680664, + "step": 2241 + }, + { + "epoch": 0.26, + "learning_rate": 2.251328687846935e-07, + "logits/chosen": -2.7889578342437744, + "logits/rejected": -2.7332167625427246, + "logps/chosen": -344.39300537109375, + "logps/rejected": -366.85821533203125, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.529309630393982, + "rewards/margins": 2.1084322929382324, + "rewards/rejected": -3.637742042541504, + "step": 2242 + }, + { + "epoch": 0.26, + "learning_rate": 2.2509743710877525e-07, + "logits/chosen": -2.9781546592712402, + "logits/rejected": -2.9168171882629395, + "logps/chosen": -379.44952392578125, + "logps/rejected": -323.8695068359375, + "loss": 0.2293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23599907755851746, + "rewards/margins": 2.6260149478912354, + "rewards/rejected": -2.8620142936706543, + "step": 2243 + }, + { + "epoch": 0.26, + "learning_rate": 2.2506200543285697e-07, + "logits/chosen": -1.9151602983474731, + "logits/rejected": -2.1762866973876953, + "logps/chosen": -246.69979858398438, + "logps/rejected": -294.57879638671875, + "loss": 0.3734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8125112652778625, + "rewards/margins": 1.1405081748962402, + "rewards/rejected": -1.953019380569458, + "step": 2244 + }, + { + "epoch": 0.26, + "learning_rate": 2.250265737569387e-07, + "logits/chosen": -2.312774896621704, + "logits/rejected": -1.924754023551941, + "logps/chosen": -89.41317749023438, + "logps/rejected": -222.2435302734375, + "loss": 0.3719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5856520533561707, + "rewards/margins": 2.412468910217285, + "rewards/rejected": -2.9981207847595215, + "step": 2245 + }, + { + "epoch": 0.26, + "learning_rate": 2.2499114208102041e-07, + "logits/chosen": -1.533930778503418, + "logits/rejected": -1.6927227973937988, + "logps/chosen": -314.80023193359375, + "logps/rejected": -337.54248046875, + "loss": 0.1374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36502864956855774, + "rewards/margins": 2.5815398693084717, + "rewards/rejected": -2.946568489074707, + "step": 2246 + }, + { + "epoch": 0.26, + "learning_rate": 2.2495571040510214e-07, + "logits/chosen": -2.655251979827881, + "logits/rejected": -2.5738022327423096, + "logps/chosen": -127.62313842773438, + "logps/rejected": -222.14505004882812, + "loss": 0.4241, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0949407815933228, + "rewards/margins": 2.801884174346924, + "rewards/rejected": -3.896825075149536, + "step": 2247 + }, + { + "epoch": 0.26, + "learning_rate": 2.2492027872918386e-07, + "logits/chosen": -2.5697529315948486, + "logits/rejected": -2.6344428062438965, + "logps/chosen": -344.04644775390625, + "logps/rejected": -284.2572021484375, + "loss": 0.5911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9132232069969177, + "rewards/margins": 1.0355985164642334, + "rewards/rejected": -1.9488215446472168, + "step": 2248 + }, + { + "epoch": 0.26, + "learning_rate": 2.2488484705326563e-07, + "logits/chosen": -2.3982791900634766, + "logits/rejected": -2.4744508266448975, + "logps/chosen": -248.84963989257812, + "logps/rejected": -342.4407958984375, + "loss": 0.172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4169912040233612, + "rewards/margins": 3.298081874847412, + "rewards/rejected": -3.715073347091675, + "step": 2249 + }, + { + "epoch": 0.26, + "learning_rate": 2.2484941537734736e-07, + "logits/chosen": -2.700608253479004, + "logits/rejected": -2.7071120738983154, + "logps/chosen": -244.4542694091797, + "logps/rejected": -264.46636962890625, + "loss": 0.5948, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0091311931610107, + "rewards/margins": 1.8171807527542114, + "rewards/rejected": -2.8263120651245117, + "step": 2250 + }, + { + "epoch": 0.26, + "learning_rate": 2.2481398370142908e-07, + "logits/chosen": -2.5963611602783203, + "logits/rejected": -2.4546470642089844, + "logps/chosen": -193.46160888671875, + "logps/rejected": -162.83229064941406, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1353093385696411, + "rewards/margins": 1.5132030248641968, + "rewards/rejected": -2.648512363433838, + "step": 2251 + }, + { + "epoch": 0.26, + "learning_rate": 2.247785520255108e-07, + "logits/chosen": -2.265582323074341, + "logits/rejected": -2.2264320850372314, + "logps/chosen": -172.97080993652344, + "logps/rejected": -187.93341064453125, + "loss": 0.4949, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1714026927947998, + "rewards/margins": 1.2106976509094238, + "rewards/rejected": -2.3821005821228027, + "step": 2252 + }, + { + "epoch": 0.26, + "learning_rate": 2.2474312034959252e-07, + "logits/chosen": -2.2259368896484375, + "logits/rejected": -2.253335475921631, + "logps/chosen": -165.5911102294922, + "logps/rejected": -397.0439453125, + "loss": 0.2284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3213706612586975, + "rewards/margins": 3.548773765563965, + "rewards/rejected": -3.870144844055176, + "step": 2253 + }, + { + "epoch": 0.26, + "learning_rate": 2.2470768867367424e-07, + "logits/chosen": -2.4262919425964355, + "logits/rejected": -2.0825517177581787, + "logps/chosen": -310.81219482421875, + "logps/rejected": -405.16412353515625, + "loss": 0.4189, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1101151704788208, + "rewards/margins": 3.710494041442871, + "rewards/rejected": -4.820609092712402, + "step": 2254 + }, + { + "epoch": 0.26, + "learning_rate": 2.24672256997756e-07, + "logits/chosen": -2.6745476722717285, + "logits/rejected": -2.8498430252075195, + "logps/chosen": -444.4673767089844, + "logps/rejected": -306.2907409667969, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46698835492134094, + "rewards/margins": 2.3085343837738037, + "rewards/rejected": -2.775522470474243, + "step": 2255 + }, + { + "epoch": 0.26, + "learning_rate": 2.2463682532183771e-07, + "logits/chosen": -2.300407648086548, + "logits/rejected": -2.2511019706726074, + "logps/chosen": -275.4330139160156, + "logps/rejected": -201.09078979492188, + "loss": 0.4219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49023908376693726, + "rewards/margins": 1.4753100872039795, + "rewards/rejected": -1.9655492305755615, + "step": 2256 + }, + { + "epoch": 0.26, + "learning_rate": 2.2460139364591944e-07, + "logits/chosen": -2.424161195755005, + "logits/rejected": -2.2094078063964844, + "logps/chosen": -200.67315673828125, + "logps/rejected": -300.233154296875, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6835014820098877, + "rewards/margins": 0.9579112529754639, + "rewards/rejected": -1.6414127349853516, + "step": 2257 + }, + { + "epoch": 0.26, + "learning_rate": 2.2456596197000116e-07, + "logits/chosen": -2.162682294845581, + "logits/rejected": -2.2718005180358887, + "logps/chosen": -366.96038818359375, + "logps/rejected": -309.9407043457031, + "loss": 0.4857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45616716146469116, + "rewards/margins": 0.8845772743225098, + "rewards/rejected": -1.3407444953918457, + "step": 2258 + }, + { + "epoch": 0.26, + "learning_rate": 2.2453053029408288e-07, + "logits/chosen": -1.717231273651123, + "logits/rejected": -1.729622483253479, + "logps/chosen": -304.7221374511719, + "logps/rejected": -224.7342071533203, + "loss": 0.4007, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0255309343338013, + "rewards/margins": 1.4945279359817505, + "rewards/rejected": -2.5200588703155518, + "step": 2259 + }, + { + "epoch": 0.26, + "learning_rate": 2.244950986181646e-07, + "logits/chosen": -2.903620481491089, + "logits/rejected": -3.0065786838531494, + "logps/chosen": -126.12030792236328, + "logps/rejected": -124.84638214111328, + "loss": 0.4099, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7668179869651794, + "rewards/margins": 1.0439938306808472, + "rewards/rejected": -1.8108117580413818, + "step": 2260 + }, + { + "epoch": 0.26, + "learning_rate": 2.2445966694224638e-07, + "logits/chosen": -1.4250168800354004, + "logits/rejected": -1.6482179164886475, + "logps/chosen": -446.3251647949219, + "logps/rejected": -386.45269775390625, + "loss": 0.844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.22188401222229, + "rewards/margins": 0.550480842590332, + "rewards/rejected": -1.772364616394043, + "step": 2261 + }, + { + "epoch": 0.26, + "learning_rate": 2.244242352663281e-07, + "logits/chosen": -2.417090892791748, + "logits/rejected": -2.3535163402557373, + "logps/chosen": -301.97283935546875, + "logps/rejected": -235.0905303955078, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9387972950935364, + "rewards/margins": 1.1422076225280762, + "rewards/rejected": -2.0810048580169678, + "step": 2262 + }, + { + "epoch": 0.26, + "learning_rate": 2.2438880359040982e-07, + "logits/chosen": -2.4417824745178223, + "logits/rejected": -2.3752763271331787, + "logps/chosen": -297.52081298828125, + "logps/rejected": -355.20562744140625, + "loss": 0.2223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40751081705093384, + "rewards/margins": 2.5277185440063477, + "rewards/rejected": -2.935229539871216, + "step": 2263 + }, + { + "epoch": 0.26, + "learning_rate": 2.2435337191449154e-07, + "logits/chosen": -2.4068915843963623, + "logits/rejected": -2.3153491020202637, + "logps/chosen": -164.78616333007812, + "logps/rejected": -276.5911865234375, + "loss": 0.3633, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7363844513893127, + "rewards/margins": 1.9175992012023926, + "rewards/rejected": -2.6539835929870605, + "step": 2264 + }, + { + "epoch": 0.26, + "learning_rate": 2.2431794023857327e-07, + "logits/chosen": -2.3043301105499268, + "logits/rejected": -2.311607837677002, + "logps/chosen": -217.195068359375, + "logps/rejected": -259.3789978027344, + "loss": 0.356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5587706565856934, + "rewards/margins": 1.8194465637207031, + "rewards/rejected": -2.3782172203063965, + "step": 2265 + }, + { + "epoch": 0.26, + "learning_rate": 2.2428250856265502e-07, + "logits/chosen": -2.1303837299346924, + "logits/rejected": -2.335303544998169, + "logps/chosen": -303.8712158203125, + "logps/rejected": -202.3148193359375, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6040847301483154, + "rewards/margins": 0.8857783079147339, + "rewards/rejected": -2.4898629188537598, + "step": 2266 + }, + { + "epoch": 0.26, + "learning_rate": 2.2424707688673674e-07, + "logits/chosen": -2.758913993835449, + "logits/rejected": -2.69862699508667, + "logps/chosen": -222.27133178710938, + "logps/rejected": -203.4457550048828, + "loss": 0.2709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008799456059932709, + "rewards/margins": 1.7193584442138672, + "rewards/rejected": -1.7105588912963867, + "step": 2267 + }, + { + "epoch": 0.26, + "learning_rate": 2.2421164521081846e-07, + "logits/chosen": -2.3648898601531982, + "logits/rejected": -2.106973171234131, + "logps/chosen": -247.87933349609375, + "logps/rejected": -251.3011474609375, + "loss": 0.5586, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7966480255126953, + "rewards/margins": 1.0676133632659912, + "rewards/rejected": -1.8642613887786865, + "step": 2268 + }, + { + "epoch": 0.26, + "learning_rate": 2.2417621353490018e-07, + "logits/chosen": -2.3848726749420166, + "logits/rejected": -2.3342785835266113, + "logps/chosen": -170.41616821289062, + "logps/rejected": -354.4142761230469, + "loss": 0.2298, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7970758676528931, + "rewards/margins": 4.756030082702637, + "rewards/rejected": -5.553106307983398, + "step": 2269 + }, + { + "epoch": 0.26, + "learning_rate": 2.241407818589819e-07, + "logits/chosen": -2.512169599533081, + "logits/rejected": -2.5739758014678955, + "logps/chosen": -263.180908203125, + "logps/rejected": -251.89097595214844, + "loss": 1.0385, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8877309560775757, + "rewards/margins": 1.2428138256072998, + "rewards/rejected": -3.130544662475586, + "step": 2270 + }, + { + "epoch": 0.26, + "learning_rate": 2.2410535018306363e-07, + "logits/chosen": -2.392455816268921, + "logits/rejected": -2.6473910808563232, + "logps/chosen": -302.7626647949219, + "logps/rejected": -271.96136474609375, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6568416357040405, + "rewards/margins": 1.130077600479126, + "rewards/rejected": -1.7869189977645874, + "step": 2271 + }, + { + "epoch": 0.26, + "learning_rate": 2.2406991850714537e-07, + "logits/chosen": -2.5354835987091064, + "logits/rejected": -2.6908912658691406, + "logps/chosen": -331.61590576171875, + "logps/rejected": -454.5199890136719, + "loss": 0.1704, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1614624261856079, + "rewards/margins": 3.1160359382629395, + "rewards/rejected": -3.277498245239258, + "step": 2272 + }, + { + "epoch": 0.26, + "learning_rate": 2.2403448683122712e-07, + "logits/chosen": -2.0652294158935547, + "logits/rejected": -2.34183931350708, + "logps/chosen": -375.20361328125, + "logps/rejected": -317.8718566894531, + "loss": 0.3539, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06756527721881866, + "rewards/margins": 1.2261501550674438, + "rewards/rejected": -1.293715238571167, + "step": 2273 + }, + { + "epoch": 0.26, + "learning_rate": 2.2399905515530884e-07, + "logits/chosen": -2.223048210144043, + "logits/rejected": -2.210000514984131, + "logps/chosen": -414.92633056640625, + "logps/rejected": -394.0779113769531, + "loss": 0.4788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5493685007095337, + "rewards/margins": 1.5945743322372437, + "rewards/rejected": -2.1439428329467773, + "step": 2274 + }, + { + "epoch": 0.26, + "learning_rate": 2.2396362347939057e-07, + "logits/chosen": -2.528329849243164, + "logits/rejected": -2.5360496044158936, + "logps/chosen": -412.5983581542969, + "logps/rejected": -259.8576965332031, + "loss": 0.2826, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9218227863311768, + "rewards/margins": 2.051978826522827, + "rewards/rejected": -2.973801612854004, + "step": 2275 + }, + { + "epoch": 0.26, + "learning_rate": 2.239281918034723e-07, + "logits/chosen": -2.328158378601074, + "logits/rejected": -2.3716049194335938, + "logps/chosen": -320.9047546386719, + "logps/rejected": -324.80419921875, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.65273118019104, + "rewards/margins": 1.0278904438018799, + "rewards/rejected": -1.68062162399292, + "step": 2276 + }, + { + "epoch": 0.26, + "learning_rate": 2.2389276012755404e-07, + "logits/chosen": -2.9384543895721436, + "logits/rejected": -2.8506901264190674, + "logps/chosen": -428.9738464355469, + "logps/rejected": -195.60301208496094, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2437121570110321, + "rewards/margins": 2.480186939239502, + "rewards/rejected": -2.7238991260528564, + "step": 2277 + }, + { + "epoch": 0.26, + "learning_rate": 2.2385732845163576e-07, + "logits/chosen": -2.690838575363159, + "logits/rejected": -2.635251045227051, + "logps/chosen": -154.67420959472656, + "logps/rejected": -209.71510314941406, + "loss": 0.3206, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3113667666912079, + "rewards/margins": 2.510406494140625, + "rewards/rejected": -2.821773052215576, + "step": 2278 + }, + { + "epoch": 0.27, + "learning_rate": 2.2382189677571748e-07, + "logits/chosen": -2.4789392948150635, + "logits/rejected": -2.2035329341888428, + "logps/chosen": -291.5769958496094, + "logps/rejected": -308.45428466796875, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5610976219177246, + "rewards/margins": 2.3766348361968994, + "rewards/rejected": -3.937732219696045, + "step": 2279 + }, + { + "epoch": 0.27, + "learning_rate": 2.237864650997992e-07, + "logits/chosen": -2.6111907958984375, + "logits/rejected": -2.5697999000549316, + "logps/chosen": -260.03460693359375, + "logps/rejected": -225.35806274414062, + "loss": 0.7894, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3287396430969238, + "rewards/margins": 1.2374447584152222, + "rewards/rejected": -2.5661842823028564, + "step": 2280 + }, + { + "epoch": 0.27, + "learning_rate": 2.2375103342388093e-07, + "logits/chosen": -2.7989232540130615, + "logits/rejected": -2.6479830741882324, + "logps/chosen": -186.59169006347656, + "logps/rejected": -280.64556884765625, + "loss": 0.261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6203424334526062, + "rewards/margins": 2.5256166458129883, + "rewards/rejected": -3.1459593772888184, + "step": 2281 + }, + { + "epoch": 0.27, + "learning_rate": 2.2371560174796265e-07, + "logits/chosen": -2.494370460510254, + "logits/rejected": -2.5689845085144043, + "logps/chosen": -379.5869140625, + "logps/rejected": -355.1335144042969, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5111453533172607, + "rewards/margins": 2.257485866546631, + "rewards/rejected": -2.7686314582824707, + "step": 2282 + }, + { + "epoch": 0.27, + "learning_rate": 2.2368017007204437e-07, + "logits/chosen": -2.5488228797912598, + "logits/rejected": -2.722444534301758, + "logps/chosen": -433.7989501953125, + "logps/rejected": -203.48477172851562, + "loss": 0.3906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6474804878234863, + "rewards/margins": 1.3359726667404175, + "rewards/rejected": -1.9834531545639038, + "step": 2283 + }, + { + "epoch": 0.27, + "learning_rate": 2.2364473839612615e-07, + "logits/chosen": -1.9583320617675781, + "logits/rejected": -1.7748161554336548, + "logps/chosen": -261.85101318359375, + "logps/rejected": -303.9474182128906, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4710744023323059, + "rewards/margins": 1.6275522708892822, + "rewards/rejected": -2.0986266136169434, + "step": 2284 + }, + { + "epoch": 0.27, + "learning_rate": 2.2360930672020787e-07, + "logits/chosen": -2.562701463699341, + "logits/rejected": -2.455134630203247, + "logps/chosen": -183.12767028808594, + "logps/rejected": -249.51431274414062, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1461396962404251, + "rewards/margins": 2.270303249359131, + "rewards/rejected": -2.41644287109375, + "step": 2285 + }, + { + "epoch": 0.27, + "learning_rate": 2.235738750442896e-07, + "logits/chosen": -2.279376983642578, + "logits/rejected": -1.9575154781341553, + "logps/chosen": -356.150390625, + "logps/rejected": -340.01007080078125, + "loss": 0.6355, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.763382911682129, + "rewards/margins": 0.6889535188674927, + "rewards/rejected": -2.452336549758911, + "step": 2286 + }, + { + "epoch": 0.27, + "learning_rate": 2.235384433683713e-07, + "logits/chosen": -2.110706329345703, + "logits/rejected": -2.1168668270111084, + "logps/chosen": -328.3638916015625, + "logps/rejected": -320.38287353515625, + "loss": 0.2239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8781938552856445, + "rewards/margins": 1.8796828985214233, + "rewards/rejected": -2.7578768730163574, + "step": 2287 + }, + { + "epoch": 0.27, + "learning_rate": 2.2350301169245303e-07, + "logits/chosen": -2.1837973594665527, + "logits/rejected": -2.6691465377807617, + "logps/chosen": -518.7034301757812, + "logps/rejected": -295.5169982910156, + "loss": 0.4145, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3184672594070435, + "rewards/margins": 1.7954461574554443, + "rewards/rejected": -3.1139135360717773, + "step": 2288 + }, + { + "epoch": 0.27, + "learning_rate": 2.2346758001653478e-07, + "logits/chosen": -2.2882866859436035, + "logits/rejected": -2.299179792404175, + "logps/chosen": -205.40994262695312, + "logps/rejected": -275.7272644042969, + "loss": 0.3276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1974257528781891, + "rewards/margins": 2.0034236907958984, + "rewards/rejected": -2.2008492946624756, + "step": 2289 + }, + { + "epoch": 0.27, + "learning_rate": 2.234321483406165e-07, + "logits/chosen": -2.1982667446136475, + "logits/rejected": -1.9386587142944336, + "logps/chosen": -253.54417419433594, + "logps/rejected": -290.667236328125, + "loss": 0.7659, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0781185626983643, + "rewards/margins": 0.5555004477500916, + "rewards/rejected": -1.6336190700531006, + "step": 2290 + }, + { + "epoch": 0.27, + "learning_rate": 2.2339671666469823e-07, + "logits/chosen": -1.9820151329040527, + "logits/rejected": -2.114626884460449, + "logps/chosen": -351.70703125, + "logps/rejected": -321.0057373046875, + "loss": 0.1089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07788477838039398, + "rewards/margins": 3.316633701324463, + "rewards/rejected": -3.3945186138153076, + "step": 2291 + }, + { + "epoch": 0.27, + "learning_rate": 2.2336128498877995e-07, + "logits/chosen": -2.333949089050293, + "logits/rejected": -2.6794896125793457, + "logps/chosen": -278.5251770019531, + "logps/rejected": -151.92031860351562, + "loss": 0.3445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.407683789730072, + "rewards/margins": 1.7040380239486694, + "rewards/rejected": -2.1117217540740967, + "step": 2292 + }, + { + "epoch": 0.27, + "learning_rate": 2.2332585331286167e-07, + "logits/chosen": -2.2470345497131348, + "logits/rejected": -2.0355703830718994, + "logps/chosen": -288.5887451171875, + "logps/rejected": -349.6793212890625, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.113924540579319, + "rewards/margins": 2.4374308586120605, + "rewards/rejected": -2.5513553619384766, + "step": 2293 + }, + { + "epoch": 0.27, + "learning_rate": 2.232904216369434e-07, + "logits/chosen": -2.0353424549102783, + "logits/rejected": -2.3459396362304688, + "logps/chosen": -246.17379760742188, + "logps/rejected": -159.01966857910156, + "loss": 0.6313, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5082567930221558, + "rewards/margins": 0.3887743353843689, + "rewards/rejected": -0.8970310688018799, + "step": 2294 + }, + { + "epoch": 0.27, + "learning_rate": 2.2325498996102514e-07, + "logits/chosen": -2.2025644779205322, + "logits/rejected": -2.180711269378662, + "logps/chosen": -232.543212890625, + "logps/rejected": -294.3747253417969, + "loss": 0.1346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7048602104187012, + "rewards/margins": 2.755133867263794, + "rewards/rejected": -3.459993839263916, + "step": 2295 + }, + { + "epoch": 0.27, + "learning_rate": 2.232195582851069e-07, + "logits/chosen": -2.6725873947143555, + "logits/rejected": -2.5025510787963867, + "logps/chosen": -247.4805908203125, + "logps/rejected": -295.7008361816406, + "loss": 0.1523, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31408771872520447, + "rewards/margins": 2.938622236251831, + "rewards/rejected": -3.2527101039886475, + "step": 2296 + }, + { + "epoch": 0.27, + "learning_rate": 2.231841266091886e-07, + "logits/chosen": -2.188382625579834, + "logits/rejected": -1.93808114528656, + "logps/chosen": -147.0696563720703, + "logps/rejected": -318.56365966796875, + "loss": 0.2794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16401195526123047, + "rewards/margins": 1.6380008459091187, + "rewards/rejected": -1.8020129203796387, + "step": 2297 + }, + { + "epoch": 0.27, + "learning_rate": 2.2314869493327033e-07, + "logits/chosen": -2.8099894523620605, + "logits/rejected": -2.8206005096435547, + "logps/chosen": -238.7169189453125, + "logps/rejected": -204.77975463867188, + "loss": 0.5088, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7920150756835938, + "rewards/margins": 1.0420554876327515, + "rewards/rejected": -1.8340705633163452, + "step": 2298 + }, + { + "epoch": 0.27, + "learning_rate": 2.2311326325735206e-07, + "logits/chosen": -2.043713092803955, + "logits/rejected": -2.1732397079467773, + "logps/chosen": -360.9541015625, + "logps/rejected": -370.5504150390625, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4997970759868622, + "rewards/margins": 1.2628229856491089, + "rewards/rejected": -1.762619972229004, + "step": 2299 + }, + { + "epoch": 0.27, + "learning_rate": 2.230778315814338e-07, + "logits/chosen": -2.302518606185913, + "logits/rejected": -2.5703089237213135, + "logps/chosen": -314.6614990234375, + "logps/rejected": -192.18283081054688, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.416350781917572, + "rewards/margins": 1.4463359117507935, + "rewards/rejected": -1.8626867532730103, + "step": 2300 + }, + { + "epoch": 0.27, + "learning_rate": 2.2304239990551553e-07, + "logits/chosen": -2.2703709602355957, + "logits/rejected": -2.6228830814361572, + "logps/chosen": -381.0523681640625, + "logps/rejected": -279.66351318359375, + "loss": 0.4767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7822766304016113, + "rewards/margins": 2.0465314388275146, + "rewards/rejected": -2.828808069229126, + "step": 2301 + }, + { + "epoch": 0.27, + "learning_rate": 2.2300696822959725e-07, + "logits/chosen": -2.1241133213043213, + "logits/rejected": -1.9224019050598145, + "logps/chosen": -114.99288940429688, + "logps/rejected": -125.38633728027344, + "loss": 1.0262, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3740273714065552, + "rewards/margins": 0.45909762382507324, + "rewards/rejected": -1.8331249952316284, + "step": 2302 + }, + { + "epoch": 0.27, + "learning_rate": 2.2297153655367897e-07, + "logits/chosen": -2.4934065341949463, + "logits/rejected": -2.421907901763916, + "logps/chosen": -102.96295166015625, + "logps/rejected": -123.11402130126953, + "loss": 0.3169, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37276020646095276, + "rewards/margins": 1.272170066833496, + "rewards/rejected": -0.899409830570221, + "step": 2303 + }, + { + "epoch": 0.27, + "learning_rate": 2.229361048777607e-07, + "logits/chosen": -2.5545639991760254, + "logits/rejected": -2.7831151485443115, + "logps/chosen": -254.76004028320312, + "logps/rejected": -179.77658081054688, + "loss": 0.4214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7192851901054382, + "rewards/margins": 1.2144399881362915, + "rewards/rejected": -1.9337252378463745, + "step": 2304 + }, + { + "epoch": 0.27, + "learning_rate": 2.2290067320184242e-07, + "logits/chosen": -2.4477362632751465, + "logits/rejected": -2.425642490386963, + "logps/chosen": -195.9151153564453, + "logps/rejected": -190.32839965820312, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9561547040939331, + "rewards/margins": 1.8870506286621094, + "rewards/rejected": -2.843205213546753, + "step": 2305 + }, + { + "epoch": 0.27, + "learning_rate": 2.2286524152592416e-07, + "logits/chosen": -2.405057191848755, + "logits/rejected": -2.4684462547302246, + "logps/chosen": -245.86767578125, + "logps/rejected": -297.3666687011719, + "loss": 0.2393, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3053625822067261, + "rewards/margins": 2.4061999320983887, + "rewards/rejected": -2.100837469100952, + "step": 2306 + }, + { + "epoch": 0.27, + "learning_rate": 2.228298098500059e-07, + "logits/chosen": -2.016921043395996, + "logits/rejected": -1.8138549327850342, + "logps/chosen": -230.95169067382812, + "logps/rejected": -330.4517822265625, + "loss": 0.7317, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9699602723121643, + "rewards/margins": 1.362838625907898, + "rewards/rejected": -2.332798957824707, + "step": 2307 + }, + { + "epoch": 0.27, + "learning_rate": 2.2279437817408764e-07, + "logits/chosen": -2.5597708225250244, + "logits/rejected": -2.5255303382873535, + "logps/chosen": -140.4930419921875, + "logps/rejected": -200.0936279296875, + "loss": 0.4475, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2884659767150879, + "rewards/margins": 0.9685728549957275, + "rewards/rejected": -1.2570388317108154, + "step": 2308 + }, + { + "epoch": 0.27, + "learning_rate": 2.2275894649816936e-07, + "logits/chosen": -2.2639126777648926, + "logits/rejected": -2.4815425872802734, + "logps/chosen": -336.80126953125, + "logps/rejected": -454.7950439453125, + "loss": 0.5802, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.48478364944458, + "rewards/margins": 1.5791040658950806, + "rewards/rejected": -3.06388783454895, + "step": 2309 + }, + { + "epoch": 0.27, + "learning_rate": 2.2272351482225108e-07, + "logits/chosen": -2.6225337982177734, + "logits/rejected": -2.893322467803955, + "logps/chosen": -371.2069091796875, + "logps/rejected": -313.2666931152344, + "loss": 0.1322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6648715734481812, + "rewards/margins": 2.5355825424194336, + "rewards/rejected": -3.2004542350769043, + "step": 2310 + }, + { + "epoch": 0.27, + "learning_rate": 2.2268808314633283e-07, + "logits/chosen": -2.0928092002868652, + "logits/rejected": -2.4475271701812744, + "logps/chosen": -313.5830078125, + "logps/rejected": -239.19961547851562, + "loss": 0.486, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6801308393478394, + "rewards/margins": 1.5475234985351562, + "rewards/rejected": -2.227654457092285, + "step": 2311 + }, + { + "epoch": 0.27, + "learning_rate": 2.2265265147041455e-07, + "logits/chosen": -2.45333194732666, + "logits/rejected": -2.30489444732666, + "logps/chosen": -253.33868408203125, + "logps/rejected": -345.7149353027344, + "loss": 0.3041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06258188188076019, + "rewards/margins": 1.9271495342254639, + "rewards/rejected": -1.9897315502166748, + "step": 2312 + }, + { + "epoch": 0.27, + "learning_rate": 2.2261721979449627e-07, + "logits/chosen": -2.0333449840545654, + "logits/rejected": -2.1790084838867188, + "logps/chosen": -188.17816162109375, + "logps/rejected": -148.2199249267578, + "loss": 1.1117, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4381242990493774, + "rewards/margins": -0.08943580090999603, + "rewards/rejected": -1.3486886024475098, + "step": 2313 + }, + { + "epoch": 0.27, + "learning_rate": 2.22581788118578e-07, + "logits/chosen": -2.2416234016418457, + "logits/rejected": -2.49039363861084, + "logps/chosen": -256.3288879394531, + "logps/rejected": -188.55014038085938, + "loss": 0.2808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9297614097595215, + "rewards/margins": 1.8489290475845337, + "rewards/rejected": -2.7786903381347656, + "step": 2314 + }, + { + "epoch": 0.27, + "learning_rate": 2.2254635644265972e-07, + "logits/chosen": -2.2613332271575928, + "logits/rejected": -2.568969249725342, + "logps/chosen": -525.091796875, + "logps/rejected": -268.9901123046875, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6298060417175293, + "rewards/margins": 2.542104721069336, + "rewards/rejected": -3.171910524368286, + "step": 2315 + }, + { + "epoch": 0.27, + "learning_rate": 2.2251092476674144e-07, + "logits/chosen": -2.0121235847473145, + "logits/rejected": -1.785934567451477, + "logps/chosen": -231.7498321533203, + "logps/rejected": -305.99676513671875, + "loss": 0.5045, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2733073234558105, + "rewards/margins": 1.1576037406921387, + "rewards/rejected": -2.430911064147949, + "step": 2316 + }, + { + "epoch": 0.27, + "learning_rate": 2.2247549309082316e-07, + "logits/chosen": -2.380159378051758, + "logits/rejected": -2.231494903564453, + "logps/chosen": -361.9001770019531, + "logps/rejected": -217.9887237548828, + "loss": 0.292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8085237145423889, + "rewards/margins": 3.277121067047119, + "rewards/rejected": -4.085644721984863, + "step": 2317 + }, + { + "epoch": 0.27, + "learning_rate": 2.224400614149049e-07, + "logits/chosen": -2.2861576080322266, + "logits/rejected": -2.219399929046631, + "logps/chosen": -216.46932983398438, + "logps/rejected": -239.6901397705078, + "loss": 0.9705, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3336951732635498, + "rewards/margins": -0.29036611318588257, + "rewards/rejected": -1.0433290004730225, + "step": 2318 + }, + { + "epoch": 0.27, + "learning_rate": 2.2240462973898666e-07, + "logits/chosen": -2.5508031845092773, + "logits/rejected": -2.1986300945281982, + "logps/chosen": -201.8201141357422, + "logps/rejected": -246.69459533691406, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6003473997116089, + "rewards/margins": 2.306387424468994, + "rewards/rejected": -2.9067347049713135, + "step": 2319 + }, + { + "epoch": 0.27, + "learning_rate": 2.2236919806306838e-07, + "logits/chosen": -2.575049877166748, + "logits/rejected": -2.343888282775879, + "logps/chosen": -295.1542053222656, + "logps/rejected": -309.12286376953125, + "loss": 0.4632, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.225027322769165, + "rewards/margins": 1.6371146440505981, + "rewards/rejected": -2.8621420860290527, + "step": 2320 + }, + { + "epoch": 0.27, + "learning_rate": 2.223337663871501e-07, + "logits/chosen": -2.4680352210998535, + "logits/rejected": -2.668046236038208, + "logps/chosen": -324.6842346191406, + "logps/rejected": -161.828857421875, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6518596410751343, + "rewards/margins": 1.8565640449523926, + "rewards/rejected": -2.5084238052368164, + "step": 2321 + }, + { + "epoch": 0.27, + "learning_rate": 2.2229833471123185e-07, + "logits/chosen": -2.314807176589966, + "logits/rejected": -2.6978659629821777, + "logps/chosen": -163.9160919189453, + "logps/rejected": -154.4246368408203, + "loss": 0.2758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8830258846282959, + "rewards/margins": 2.120959520339966, + "rewards/rejected": -3.0039854049682617, + "step": 2322 + }, + { + "epoch": 0.27, + "learning_rate": 2.2226290303531357e-07, + "logits/chosen": -1.9165089130401611, + "logits/rejected": -2.296776056289673, + "logps/chosen": -435.97540283203125, + "logps/rejected": -287.1107482910156, + "loss": 0.2322, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00701478123664856, + "rewards/margins": 2.663872480392456, + "rewards/rejected": -2.65685772895813, + "step": 2323 + }, + { + "epoch": 0.27, + "learning_rate": 2.222274713593953e-07, + "logits/chosen": -2.2731106281280518, + "logits/rejected": -2.1114699840545654, + "logps/chosen": -350.9409484863281, + "logps/rejected": -369.7868347167969, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1844133585691452, + "rewards/margins": 4.561666488647461, + "rewards/rejected": -4.746079444885254, + "step": 2324 + }, + { + "epoch": 0.27, + "learning_rate": 2.2219203968347702e-07, + "logits/chosen": -2.5088741779327393, + "logits/rejected": -2.3814172744750977, + "logps/chosen": -359.91192626953125, + "logps/rejected": -224.7606964111328, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1308521032333374, + "rewards/margins": 1.452314019203186, + "rewards/rejected": -2.5831661224365234, + "step": 2325 + }, + { + "epoch": 0.27, + "learning_rate": 2.2215660800755874e-07, + "logits/chosen": -2.120164155960083, + "logits/rejected": -2.250788927078247, + "logps/chosen": -340.92584228515625, + "logps/rejected": -314.28118896484375, + "loss": 0.2736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7423826456069946, + "rewards/margins": 2.693681001663208, + "rewards/rejected": -3.436063528060913, + "step": 2326 + }, + { + "epoch": 0.27, + "learning_rate": 2.2212117633164046e-07, + "logits/chosen": -2.0943052768707275, + "logits/rejected": -2.1481404304504395, + "logps/chosen": -244.93551635742188, + "logps/rejected": -202.8932342529297, + "loss": 0.6297, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7181135416030884, + "rewards/margins": 0.5158251523971558, + "rewards/rejected": -1.2339386940002441, + "step": 2327 + }, + { + "epoch": 0.27, + "learning_rate": 2.2208574465572218e-07, + "logits/chosen": -2.0263278484344482, + "logits/rejected": -2.355471611022949, + "logps/chosen": -289.506103515625, + "logps/rejected": -172.9910888671875, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9220860004425049, + "rewards/margins": 0.9235358238220215, + "rewards/rejected": -1.8456218242645264, + "step": 2328 + }, + { + "epoch": 0.27, + "learning_rate": 2.2205031297980393e-07, + "logits/chosen": -2.146834135055542, + "logits/rejected": -2.298522710800171, + "logps/chosen": -428.6302795410156, + "logps/rejected": -246.95297241210938, + "loss": 0.7919, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0847526788711548, + "rewards/margins": 0.7268729209899902, + "rewards/rejected": -1.8116254806518555, + "step": 2329 + }, + { + "epoch": 0.27, + "learning_rate": 2.2201488130388565e-07, + "logits/chosen": -1.863234043121338, + "logits/rejected": -1.9400355815887451, + "logps/chosen": -381.6712646484375, + "logps/rejected": -275.09674072265625, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41852056980133057, + "rewards/margins": 1.6593189239501953, + "rewards/rejected": -2.0778396129608154, + "step": 2330 + }, + { + "epoch": 0.27, + "learning_rate": 2.219794496279674e-07, + "logits/chosen": -2.5288686752319336, + "logits/rejected": -2.535917043685913, + "logps/chosen": -251.35084533691406, + "logps/rejected": -236.79800415039062, + "loss": 0.3348, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18949905037879944, + "rewards/margins": 1.2721318006515503, + "rewards/rejected": -1.4616308212280273, + "step": 2331 + }, + { + "epoch": 0.27, + "learning_rate": 2.2194401795204913e-07, + "logits/chosen": -2.2104930877685547, + "logits/rejected": -2.3038458824157715, + "logps/chosen": -351.9674377441406, + "logps/rejected": -284.28363037109375, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1780307292938232, + "rewards/margins": 1.0954787731170654, + "rewards/rejected": -2.2735097408294678, + "step": 2332 + }, + { + "epoch": 0.27, + "learning_rate": 2.2190858627613085e-07, + "logits/chosen": -1.7868220806121826, + "logits/rejected": -2.118410587310791, + "logps/chosen": -478.75323486328125, + "logps/rejected": -293.51214599609375, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0436301231384277, + "rewards/margins": 2.0944817066192627, + "rewards/rejected": -3.1381120681762695, + "step": 2333 + }, + { + "epoch": 0.27, + "learning_rate": 2.218731546002126e-07, + "logits/chosen": -2.1012794971466064, + "logits/rejected": -2.005875825881958, + "logps/chosen": -353.8478698730469, + "logps/rejected": -201.50689697265625, + "loss": 0.8106, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.014651298522949, + "rewards/margins": 0.16474232077598572, + "rewards/rejected": -2.1793932914733887, + "step": 2334 + }, + { + "epoch": 0.27, + "learning_rate": 2.2183772292429432e-07, + "logits/chosen": -2.51424241065979, + "logits/rejected": -2.5801382064819336, + "logps/chosen": -346.51513671875, + "logps/rejected": -296.2989807128906, + "loss": 0.1794, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.116164207458496, + "rewards/margins": 2.0953683853149414, + "rewards/rejected": -3.2115323543548584, + "step": 2335 + }, + { + "epoch": 0.27, + "learning_rate": 2.2180229124837604e-07, + "logits/chosen": -2.6300530433654785, + "logits/rejected": -2.7171499729156494, + "logps/chosen": -303.7069396972656, + "logps/rejected": -283.9895324707031, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5895646810531616, + "rewards/margins": 1.8297762870788574, + "rewards/rejected": -2.4193408489227295, + "step": 2336 + }, + { + "epoch": 0.27, + "learning_rate": 2.2176685957245776e-07, + "logits/chosen": -2.509228467941284, + "logits/rejected": -2.6573312282562256, + "logps/chosen": -203.83352661132812, + "logps/rejected": -229.6935272216797, + "loss": 0.3059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.336578369140625, + "rewards/margins": 2.0018186569213867, + "rewards/rejected": -2.3383970260620117, + "step": 2337 + }, + { + "epoch": 0.27, + "learning_rate": 2.2173142789653948e-07, + "logits/chosen": -2.3799374103546143, + "logits/rejected": -2.5987653732299805, + "logps/chosen": -414.4248352050781, + "logps/rejected": -276.3588562011719, + "loss": 0.8842, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9455180168151855, + "rewards/margins": 0.35441192984580994, + "rewards/rejected": -2.2999300956726074, + "step": 2338 + }, + { + "epoch": 0.27, + "learning_rate": 2.216959962206212e-07, + "logits/chosen": -2.3306522369384766, + "logits/rejected": -2.086343288421631, + "logps/chosen": -285.0040588378906, + "logps/rejected": -334.10589599609375, + "loss": 0.1882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6397737264633179, + "rewards/margins": 2.5017120838165283, + "rewards/rejected": -3.1414859294891357, + "step": 2339 + }, + { + "epoch": 0.27, + "learning_rate": 2.2166056454470296e-07, + "logits/chosen": -1.756518840789795, + "logits/rejected": -1.9333630800247192, + "logps/chosen": -219.66305541992188, + "logps/rejected": -216.55975341796875, + "loss": 0.2766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7006471753120422, + "rewards/margins": 1.4319958686828613, + "rewards/rejected": -2.132642984390259, + "step": 2340 + }, + { + "epoch": 0.27, + "learning_rate": 2.2162513286878468e-07, + "logits/chosen": -2.58113169670105, + "logits/rejected": -2.59798526763916, + "logps/chosen": -340.30902099609375, + "logps/rejected": -216.14337158203125, + "loss": 4.0131, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.506505012512207, + "rewards/margins": -1.5689713954925537, + "rewards/rejected": -2.9375336170196533, + "step": 2341 + }, + { + "epoch": 0.27, + "learning_rate": 2.215897011928664e-07, + "logits/chosen": -2.0126798152923584, + "logits/rejected": -2.52693772315979, + "logps/chosen": -388.7501220703125, + "logps/rejected": -222.02639770507812, + "loss": 0.8632, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2303935289382935, + "rewards/margins": 0.05533331632614136, + "rewards/rejected": -1.28572678565979, + "step": 2342 + }, + { + "epoch": 0.27, + "learning_rate": 2.2155426951694815e-07, + "logits/chosen": -2.1979267597198486, + "logits/rejected": -2.404275894165039, + "logps/chosen": -461.43670654296875, + "logps/rejected": -308.5429992675781, + "loss": 0.5917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5367857217788696, + "rewards/margins": 1.3757575750350952, + "rewards/rejected": -1.9125432968139648, + "step": 2343 + }, + { + "epoch": 0.27, + "learning_rate": 2.2151883784102987e-07, + "logits/chosen": -2.2873904705047607, + "logits/rejected": -2.533367395401001, + "logps/chosen": -501.81390380859375, + "logps/rejected": -293.88690185546875, + "loss": 0.2673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3701082170009613, + "rewards/margins": 1.9717859029769897, + "rewards/rejected": -2.3418939113616943, + "step": 2344 + }, + { + "epoch": 0.27, + "learning_rate": 2.2148340616511162e-07, + "logits/chosen": -1.9484496116638184, + "logits/rejected": -2.5285189151763916, + "logps/chosen": -539.4857788085938, + "logps/rejected": -249.08535766601562, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45619314908981323, + "rewards/margins": 1.8701670169830322, + "rewards/rejected": -1.4139738082885742, + "step": 2345 + }, + { + "epoch": 0.27, + "learning_rate": 2.2144797448919334e-07, + "logits/chosen": -2.2651586532592773, + "logits/rejected": -1.8783873319625854, + "logps/chosen": -135.37623596191406, + "logps/rejected": -343.5937194824219, + "loss": 0.1053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24080611765384674, + "rewards/margins": 3.577331304550171, + "rewards/rejected": -3.8181374073028564, + "step": 2346 + }, + { + "epoch": 0.27, + "learning_rate": 2.2141254281327506e-07, + "logits/chosen": -2.309349298477173, + "logits/rejected": -2.1380529403686523, + "logps/chosen": -304.93145751953125, + "logps/rejected": -270.6235656738281, + "loss": 0.8768, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6129963397979736, + "rewards/margins": 1.0161304473876953, + "rewards/rejected": -2.62912654876709, + "step": 2347 + }, + { + "epoch": 0.27, + "learning_rate": 2.2137711113735679e-07, + "logits/chosen": -1.7736966609954834, + "logits/rejected": -2.343919038772583, + "logps/chosen": -294.3101501464844, + "logps/rejected": -209.73196411132812, + "loss": 0.2748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4887973666191101, + "rewards/margins": 2.028449058532715, + "rewards/rejected": -2.5172462463378906, + "step": 2348 + }, + { + "epoch": 0.27, + "learning_rate": 2.213416794614385e-07, + "logits/chosen": -1.7552624940872192, + "logits/rejected": -1.8386038541793823, + "logps/chosen": -238.33078002929688, + "logps/rejected": -265.6388854980469, + "loss": 0.482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25377997756004333, + "rewards/margins": 1.4614430665969849, + "rewards/rejected": -1.7152231931686401, + "step": 2349 + }, + { + "epoch": 0.27, + "learning_rate": 2.2130624778552023e-07, + "logits/chosen": -2.4830029010772705, + "logits/rejected": -2.403174877166748, + "logps/chosen": -267.6897888183594, + "logps/rejected": -218.0553741455078, + "loss": 1.1432, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.9087505340576172, + "rewards/margins": 0.1288871467113495, + "rewards/rejected": -2.03763747215271, + "step": 2350 + }, + { + "epoch": 0.27, + "learning_rate": 2.2127081610960198e-07, + "logits/chosen": -2.089923143386841, + "logits/rejected": -2.131441593170166, + "logps/chosen": -349.5634460449219, + "logps/rejected": -408.6036071777344, + "loss": 0.9031, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.50191330909729, + "rewards/margins": 0.676645040512085, + "rewards/rejected": -2.178558349609375, + "step": 2351 + }, + { + "epoch": 0.27, + "learning_rate": 2.212353844336837e-07, + "logits/chosen": -1.6187704801559448, + "logits/rejected": -2.0322279930114746, + "logps/chosen": -638.4826049804688, + "logps/rejected": -365.4383239746094, + "loss": 0.2936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6537527441978455, + "rewards/margins": 2.2139840126037598, + "rewards/rejected": -2.86773681640625, + "step": 2352 + }, + { + "epoch": 0.27, + "learning_rate": 2.2119995275776542e-07, + "logits/chosen": -2.3770034313201904, + "logits/rejected": -2.2488796710968018, + "logps/chosen": -197.54150390625, + "logps/rejected": -279.10321044921875, + "loss": 0.3072, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4318734407424927, + "rewards/margins": 2.443850040435791, + "rewards/rejected": -3.8757236003875732, + "step": 2353 + }, + { + "epoch": 0.27, + "learning_rate": 2.2116452108184717e-07, + "logits/chosen": -2.7970478534698486, + "logits/rejected": -2.759305238723755, + "logps/chosen": -244.17694091796875, + "logps/rejected": -193.90567016601562, + "loss": 0.3423, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.149188756942749, + "rewards/margins": 1.456141710281372, + "rewards/rejected": -2.605330467224121, + "step": 2354 + }, + { + "epoch": 0.27, + "learning_rate": 2.211290894059289e-07, + "logits/chosen": -2.320312261581421, + "logits/rejected": -2.180847406387329, + "logps/chosen": -213.18814086914062, + "logps/rejected": -255.67953491210938, + "loss": 0.3538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7515394687652588, + "rewards/margins": 1.5918781757354736, + "rewards/rejected": -2.3434176445007324, + "step": 2355 + }, + { + "epoch": 0.27, + "learning_rate": 2.2109365773001064e-07, + "logits/chosen": -2.280881643295288, + "logits/rejected": -2.292393922805786, + "logps/chosen": -159.19058227539062, + "logps/rejected": -260.7165832519531, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06148223578929901, + "rewards/margins": 3.5465002059936523, + "rewards/rejected": -3.6079823970794678, + "step": 2356 + }, + { + "epoch": 0.27, + "learning_rate": 2.2105822605409236e-07, + "logits/chosen": -2.5721921920776367, + "logits/rejected": -2.495333671569824, + "logps/chosen": -316.4535827636719, + "logps/rejected": -347.248291015625, + "loss": 0.4416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9478765726089478, + "rewards/margins": 2.292473793029785, + "rewards/rejected": -3.2403502464294434, + "step": 2357 + }, + { + "epoch": 0.27, + "learning_rate": 2.2102279437817409e-07, + "logits/chosen": -2.196408987045288, + "logits/rejected": -2.2882773876190186, + "logps/chosen": -261.9180603027344, + "logps/rejected": -188.11013793945312, + "loss": 0.465, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1776363849639893, + "rewards/margins": 1.186735987663269, + "rewards/rejected": -2.3643722534179688, + "step": 2358 + }, + { + "epoch": 0.27, + "learning_rate": 2.209873627022558e-07, + "logits/chosen": -2.1748299598693848, + "logits/rejected": -2.372422218322754, + "logps/chosen": -279.1041259765625, + "logps/rejected": -361.0118408203125, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.683375895023346, + "rewards/margins": 2.211364507675171, + "rewards/rejected": -2.894740343093872, + "step": 2359 + }, + { + "epoch": 0.27, + "learning_rate": 2.2095193102633753e-07, + "logits/chosen": -2.4749372005462646, + "logits/rejected": -2.365464210510254, + "logps/chosen": -361.41766357421875, + "logps/rejected": -424.8029479980469, + "loss": 0.5578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9799612760543823, + "rewards/margins": 2.590235710144043, + "rewards/rejected": -3.570197105407715, + "step": 2360 + }, + { + "epoch": 0.27, + "learning_rate": 2.2091649935041925e-07, + "logits/chosen": -2.3847100734710693, + "logits/rejected": -2.5419487953186035, + "logps/chosen": -239.45211791992188, + "logps/rejected": -195.13766479492188, + "loss": 3.1153, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1817626953125, + "rewards/margins": -2.178901433944702, + "rewards/rejected": -1.0028612613677979, + "step": 2361 + }, + { + "epoch": 0.27, + "learning_rate": 2.2088106767450097e-07, + "logits/chosen": -1.8170108795166016, + "logits/rejected": -1.419071912765503, + "logps/chosen": -461.3736572265625, + "logps/rejected": -471.66107177734375, + "loss": 0.3664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22786402702331543, + "rewards/margins": 1.3093289136886597, + "rewards/rejected": -1.537192940711975, + "step": 2362 + }, + { + "epoch": 0.27, + "learning_rate": 2.2084563599858272e-07, + "logits/chosen": -2.3640308380126953, + "logits/rejected": -2.26580548286438, + "logps/chosen": -276.943603515625, + "logps/rejected": -315.0652770996094, + "loss": 10.3895, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.030027389526367, + "rewards/margins": -7.436212539672852, + "rewards/rejected": -3.5938143730163574, + "step": 2363 + }, + { + "epoch": 0.28, + "learning_rate": 2.2081020432266444e-07, + "logits/chosen": -1.7471156120300293, + "logits/rejected": -1.7002453804016113, + "logps/chosen": -232.1814727783203, + "logps/rejected": -287.02362060546875, + "loss": 0.5856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.297282338142395, + "rewards/margins": 1.506813406944275, + "rewards/rejected": -1.8040958642959595, + "step": 2364 + }, + { + "epoch": 0.28, + "learning_rate": 2.2077477264674617e-07, + "logits/chosen": -2.257291793823242, + "logits/rejected": -2.2081761360168457, + "logps/chosen": -190.15274047851562, + "logps/rejected": -215.10765075683594, + "loss": 0.2746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7106113433837891, + "rewards/margins": 2.6461777687072754, + "rewards/rejected": -3.3567891120910645, + "step": 2365 + }, + { + "epoch": 0.28, + "learning_rate": 2.2073934097082792e-07, + "logits/chosen": -2.0045714378356934, + "logits/rejected": -2.0284788608551025, + "logps/chosen": -155.90640258789062, + "logps/rejected": -326.88470458984375, + "loss": 0.6921, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0469673871994019, + "rewards/margins": 1.4492459297180176, + "rewards/rejected": -2.496213436126709, + "step": 2366 + }, + { + "epoch": 0.28, + "learning_rate": 2.2070390929490966e-07, + "logits/chosen": -2.3875529766082764, + "logits/rejected": -2.1453328132629395, + "logps/chosen": -341.0448913574219, + "logps/rejected": -323.91290283203125, + "loss": 0.343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2551132142543793, + "rewards/margins": 2.9822468757629395, + "rewards/rejected": -3.2373600006103516, + "step": 2367 + }, + { + "epoch": 0.28, + "learning_rate": 2.2066847761899139e-07, + "logits/chosen": -2.1235127449035645, + "logits/rejected": -2.106849193572998, + "logps/chosen": -242.9930419921875, + "logps/rejected": -218.91946411132812, + "loss": 0.5839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.625690221786499, + "rewards/margins": 2.3765556812286377, + "rewards/rejected": -3.002246141433716, + "step": 2368 + }, + { + "epoch": 0.28, + "learning_rate": 2.206330459430731e-07, + "logits/chosen": -2.2457399368286133, + "logits/rejected": -2.063079357147217, + "logps/chosen": -308.23699951171875, + "logps/rejected": -342.2718200683594, + "loss": 0.384, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5508841276168823, + "rewards/margins": 1.452219009399414, + "rewards/rejected": -2.003103256225586, + "step": 2369 + }, + { + "epoch": 0.28, + "learning_rate": 2.2059761426715483e-07, + "logits/chosen": -2.6948163509368896, + "logits/rejected": -2.5903496742248535, + "logps/chosen": -116.08805084228516, + "logps/rejected": -467.1485595703125, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3716392517089844, + "rewards/margins": 2.226377010345459, + "rewards/rejected": -2.5980162620544434, + "step": 2370 + }, + { + "epoch": 0.28, + "learning_rate": 2.2056218259123655e-07, + "logits/chosen": -2.7818942070007324, + "logits/rejected": -2.7699718475341797, + "logps/chosen": -123.5443115234375, + "logps/rejected": -178.60928344726562, + "loss": 0.2879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6243726015090942, + "rewards/margins": 2.520885705947876, + "rewards/rejected": -3.1452584266662598, + "step": 2371 + }, + { + "epoch": 0.28, + "learning_rate": 2.2052675091531827e-07, + "logits/chosen": -1.8245190382003784, + "logits/rejected": -1.9161250591278076, + "logps/chosen": -231.40274047851562, + "logps/rejected": -218.56686401367188, + "loss": 0.839, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2886322736740112, + "rewards/margins": 1.046738862991333, + "rewards/rejected": -2.3353710174560547, + "step": 2372 + }, + { + "epoch": 0.28, + "learning_rate": 2.204913192394e-07, + "logits/chosen": -2.285767078399658, + "logits/rejected": -2.2660446166992188, + "logps/chosen": -189.80606079101562, + "logps/rejected": -190.5797882080078, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6608383655548096, + "rewards/margins": 1.2405519485473633, + "rewards/rejected": -1.9013901948928833, + "step": 2373 + }, + { + "epoch": 0.28, + "learning_rate": 2.2045588756348175e-07, + "logits/chosen": -2.4185690879821777, + "logits/rejected": -2.5796079635620117, + "logps/chosen": -412.8140563964844, + "logps/rejected": -181.40756225585938, + "loss": 0.5848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3630870580673218, + "rewards/margins": 0.6978697776794434, + "rewards/rejected": -1.0609568357467651, + "step": 2374 + }, + { + "epoch": 0.28, + "learning_rate": 2.2042045588756347e-07, + "logits/chosen": -2.2554476261138916, + "logits/rejected": -2.082094192504883, + "logps/chosen": -209.9491424560547, + "logps/rejected": -324.65032958984375, + "loss": 0.7433, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5812917947769165, + "rewards/margins": 0.6294167041778564, + "rewards/rejected": -2.2107086181640625, + "step": 2375 + }, + { + "epoch": 0.28, + "learning_rate": 2.203850242116452e-07, + "logits/chosen": -2.269076347351074, + "logits/rejected": -2.480708122253418, + "logps/chosen": -359.80169677734375, + "logps/rejected": -191.16073608398438, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3389315605163574, + "rewards/margins": 1.5065524578094482, + "rewards/rejected": -2.8454840183258057, + "step": 2376 + }, + { + "epoch": 0.28, + "learning_rate": 2.203495925357269e-07, + "logits/chosen": -2.2147653102874756, + "logits/rejected": -2.2513058185577393, + "logps/chosen": -312.72283935546875, + "logps/rejected": -269.85943603515625, + "loss": 0.281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7431227564811707, + "rewards/margins": 2.270219326019287, + "rewards/rejected": -3.0133419036865234, + "step": 2377 + }, + { + "epoch": 0.28, + "learning_rate": 2.2031416085980866e-07, + "logits/chosen": -2.449279308319092, + "logits/rejected": -2.4968695640563965, + "logps/chosen": -327.5238037109375, + "logps/rejected": -295.8417053222656, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38361847400665283, + "rewards/margins": 2.9238224029541016, + "rewards/rejected": -2.5402040481567383, + "step": 2378 + }, + { + "epoch": 0.28, + "learning_rate": 2.202787291838904e-07, + "logits/chosen": -2.1470117568969727, + "logits/rejected": -2.165759563446045, + "logps/chosen": -149.8660125732422, + "logps/rejected": -246.30050659179688, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30056631565093994, + "rewards/margins": 2.982776641845703, + "rewards/rejected": -3.2833428382873535, + "step": 2379 + }, + { + "epoch": 0.28, + "learning_rate": 2.2024329750797213e-07, + "logits/chosen": -2.230952739715576, + "logits/rejected": -2.0624520778656006, + "logps/chosen": -278.165283203125, + "logps/rejected": -283.8473815917969, + "loss": 0.3396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3423001766204834, + "rewards/margins": 1.7550339698791504, + "rewards/rejected": -2.097334146499634, + "step": 2380 + }, + { + "epoch": 0.28, + "learning_rate": 2.2020786583205385e-07, + "logits/chosen": -2.8514208793640137, + "logits/rejected": -2.626286268234253, + "logps/chosen": -272.5777282714844, + "logps/rejected": -347.6559143066406, + "loss": 0.1595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4610629677772522, + "rewards/margins": 2.96547794342041, + "rewards/rejected": -3.4265408515930176, + "step": 2381 + }, + { + "epoch": 0.28, + "learning_rate": 2.2017243415613558e-07, + "logits/chosen": -2.578042984008789, + "logits/rejected": -2.269885540008545, + "logps/chosen": -233.7438201904297, + "logps/rejected": -315.58843994140625, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0502796173095703, + "rewards/margins": 2.48714542388916, + "rewards/rejected": -3.5374250411987305, + "step": 2382 + }, + { + "epoch": 0.28, + "learning_rate": 2.201370024802173e-07, + "logits/chosen": -2.6171793937683105, + "logits/rejected": -2.82112455368042, + "logps/chosen": -263.806396484375, + "logps/rejected": -194.5598907470703, + "loss": 0.6574, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1325020790100098, + "rewards/margins": 0.3997999429702759, + "rewards/rejected": -1.5323021411895752, + "step": 2383 + }, + { + "epoch": 0.28, + "learning_rate": 2.2010157080429902e-07, + "logits/chosen": -2.57999587059021, + "logits/rejected": -2.7716307640075684, + "logps/chosen": -250.9347686767578, + "logps/rejected": -237.021240234375, + "loss": 0.4301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6953539848327637, + "rewards/margins": 2.4878132343292236, + "rewards/rejected": -3.1831672191619873, + "step": 2384 + }, + { + "epoch": 0.28, + "learning_rate": 2.2006613912838077e-07, + "logits/chosen": -2.7224910259246826, + "logits/rejected": -2.7879536151885986, + "logps/chosen": -306.70684814453125, + "logps/rejected": -228.5615692138672, + "loss": 0.3768, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5941888093948364, + "rewards/margins": 2.301236391067505, + "rewards/rejected": -3.8954248428344727, + "step": 2385 + }, + { + "epoch": 0.28, + "learning_rate": 2.200307074524625e-07, + "logits/chosen": -2.0413150787353516, + "logits/rejected": -1.8335734605789185, + "logps/chosen": -253.45407104492188, + "logps/rejected": -311.700927734375, + "loss": 0.7677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8027268648147583, + "rewards/margins": 1.2913553714752197, + "rewards/rejected": -2.0940823554992676, + "step": 2386 + }, + { + "epoch": 0.28, + "learning_rate": 2.199952757765442e-07, + "logits/chosen": -2.5803537368774414, + "logits/rejected": -2.466660976409912, + "logps/chosen": -281.01568603515625, + "logps/rejected": -214.20652770996094, + "loss": 0.4697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8788562417030334, + "rewards/margins": 1.4742250442504883, + "rewards/rejected": -2.353081226348877, + "step": 2387 + }, + { + "epoch": 0.28, + "learning_rate": 2.1995984410062593e-07, + "logits/chosen": -2.6785192489624023, + "logits/rejected": -2.4651811122894287, + "logps/chosen": -242.13633728027344, + "logps/rejected": -266.4056091308594, + "loss": 0.3374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21231329441070557, + "rewards/margins": 1.938291311264038, + "rewards/rejected": -2.150604724884033, + "step": 2388 + }, + { + "epoch": 0.28, + "learning_rate": 2.1992441242470768e-07, + "logits/chosen": -2.461221218109131, + "logits/rejected": -2.525752305984497, + "logps/chosen": -242.9658966064453, + "logps/rejected": -209.76199340820312, + "loss": 0.2382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43106257915496826, + "rewards/margins": 2.6564924716949463, + "rewards/rejected": -3.087555408477783, + "step": 2389 + }, + { + "epoch": 0.28, + "learning_rate": 2.1988898074878943e-07, + "logits/chosen": -2.5665929317474365, + "logits/rejected": -2.625302314758301, + "logps/chosen": -474.7854919433594, + "logps/rejected": -259.02313232421875, + "loss": 0.6549, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2976677417755127, + "rewards/margins": 0.7028109431266785, + "rewards/rejected": -2.000478506088257, + "step": 2390 + }, + { + "epoch": 0.28, + "learning_rate": 2.1985354907287115e-07, + "logits/chosen": -2.4631879329681396, + "logits/rejected": -2.540917158126831, + "logps/chosen": -251.0959014892578, + "logps/rejected": -281.427001953125, + "loss": 0.2746, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5039739608764648, + "rewards/margins": 2.23482346534729, + "rewards/rejected": -2.738797426223755, + "step": 2391 + }, + { + "epoch": 0.28, + "learning_rate": 2.1981811739695288e-07, + "logits/chosen": -2.4236018657684326, + "logits/rejected": -2.431947708129883, + "logps/chosen": -357.7934265136719, + "logps/rejected": -281.3615417480469, + "loss": 0.3079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9061816930770874, + "rewards/margins": 1.9344557523727417, + "rewards/rejected": -2.840637445449829, + "step": 2392 + }, + { + "epoch": 0.28, + "learning_rate": 2.197826857210346e-07, + "logits/chosen": -2.051589250564575, + "logits/rejected": -2.0300133228302, + "logps/chosen": -299.17828369140625, + "logps/rejected": -335.50091552734375, + "loss": 0.3633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6886791586875916, + "rewards/margins": 2.1329829692840576, + "rewards/rejected": -2.821661949157715, + "step": 2393 + }, + { + "epoch": 0.28, + "learning_rate": 2.1974725404511632e-07, + "logits/chosen": -1.6360447406768799, + "logits/rejected": -2.1216464042663574, + "logps/chosen": -257.3417053222656, + "logps/rejected": -188.1143035888672, + "loss": 1.2082, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.342543840408325, + "rewards/margins": 0.494704008102417, + "rewards/rejected": -2.837247848510742, + "step": 2394 + }, + { + "epoch": 0.28, + "learning_rate": 2.1971182236919804e-07, + "logits/chosen": -2.3122386932373047, + "logits/rejected": -2.334972381591797, + "logps/chosen": -315.799072265625, + "logps/rejected": -329.4121398925781, + "loss": 0.2584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36763039231300354, + "rewards/margins": 2.2250094413757324, + "rewards/rejected": -2.592639923095703, + "step": 2395 + }, + { + "epoch": 0.28, + "learning_rate": 2.196763906932798e-07, + "logits/chosen": -1.7906460762023926, + "logits/rejected": -1.7463388442993164, + "logps/chosen": -499.3748779296875, + "logps/rejected": -486.61767578125, + "loss": 0.6052, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2680559158325195, + "rewards/margins": 1.432629942893982, + "rewards/rejected": -2.700685977935791, + "step": 2396 + }, + { + "epoch": 0.28, + "learning_rate": 2.1964095901736151e-07, + "logits/chosen": -2.1510205268859863, + "logits/rejected": -2.5302445888519287, + "logps/chosen": -237.18040466308594, + "logps/rejected": -148.45404052734375, + "loss": 1.0678, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.429471492767334, + "rewards/margins": 0.029442042112350464, + "rewards/rejected": -1.4589135646820068, + "step": 2397 + }, + { + "epoch": 0.28, + "learning_rate": 2.1960552734144324e-07, + "logits/chosen": -2.531017780303955, + "logits/rejected": -2.3079235553741455, + "logps/chosen": -184.8892822265625, + "logps/rejected": -338.419921875, + "loss": 0.3147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6930748224258423, + "rewards/margins": 2.5709385871887207, + "rewards/rejected": -3.2640132904052734, + "step": 2398 + }, + { + "epoch": 0.28, + "learning_rate": 2.1957009566552496e-07, + "logits/chosen": -2.1965672969818115, + "logits/rejected": -2.2260260581970215, + "logps/chosen": -244.67416381835938, + "logps/rejected": -206.92391967773438, + "loss": 2.0456, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.6251540184020996, + "rewards/margins": -0.9866768717765808, + "rewards/rejected": -2.638477087020874, + "step": 2399 + }, + { + "epoch": 0.28, + "learning_rate": 2.1953466398960668e-07, + "logits/chosen": -2.5782532691955566, + "logits/rejected": -2.642744779586792, + "logps/chosen": -184.93133544921875, + "logps/rejected": -288.57537841796875, + "loss": 0.829, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7896428108215332, + "rewards/margins": 1.7281076908111572, + "rewards/rejected": -2.5177507400512695, + "step": 2400 + }, + { + "epoch": 0.28, + "learning_rate": 2.1949923231368845e-07, + "logits/chosen": -2.104445219039917, + "logits/rejected": -2.114363193511963, + "logps/chosen": -250.6646728515625, + "logps/rejected": -288.03546142578125, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8317999839782715, + "rewards/margins": 1.4594800472259521, + "rewards/rejected": -2.2912800312042236, + "step": 2401 + }, + { + "epoch": 0.28, + "learning_rate": 2.1946380063777018e-07, + "logits/chosen": -2.529813766479492, + "logits/rejected": -2.702843427658081, + "logps/chosen": -374.92523193359375, + "logps/rejected": -218.46713256835938, + "loss": 0.42, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9739036560058594, + "rewards/margins": 1.1215391159057617, + "rewards/rejected": -2.095442771911621, + "step": 2402 + }, + { + "epoch": 0.28, + "learning_rate": 2.194283689618519e-07, + "logits/chosen": -2.7392899990081787, + "logits/rejected": -2.7655205726623535, + "logps/chosen": -237.51446533203125, + "logps/rejected": -239.36331176757812, + "loss": 0.5113, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1517144441604614, + "rewards/margins": 2.620999336242676, + "rewards/rejected": -3.7727136611938477, + "step": 2403 + }, + { + "epoch": 0.28, + "learning_rate": 2.1939293728593362e-07, + "logits/chosen": -1.9733822345733643, + "logits/rejected": -1.9166345596313477, + "logps/chosen": -537.4116821289062, + "logps/rejected": -406.8906555175781, + "loss": 0.4769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8158719539642334, + "rewards/margins": 1.1274051666259766, + "rewards/rejected": -1.94327712059021, + "step": 2404 + }, + { + "epoch": 0.28, + "learning_rate": 2.1935750561001534e-07, + "logits/chosen": -2.221069812774658, + "logits/rejected": -2.236828565597534, + "logps/chosen": -273.2884826660156, + "logps/rejected": -276.7657470703125, + "loss": 0.5332, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9262239933013916, + "rewards/margins": 1.9250211715698242, + "rewards/rejected": -4.851245403289795, + "step": 2405 + }, + { + "epoch": 0.28, + "learning_rate": 2.1932207393409707e-07, + "logits/chosen": -2.3161580562591553, + "logits/rejected": -2.1704442501068115, + "logps/chosen": -261.67535400390625, + "logps/rejected": -207.7584228515625, + "loss": 0.4286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4458218812942505, + "rewards/margins": 0.7641977667808533, + "rewards/rejected": -1.210019826889038, + "step": 2406 + }, + { + "epoch": 0.28, + "learning_rate": 2.192866422581788e-07, + "logits/chosen": -2.5317957401275635, + "logits/rejected": -2.573197364807129, + "logps/chosen": -396.7981872558594, + "logps/rejected": -340.38177490234375, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43957602977752686, + "rewards/margins": 3.1361522674560547, + "rewards/rejected": -2.6965763568878174, + "step": 2407 + }, + { + "epoch": 0.28, + "learning_rate": 2.1925121058226054e-07, + "logits/chosen": -2.3984756469726562, + "logits/rejected": -2.5308644771575928, + "logps/chosen": -319.2395935058594, + "logps/rejected": -283.64202880859375, + "loss": 0.5217, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22181940078735352, + "rewards/margins": 1.814698576927185, + "rewards/rejected": -2.036517858505249, + "step": 2408 + }, + { + "epoch": 0.28, + "learning_rate": 2.1921577890634226e-07, + "logits/chosen": -2.2337279319763184, + "logits/rejected": -2.0121798515319824, + "logps/chosen": -207.4919891357422, + "logps/rejected": -289.39202880859375, + "loss": 0.7214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4415642023086548, + "rewards/margins": 1.1816262006759644, + "rewards/rejected": -2.623190402984619, + "step": 2409 + }, + { + "epoch": 0.28, + "learning_rate": 2.1918034723042398e-07, + "logits/chosen": -2.802725315093994, + "logits/rejected": -2.9099221229553223, + "logps/chosen": -175.3132781982422, + "logps/rejected": -273.21148681640625, + "loss": 0.1209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4772554039955139, + "rewards/margins": 4.014412879943848, + "rewards/rejected": -4.491668701171875, + "step": 2410 + }, + { + "epoch": 0.28, + "learning_rate": 2.191449155545057e-07, + "logits/chosen": -2.5199801921844482, + "logits/rejected": -2.580404043197632, + "logps/chosen": -315.50579833984375, + "logps/rejected": -261.89520263671875, + "loss": 0.4488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6685268878936768, + "rewards/margins": 1.4561865329742432, + "rewards/rejected": -2.12471342086792, + "step": 2411 + }, + { + "epoch": 0.28, + "learning_rate": 2.1910948387858742e-07, + "logits/chosen": -2.7414276599884033, + "logits/rejected": -2.4749064445495605, + "logps/chosen": -224.6689453125, + "logps/rejected": -227.08041381835938, + "loss": 0.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6149680018424988, + "rewards/margins": 1.4762042760849, + "rewards/rejected": -2.091172218322754, + "step": 2412 + }, + { + "epoch": 0.28, + "learning_rate": 2.190740522026692e-07, + "logits/chosen": -2.2181923389434814, + "logits/rejected": -1.867035150527954, + "logps/chosen": -279.40887451171875, + "logps/rejected": -302.70233154296875, + "loss": 0.4825, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0305511951446533, + "rewards/margins": 1.1736646890640259, + "rewards/rejected": -2.2042160034179688, + "step": 2413 + }, + { + "epoch": 0.28, + "learning_rate": 2.1903862052675092e-07, + "logits/chosen": -1.7038986682891846, + "logits/rejected": -1.6517492532730103, + "logps/chosen": -280.27886962890625, + "logps/rejected": -228.50279235839844, + "loss": 0.3665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5821016430854797, + "rewards/margins": 1.2603877782821655, + "rewards/rejected": -1.84248948097229, + "step": 2414 + }, + { + "epoch": 0.28, + "learning_rate": 2.1900318885083264e-07, + "logits/chosen": -2.334750175476074, + "logits/rejected": -2.5466737747192383, + "logps/chosen": -315.8623046875, + "logps/rejected": -155.6282501220703, + "loss": 0.2642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9079563617706299, + "rewards/margins": 1.986535668373108, + "rewards/rejected": -2.8944919109344482, + "step": 2415 + }, + { + "epoch": 0.28, + "learning_rate": 2.1896775717491437e-07, + "logits/chosen": -2.305742025375366, + "logits/rejected": -2.345782995223999, + "logps/chosen": -159.86029052734375, + "logps/rejected": -216.63575744628906, + "loss": 0.2029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7539782524108887, + "rewards/margins": 2.9820733070373535, + "rewards/rejected": -4.736051559448242, + "step": 2416 + }, + { + "epoch": 0.28, + "learning_rate": 2.189323254989961e-07, + "logits/chosen": -2.807830333709717, + "logits/rejected": -2.591386556625366, + "logps/chosen": -178.17306518554688, + "logps/rejected": -254.59375, + "loss": 0.2194, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8814403414726257, + "rewards/margins": 3.3203353881835938, + "rewards/rejected": -4.201775550842285, + "step": 2417 + }, + { + "epoch": 0.28, + "learning_rate": 2.188968938230778e-07, + "logits/chosen": -2.1598758697509766, + "logits/rejected": -1.9119510650634766, + "logps/chosen": -217.4944305419922, + "logps/rejected": -222.904541015625, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28720033168792725, + "rewards/margins": 1.7462971210479736, + "rewards/rejected": -2.0334973335266113, + "step": 2418 + }, + { + "epoch": 0.28, + "learning_rate": 2.1886146214715956e-07, + "logits/chosen": -2.9726126194000244, + "logits/rejected": -3.056459426879883, + "logps/chosen": -216.31887817382812, + "logps/rejected": -242.21856689453125, + "loss": 0.3292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.806330680847168, + "rewards/margins": 2.829763889312744, + "rewards/rejected": -3.636094331741333, + "step": 2419 + }, + { + "epoch": 0.28, + "learning_rate": 2.1882603047124128e-07, + "logits/chosen": -2.5238051414489746, + "logits/rejected": -2.6433095932006836, + "logps/chosen": -289.5738830566406, + "logps/rejected": -186.95626831054688, + "loss": 1.3483, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0523738861083984, + "rewards/margins": -0.24874144792556763, + "rewards/rejected": -1.8036324977874756, + "step": 2420 + }, + { + "epoch": 0.28, + "learning_rate": 2.18790598795323e-07, + "logits/chosen": -1.9263896942138672, + "logits/rejected": -1.8871774673461914, + "logps/chosen": -304.6511535644531, + "logps/rejected": -362.1975402832031, + "loss": 0.3025, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3768489360809326, + "rewards/margins": 2.2314929962158203, + "rewards/rejected": -3.608342170715332, + "step": 2421 + }, + { + "epoch": 0.28, + "learning_rate": 2.1875516711940473e-07, + "logits/chosen": -2.5491456985473633, + "logits/rejected": -2.365755081176758, + "logps/chosen": -291.5038757324219, + "logps/rejected": -242.99124145507812, + "loss": 0.347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5266618728637695, + "rewards/margins": 2.365705728530884, + "rewards/rejected": -2.8923676013946533, + "step": 2422 + }, + { + "epoch": 0.28, + "learning_rate": 2.1871973544348645e-07, + "logits/chosen": -1.9858808517456055, + "logits/rejected": -2.0882153511047363, + "logps/chosen": -172.44886779785156, + "logps/rejected": -227.1353302001953, + "loss": 0.9635, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.679159164428711, + "rewards/margins": 0.7912605404853821, + "rewards/rejected": -2.4704198837280273, + "step": 2423 + }, + { + "epoch": 0.28, + "learning_rate": 2.1868430376756822e-07, + "logits/chosen": -2.4140841960906982, + "logits/rejected": -2.4430532455444336, + "logps/chosen": -247.08200073242188, + "logps/rejected": -238.9581298828125, + "loss": 0.4527, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3172849416732788, + "rewards/margins": 1.7958452701568604, + "rewards/rejected": -3.1131303310394287, + "step": 2424 + }, + { + "epoch": 0.28, + "learning_rate": 2.1864887209164994e-07, + "logits/chosen": -2.4403512477874756, + "logits/rejected": -2.635002851486206, + "logps/chosen": -295.29437255859375, + "logps/rejected": -305.2942810058594, + "loss": 0.5551, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5474229454994202, + "rewards/margins": 2.4311656951904297, + "rewards/rejected": -2.978588581085205, + "step": 2425 + }, + { + "epoch": 0.28, + "learning_rate": 2.1861344041573167e-07, + "logits/chosen": -2.9091224670410156, + "logits/rejected": -2.7866063117980957, + "logps/chosen": -261.61065673828125, + "logps/rejected": -243.71136474609375, + "loss": 0.8067, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3093852996826172, + "rewards/margins": 0.6743326187133789, + "rewards/rejected": -1.983717918395996, + "step": 2426 + }, + { + "epoch": 0.28, + "learning_rate": 2.185780087398134e-07, + "logits/chosen": -2.7849247455596924, + "logits/rejected": -2.722393274307251, + "logps/chosen": -170.15000915527344, + "logps/rejected": -235.87515258789062, + "loss": 0.5568, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0051884651184082, + "rewards/margins": 3.331677198410034, + "rewards/rejected": -4.3368659019470215, + "step": 2427 + }, + { + "epoch": 0.28, + "learning_rate": 2.185425770638951e-07, + "logits/chosen": -2.3067376613616943, + "logits/rejected": -2.4732773303985596, + "logps/chosen": -509.762939453125, + "logps/rejected": -441.5346984863281, + "loss": 0.3159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4289066195487976, + "rewards/margins": 1.9739067554473877, + "rewards/rejected": -2.40281343460083, + "step": 2428 + }, + { + "epoch": 0.28, + "learning_rate": 2.1850714538797683e-07, + "logits/chosen": -2.3931708335876465, + "logits/rejected": -2.4984872341156006, + "logps/chosen": -341.8184509277344, + "logps/rejected": -249.65969848632812, + "loss": 0.5713, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7307844758033752, + "rewards/margins": 2.2252461910247803, + "rewards/rejected": -2.9560303688049316, + "step": 2429 + }, + { + "epoch": 0.28, + "learning_rate": 2.1847171371205858e-07, + "logits/chosen": -2.297938585281372, + "logits/rejected": -1.8681225776672363, + "logps/chosen": -270.20556640625, + "logps/rejected": -484.920166015625, + "loss": 0.5677, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7687557339668274, + "rewards/margins": 0.6492458581924438, + "rewards/rejected": -1.4180015325546265, + "step": 2430 + }, + { + "epoch": 0.28, + "learning_rate": 2.184362820361403e-07, + "logits/chosen": -2.5370614528656006, + "logits/rejected": -2.7370262145996094, + "logps/chosen": -277.70782470703125, + "logps/rejected": -237.05938720703125, + "loss": 0.3329, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1228300333023071, + "rewards/margins": 1.885025978088379, + "rewards/rejected": -3.0078558921813965, + "step": 2431 + }, + { + "epoch": 0.28, + "learning_rate": 2.1840085036022203e-07, + "logits/chosen": -2.383835792541504, + "logits/rejected": -2.333524703979492, + "logps/chosen": -371.65740966796875, + "logps/rejected": -396.67791748046875, + "loss": 0.4606, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09138723462820053, + "rewards/margins": 0.7897689938545227, + "rewards/rejected": -0.8811562061309814, + "step": 2432 + }, + { + "epoch": 0.28, + "learning_rate": 2.1836541868430375e-07, + "logits/chosen": -2.4804301261901855, + "logits/rejected": -2.508697032928467, + "logps/chosen": -203.05255126953125, + "logps/rejected": -232.43853759765625, + "loss": 0.2683, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6086359024047852, + "rewards/margins": 3.016047716140747, + "rewards/rejected": -3.6246838569641113, + "step": 2433 + }, + { + "epoch": 0.28, + "learning_rate": 2.1832998700838547e-07, + "logits/chosen": -2.3846709728240967, + "logits/rejected": -2.4511735439300537, + "logps/chosen": -240.42608642578125, + "logps/rejected": -211.8994140625, + "loss": 0.2795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14571093022823334, + "rewards/margins": 1.9337424039840698, + "rewards/rejected": -2.079453229904175, + "step": 2434 + }, + { + "epoch": 0.28, + "learning_rate": 2.182945553324672e-07, + "logits/chosen": -2.2005419731140137, + "logits/rejected": -2.347635507583618, + "logps/chosen": -246.1650848388672, + "logps/rejected": -159.45718383789062, + "loss": 0.5205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6720130443572998, + "rewards/margins": 1.2505114078521729, + "rewards/rejected": -1.9225245714187622, + "step": 2435 + }, + { + "epoch": 0.28, + "learning_rate": 2.1825912365654897e-07, + "logits/chosen": -2.744433879852295, + "logits/rejected": -2.3822758197784424, + "logps/chosen": -194.48477172851562, + "logps/rejected": -225.1651153564453, + "loss": 0.6012, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0521517992019653, + "rewards/margins": 0.6601595878601074, + "rewards/rejected": -1.7123115062713623, + "step": 2436 + }, + { + "epoch": 0.28, + "learning_rate": 2.182236919806307e-07, + "logits/chosen": -3.090705394744873, + "logits/rejected": -3.0250086784362793, + "logps/chosen": -374.5745849609375, + "logps/rejected": -321.883056640625, + "loss": 0.6359, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4832537174224854, + "rewards/margins": 1.7162953615188599, + "rewards/rejected": -3.199549436569214, + "step": 2437 + }, + { + "epoch": 0.28, + "learning_rate": 2.181882603047124e-07, + "logits/chosen": -1.6695842742919922, + "logits/rejected": -1.5733023881912231, + "logps/chosen": -350.4315185546875, + "logps/rejected": -323.7000732421875, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9580378532409668, + "rewards/margins": 1.8412737846374512, + "rewards/rejected": -2.799311637878418, + "step": 2438 + }, + { + "epoch": 0.28, + "learning_rate": 2.1815282862879413e-07, + "logits/chosen": -2.780118942260742, + "logits/rejected": -2.6690480709075928, + "logps/chosen": -287.4557800292969, + "logps/rejected": -227.68028259277344, + "loss": 0.6141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9995822906494141, + "rewards/margins": 1.172029733657837, + "rewards/rejected": -2.171612024307251, + "step": 2439 + }, + { + "epoch": 0.28, + "learning_rate": 2.1811739695287586e-07, + "logits/chosen": -2.0758941173553467, + "logits/rejected": -2.0042366981506348, + "logps/chosen": -245.48233032226562, + "logps/rejected": -256.89459228515625, + "loss": 0.2882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5923284292221069, + "rewards/margins": 2.0743322372436523, + "rewards/rejected": -2.6666603088378906, + "step": 2440 + }, + { + "epoch": 0.28, + "learning_rate": 2.180819652769576e-07, + "logits/chosen": -2.3016905784606934, + "logits/rejected": -2.6276965141296387, + "logps/chosen": -284.684326171875, + "logps/rejected": -199.1748809814453, + "loss": 0.6392, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1366122961044312, + "rewards/margins": 1.3160499334335327, + "rewards/rejected": -2.452662229537964, + "step": 2441 + }, + { + "epoch": 0.28, + "learning_rate": 2.1804653360103933e-07, + "logits/chosen": -2.300863027572632, + "logits/rejected": -2.23675274848938, + "logps/chosen": -289.3321533203125, + "logps/rejected": -247.20803833007812, + "loss": 0.2806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6414758563041687, + "rewards/margins": 2.3768720626831055, + "rewards/rejected": -3.018347978591919, + "step": 2442 + }, + { + "epoch": 0.28, + "learning_rate": 2.1801110192512105e-07, + "logits/chosen": -2.0366744995117188, + "logits/rejected": -2.172194004058838, + "logps/chosen": -188.126953125, + "logps/rejected": -220.412353515625, + "loss": 0.4973, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6541728973388672, + "rewards/margins": 1.6980571746826172, + "rewards/rejected": -3.3522300720214844, + "step": 2443 + }, + { + "epoch": 0.28, + "learning_rate": 2.1797567024920277e-07, + "logits/chosen": -2.7356224060058594, + "logits/rejected": -2.7738306522369385, + "logps/chosen": -213.9744110107422, + "logps/rejected": -205.75306701660156, + "loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09099578857421875, + "rewards/margins": 2.4257750511169434, + "rewards/rejected": -2.516770839691162, + "step": 2444 + }, + { + "epoch": 0.28, + "learning_rate": 2.179402385732845e-07, + "logits/chosen": -2.0045621395111084, + "logits/rejected": -2.075671672821045, + "logps/chosen": -198.0613250732422, + "logps/rejected": -236.8986053466797, + "loss": 1.1645, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.5709025859832764, + "rewards/margins": 0.6127287149429321, + "rewards/rejected": -3.183631420135498, + "step": 2445 + }, + { + "epoch": 0.28, + "learning_rate": 2.1790480689736621e-07, + "logits/chosen": -1.7411086559295654, + "logits/rejected": -1.7757494449615479, + "logps/chosen": -210.12625122070312, + "logps/rejected": -238.86343383789062, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44968381524086, + "rewards/margins": 1.8220546245574951, + "rewards/rejected": -2.2717385292053223, + "step": 2446 + }, + { + "epoch": 0.28, + "learning_rate": 2.1786937522144794e-07, + "logits/chosen": -2.5625712871551514, + "logits/rejected": -2.319204330444336, + "logps/chosen": -348.5757751464844, + "logps/rejected": -310.1686096191406, + "loss": 0.3593, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1385153532028198, + "rewards/margins": 1.6224730014801025, + "rewards/rejected": -2.760988235473633, + "step": 2447 + }, + { + "epoch": 0.28, + "learning_rate": 2.178339435455297e-07, + "logits/chosen": -2.5249807834625244, + "logits/rejected": -2.6661505699157715, + "logps/chosen": -223.2581329345703, + "logps/rejected": -208.005615234375, + "loss": 0.4646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9694516658782959, + "rewards/margins": 1.5931706428527832, + "rewards/rejected": -2.5626220703125, + "step": 2448 + }, + { + "epoch": 0.28, + "learning_rate": 2.1779851186961143e-07, + "logits/chosen": -2.069368839263916, + "logits/rejected": -2.3304343223571777, + "logps/chosen": -337.6851806640625, + "logps/rejected": -242.470703125, + "loss": 0.6183, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1734493970870972, + "rewards/margins": 0.8337082862854004, + "rewards/rejected": -2.007157802581787, + "step": 2449 + }, + { + "epoch": 0.29, + "learning_rate": 2.1776308019369316e-07, + "logits/chosen": -2.1143674850463867, + "logits/rejected": -2.4560136795043945, + "logps/chosen": -358.7385559082031, + "logps/rejected": -209.5023193359375, + "loss": 0.4394, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7461340427398682, + "rewards/margins": 1.8844082355499268, + "rewards/rejected": -2.630542278289795, + "step": 2450 + }, + { + "epoch": 0.29, + "learning_rate": 2.1772764851777488e-07, + "logits/chosen": -2.0776731967926025, + "logits/rejected": -2.16764235496521, + "logps/chosen": -329.9303283691406, + "logps/rejected": -298.8729553222656, + "loss": 0.4509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5396146774291992, + "rewards/margins": 1.087437629699707, + "rewards/rejected": -1.6270523071289062, + "step": 2451 + }, + { + "epoch": 0.29, + "learning_rate": 2.176922168418566e-07, + "logits/chosen": -1.3730180263519287, + "logits/rejected": -1.8702495098114014, + "logps/chosen": -399.27484130859375, + "logps/rejected": -274.1457214355469, + "loss": 0.4645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10476456582546234, + "rewards/margins": 1.2702255249023438, + "rewards/rejected": -1.3749902248382568, + "step": 2452 + }, + { + "epoch": 0.29, + "learning_rate": 2.1765678516593835e-07, + "logits/chosen": -2.0756473541259766, + "logits/rejected": -2.388780117034912, + "logps/chosen": -239.2199249267578, + "logps/rejected": -245.57345581054688, + "loss": 0.4867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44959378242492676, + "rewards/margins": 1.2436915636062622, + "rewards/rejected": -1.6932854652404785, + "step": 2453 + }, + { + "epoch": 0.29, + "learning_rate": 2.1762135349002007e-07, + "logits/chosen": -2.3898582458496094, + "logits/rejected": -2.6965701580047607, + "logps/chosen": -300.75341796875, + "logps/rejected": -198.84890747070312, + "loss": 0.3955, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3583180010318756, + "rewards/margins": 2.208394765853882, + "rewards/rejected": -2.5667128562927246, + "step": 2454 + }, + { + "epoch": 0.29, + "learning_rate": 2.175859218141018e-07, + "logits/chosen": -2.2460758686065674, + "logits/rejected": -2.34220290184021, + "logps/chosen": -275.5306396484375, + "logps/rejected": -297.3830871582031, + "loss": 0.3654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40517866611480713, + "rewards/margins": 2.767184019088745, + "rewards/rejected": -3.172362804412842, + "step": 2455 + }, + { + "epoch": 0.29, + "learning_rate": 2.1755049013818352e-07, + "logits/chosen": -2.265929698944092, + "logits/rejected": -2.5448100566864014, + "logps/chosen": -231.16973876953125, + "logps/rejected": -220.73806762695312, + "loss": 0.4614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9303874373435974, + "rewards/margins": 1.3694424629211426, + "rewards/rejected": -2.2998297214508057, + "step": 2456 + }, + { + "epoch": 0.29, + "learning_rate": 2.1751505846226524e-07, + "logits/chosen": -2.6133482456207275, + "logits/rejected": -2.9053165912628174, + "logps/chosen": -202.8801727294922, + "logps/rejected": -163.5523681640625, + "loss": 0.7751, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3435837030410767, + "rewards/margins": 1.4021862745285034, + "rewards/rejected": -2.74576997756958, + "step": 2457 + }, + { + "epoch": 0.29, + "learning_rate": 2.1747962678634696e-07, + "logits/chosen": -2.572540283203125, + "logits/rejected": -2.4240710735321045, + "logps/chosen": -125.78656768798828, + "logps/rejected": -201.97930908203125, + "loss": 0.3686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2883782982826233, + "rewards/margins": 2.0042266845703125, + "rewards/rejected": -2.292604923248291, + "step": 2458 + }, + { + "epoch": 0.29, + "learning_rate": 2.1744419511042873e-07, + "logits/chosen": -2.3581531047821045, + "logits/rejected": -2.2826406955718994, + "logps/chosen": -263.19293212890625, + "logps/rejected": -193.87155151367188, + "loss": 0.6241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8815513849258423, + "rewards/margins": 2.085815668106079, + "rewards/rejected": -2.967366933822632, + "step": 2459 + }, + { + "epoch": 0.29, + "learning_rate": 2.1740876343451046e-07, + "logits/chosen": -2.1588902473449707, + "logits/rejected": -2.4780352115631104, + "logps/chosen": -412.3316650390625, + "logps/rejected": -343.46563720703125, + "loss": 0.7801, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3211928606033325, + "rewards/margins": 0.3646429777145386, + "rewards/rejected": -1.685835838317871, + "step": 2460 + }, + { + "epoch": 0.29, + "learning_rate": 2.1737333175859218e-07, + "logits/chosen": -1.9822461605072021, + "logits/rejected": -2.0525383949279785, + "logps/chosen": -271.55340576171875, + "logps/rejected": -235.89541625976562, + "loss": 0.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2293037176132202, + "rewards/margins": 0.8565818071365356, + "rewards/rejected": -2.085885524749756, + "step": 2461 + }, + { + "epoch": 0.29, + "learning_rate": 2.173379000826739e-07, + "logits/chosen": -2.732396364212036, + "logits/rejected": -2.691169500350952, + "logps/chosen": -317.80517578125, + "logps/rejected": -358.82281494140625, + "loss": 0.4811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2289564609527588, + "rewards/margins": 1.2388205528259277, + "rewards/rejected": -2.4677772521972656, + "step": 2462 + }, + { + "epoch": 0.29, + "learning_rate": 2.1730246840675562e-07, + "logits/chosen": -2.7565176486968994, + "logits/rejected": -2.8796520233154297, + "logps/chosen": -182.7015380859375, + "logps/rejected": -238.11822509765625, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6201177835464478, + "rewards/margins": 1.957884430885315, + "rewards/rejected": -2.5780019760131836, + "step": 2463 + }, + { + "epoch": 0.29, + "learning_rate": 2.1726703673083737e-07, + "logits/chosen": -2.0676345825195312, + "logits/rejected": -2.018420696258545, + "logps/chosen": -313.51361083984375, + "logps/rejected": -237.87725830078125, + "loss": 0.4599, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5916812419891357, + "rewards/margins": 1.7116608619689941, + "rewards/rejected": -2.30334210395813, + "step": 2464 + }, + { + "epoch": 0.29, + "learning_rate": 2.172316050549191e-07, + "logits/chosen": -2.1458559036254883, + "logits/rejected": -2.1581954956054688, + "logps/chosen": -374.22467041015625, + "logps/rejected": -360.7703552246094, + "loss": 0.254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.053375005722046, + "rewards/margins": 2.296565055847168, + "rewards/rejected": -3.349940061569214, + "step": 2465 + }, + { + "epoch": 0.29, + "learning_rate": 2.1719617337900082e-07, + "logits/chosen": -2.6527764797210693, + "logits/rejected": -2.5191380977630615, + "logps/chosen": -176.8474578857422, + "logps/rejected": -186.81121826171875, + "loss": 0.5223, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2767152786254883, + "rewards/margins": 0.7830303907394409, + "rewards/rejected": -2.0597455501556396, + "step": 2466 + }, + { + "epoch": 0.29, + "learning_rate": 2.1716074170308254e-07, + "logits/chosen": -2.6270358562469482, + "logits/rejected": -2.847661018371582, + "logps/chosen": -246.0839080810547, + "logps/rejected": -214.92115783691406, + "loss": 0.2277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3325130939483643, + "rewards/margins": 3.010699987411499, + "rewards/rejected": -4.343213081359863, + "step": 2467 + }, + { + "epoch": 0.29, + "learning_rate": 2.1712531002716426e-07, + "logits/chosen": -2.0580027103424072, + "logits/rejected": -2.246978759765625, + "logps/chosen": -363.367919921875, + "logps/rejected": -255.94229125976562, + "loss": 0.5123, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3784147500991821, + "rewards/margins": 1.2304003238677979, + "rewards/rejected": -2.6088151931762695, + "step": 2468 + }, + { + "epoch": 0.29, + "learning_rate": 2.1708987835124598e-07, + "logits/chosen": -2.6273584365844727, + "logits/rejected": -2.421530246734619, + "logps/chosen": -163.42869567871094, + "logps/rejected": -272.5882263183594, + "loss": 0.557, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6079937815666199, + "rewards/margins": 1.618372917175293, + "rewards/rejected": -2.2263667583465576, + "step": 2469 + }, + { + "epoch": 0.29, + "learning_rate": 2.1705444667532773e-07, + "logits/chosen": -2.3506693840026855, + "logits/rejected": -2.5059287548065186, + "logps/chosen": -387.5794677734375, + "logps/rejected": -245.97079467773438, + "loss": 0.652, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0010559558868408, + "rewards/margins": 1.6933295726776123, + "rewards/rejected": -2.694385528564453, + "step": 2470 + }, + { + "epoch": 0.29, + "learning_rate": 2.1701901499940948e-07, + "logits/chosen": -2.1610360145568848, + "logits/rejected": -2.7579829692840576, + "logps/chosen": -515.5169067382812, + "logps/rejected": -274.25726318359375, + "loss": 0.487, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6520326137542725, + "rewards/margins": 0.898408830165863, + "rewards/rejected": -2.5504415035247803, + "step": 2471 + }, + { + "epoch": 0.29, + "learning_rate": 2.169835833234912e-07, + "logits/chosen": -1.7973394393920898, + "logits/rejected": -2.4238595962524414, + "logps/chosen": -265.7392272949219, + "logps/rejected": -174.10629272460938, + "loss": 0.5527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5772750377655029, + "rewards/margins": 1.0169744491577148, + "rewards/rejected": -1.5942493677139282, + "step": 2472 + }, + { + "epoch": 0.29, + "learning_rate": 2.1694815164757292e-07, + "logits/chosen": -1.8389298915863037, + "logits/rejected": -1.9321054220199585, + "logps/chosen": -456.0957336425781, + "logps/rejected": -377.3522033691406, + "loss": 0.4853, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5206893682479858, + "rewards/margins": 1.5758976936340332, + "rewards/rejected": -2.0965871810913086, + "step": 2473 + }, + { + "epoch": 0.29, + "learning_rate": 2.1691271997165465e-07, + "logits/chosen": -2.7043190002441406, + "logits/rejected": -2.602125644683838, + "logps/chosen": -263.0576171875, + "logps/rejected": -299.27313232421875, + "loss": 0.1125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5646453499794006, + "rewards/margins": 3.171477794647217, + "rewards/rejected": -3.7361230850219727, + "step": 2474 + }, + { + "epoch": 0.29, + "learning_rate": 2.168772882957364e-07, + "logits/chosen": -2.5873637199401855, + "logits/rejected": -2.472294330596924, + "logps/chosen": -337.7218322753906, + "logps/rejected": -253.92446899414062, + "loss": 0.2148, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1485633850097656, + "rewards/margins": 1.8845783472061157, + "rewards/rejected": -3.033141613006592, + "step": 2475 + }, + { + "epoch": 0.29, + "learning_rate": 2.1684185661981812e-07, + "logits/chosen": -2.471859931945801, + "logits/rejected": -2.494781017303467, + "logps/chosen": -452.29425048828125, + "logps/rejected": -615.3822631835938, + "loss": 0.3725, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2650425434112549, + "rewards/margins": 2.8933329582214355, + "rewards/rejected": -4.1583757400512695, + "step": 2476 + }, + { + "epoch": 0.29, + "learning_rate": 2.1680642494389984e-07, + "logits/chosen": -1.9317779541015625, + "logits/rejected": -1.91277015209198, + "logps/chosen": -344.041259765625, + "logps/rejected": -370.9241943359375, + "loss": 0.3656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8470520377159119, + "rewards/margins": 1.7892144918441772, + "rewards/rejected": -2.6362664699554443, + "step": 2477 + }, + { + "epoch": 0.29, + "learning_rate": 2.1677099326798156e-07, + "logits/chosen": -2.6003546714782715, + "logits/rejected": -2.4782469272613525, + "logps/chosen": -155.3167724609375, + "logps/rejected": -244.5566864013672, + "loss": 0.6015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0782259702682495, + "rewards/margins": 1.5402922630310059, + "rewards/rejected": -2.618518352508545, + "step": 2478 + }, + { + "epoch": 0.29, + "learning_rate": 2.1673556159206328e-07, + "logits/chosen": -2.764953851699829, + "logits/rejected": -2.8564746379852295, + "logps/chosen": -153.8671417236328, + "logps/rejected": -182.783203125, + "loss": 0.512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.924048125743866, + "rewards/margins": 2.099832773208618, + "rewards/rejected": -3.023880958557129, + "step": 2479 + }, + { + "epoch": 0.29, + "learning_rate": 2.16700129916145e-07, + "logits/chosen": -2.386533260345459, + "logits/rejected": -2.425724744796753, + "logps/chosen": -310.36151123046875, + "logps/rejected": -195.2545166015625, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4305097758769989, + "rewards/margins": 2.4000132083892822, + "rewards/rejected": -2.8305230140686035, + "step": 2480 + }, + { + "epoch": 0.29, + "learning_rate": 2.1666469824022673e-07, + "logits/chosen": -1.8042891025543213, + "logits/rejected": -2.167426586151123, + "logps/chosen": -554.564208984375, + "logps/rejected": -370.48065185546875, + "loss": 0.5978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6072208881378174, + "rewards/margins": 2.131959915161133, + "rewards/rejected": -2.7391810417175293, + "step": 2481 + }, + { + "epoch": 0.29, + "learning_rate": 2.1662926656430848e-07, + "logits/chosen": -2.7623744010925293, + "logits/rejected": -2.661491632461548, + "logps/chosen": -126.27397155761719, + "logps/rejected": -233.30946350097656, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4357917606830597, + "rewards/margins": 1.8856812715530396, + "rewards/rejected": -2.3214731216430664, + "step": 2482 + }, + { + "epoch": 0.29, + "learning_rate": 2.1659383488839022e-07, + "logits/chosen": -2.355668067932129, + "logits/rejected": -2.486140012741089, + "logps/chosen": -304.5206604003906, + "logps/rejected": -226.6120147705078, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3285385370254517, + "rewards/margins": 1.1340433359146118, + "rewards/rejected": -2.4625821113586426, + "step": 2483 + }, + { + "epoch": 0.29, + "learning_rate": 2.1655840321247195e-07, + "logits/chosen": -2.8706371784210205, + "logits/rejected": -2.7139811515808105, + "logps/chosen": -175.02418518066406, + "logps/rejected": -207.94557189941406, + "loss": 0.4015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0142889022827148, + "rewards/margins": 1.9969303607940674, + "rewards/rejected": -3.0112192630767822, + "step": 2484 + }, + { + "epoch": 0.29, + "learning_rate": 2.1652297153655367e-07, + "logits/chosen": -2.4295926094055176, + "logits/rejected": -2.457249402999878, + "logps/chosen": -196.5600128173828, + "logps/rejected": -344.6933898925781, + "loss": 0.4395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6401822566986084, + "rewards/margins": 1.4477496147155762, + "rewards/rejected": -2.0879318714141846, + "step": 2485 + }, + { + "epoch": 0.29, + "learning_rate": 2.164875398606354e-07, + "logits/chosen": -2.3014252185821533, + "logits/rejected": -2.210580825805664, + "logps/chosen": -96.48078155517578, + "logps/rejected": -224.66656494140625, + "loss": 0.2997, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6605328917503357, + "rewards/margins": 1.7184669971466064, + "rewards/rejected": -2.378999948501587, + "step": 2486 + }, + { + "epoch": 0.29, + "learning_rate": 2.1645210818471714e-07, + "logits/chosen": -2.3552684783935547, + "logits/rejected": -2.480907440185547, + "logps/chosen": -288.65447998046875, + "logps/rejected": -217.87808227539062, + "loss": 0.7457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4416617155075073, + "rewards/margins": 0.9657660722732544, + "rewards/rejected": -2.4074275493621826, + "step": 2487 + }, + { + "epoch": 0.29, + "learning_rate": 2.1641667650879886e-07, + "logits/chosen": -1.9270848035812378, + "logits/rejected": -2.0156235694885254, + "logps/chosen": -677.5250244140625, + "logps/rejected": -304.284423828125, + "loss": 0.5192, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0662102699279785, + "rewards/margins": 1.6201300621032715, + "rewards/rejected": -2.68634033203125, + "step": 2488 + }, + { + "epoch": 0.29, + "learning_rate": 2.1638124483288058e-07, + "logits/chosen": -2.279860258102417, + "logits/rejected": -2.399411201477051, + "logps/chosen": -314.2524108886719, + "logps/rejected": -306.31207275390625, + "loss": 0.3357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.221620112657547, + "rewards/margins": 1.6651103496551514, + "rewards/rejected": -1.8867303133010864, + "step": 2489 + }, + { + "epoch": 0.29, + "learning_rate": 2.163458131569623e-07, + "logits/chosen": -2.265770673751831, + "logits/rejected": -2.7091500759124756, + "logps/chosen": -337.3496398925781, + "logps/rejected": -216.61634826660156, + "loss": 0.2304, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5104184746742249, + "rewards/margins": 2.345304250717163, + "rewards/rejected": -2.8557229042053223, + "step": 2490 + }, + { + "epoch": 0.29, + "learning_rate": 2.1631038148104403e-07, + "logits/chosen": -2.1074585914611816, + "logits/rejected": -2.240861415863037, + "logps/chosen": -140.7238311767578, + "logps/rejected": -159.63108825683594, + "loss": 0.5311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.02963387966156, + "rewards/margins": 0.892888069152832, + "rewards/rejected": -1.922521948814392, + "step": 2491 + }, + { + "epoch": 0.29, + "learning_rate": 2.1627494980512575e-07, + "logits/chosen": -2.4363150596618652, + "logits/rejected": -2.4098405838012695, + "logps/chosen": -240.70071411132812, + "logps/rejected": -224.8988800048828, + "loss": 0.4626, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2235594987869263, + "rewards/margins": 0.8425126075744629, + "rewards/rejected": -2.0660722255706787, + "step": 2492 + }, + { + "epoch": 0.29, + "learning_rate": 2.162395181292075e-07, + "logits/chosen": -1.9511940479278564, + "logits/rejected": -1.9594807624816895, + "logps/chosen": -296.89794921875, + "logps/rejected": -339.48968505859375, + "loss": 0.4048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7253051400184631, + "rewards/margins": 0.954999566078186, + "rewards/rejected": -1.6803046464920044, + "step": 2493 + }, + { + "epoch": 0.29, + "learning_rate": 2.1620408645328925e-07, + "logits/chosen": -2.4320504665374756, + "logits/rejected": -2.227735996246338, + "logps/chosen": -194.30369567871094, + "logps/rejected": -286.7164611816406, + "loss": 0.4027, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5044612288475037, + "rewards/margins": 1.7659118175506592, + "rewards/rejected": -2.2703728675842285, + "step": 2494 + }, + { + "epoch": 0.29, + "learning_rate": 2.1616865477737097e-07, + "logits/chosen": -2.683600664138794, + "logits/rejected": -2.6873273849487305, + "logps/chosen": -200.62213134765625, + "logps/rejected": -260.86944580078125, + "loss": 0.4168, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0549192428588867, + "rewards/margins": 1.512897253036499, + "rewards/rejected": -2.5678164958953857, + "step": 2495 + }, + { + "epoch": 0.29, + "learning_rate": 2.161332231014527e-07, + "logits/chosen": -2.5147647857666016, + "logits/rejected": -2.6125638484954834, + "logps/chosen": -185.51768493652344, + "logps/rejected": -264.8766784667969, + "loss": 0.4466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3316652476787567, + "rewards/margins": 1.7436048984527588, + "rewards/rejected": -2.075270175933838, + "step": 2496 + }, + { + "epoch": 0.29, + "learning_rate": 2.1609779142553441e-07, + "logits/chosen": -1.6731147766113281, + "logits/rejected": -2.253133773803711, + "logps/chosen": -611.4571533203125, + "logps/rejected": -290.9269104003906, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4358718395233154, + "rewards/margins": 0.5214658379554749, + "rewards/rejected": -1.9573378562927246, + "step": 2497 + }, + { + "epoch": 0.29, + "learning_rate": 2.1606235974961616e-07, + "logits/chosen": -2.7119803428649902, + "logits/rejected": -2.7804431915283203, + "logps/chosen": -262.18011474609375, + "logps/rejected": -197.81143188476562, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2761025428771973, + "rewards/margins": 1.1145788431167603, + "rewards/rejected": -2.390681266784668, + "step": 2498 + }, + { + "epoch": 0.29, + "learning_rate": 2.1602692807369788e-07, + "logits/chosen": -2.087944507598877, + "logits/rejected": -2.3218302726745605, + "logps/chosen": -421.219970703125, + "logps/rejected": -236.00625610351562, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9269934892654419, + "rewards/margins": 1.710361361503601, + "rewards/rejected": -2.637354850769043, + "step": 2499 + }, + { + "epoch": 0.29, + "learning_rate": 2.159914963977796e-07, + "logits/chosen": -2.364070415496826, + "logits/rejected": -2.37058162689209, + "logps/chosen": -194.7274169921875, + "logps/rejected": -223.50689697265625, + "loss": 0.1926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4434488117694855, + "rewards/margins": 2.6976234912872314, + "rewards/rejected": -3.1410725116729736, + "step": 2500 + }, + { + "epoch": 0.29, + "learning_rate": 2.1595606472186133e-07, + "logits/chosen": -2.66389799118042, + "logits/rejected": -2.8655638694763184, + "logps/chosen": -226.45138549804688, + "logps/rejected": -163.21151733398438, + "loss": 0.6713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8911166191101074, + "rewards/margins": 1.7618833780288696, + "rewards/rejected": -2.6529998779296875, + "step": 2501 + }, + { + "epoch": 0.29, + "learning_rate": 2.1592063304594305e-07, + "logits/chosen": -2.665041923522949, + "logits/rejected": -2.6689534187316895, + "logps/chosen": -177.0982666015625, + "logps/rejected": -261.6306457519531, + "loss": 0.2848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6877730488777161, + "rewards/margins": 2.144991159439087, + "rewards/rejected": -2.8327643871307373, + "step": 2502 + }, + { + "epoch": 0.29, + "learning_rate": 2.1588520137002477e-07, + "logits/chosen": -1.9570120573043823, + "logits/rejected": -2.0342416763305664, + "logps/chosen": -416.959228515625, + "logps/rejected": -306.8493347167969, + "loss": 0.2026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36147573590278625, + "rewards/margins": 2.408031940460205, + "rewards/rejected": -2.769507646560669, + "step": 2503 + }, + { + "epoch": 0.29, + "learning_rate": 2.1584976969410652e-07, + "logits/chosen": -2.486175537109375, + "logits/rejected": -2.26587176322937, + "logps/chosen": -251.4013671875, + "logps/rejected": -255.628173828125, + "loss": 0.2594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4538498520851135, + "rewards/margins": 2.3155696392059326, + "rewards/rejected": -2.7694191932678223, + "step": 2504 + }, + { + "epoch": 0.29, + "learning_rate": 2.1581433801818824e-07, + "logits/chosen": -2.3222920894622803, + "logits/rejected": -2.5238311290740967, + "logps/chosen": -318.052490234375, + "logps/rejected": -163.18344116210938, + "loss": 0.7829, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.386183738708496, + "rewards/margins": 0.9369373321533203, + "rewards/rejected": -3.3231213092803955, + "step": 2505 + }, + { + "epoch": 0.29, + "learning_rate": 2.1577890634227e-07, + "logits/chosen": -2.333968162536621, + "logits/rejected": -2.5672144889831543, + "logps/chosen": -362.73565673828125, + "logps/rejected": -255.62030029296875, + "loss": 0.2109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2562384605407715, + "rewards/margins": 2.2481024265289307, + "rewards/rejected": -2.504340887069702, + "step": 2506 + }, + { + "epoch": 0.29, + "learning_rate": 2.1574347466635171e-07, + "logits/chosen": -2.6349728107452393, + "logits/rejected": -2.8988699913024902, + "logps/chosen": -259.6604919433594, + "logps/rejected": -227.67808532714844, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6094520092010498, + "rewards/margins": 2.4679994583129883, + "rewards/rejected": -3.077451705932617, + "step": 2507 + }, + { + "epoch": 0.29, + "learning_rate": 2.1570804299043344e-07, + "logits/chosen": -2.30950927734375, + "logits/rejected": -2.539088726043701, + "logps/chosen": -246.06527709960938, + "logps/rejected": -311.15618896484375, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18298441171646118, + "rewards/margins": 3.3109629154205322, + "rewards/rejected": -3.4939475059509277, + "step": 2508 + }, + { + "epoch": 0.29, + "learning_rate": 2.1567261131451519e-07, + "logits/chosen": -2.332838535308838, + "logits/rejected": -2.3300657272338867, + "logps/chosen": -250.5757598876953, + "logps/rejected": -286.112060546875, + "loss": 0.76, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7139772176742554, + "rewards/margins": 0.8995258212089539, + "rewards/rejected": -1.613503098487854, + "step": 2509 + }, + { + "epoch": 0.29, + "learning_rate": 2.156371796385969e-07, + "logits/chosen": -1.726469874382019, + "logits/rejected": -1.7293589115142822, + "logps/chosen": -387.61981201171875, + "logps/rejected": -441.3700256347656, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9473698139190674, + "rewards/margins": 2.4158239364624023, + "rewards/rejected": -3.3631937503814697, + "step": 2510 + }, + { + "epoch": 0.29, + "learning_rate": 2.1560174796267863e-07, + "logits/chosen": -2.4357662200927734, + "logits/rejected": -2.3817741870880127, + "logps/chosen": -292.2746276855469, + "logps/rejected": -248.77415466308594, + "loss": 0.5239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35337433218955994, + "rewards/margins": 1.4764848947525024, + "rewards/rejected": -1.8298592567443848, + "step": 2511 + }, + { + "epoch": 0.29, + "learning_rate": 2.1556631628676035e-07, + "logits/chosen": -2.5331079959869385, + "logits/rejected": -2.249842643737793, + "logps/chosen": -268.59783935546875, + "logps/rejected": -375.6451721191406, + "loss": 0.3666, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42586958408355713, + "rewards/margins": 1.9522696733474731, + "rewards/rejected": -2.3781392574310303, + "step": 2512 + }, + { + "epoch": 0.29, + "learning_rate": 2.1553088461084207e-07, + "logits/chosen": -2.6065847873687744, + "logits/rejected": -2.7714946269989014, + "logps/chosen": -303.1990966796875, + "logps/rejected": -311.88543701171875, + "loss": 0.1076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5122393369674683, + "rewards/margins": 3.1735033988952637, + "rewards/rejected": -3.6857428550720215, + "step": 2513 + }, + { + "epoch": 0.29, + "learning_rate": 2.154954529349238e-07, + "logits/chosen": -2.1436097621917725, + "logits/rejected": -2.0793073177337646, + "logps/chosen": -257.1372375488281, + "logps/rejected": -320.68505859375, + "loss": 0.7532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5653723478317261, + "rewards/margins": 0.30032941699028015, + "rewards/rejected": -0.8657017946243286, + "step": 2514 + }, + { + "epoch": 0.29, + "learning_rate": 2.1546002125900552e-07, + "logits/chosen": -2.2719881534576416, + "logits/rejected": -2.0516462326049805, + "logps/chosen": -153.1759796142578, + "logps/rejected": -162.70884704589844, + "loss": 1.0015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0997684001922607, + "rewards/margins": 0.6520240902900696, + "rewards/rejected": -1.7517926692962646, + "step": 2515 + }, + { + "epoch": 0.29, + "learning_rate": 2.1542458958308727e-07, + "logits/chosen": -2.474311590194702, + "logits/rejected": -2.2612688541412354, + "logps/chosen": -174.1598663330078, + "logps/rejected": -204.6775665283203, + "loss": 0.4369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5761568546295166, + "rewards/margins": 1.2398627996444702, + "rewards/rejected": -1.8160197734832764, + "step": 2516 + }, + { + "epoch": 0.29, + "learning_rate": 2.15389157907169e-07, + "logits/chosen": -2.4809908866882324, + "logits/rejected": -2.289766311645508, + "logps/chosen": -191.56077575683594, + "logps/rejected": -215.5283966064453, + "loss": 0.3109, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8735411167144775, + "rewards/margins": 2.218721866607666, + "rewards/rejected": -3.0922629833221436, + "step": 2517 + }, + { + "epoch": 0.29, + "learning_rate": 2.1535372623125074e-07, + "logits/chosen": -2.823366641998291, + "logits/rejected": -2.7455403804779053, + "logps/chosen": -126.89311218261719, + "logps/rejected": -183.48045349121094, + "loss": 0.2034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.913655161857605, + "rewards/margins": 1.9730548858642578, + "rewards/rejected": -2.8867099285125732, + "step": 2518 + }, + { + "epoch": 0.29, + "learning_rate": 2.1531829455533246e-07, + "logits/chosen": -1.9960289001464844, + "logits/rejected": -1.8561956882476807, + "logps/chosen": -192.32167053222656, + "logps/rejected": -235.8656005859375, + "loss": 0.6836, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.452646017074585, + "rewards/margins": 1.8187305927276611, + "rewards/rejected": -3.271376609802246, + "step": 2519 + }, + { + "epoch": 0.29, + "learning_rate": 2.152828628794142e-07, + "logits/chosen": -1.544995665550232, + "logits/rejected": -1.4306167364120483, + "logps/chosen": -439.45416259765625, + "logps/rejected": -314.9580078125, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8839030265808105, + "rewards/margins": 1.0862598419189453, + "rewards/rejected": -1.9701627492904663, + "step": 2520 + }, + { + "epoch": 0.29, + "learning_rate": 2.1524743120349593e-07, + "logits/chosen": -2.0658252239227295, + "logits/rejected": -2.366868019104004, + "logps/chosen": -335.5693359375, + "logps/rejected": -245.95172119140625, + "loss": 0.4255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9966591596603394, + "rewards/margins": 1.659735083580017, + "rewards/rejected": -2.6563942432403564, + "step": 2521 + }, + { + "epoch": 0.29, + "learning_rate": 2.1521199952757765e-07, + "logits/chosen": -2.5534114837646484, + "logits/rejected": -2.722569465637207, + "logps/chosen": -212.35562133789062, + "logps/rejected": -186.20565795898438, + "loss": 0.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45241692662239075, + "rewards/margins": 1.912498950958252, + "rewards/rejected": -2.3649158477783203, + "step": 2522 + }, + { + "epoch": 0.29, + "learning_rate": 2.1517656785165937e-07, + "logits/chosen": -2.3214805126190186, + "logits/rejected": -2.4388954639434814, + "logps/chosen": -344.68121337890625, + "logps/rejected": -243.51193237304688, + "loss": 0.3391, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5915799736976624, + "rewards/margins": 1.4331473112106323, + "rewards/rejected": -2.0247273445129395, + "step": 2523 + }, + { + "epoch": 0.29, + "learning_rate": 2.151411361757411e-07, + "logits/chosen": -2.653506278991699, + "logits/rejected": -2.872809886932373, + "logps/chosen": -206.9505157470703, + "logps/rejected": -147.49998474121094, + "loss": 0.5155, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.15928316116333, + "rewards/margins": 0.840441882610321, + "rewards/rejected": -1.999725103378296, + "step": 2524 + }, + { + "epoch": 0.29, + "learning_rate": 2.1510570449982282e-07, + "logits/chosen": -2.4350903034210205, + "logits/rejected": -2.1825990676879883, + "logps/chosen": -215.20053100585938, + "logps/rejected": -268.059814453125, + "loss": 0.2701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02213093638420105, + "rewards/margins": 1.729360818862915, + "rewards/rejected": -1.7072298526763916, + "step": 2525 + }, + { + "epoch": 0.29, + "learning_rate": 2.1507027282390454e-07, + "logits/chosen": -2.6398324966430664, + "logits/rejected": -2.4200289249420166, + "logps/chosen": -258.4054870605469, + "logps/rejected": -370.25726318359375, + "loss": 0.6648, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8303380012512207, + "rewards/margins": 5.022956848144531, + "rewards/rejected": -5.85329532623291, + "step": 2526 + }, + { + "epoch": 0.29, + "learning_rate": 2.150348411479863e-07, + "logits/chosen": -2.467426061630249, + "logits/rejected": -2.579580783843994, + "logps/chosen": -403.6004943847656, + "logps/rejected": -295.8081970214844, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35209962725639343, + "rewards/margins": 3.722083330154419, + "rewards/rejected": -3.369983434677124, + "step": 2527 + }, + { + "epoch": 0.29, + "learning_rate": 2.14999409472068e-07, + "logits/chosen": -2.321190118789673, + "logits/rejected": -2.3155312538146973, + "logps/chosen": -175.9431610107422, + "logps/rejected": -226.6474609375, + "loss": 0.4849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9819499254226685, + "rewards/margins": 2.3070194721221924, + "rewards/rejected": -3.2889695167541504, + "step": 2528 + }, + { + "epoch": 0.29, + "learning_rate": 2.1496397779614976e-07, + "logits/chosen": -2.2119476795196533, + "logits/rejected": -2.2493815422058105, + "logps/chosen": -234.21539306640625, + "logps/rejected": -300.6205749511719, + "loss": 0.562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33483225107192993, + "rewards/margins": 1.8870439529418945, + "rewards/rejected": -2.2218761444091797, + "step": 2529 + }, + { + "epoch": 0.29, + "learning_rate": 2.1492854612023148e-07, + "logits/chosen": -1.8799748420715332, + "logits/rejected": -2.1158785820007324, + "logps/chosen": -222.13890075683594, + "logps/rejected": -223.2462158203125, + "loss": 0.4094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.435774564743042, + "rewards/margins": 2.016730546951294, + "rewards/rejected": -2.452505350112915, + "step": 2530 + }, + { + "epoch": 0.29, + "learning_rate": 2.148931144443132e-07, + "logits/chosen": -1.658432126045227, + "logits/rejected": -1.5726585388183594, + "logps/chosen": -280.7524719238281, + "logps/rejected": -298.454833984375, + "loss": 0.7576, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5815993547439575, + "rewards/margins": 2.131830930709839, + "rewards/rejected": -2.713430166244507, + "step": 2531 + }, + { + "epoch": 0.29, + "learning_rate": 2.1485768276839495e-07, + "logits/chosen": -2.9714138507843018, + "logits/rejected": -2.919175386428833, + "logps/chosen": -178.443115234375, + "logps/rejected": -133.97642517089844, + "loss": 0.2917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05928654968738556, + "rewards/margins": 1.7179152965545654, + "rewards/rejected": -1.7772016525268555, + "step": 2532 + }, + { + "epoch": 0.29, + "learning_rate": 2.1482225109247668e-07, + "logits/chosen": -2.0244479179382324, + "logits/rejected": -2.351897954940796, + "logps/chosen": -540.7979125976562, + "logps/rejected": -338.91217041015625, + "loss": 0.2739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48705923557281494, + "rewards/margins": 1.8355441093444824, + "rewards/rejected": -2.322603225708008, + "step": 2533 + }, + { + "epoch": 0.29, + "learning_rate": 2.147868194165584e-07, + "logits/chosen": -2.5561256408691406, + "logits/rejected": -2.6778125762939453, + "logps/chosen": -229.99276733398438, + "logps/rejected": -217.60507202148438, + "loss": 0.5552, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0051968097686768, + "rewards/margins": 1.162032127380371, + "rewards/rejected": -2.167228937149048, + "step": 2534 + }, + { + "epoch": 0.29, + "learning_rate": 2.1475138774064012e-07, + "logits/chosen": -2.0245018005371094, + "logits/rejected": -2.349809408187866, + "logps/chosen": -464.1607971191406, + "logps/rejected": -365.2149353027344, + "loss": 0.5502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44163796305656433, + "rewards/margins": 1.8277925252914429, + "rewards/rejected": -2.26943039894104, + "step": 2535 + }, + { + "epoch": 0.3, + "learning_rate": 2.1471595606472184e-07, + "logits/chosen": -1.7524765729904175, + "logits/rejected": -2.0873425006866455, + "logps/chosen": -285.0338439941406, + "logps/rejected": -227.56915283203125, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5647150278091431, + "rewards/margins": 2.530390977859497, + "rewards/rejected": -3.0951058864593506, + "step": 2536 + }, + { + "epoch": 0.3, + "learning_rate": 2.1468052438880356e-07, + "logits/chosen": -2.296182155609131, + "logits/rejected": -2.254962921142578, + "logps/chosen": -180.7959747314453, + "logps/rejected": -242.15760803222656, + "loss": 0.2681, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45223239064216614, + "rewards/margins": 2.9311883449554443, + "rewards/rejected": -3.383420467376709, + "step": 2537 + }, + { + "epoch": 0.3, + "learning_rate": 2.146450927128853e-07, + "logits/chosen": -2.780153751373291, + "logits/rejected": -2.7498185634613037, + "logps/chosen": -212.97930908203125, + "logps/rejected": -251.3601531982422, + "loss": 0.4615, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6916762590408325, + "rewards/margins": 1.893078327178955, + "rewards/rejected": -2.584754705429077, + "step": 2538 + }, + { + "epoch": 0.3, + "learning_rate": 2.1460966103696703e-07, + "logits/chosen": -2.0870018005371094, + "logits/rejected": -2.2417819499969482, + "logps/chosen": -311.5880126953125, + "logps/rejected": -256.0032958984375, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9865826368331909, + "rewards/margins": 1.67692232131958, + "rewards/rejected": -2.6635048389434814, + "step": 2539 + }, + { + "epoch": 0.3, + "learning_rate": 2.1457422936104876e-07, + "logits/chosen": -2.2921738624572754, + "logits/rejected": -2.4124789237976074, + "logps/chosen": -463.2216796875, + "logps/rejected": -340.97308349609375, + "loss": 0.6251, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7892606258392334, + "rewards/margins": 0.5807223320007324, + "rewards/rejected": -1.3699829578399658, + "step": 2540 + }, + { + "epoch": 0.3, + "learning_rate": 2.145387976851305e-07, + "logits/chosen": -2.0969619750976562, + "logits/rejected": -1.9209352731704712, + "logps/chosen": -240.2244110107422, + "logps/rejected": -381.3532409667969, + "loss": 0.2895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9568729996681213, + "rewards/margins": 3.6741714477539062, + "rewards/rejected": -4.631044387817383, + "step": 2541 + }, + { + "epoch": 0.3, + "learning_rate": 2.1450336600921223e-07, + "logits/chosen": -2.419430732727051, + "logits/rejected": -2.457300901412964, + "logps/chosen": -368.83441162109375, + "logps/rejected": -304.298095703125, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8944985866546631, + "rewards/margins": 1.3708322048187256, + "rewards/rejected": -2.2653307914733887, + "step": 2542 + }, + { + "epoch": 0.3, + "learning_rate": 2.1446793433329398e-07, + "logits/chosen": -2.091369390487671, + "logits/rejected": -2.6405375003814697, + "logps/chosen": -369.7547607421875, + "logps/rejected": -279.75848388671875, + "loss": 0.6114, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4008021354675293, + "rewards/margins": 0.83387291431427, + "rewards/rejected": -2.234675168991089, + "step": 2543 + }, + { + "epoch": 0.3, + "learning_rate": 2.144325026573757e-07, + "logits/chosen": -1.6898483037948608, + "logits/rejected": -2.121739625930786, + "logps/chosen": -256.3876037597656, + "logps/rejected": -224.85952758789062, + "loss": 0.1921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3262261152267456, + "rewards/margins": 2.2378976345062256, + "rewards/rejected": -1.9116718769073486, + "step": 2544 + }, + { + "epoch": 0.3, + "learning_rate": 2.1439707098145742e-07, + "logits/chosen": -1.8569974899291992, + "logits/rejected": -2.192429542541504, + "logps/chosen": -338.24591064453125, + "logps/rejected": -253.52133178710938, + "loss": 1.13, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.58491849899292, + "rewards/margins": 1.2095301151275635, + "rewards/rejected": -2.7944488525390625, + "step": 2545 + }, + { + "epoch": 0.3, + "learning_rate": 2.1436163930553914e-07, + "logits/chosen": -2.5827949047088623, + "logits/rejected": -2.631491184234619, + "logps/chosen": -389.8240051269531, + "logps/rejected": -330.2645263671875, + "loss": 0.2693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7319276332855225, + "rewards/margins": 1.927665114402771, + "rewards/rejected": -2.659592628479004, + "step": 2546 + }, + { + "epoch": 0.3, + "learning_rate": 2.1432620762962086e-07, + "logits/chosen": -2.4921927452087402, + "logits/rejected": -2.3757214546203613, + "logps/chosen": -156.644775390625, + "logps/rejected": -196.5872039794922, + "loss": 0.1795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22755427658557892, + "rewards/margins": 2.889796018600464, + "rewards/rejected": -3.1173503398895264, + "step": 2547 + }, + { + "epoch": 0.3, + "learning_rate": 2.1429077595370259e-07, + "logits/chosen": -2.2522928714752197, + "logits/rejected": -2.046821355819702, + "logps/chosen": -264.7816467285156, + "logps/rejected": -368.62884521484375, + "loss": 0.5165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6784238815307617, + "rewards/margins": 2.8429083824157715, + "rewards/rejected": -3.521332263946533, + "step": 2548 + }, + { + "epoch": 0.3, + "learning_rate": 2.1425534427778433e-07, + "logits/chosen": -2.337648391723633, + "logits/rejected": -2.6032400131225586, + "logps/chosen": -299.40032958984375, + "logps/rejected": -211.0352020263672, + "loss": 0.4828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5232434272766113, + "rewards/margins": 0.9209034442901611, + "rewards/rejected": -1.4441468715667725, + "step": 2549 + }, + { + "epoch": 0.3, + "learning_rate": 2.1421991260186606e-07, + "logits/chosen": -2.329113483428955, + "logits/rejected": -2.244281530380249, + "logps/chosen": -206.89669799804688, + "logps/rejected": -184.121337890625, + "loss": 0.5675, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7923282384872437, + "rewards/margins": 0.7351560592651367, + "rewards/rejected": -1.5274842977523804, + "step": 2550 + }, + { + "epoch": 0.3, + "learning_rate": 2.1418448092594778e-07, + "logits/chosen": -2.805490016937256, + "logits/rejected": -2.6878297328948975, + "logps/chosen": -328.20654296875, + "logps/rejected": -184.1094207763672, + "loss": 0.5501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3256984949111938, + "rewards/margins": 0.5649362802505493, + "rewards/rejected": -1.8906347751617432, + "step": 2551 + }, + { + "epoch": 0.3, + "learning_rate": 2.141490492500295e-07, + "logits/chosen": -1.3871078491210938, + "logits/rejected": -1.4252395629882812, + "logps/chosen": -533.41845703125, + "logps/rejected": -483.2685852050781, + "loss": 0.4536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8303223252296448, + "rewards/margins": 1.6787357330322266, + "rewards/rejected": -2.5090579986572266, + "step": 2552 + }, + { + "epoch": 0.3, + "learning_rate": 2.1411361757411125e-07, + "logits/chosen": -2.475013017654419, + "logits/rejected": -2.5368683338165283, + "logps/chosen": -255.7469024658203, + "logps/rejected": -211.8311767578125, + "loss": 0.3921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1779918670654297, + "rewards/margins": 1.5918350219726562, + "rewards/rejected": -2.769826889038086, + "step": 2553 + }, + { + "epoch": 0.3, + "learning_rate": 2.14078185898193e-07, + "logits/chosen": -2.0530736446380615, + "logits/rejected": -1.8021992444992065, + "logps/chosen": -266.2743835449219, + "logps/rejected": -434.33892822265625, + "loss": 0.3758, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9634113907814026, + "rewards/margins": 2.9322547912597656, + "rewards/rejected": -3.8956663608551025, + "step": 2554 + }, + { + "epoch": 0.3, + "learning_rate": 2.1404275422227472e-07, + "logits/chosen": -2.6773080825805664, + "logits/rejected": -2.6919801235198975, + "logps/chosen": -330.18109130859375, + "logps/rejected": -390.2763671875, + "loss": 0.8071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4889535903930664, + "rewards/margins": 0.6570836305618286, + "rewards/rejected": -2.1460371017456055, + "step": 2555 + }, + { + "epoch": 0.3, + "learning_rate": 2.1400732254635644e-07, + "logits/chosen": -2.375849723815918, + "logits/rejected": -2.170619010925293, + "logps/chosen": -193.95196533203125, + "logps/rejected": -259.37646484375, + "loss": 0.2699, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8537381291389465, + "rewards/margins": 2.6344709396362305, + "rewards/rejected": -3.4882090091705322, + "step": 2556 + }, + { + "epoch": 0.3, + "learning_rate": 2.1397189087043816e-07, + "logits/chosen": -2.4741876125335693, + "logits/rejected": -2.3509905338287354, + "logps/chosen": -300.0631103515625, + "logps/rejected": -183.92515563964844, + "loss": 0.8448, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.315861701965332, + "rewards/margins": 0.8787112832069397, + "rewards/rejected": -2.194572925567627, + "step": 2557 + }, + { + "epoch": 0.3, + "learning_rate": 2.139364591945199e-07, + "logits/chosen": -2.6255922317504883, + "logits/rejected": -2.8591160774230957, + "logps/chosen": -206.8905487060547, + "logps/rejected": -174.03048706054688, + "loss": 0.3497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6024445295333862, + "rewards/margins": 2.7891364097595215, + "rewards/rejected": -3.3915810585021973, + "step": 2558 + }, + { + "epoch": 0.3, + "learning_rate": 2.139010275186016e-07, + "logits/chosen": -2.8973851203918457, + "logits/rejected": -2.902285575866699, + "logps/chosen": -208.32025146484375, + "logps/rejected": -223.06077575683594, + "loss": 0.3292, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.139334797859192, + "rewards/margins": 1.4390631914138794, + "rewards/rejected": -2.5783979892730713, + "step": 2559 + }, + { + "epoch": 0.3, + "learning_rate": 2.1386559584268333e-07, + "logits/chosen": -1.680161476135254, + "logits/rejected": -1.5693259239196777, + "logps/chosen": -493.3486022949219, + "logps/rejected": -508.0200500488281, + "loss": 0.7089, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.347231388092041, + "rewards/margins": 1.0592546463012695, + "rewards/rejected": -2.4064860343933105, + "step": 2560 + }, + { + "epoch": 0.3, + "learning_rate": 2.1383016416676508e-07, + "logits/chosen": -2.554274797439575, + "logits/rejected": -2.710440158843994, + "logps/chosen": -290.7469482421875, + "logps/rejected": -256.419677734375, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41870439052581787, + "rewards/margins": 3.2795321941375732, + "rewards/rejected": -2.860827922821045, + "step": 2561 + }, + { + "epoch": 0.3, + "learning_rate": 2.137947324908468e-07, + "logits/chosen": -2.5328361988067627, + "logits/rejected": -2.5581235885620117, + "logps/chosen": -307.789306640625, + "logps/rejected": -327.694091796875, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.216713547706604, + "rewards/margins": 2.4931373596191406, + "rewards/rejected": -3.7098512649536133, + "step": 2562 + }, + { + "epoch": 0.3, + "learning_rate": 2.1375930081492852e-07, + "logits/chosen": -1.9491183757781982, + "logits/rejected": -2.1379191875457764, + "logps/chosen": -283.219482421875, + "logps/rejected": -313.8524169921875, + "loss": 0.8224, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1927485466003418, + "rewards/margins": 1.3765716552734375, + "rewards/rejected": -2.5693204402923584, + "step": 2563 + }, + { + "epoch": 0.3, + "learning_rate": 2.1372386913901027e-07, + "logits/chosen": -2.441416025161743, + "logits/rejected": -2.434452533721924, + "logps/chosen": -174.22471618652344, + "logps/rejected": -221.33143615722656, + "loss": 0.3138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0940719842910767, + "rewards/margins": 1.596376895904541, + "rewards/rejected": -2.6904489994049072, + "step": 2564 + }, + { + "epoch": 0.3, + "learning_rate": 2.1368843746309202e-07, + "logits/chosen": -3.0159478187561035, + "logits/rejected": -3.010021686553955, + "logps/chosen": -212.3440704345703, + "logps/rejected": -238.00233459472656, + "loss": 0.2541, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1313419342041016, + "rewards/margins": 2.14462947845459, + "rewards/rejected": -3.2759711742401123, + "step": 2565 + }, + { + "epoch": 0.3, + "learning_rate": 2.1365300578717374e-07, + "logits/chosen": -2.8007259368896484, + "logits/rejected": -2.437293529510498, + "logps/chosen": -197.3070068359375, + "logps/rejected": -234.79383850097656, + "loss": 0.6553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9126080274581909, + "rewards/margins": 2.3788390159606934, + "rewards/rejected": -3.2914469242095947, + "step": 2566 + }, + { + "epoch": 0.3, + "learning_rate": 2.1361757411125547e-07, + "logits/chosen": -2.4267590045928955, + "logits/rejected": -2.5445449352264404, + "logps/chosen": -273.1348876953125, + "logps/rejected": -282.7674560546875, + "loss": 0.1588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6609268188476562, + "rewards/margins": 3.722398042678833, + "rewards/rejected": -4.383325099945068, + "step": 2567 + }, + { + "epoch": 0.3, + "learning_rate": 2.135821424353372e-07, + "logits/chosen": -2.3227498531341553, + "logits/rejected": -2.548868179321289, + "logps/chosen": -305.441650390625, + "logps/rejected": -246.95455932617188, + "loss": 0.392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39701545238494873, + "rewards/margins": 1.585936427116394, + "rewards/rejected": -1.9829518795013428, + "step": 2568 + }, + { + "epoch": 0.3, + "learning_rate": 2.135467107594189e-07, + "logits/chosen": -2.3718113899230957, + "logits/rejected": -2.1741511821746826, + "logps/chosen": -348.6911315917969, + "logps/rejected": -266.3795471191406, + "loss": 0.2698, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05877622216939926, + "rewards/margins": 1.889040470123291, + "rewards/rejected": -1.8302642107009888, + "step": 2569 + }, + { + "epoch": 0.3, + "learning_rate": 2.1351127908350063e-07, + "logits/chosen": -1.788395643234253, + "logits/rejected": -2.0295250415802, + "logps/chosen": -254.83863830566406, + "logps/rejected": -252.07754516601562, + "loss": 0.7222, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.093961000442505, + "rewards/margins": 1.495567798614502, + "rewards/rejected": -3.589528799057007, + "step": 2570 + }, + { + "epoch": 0.3, + "learning_rate": 2.1347584740758235e-07, + "logits/chosen": -2.507138252258301, + "logits/rejected": -2.6624112129211426, + "logps/chosen": -167.6116943359375, + "logps/rejected": -154.7735595703125, + "loss": 0.4048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8135145902633667, + "rewards/margins": 0.9753490686416626, + "rewards/rejected": -1.7888636589050293, + "step": 2571 + }, + { + "epoch": 0.3, + "learning_rate": 2.134404157316641e-07, + "logits/chosen": -2.4818649291992188, + "logits/rejected": -2.5543031692504883, + "logps/chosen": -192.1553192138672, + "logps/rejected": -242.59072875976562, + "loss": 0.3473, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3355586528778076, + "rewards/margins": 2.013864040374756, + "rewards/rejected": -2.3494229316711426, + "step": 2572 + }, + { + "epoch": 0.3, + "learning_rate": 2.1340498405574582e-07, + "logits/chosen": -2.338322401046753, + "logits/rejected": -2.352806568145752, + "logps/chosen": -233.18942260742188, + "logps/rejected": -284.0077209472656, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9903063774108887, + "rewards/margins": 3.2890801429748535, + "rewards/rejected": -4.279386520385742, + "step": 2573 + }, + { + "epoch": 0.3, + "learning_rate": 2.1336955237982755e-07, + "logits/chosen": -2.3057117462158203, + "logits/rejected": -2.434682846069336, + "logps/chosen": -170.9749298095703, + "logps/rejected": -154.5333709716797, + "loss": 0.5823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9230182766914368, + "rewards/margins": 0.6151262521743774, + "rewards/rejected": -1.5381447076797485, + "step": 2574 + }, + { + "epoch": 0.3, + "learning_rate": 2.1333412070390927e-07, + "logits/chosen": -2.5503361225128174, + "logits/rejected": -2.627332925796509, + "logps/chosen": -370.56854248046875, + "logps/rejected": -287.74993896484375, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.945339024066925, + "rewards/margins": 1.933487892150879, + "rewards/rejected": -2.8788270950317383, + "step": 2575 + }, + { + "epoch": 0.3, + "learning_rate": 2.1329868902799102e-07, + "logits/chosen": -1.8307360410690308, + "logits/rejected": -2.1788668632507324, + "logps/chosen": -420.8900146484375, + "logps/rejected": -287.46881103515625, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46200910210609436, + "rewards/margins": 1.8660531044006348, + "rewards/rejected": -2.328062057495117, + "step": 2576 + }, + { + "epoch": 0.3, + "learning_rate": 2.1326325735207277e-07, + "logits/chosen": -2.641584873199463, + "logits/rejected": -2.4264044761657715, + "logps/chosen": -233.10919189453125, + "logps/rejected": -300.5948181152344, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9543836116790771, + "rewards/margins": 1.8592082262039185, + "rewards/rejected": -2.813591718673706, + "step": 2577 + }, + { + "epoch": 0.3, + "learning_rate": 2.132278256761545e-07, + "logits/chosen": -2.2594223022460938, + "logits/rejected": -1.9873428344726562, + "logps/chosen": -161.418212890625, + "logps/rejected": -249.96804809570312, + "loss": 0.5935, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07782843708992004, + "rewards/margins": 1.0898891687393188, + "rewards/rejected": -1.1677175760269165, + "step": 2578 + }, + { + "epoch": 0.3, + "learning_rate": 2.131923940002362e-07, + "logits/chosen": -2.7860255241394043, + "logits/rejected": -2.6397600173950195, + "logps/chosen": -354.02996826171875, + "logps/rejected": -356.10198974609375, + "loss": 0.2757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8101099729537964, + "rewards/margins": 1.934465765953064, + "rewards/rejected": -2.7445757389068604, + "step": 2579 + }, + { + "epoch": 0.3, + "learning_rate": 2.1315696232431793e-07, + "logits/chosen": -2.0987277030944824, + "logits/rejected": -2.3405189514160156, + "logps/chosen": -461.44110107421875, + "logps/rejected": -289.266357421875, + "loss": 0.2576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5622744560241699, + "rewards/margins": 1.505702257156372, + "rewards/rejected": -2.067976713180542, + "step": 2580 + }, + { + "epoch": 0.3, + "learning_rate": 2.1312153064839965e-07, + "logits/chosen": -2.2263922691345215, + "logits/rejected": -2.131243944168091, + "logps/chosen": -363.37567138671875, + "logps/rejected": -444.2206115722656, + "loss": 0.5159, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.322045087814331, + "rewards/margins": 1.8033368587493896, + "rewards/rejected": -3.1253819465637207, + "step": 2581 + }, + { + "epoch": 0.3, + "learning_rate": 2.1308609897248138e-07, + "logits/chosen": -1.8517756462097168, + "logits/rejected": -2.052490472793579, + "logps/chosen": -287.9496765136719, + "logps/rejected": -332.45440673828125, + "loss": 0.7047, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8589866161346436, + "rewards/margins": 0.7502407431602478, + "rewards/rejected": -1.609227180480957, + "step": 2582 + }, + { + "epoch": 0.3, + "learning_rate": 2.1305066729656313e-07, + "logits/chosen": -2.078005790710449, + "logits/rejected": -2.3747832775115967, + "logps/chosen": -313.2320861816406, + "logps/rejected": -243.12908935546875, + "loss": 0.4894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7399553060531616, + "rewards/margins": 0.8784913420677185, + "rewards/rejected": -1.618446707725525, + "step": 2583 + }, + { + "epoch": 0.3, + "learning_rate": 2.1301523562064485e-07, + "logits/chosen": -1.7394474744796753, + "logits/rejected": -1.9447870254516602, + "logps/chosen": -236.136962890625, + "logps/rejected": -213.13027954101562, + "loss": 0.5938, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4992235898971558, + "rewards/margins": 1.3519543409347534, + "rewards/rejected": -2.851177930831909, + "step": 2584 + }, + { + "epoch": 0.3, + "learning_rate": 2.1297980394472657e-07, + "logits/chosen": -2.3467164039611816, + "logits/rejected": -2.3690361976623535, + "logps/chosen": -200.31387329101562, + "logps/rejected": -558.9495239257812, + "loss": 0.1757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5098056793212891, + "rewards/margins": 2.6347427368164062, + "rewards/rejected": -3.1445486545562744, + "step": 2585 + }, + { + "epoch": 0.3, + "learning_rate": 2.129443722688083e-07, + "logits/chosen": -2.170457363128662, + "logits/rejected": -2.2061541080474854, + "logps/chosen": -348.7015075683594, + "logps/rejected": -282.8475646972656, + "loss": 0.6392, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8532479405403137, + "rewards/margins": 1.0584313869476318, + "rewards/rejected": -1.9116792678833008, + "step": 2586 + }, + { + "epoch": 0.3, + "learning_rate": 2.1290894059289001e-07, + "logits/chosen": -2.223428249359131, + "logits/rejected": -2.410503387451172, + "logps/chosen": -194.19691467285156, + "logps/rejected": -187.31814575195312, + "loss": 1.2347, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2361310720443726, + "rewards/margins": 0.6413974761962891, + "rewards/rejected": -1.8775286674499512, + "step": 2587 + }, + { + "epoch": 0.3, + "learning_rate": 2.128735089169718e-07, + "logits/chosen": -2.449988842010498, + "logits/rejected": -2.3299474716186523, + "logps/chosen": -365.9095153808594, + "logps/rejected": -390.0362854003906, + "loss": 0.5369, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9473435878753662, + "rewards/margins": 2.1307365894317627, + "rewards/rejected": -4.078080654144287, + "step": 2588 + }, + { + "epoch": 0.3, + "learning_rate": 2.128380772410535e-07, + "logits/chosen": -2.0173096656799316, + "logits/rejected": -1.8287954330444336, + "logps/chosen": -278.3974304199219, + "logps/rejected": -304.689453125, + "loss": 0.5405, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5452982187271118, + "rewards/margins": 1.493009090423584, + "rewards/rejected": -2.0383071899414062, + "step": 2589 + }, + { + "epoch": 0.3, + "learning_rate": 2.1280264556513523e-07, + "logits/chosen": -2.9732067584991455, + "logits/rejected": -3.045532703399658, + "logps/chosen": -260.14666748046875, + "logps/rejected": -250.9874725341797, + "loss": 0.3417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8266110420227051, + "rewards/margins": 2.058135986328125, + "rewards/rejected": -2.88474702835083, + "step": 2590 + }, + { + "epoch": 0.3, + "learning_rate": 2.1276721388921696e-07, + "logits/chosen": -2.44769287109375, + "logits/rejected": -2.013193368911743, + "logps/chosen": -383.69525146484375, + "logps/rejected": -332.4159851074219, + "loss": 0.4063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6982718110084534, + "rewards/margins": 2.229570150375366, + "rewards/rejected": -2.927841901779175, + "step": 2591 + }, + { + "epoch": 0.3, + "learning_rate": 2.1273178221329868e-07, + "logits/chosen": -2.5451931953430176, + "logits/rejected": -2.4386518001556396, + "logps/chosen": -311.6807861328125, + "logps/rejected": -398.2003173828125, + "loss": 0.2306, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3065221309661865, + "rewards/margins": 2.2609922885894775, + "rewards/rejected": -3.567514181137085, + "step": 2592 + }, + { + "epoch": 0.3, + "learning_rate": 2.126963505373804e-07, + "logits/chosen": -2.0170228481292725, + "logits/rejected": -2.314340353012085, + "logps/chosen": -659.23486328125, + "logps/rejected": -409.014404296875, + "loss": 0.1879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17520523071289062, + "rewards/margins": 2.439131736755371, + "rewards/rejected": -2.6143369674682617, + "step": 2593 + }, + { + "epoch": 0.3, + "learning_rate": 2.1266091886146215e-07, + "logits/chosen": -2.5408551692962646, + "logits/rejected": -2.7623209953308105, + "logps/chosen": -419.5908203125, + "logps/rejected": -262.07952880859375, + "loss": 0.4076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42643341422080994, + "rewards/margins": 1.9627695083618164, + "rewards/rejected": -2.3892030715942383, + "step": 2594 + }, + { + "epoch": 0.3, + "learning_rate": 2.1262548718554387e-07, + "logits/chosen": -2.130004405975342, + "logits/rejected": -2.1696572303771973, + "logps/chosen": -237.37547302246094, + "logps/rejected": -297.6939392089844, + "loss": 0.6062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8029007911682129, + "rewards/margins": 0.5593338012695312, + "rewards/rejected": -1.3622345924377441, + "step": 2595 + }, + { + "epoch": 0.3, + "learning_rate": 2.125900555096256e-07, + "logits/chosen": -2.3736417293548584, + "logits/rejected": -2.410658359527588, + "logps/chosen": -264.06817626953125, + "logps/rejected": -280.2198486328125, + "loss": 0.4736, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.343987226486206, + "rewards/margins": 1.5078290700912476, + "rewards/rejected": -2.851816177368164, + "step": 2596 + }, + { + "epoch": 0.3, + "learning_rate": 2.1255462383370731e-07, + "logits/chosen": -2.7688183784484863, + "logits/rejected": -2.6200129985809326, + "logps/chosen": -276.15789794921875, + "logps/rejected": -315.2568359375, + "loss": 0.2082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.957351565361023, + "rewards/margins": 2.4698352813720703, + "rewards/rejected": -3.427186965942383, + "step": 2597 + }, + { + "epoch": 0.3, + "learning_rate": 2.1251919215778904e-07, + "logits/chosen": -2.7673873901367188, + "logits/rejected": -2.792937994003296, + "logps/chosen": -253.6224365234375, + "logps/rejected": -289.9352722167969, + "loss": 0.4511, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6798582077026367, + "rewards/margins": 0.9627512693405151, + "rewards/rejected": -1.6426095962524414, + "step": 2598 + }, + { + "epoch": 0.3, + "learning_rate": 2.1248376048187076e-07, + "logits/chosen": -2.4799113273620605, + "logits/rejected": -2.410592794418335, + "logps/chosen": -215.07684326171875, + "logps/rejected": -262.2229919433594, + "loss": 0.3251, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20404258370399475, + "rewards/margins": 1.8311035633087158, + "rewards/rejected": -2.0351459980010986, + "step": 2599 + }, + { + "epoch": 0.3, + "learning_rate": 2.1244832880595253e-07, + "logits/chosen": -2.0321807861328125, + "logits/rejected": -2.256634473800659, + "logps/chosen": -370.8642883300781, + "logps/rejected": -295.35736083984375, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.708146333694458, + "rewards/margins": 2.2296202182769775, + "rewards/rejected": -2.9377665519714355, + "step": 2600 + }, + { + "epoch": 0.3, + "learning_rate": 2.1241289713003426e-07, + "logits/chosen": -1.658935308456421, + "logits/rejected": -1.6585564613342285, + "logps/chosen": -491.05328369140625, + "logps/rejected": -448.1021728515625, + "loss": 1.0592, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8309261798858643, + "rewards/margins": -0.4665265679359436, + "rewards/rejected": -0.36439961194992065, + "step": 2601 + }, + { + "epoch": 0.3, + "learning_rate": 2.1237746545411598e-07, + "logits/chosen": -2.0054657459259033, + "logits/rejected": -1.9933688640594482, + "logps/chosen": -201.70379638671875, + "logps/rejected": -203.50924682617188, + "loss": 0.4371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14479874074459076, + "rewards/margins": 1.36676025390625, + "rewards/rejected": -1.511559009552002, + "step": 2602 + }, + { + "epoch": 0.3, + "learning_rate": 2.123420337781977e-07, + "logits/chosen": -2.478367328643799, + "logits/rejected": -2.4522769451141357, + "logps/chosen": -268.9043273925781, + "logps/rejected": -254.5853271484375, + "loss": 0.5811, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5560452938079834, + "rewards/margins": 1.874540090560913, + "rewards/rejected": -2.4305853843688965, + "step": 2603 + }, + { + "epoch": 0.3, + "learning_rate": 2.1230660210227942e-07, + "logits/chosen": -1.834705114364624, + "logits/rejected": -2.2807557582855225, + "logps/chosen": -406.5901794433594, + "logps/rejected": -355.1419677734375, + "loss": 0.5577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0850335359573364, + "rewards/margins": 1.8589515686035156, + "rewards/rejected": -2.9439849853515625, + "step": 2604 + }, + { + "epoch": 0.3, + "learning_rate": 2.1227117042636114e-07, + "logits/chosen": -2.3453547954559326, + "logits/rejected": -2.3312788009643555, + "logps/chosen": -313.15283203125, + "logps/rejected": -301.3948669433594, + "loss": 0.4597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9407507181167603, + "rewards/margins": 1.9643663167953491, + "rewards/rejected": -2.9051170349121094, + "step": 2605 + }, + { + "epoch": 0.3, + "learning_rate": 2.122357387504429e-07, + "logits/chosen": -2.81205415725708, + "logits/rejected": -2.751915454864502, + "logps/chosen": -242.0782470703125, + "logps/rejected": -187.39796447753906, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3091857433319092, + "rewards/margins": 1.7634137868881226, + "rewards/rejected": -2.072599411010742, + "step": 2606 + }, + { + "epoch": 0.3, + "learning_rate": 2.1220030707452462e-07, + "logits/chosen": -1.9987821578979492, + "logits/rejected": -2.2674148082733154, + "logps/chosen": -418.02392578125, + "logps/rejected": -334.96258544921875, + "loss": 0.5651, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7907872200012207, + "rewards/margins": 0.9457281231880188, + "rewards/rejected": -2.736515522003174, + "step": 2607 + }, + { + "epoch": 0.3, + "learning_rate": 2.1216487539860634e-07, + "logits/chosen": -2.056626319885254, + "logits/rejected": -2.3265323638916016, + "logps/chosen": -327.2454833984375, + "logps/rejected": -355.4508361816406, + "loss": 0.4877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8456428050994873, + "rewards/margins": 1.4092652797698975, + "rewards/rejected": -2.2549080848693848, + "step": 2608 + }, + { + "epoch": 0.3, + "learning_rate": 2.1212944372268806e-07, + "logits/chosen": -2.259206533432007, + "logits/rejected": -2.403496742248535, + "logps/chosen": -307.18438720703125, + "logps/rejected": -312.2645263671875, + "loss": 0.2615, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0662508010864258, + "rewards/margins": 2.3492517471313477, + "rewards/rejected": -3.4155025482177734, + "step": 2609 + }, + { + "epoch": 0.3, + "learning_rate": 2.1209401204676978e-07, + "logits/chosen": -1.7066807746887207, + "logits/rejected": -1.8605637550354004, + "logps/chosen": -164.90879821777344, + "logps/rejected": -251.44285583496094, + "loss": 0.6079, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0301826000213623, + "rewards/margins": 2.6616854667663574, + "rewards/rejected": -3.691868305206299, + "step": 2610 + }, + { + "epoch": 0.3, + "learning_rate": 2.1205858037085156e-07, + "logits/chosen": -2.483349084854126, + "logits/rejected": -2.168569326400757, + "logps/chosen": -351.0950927734375, + "logps/rejected": -362.4216613769531, + "loss": 0.9923, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3393306732177734, + "rewards/margins": 0.3561866283416748, + "rewards/rejected": -1.6955173015594482, + "step": 2611 + }, + { + "epoch": 0.3, + "learning_rate": 2.1202314869493328e-07, + "logits/chosen": -2.4357848167419434, + "logits/rejected": -2.5168371200561523, + "logps/chosen": -315.2315673828125, + "logps/rejected": -248.7649688720703, + "loss": 0.1825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6943668127059937, + "rewards/margins": 2.862358331680298, + "rewards/rejected": -3.556725025177002, + "step": 2612 + }, + { + "epoch": 0.3, + "learning_rate": 2.11987717019015e-07, + "logits/chosen": -2.2405080795288086, + "logits/rejected": -2.593627691268921, + "logps/chosen": -508.3279113769531, + "logps/rejected": -391.7936706542969, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29705363512039185, + "rewards/margins": 3.110368251800537, + "rewards/rejected": -3.407421827316284, + "step": 2613 + }, + { + "epoch": 0.3, + "learning_rate": 2.1195228534309672e-07, + "logits/chosen": -2.561253786087036, + "logits/rejected": -2.840947151184082, + "logps/chosen": -419.335205078125, + "logps/rejected": -248.33193969726562, + "loss": 0.6347, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.216674566268921, + "rewards/margins": 0.8248121738433838, + "rewards/rejected": -2.0414867401123047, + "step": 2614 + }, + { + "epoch": 0.3, + "learning_rate": 2.1191685366717845e-07, + "logits/chosen": -2.5178534984588623, + "logits/rejected": -2.454564332962036, + "logps/chosen": -452.730224609375, + "logps/rejected": -263.9150390625, + "loss": 0.4406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9623321890830994, + "rewards/margins": 1.2375288009643555, + "rewards/rejected": -2.1998610496520996, + "step": 2615 + }, + { + "epoch": 0.3, + "learning_rate": 2.1188142199126017e-07, + "logits/chosen": -1.4523088932037354, + "logits/rejected": -1.9552210569381714, + "logps/chosen": -401.03717041015625, + "logps/rejected": -356.2298278808594, + "loss": 0.2712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.967836320400238, + "rewards/margins": 1.5908634662628174, + "rewards/rejected": -2.5586998462677, + "step": 2616 + }, + { + "epoch": 0.3, + "learning_rate": 2.1184599031534192e-07, + "logits/chosen": -2.603978395462036, + "logits/rejected": -2.7571914196014404, + "logps/chosen": -218.9181365966797, + "logps/rejected": -296.4081115722656, + "loss": 0.3591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8356574177742004, + "rewards/margins": 2.6117098331451416, + "rewards/rejected": -3.4473671913146973, + "step": 2617 + }, + { + "epoch": 0.3, + "learning_rate": 2.1181055863942364e-07, + "logits/chosen": -2.5307421684265137, + "logits/rejected": -2.3866899013519287, + "logps/chosen": -213.90304565429688, + "logps/rejected": -289.0001220703125, + "loss": 0.3718, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.394941806793213, + "rewards/margins": 1.557161569595337, + "rewards/rejected": -2.95210337638855, + "step": 2618 + }, + { + "epoch": 0.3, + "learning_rate": 2.1177512696350536e-07, + "logits/chosen": -2.2501115798950195, + "logits/rejected": -2.0865461826324463, + "logps/chosen": -253.7670440673828, + "logps/rejected": -287.92144775390625, + "loss": 0.2165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3729434907436371, + "rewards/margins": 2.7201356887817383, + "rewards/rejected": -3.0930793285369873, + "step": 2619 + }, + { + "epoch": 0.3, + "learning_rate": 2.1173969528758708e-07, + "logits/chosen": -2.152005910873413, + "logits/rejected": -2.4854536056518555, + "logps/chosen": -419.6391906738281, + "logps/rejected": -291.58441162109375, + "loss": 0.4318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8422344923019409, + "rewards/margins": 0.9933696389198303, + "rewards/rejected": -1.8356040716171265, + "step": 2620 + }, + { + "epoch": 0.3, + "learning_rate": 2.117042636116688e-07, + "logits/chosen": -2.2342865467071533, + "logits/rejected": -1.9242534637451172, + "logps/chosen": -320.97076416015625, + "logps/rejected": -397.93792724609375, + "loss": 0.6453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5702366828918457, + "rewards/margins": 1.847440242767334, + "rewards/rejected": -2.4176766872406006, + "step": 2621 + }, + { + "epoch": 0.31, + "learning_rate": 2.1166883193575053e-07, + "logits/chosen": -2.6408190727233887, + "logits/rejected": -2.531914710998535, + "logps/chosen": -359.55718994140625, + "logps/rejected": -267.2164306640625, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2366304397583008, + "rewards/margins": 2.831247091293335, + "rewards/rejected": -4.067877769470215, + "step": 2622 + }, + { + "epoch": 0.31, + "learning_rate": 2.116334002598323e-07, + "logits/chosen": -2.1648788452148438, + "logits/rejected": -2.4661946296691895, + "logps/chosen": -283.6842041015625, + "logps/rejected": -239.6512908935547, + "loss": 0.1733, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0294183492660522, + "rewards/margins": 2.941720485687256, + "rewards/rejected": -3.9711389541625977, + "step": 2623 + }, + { + "epoch": 0.31, + "learning_rate": 2.1159796858391402e-07, + "logits/chosen": -2.277092218399048, + "logits/rejected": -2.169022560119629, + "logps/chosen": -244.0896453857422, + "logps/rejected": -260.3732604980469, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4678685963153839, + "rewards/margins": 2.222628593444824, + "rewards/rejected": -2.690497398376465, + "step": 2624 + }, + { + "epoch": 0.31, + "learning_rate": 2.1156253690799575e-07, + "logits/chosen": -2.20923113822937, + "logits/rejected": -2.0786266326904297, + "logps/chosen": -376.7230224609375, + "logps/rejected": -361.3587341308594, + "loss": 0.2817, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9471611976623535, + "rewards/margins": 2.585561513900757, + "rewards/rejected": -3.5327227115631104, + "step": 2625 + }, + { + "epoch": 0.31, + "learning_rate": 2.1152710523207747e-07, + "logits/chosen": -2.4404876232147217, + "logits/rejected": -2.4123077392578125, + "logps/chosen": -148.29144287109375, + "logps/rejected": -177.520263671875, + "loss": 0.2516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7547823190689087, + "rewards/margins": 2.3257863521575928, + "rewards/rejected": -3.080568790435791, + "step": 2626 + }, + { + "epoch": 0.31, + "learning_rate": 2.114916735561592e-07, + "logits/chosen": -2.4032299518585205, + "logits/rejected": -2.618673324584961, + "logps/chosen": -286.8197021484375, + "logps/rejected": -238.5795135498047, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1468677818775177, + "rewards/margins": 1.5315043926239014, + "rewards/rejected": -1.6783721446990967, + "step": 2627 + }, + { + "epoch": 0.31, + "learning_rate": 2.1145624188024094e-07, + "logits/chosen": -2.320826530456543, + "logits/rejected": -2.5137197971343994, + "logps/chosen": -221.326904296875, + "logps/rejected": -145.04627990722656, + "loss": 0.3531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5586292147636414, + "rewards/margins": 1.2979543209075928, + "rewards/rejected": -1.856583595275879, + "step": 2628 + }, + { + "epoch": 0.31, + "learning_rate": 2.1142081020432266e-07, + "logits/chosen": -2.181326150894165, + "logits/rejected": -1.8664754629135132, + "logps/chosen": -261.5431213378906, + "logps/rejected": -301.7285461425781, + "loss": 0.265, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22348226606845856, + "rewards/margins": 1.9520633220672607, + "rewards/rejected": -1.7285809516906738, + "step": 2629 + }, + { + "epoch": 0.31, + "learning_rate": 2.1138537852840438e-07, + "logits/chosen": -2.8733761310577393, + "logits/rejected": -2.8309526443481445, + "logps/chosen": -167.49317932128906, + "logps/rejected": -230.95228576660156, + "loss": 0.2215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7066681981086731, + "rewards/margins": 2.5904173851013184, + "rewards/rejected": -3.2970852851867676, + "step": 2630 + }, + { + "epoch": 0.31, + "learning_rate": 2.113499468524861e-07, + "logits/chosen": -2.6493706703186035, + "logits/rejected": -2.5405726432800293, + "logps/chosen": -265.2339172363281, + "logps/rejected": -317.2952880859375, + "loss": 0.3547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37690725922584534, + "rewards/margins": 1.6584751605987549, + "rewards/rejected": -2.0353822708129883, + "step": 2631 + }, + { + "epoch": 0.31, + "learning_rate": 2.1131451517656783e-07, + "logits/chosen": -2.4469263553619385, + "logits/rejected": -2.388017416000366, + "logps/chosen": -284.89300537109375, + "logps/rejected": -309.20916748046875, + "loss": 0.2181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35175448656082153, + "rewards/margins": 2.2137906551361084, + "rewards/rejected": -2.565545082092285, + "step": 2632 + }, + { + "epoch": 0.31, + "learning_rate": 2.1127908350064955e-07, + "logits/chosen": -1.7910642623901367, + "logits/rejected": -1.699021577835083, + "logps/chosen": -198.5264892578125, + "logps/rejected": -294.5920104980469, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37074053287506104, + "rewards/margins": 1.4371693134307861, + "rewards/rejected": -1.8079099655151367, + "step": 2633 + }, + { + "epoch": 0.31, + "learning_rate": 2.1124365182473127e-07, + "logits/chosen": -2.6893982887268066, + "logits/rejected": -2.8123655319213867, + "logps/chosen": -166.62506103515625, + "logps/rejected": -201.0951690673828, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3479629158973694, + "rewards/margins": 3.9292595386505127, + "rewards/rejected": -4.277222156524658, + "step": 2634 + }, + { + "epoch": 0.31, + "learning_rate": 2.1120822014881305e-07, + "logits/chosen": -2.5012810230255127, + "logits/rejected": -2.5813379287719727, + "logps/chosen": -350.53759765625, + "logps/rejected": -286.6799011230469, + "loss": 0.1268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13475248217582703, + "rewards/margins": 2.1292128562927246, + "rewards/rejected": -1.9944602251052856, + "step": 2635 + }, + { + "epoch": 0.31, + "learning_rate": 2.1117278847289477e-07, + "logits/chosen": -2.7884576320648193, + "logits/rejected": -2.741788148880005, + "logps/chosen": -288.32318115234375, + "logps/rejected": -191.75872802734375, + "loss": 0.4837, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4506542682647705, + "rewards/margins": 1.6181352138519287, + "rewards/rejected": -3.068789482116699, + "step": 2636 + }, + { + "epoch": 0.31, + "learning_rate": 2.111373567969765e-07, + "logits/chosen": -2.429070472717285, + "logits/rejected": -2.6503121852874756, + "logps/chosen": -242.72433471679688, + "logps/rejected": -191.52037048339844, + "loss": 0.2607, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1555165946483612, + "rewards/margins": 2.1063499450683594, + "rewards/rejected": -2.261866807937622, + "step": 2637 + }, + { + "epoch": 0.31, + "learning_rate": 2.111019251210582e-07, + "logits/chosen": -2.692335844039917, + "logits/rejected": -2.626697540283203, + "logps/chosen": -160.56411743164062, + "logps/rejected": -208.75839233398438, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.734602689743042, + "rewards/margins": 1.7118511199951172, + "rewards/rejected": -2.44645357131958, + "step": 2638 + }, + { + "epoch": 0.31, + "learning_rate": 2.1106649344513996e-07, + "logits/chosen": -2.485778570175171, + "logits/rejected": -2.694890022277832, + "logps/chosen": -298.801025390625, + "logps/rejected": -253.38330078125, + "loss": 0.6193, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5510451197624207, + "rewards/margins": 0.774277925491333, + "rewards/rejected": -1.3253231048583984, + "step": 2639 + }, + { + "epoch": 0.31, + "learning_rate": 2.1103106176922168e-07, + "logits/chosen": -3.069822311401367, + "logits/rejected": -2.938406467437744, + "logps/chosen": -349.7483825683594, + "logps/rejected": -244.55052185058594, + "loss": 0.1686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9716681241989136, + "rewards/margins": 2.650838613510132, + "rewards/rejected": -3.622506618499756, + "step": 2640 + }, + { + "epoch": 0.31, + "learning_rate": 2.109956300933034e-07, + "logits/chosen": -2.6175661087036133, + "logits/rejected": -2.672595500946045, + "logps/chosen": -249.59304809570312, + "logps/rejected": -345.5322265625, + "loss": 0.4742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7195810079574585, + "rewards/margins": 2.117192029953003, + "rewards/rejected": -2.836772918701172, + "step": 2641 + }, + { + "epoch": 0.31, + "learning_rate": 2.1096019841738513e-07, + "logits/chosen": -2.5615475177764893, + "logits/rejected": -2.543086051940918, + "logps/chosen": -142.9766082763672, + "logps/rejected": -213.3986358642578, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7747042179107666, + "rewards/margins": 1.8799471855163574, + "rewards/rejected": -2.654651641845703, + "step": 2642 + }, + { + "epoch": 0.31, + "learning_rate": 2.1092476674146685e-07, + "logits/chosen": -2.264913558959961, + "logits/rejected": -2.3459906578063965, + "logps/chosen": -265.44952392578125, + "logps/rejected": -232.56129455566406, + "loss": 0.6207, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2776224613189697, + "rewards/margins": 2.1406726837158203, + "rewards/rejected": -3.418294906616211, + "step": 2643 + }, + { + "epoch": 0.31, + "learning_rate": 2.1088933506554857e-07, + "logits/chosen": -2.516901731491089, + "logits/rejected": -2.360185146331787, + "logps/chosen": -138.0449981689453, + "logps/rejected": -264.96563720703125, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4896734356880188, + "rewards/margins": 2.5722780227661133, + "rewards/rejected": -3.0619516372680664, + "step": 2644 + }, + { + "epoch": 0.31, + "learning_rate": 2.108539033896303e-07, + "logits/chosen": -2.2688114643096924, + "logits/rejected": -2.28415846824646, + "logps/chosen": -288.4903259277344, + "logps/rejected": -285.13177490234375, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3402937650680542, + "rewards/margins": 2.1977367401123047, + "rewards/rejected": -3.5380303859710693, + "step": 2645 + }, + { + "epoch": 0.31, + "learning_rate": 2.1081847171371207e-07, + "logits/chosen": -2.0191867351531982, + "logits/rejected": -2.1728403568267822, + "logps/chosen": -180.56874084472656, + "logps/rejected": -200.29214477539062, + "loss": 1.0942, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5283387899398804, + "rewards/margins": 0.2724747657775879, + "rewards/rejected": -1.8008134365081787, + "step": 2646 + }, + { + "epoch": 0.31, + "learning_rate": 2.107830400377938e-07, + "logits/chosen": -2.132140874862671, + "logits/rejected": -2.0301506519317627, + "logps/chosen": -305.08868408203125, + "logps/rejected": -300.6380920410156, + "loss": 0.515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41881027817726135, + "rewards/margins": 1.301442265510559, + "rewards/rejected": -1.720252513885498, + "step": 2647 + }, + { + "epoch": 0.31, + "learning_rate": 2.1074760836187551e-07, + "logits/chosen": -2.510190010070801, + "logits/rejected": -2.2151806354522705, + "logps/chosen": -121.96829223632812, + "logps/rejected": -272.3302001953125, + "loss": 0.3341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8673459887504578, + "rewards/margins": 2.559882640838623, + "rewards/rejected": -3.4272284507751465, + "step": 2648 + }, + { + "epoch": 0.31, + "learning_rate": 2.1071217668595724e-07, + "logits/chosen": -2.226126194000244, + "logits/rejected": -2.487330913543701, + "logps/chosen": -552.5927124023438, + "logps/rejected": -419.29327392578125, + "loss": 0.4172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8817245364189148, + "rewards/margins": 1.954500675201416, + "rewards/rejected": -2.8362247943878174, + "step": 2649 + }, + { + "epoch": 0.31, + "learning_rate": 2.1067674501003896e-07, + "logits/chosen": -2.533182144165039, + "logits/rejected": -2.476315975189209, + "logps/chosen": -121.79317474365234, + "logps/rejected": -167.78134155273438, + "loss": 0.3491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2469920814037323, + "rewards/margins": 1.272566556930542, + "rewards/rejected": -1.5195586681365967, + "step": 2650 + }, + { + "epoch": 0.31, + "learning_rate": 2.106413133341207e-07, + "logits/chosen": -1.6353230476379395, + "logits/rejected": -2.031093120574951, + "logps/chosen": -260.50286865234375, + "logps/rejected": -161.6191864013672, + "loss": 0.5465, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1875946521759033, + "rewards/margins": 1.0488122701644897, + "rewards/rejected": -2.2364070415496826, + "step": 2651 + }, + { + "epoch": 0.31, + "learning_rate": 2.1060588165820243e-07, + "logits/chosen": -2.794382095336914, + "logits/rejected": -2.7597005367279053, + "logps/chosen": -355.0013427734375, + "logps/rejected": -300.3617858886719, + "loss": 0.5312, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4043614864349365, + "rewards/margins": 1.6262657642364502, + "rewards/rejected": -3.0306272506713867, + "step": 2652 + }, + { + "epoch": 0.31, + "learning_rate": 2.1057044998228415e-07, + "logits/chosen": -1.9108713865280151, + "logits/rejected": -1.9917467832565308, + "logps/chosen": -326.210205078125, + "logps/rejected": -346.9455871582031, + "loss": 1.0825, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.988628625869751, + "rewards/margins": 0.284631609916687, + "rewards/rejected": -2.2732601165771484, + "step": 2653 + }, + { + "epoch": 0.31, + "learning_rate": 2.1053501830636587e-07, + "logits/chosen": -2.1646206378936768, + "logits/rejected": -2.3150386810302734, + "logps/chosen": -393.072021484375, + "logps/rejected": -283.27362060546875, + "loss": 0.8263, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.992642879486084, + "rewards/margins": 0.6271450519561768, + "rewards/rejected": -1.6197879314422607, + "step": 2654 + }, + { + "epoch": 0.31, + "learning_rate": 2.104995866304476e-07, + "logits/chosen": -2.418151617050171, + "logits/rejected": -2.595637083053589, + "logps/chosen": -255.8325958251953, + "logps/rejected": -356.89703369140625, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7023605704307556, + "rewards/margins": 2.6243202686309814, + "rewards/rejected": -3.3266806602478027, + "step": 2655 + }, + { + "epoch": 0.31, + "learning_rate": 2.1046415495452932e-07, + "logits/chosen": -2.9892048835754395, + "logits/rejected": -2.9474711418151855, + "logps/chosen": -129.03915405273438, + "logps/rejected": -187.4677734375, + "loss": 0.3802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7479711771011353, + "rewards/margins": 2.073326826095581, + "rewards/rejected": -2.821298122406006, + "step": 2656 + }, + { + "epoch": 0.31, + "learning_rate": 2.1042872327861107e-07, + "logits/chosen": -2.084246873855591, + "logits/rejected": -2.46764874458313, + "logps/chosen": -341.95489501953125, + "logps/rejected": -180.57235717773438, + "loss": 1.0312, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3535133600234985, + "rewards/margins": 0.8197640180587769, + "rewards/rejected": -2.1732773780822754, + "step": 2657 + }, + { + "epoch": 0.31, + "learning_rate": 2.1039329160269281e-07, + "logits/chosen": -2.0597596168518066, + "logits/rejected": -2.4004149436950684, + "logps/chosen": -312.1173095703125, + "logps/rejected": -341.5392150878906, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.524691104888916, + "rewards/margins": 1.22756028175354, + "rewards/rejected": -1.7522512674331665, + "step": 2658 + }, + { + "epoch": 0.31, + "learning_rate": 2.1035785992677454e-07, + "logits/chosen": -2.1929712295532227, + "logits/rejected": -2.287137508392334, + "logps/chosen": -353.84649658203125, + "logps/rejected": -291.94720458984375, + "loss": 0.4838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9188960194587708, + "rewards/margins": 1.7596720457077026, + "rewards/rejected": -2.678568124771118, + "step": 2659 + }, + { + "epoch": 0.31, + "learning_rate": 2.1032242825085626e-07, + "logits/chosen": -2.627694845199585, + "logits/rejected": -2.267904281616211, + "logps/chosen": -388.19171142578125, + "logps/rejected": -336.8194580078125, + "loss": 0.2208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1722581386566162, + "rewards/margins": 3.7775230407714844, + "rewards/rejected": -4.94978141784668, + "step": 2660 + }, + { + "epoch": 0.31, + "learning_rate": 2.1028699657493798e-07, + "logits/chosen": -2.4802045822143555, + "logits/rejected": -2.457271099090576, + "logps/chosen": -237.4795379638672, + "logps/rejected": -229.6590576171875, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7305752038955688, + "rewards/margins": 2.229184627532959, + "rewards/rejected": -2.959759473800659, + "step": 2661 + }, + { + "epoch": 0.31, + "learning_rate": 2.1025156489901973e-07, + "logits/chosen": -1.8240361213684082, + "logits/rejected": -2.0376675128936768, + "logps/chosen": -337.7503662109375, + "logps/rejected": -283.7057800292969, + "loss": 0.3674, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1554460525512695, + "rewards/margins": 1.886474609375, + "rewards/rejected": -3.0419209003448486, + "step": 2662 + }, + { + "epoch": 0.31, + "learning_rate": 2.1021613322310145e-07, + "logits/chosen": -1.9533578157424927, + "logits/rejected": -2.1146039962768555, + "logps/chosen": -329.7105712890625, + "logps/rejected": -403.6001892089844, + "loss": 0.4593, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4846744537353516, + "rewards/margins": 2.088966131210327, + "rewards/rejected": -3.5736405849456787, + "step": 2663 + }, + { + "epoch": 0.31, + "learning_rate": 2.1018070154718317e-07, + "logits/chosen": -2.3854057788848877, + "logits/rejected": -2.1195895671844482, + "logps/chosen": -131.17222595214844, + "logps/rejected": -204.53121948242188, + "loss": 0.4014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6077719330787659, + "rewards/margins": 1.0128673315048218, + "rewards/rejected": -1.6206393241882324, + "step": 2664 + }, + { + "epoch": 0.31, + "learning_rate": 2.101452698712649e-07, + "logits/chosen": -2.4261093139648438, + "logits/rejected": -2.5097408294677734, + "logps/chosen": -544.8653564453125, + "logps/rejected": -475.90863037109375, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0649033784866333, + "rewards/margins": 3.06026029586792, + "rewards/rejected": -4.125163555145264, + "step": 2665 + }, + { + "epoch": 0.31, + "learning_rate": 2.1010983819534662e-07, + "logits/chosen": -1.6681721210479736, + "logits/rejected": -1.7541205883026123, + "logps/chosen": -257.0186767578125, + "logps/rejected": -267.51123046875, + "loss": 0.5477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.934929370880127, + "rewards/margins": 0.515464723110199, + "rewards/rejected": -1.4503940343856812, + "step": 2666 + }, + { + "epoch": 0.31, + "learning_rate": 2.1007440651942834e-07, + "logits/chosen": -2.3892014026641846, + "logits/rejected": -2.4401612281799316, + "logps/chosen": -295.4156494140625, + "logps/rejected": -244.27566528320312, + "loss": 0.4327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43629157543182373, + "rewards/margins": 1.6572957038879395, + "rewards/rejected": -2.0935871601104736, + "step": 2667 + }, + { + "epoch": 0.31, + "learning_rate": 2.100389748435101e-07, + "logits/chosen": -1.9924544095993042, + "logits/rejected": -2.3507463932037354, + "logps/chosen": -267.5882873535156, + "logps/rejected": -141.91348266601562, + "loss": 0.6673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9390842914581299, + "rewards/margins": 0.2679375112056732, + "rewards/rejected": -1.207021713256836, + "step": 2668 + }, + { + "epoch": 0.31, + "learning_rate": 2.100035431675918e-07, + "logits/chosen": -1.8421306610107422, + "logits/rejected": -2.098737955093384, + "logps/chosen": -352.4469909667969, + "logps/rejected": -266.88385009765625, + "loss": 0.8283, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.178695559501648, + "rewards/margins": 0.7573772668838501, + "rewards/rejected": -1.9360729455947876, + "step": 2669 + }, + { + "epoch": 0.31, + "learning_rate": 2.0996811149167356e-07, + "logits/chosen": -2.316805124282837, + "logits/rejected": -2.484732151031494, + "logps/chosen": -119.25689697265625, + "logps/rejected": -137.76181030273438, + "loss": 0.5212, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.074318766593933, + "rewards/margins": 1.1764211654663086, + "rewards/rejected": -2.2507400512695312, + "step": 2670 + }, + { + "epoch": 0.31, + "learning_rate": 2.0993267981575528e-07, + "logits/chosen": -2.205911636352539, + "logits/rejected": -2.101837158203125, + "logps/chosen": -257.53692626953125, + "logps/rejected": -252.5493621826172, + "loss": 0.3506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9378831386566162, + "rewards/margins": 1.1028310060501099, + "rewards/rejected": -2.0407142639160156, + "step": 2671 + }, + { + "epoch": 0.31, + "learning_rate": 2.09897248139837e-07, + "logits/chosen": -2.536623954772949, + "logits/rejected": -2.7729992866516113, + "logps/chosen": -268.724853515625, + "logps/rejected": -196.9244384765625, + "loss": 0.5163, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.916095495223999, + "rewards/margins": 1.095945119857788, + "rewards/rejected": -2.012040615081787, + "step": 2672 + }, + { + "epoch": 0.31, + "learning_rate": 2.0986181646391875e-07, + "logits/chosen": -2.053227663040161, + "logits/rejected": -2.5156712532043457, + "logps/chosen": -478.115966796875, + "logps/rejected": -146.27442932128906, + "loss": 0.3886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06146063655614853, + "rewards/margins": 1.7046363353729248, + "rewards/rejected": -1.7660969495773315, + "step": 2673 + }, + { + "epoch": 0.31, + "learning_rate": 2.0982638478800047e-07, + "logits/chosen": -2.0902929306030273, + "logits/rejected": -2.1604156494140625, + "logps/chosen": -273.1072082519531, + "logps/rejected": -357.2471618652344, + "loss": 0.1423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8819839954376221, + "rewards/margins": 3.795983076095581, + "rewards/rejected": -4.677967071533203, + "step": 2674 + }, + { + "epoch": 0.31, + "learning_rate": 2.097909531120822e-07, + "logits/chosen": -2.2265102863311768, + "logits/rejected": -2.407501697540283, + "logps/chosen": -422.2948913574219, + "logps/rejected": -307.6490478515625, + "loss": 0.7164, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0199847221374512, + "rewards/margins": 0.3709573447704315, + "rewards/rejected": -1.390942096710205, + "step": 2675 + }, + { + "epoch": 0.31, + "learning_rate": 2.0975552143616392e-07, + "logits/chosen": -2.083221912384033, + "logits/rejected": -2.174692392349243, + "logps/chosen": -330.82879638671875, + "logps/rejected": -361.12249755859375, + "loss": 0.6136, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5711886882781982, + "rewards/margins": 2.419903516769409, + "rewards/rejected": -2.9910922050476074, + "step": 2676 + }, + { + "epoch": 0.31, + "learning_rate": 2.0972008976024564e-07, + "logits/chosen": -2.1168875694274902, + "logits/rejected": -2.265673875808716, + "logps/chosen": -430.12750244140625, + "logps/rejected": -279.1639404296875, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16879981756210327, + "rewards/margins": 2.085088014602661, + "rewards/rejected": -2.253887891769409, + "step": 2677 + }, + { + "epoch": 0.31, + "learning_rate": 2.0968465808432736e-07, + "logits/chosen": -1.954486608505249, + "logits/rejected": -2.3209309577941895, + "logps/chosen": -299.5252380371094, + "logps/rejected": -189.27455139160156, + "loss": 0.4292, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3502020835876465, + "rewards/margins": 1.8267571926116943, + "rewards/rejected": -3.176959276199341, + "step": 2678 + }, + { + "epoch": 0.31, + "learning_rate": 2.0964922640840908e-07, + "logits/chosen": -1.9220571517944336, + "logits/rejected": -1.844224214553833, + "logps/chosen": -197.9112091064453, + "logps/rejected": -230.99183654785156, + "loss": 0.3335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12954463064670563, + "rewards/margins": 1.3945575952529907, + "rewards/rejected": -1.5241022109985352, + "step": 2679 + }, + { + "epoch": 0.31, + "learning_rate": 2.0961379473249083e-07, + "logits/chosen": -2.385979175567627, + "logits/rejected": -2.2437386512756348, + "logps/chosen": -131.93894958496094, + "logps/rejected": -265.19842529296875, + "loss": 0.896, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.920721411705017, + "rewards/margins": 3.388429880142212, + "rewards/rejected": -5.3091511726379395, + "step": 2680 + }, + { + "epoch": 0.31, + "learning_rate": 2.0957836305657258e-07, + "logits/chosen": -1.9039777517318726, + "logits/rejected": -2.2027769088745117, + "logps/chosen": -328.58544921875, + "logps/rejected": -245.66094970703125, + "loss": 0.4788, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1873950958251953, + "rewards/margins": 1.8677208423614502, + "rewards/rejected": -3.0551156997680664, + "step": 2681 + }, + { + "epoch": 0.31, + "learning_rate": 2.095429313806543e-07, + "logits/chosen": -2.410505771636963, + "logits/rejected": -2.3592982292175293, + "logps/chosen": -177.11376953125, + "logps/rejected": -260.02154541015625, + "loss": 0.3227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8805637955665588, + "rewards/margins": 1.5039234161376953, + "rewards/rejected": -2.3844871520996094, + "step": 2682 + }, + { + "epoch": 0.31, + "learning_rate": 2.0950749970473603e-07, + "logits/chosen": -2.1531481742858887, + "logits/rejected": -2.119595527648926, + "logps/chosen": -197.77114868164062, + "logps/rejected": -195.6953582763672, + "loss": 0.4999, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.339576780796051, + "rewards/margins": 1.182603359222412, + "rewards/rejected": -1.5221800804138184, + "step": 2683 + }, + { + "epoch": 0.31, + "learning_rate": 2.0947206802881775e-07, + "logits/chosen": -2.149996757507324, + "logits/rejected": -2.546684503555298, + "logps/chosen": -553.7098388671875, + "logps/rejected": -258.75604248046875, + "loss": 0.345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43140238523483276, + "rewards/margins": 1.6851290464401245, + "rewards/rejected": -2.1165313720703125, + "step": 2684 + }, + { + "epoch": 0.31, + "learning_rate": 2.094366363528995e-07, + "logits/chosen": -2.21504545211792, + "logits/rejected": -2.173283815383911, + "logps/chosen": -350.2112731933594, + "logps/rejected": -243.73985290527344, + "loss": 0.4231, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1309080123901367, + "rewards/margins": 1.1104423999786377, + "rewards/rejected": -3.2413504123687744, + "step": 2685 + }, + { + "epoch": 0.31, + "learning_rate": 2.0940120467698122e-07, + "logits/chosen": -2.6654253005981445, + "logits/rejected": -2.7541329860687256, + "logps/chosen": -250.3563232421875, + "logps/rejected": -333.6627197265625, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4679511487483978, + "rewards/margins": 2.7770838737487793, + "rewards/rejected": -3.24503493309021, + "step": 2686 + }, + { + "epoch": 0.31, + "learning_rate": 2.0936577300106294e-07, + "logits/chosen": -2.023287773132324, + "logits/rejected": -2.2012529373168945, + "logps/chosen": -243.79791259765625, + "logps/rejected": -205.6609344482422, + "loss": 0.4791, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0264650583267212, + "rewards/margins": 1.7309491634368896, + "rewards/rejected": -2.7574143409729004, + "step": 2687 + }, + { + "epoch": 0.31, + "learning_rate": 2.0933034132514466e-07, + "logits/chosen": -2.376368999481201, + "logits/rejected": -2.343724250793457, + "logps/chosen": -245.82496643066406, + "logps/rejected": -144.9889373779297, + "loss": 0.4882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03749767690896988, + "rewards/margins": 1.1064523458480835, + "rewards/rejected": -1.1439499855041504, + "step": 2688 + }, + { + "epoch": 0.31, + "learning_rate": 2.0929490964922639e-07, + "logits/chosen": -2.7336738109588623, + "logits/rejected": -2.6613545417785645, + "logps/chosen": -164.3998565673828, + "logps/rejected": -251.1111602783203, + "loss": 0.5477, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.567257285118103, + "rewards/margins": 1.6258279085159302, + "rewards/rejected": -2.193085193634033, + "step": 2689 + }, + { + "epoch": 0.31, + "learning_rate": 2.092594779733081e-07, + "logits/chosen": -2.2934212684631348, + "logits/rejected": -2.358381509780884, + "logps/chosen": -247.85089111328125, + "logps/rejected": -300.5628356933594, + "loss": 0.4263, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1860499382019043, + "rewards/margins": 2.0033295154571533, + "rewards/rejected": -3.1893796920776367, + "step": 2690 + }, + { + "epoch": 0.31, + "learning_rate": 2.0922404629738986e-07, + "logits/chosen": -2.809638023376465, + "logits/rejected": -2.571241855621338, + "logps/chosen": -334.3759460449219, + "logps/rejected": -274.49468994140625, + "loss": 0.8282, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1718063354492188, + "rewards/margins": 1.063801646232605, + "rewards/rejected": -2.235607862472534, + "step": 2691 + }, + { + "epoch": 0.31, + "learning_rate": 2.0918861462147158e-07, + "logits/chosen": -2.001131057739258, + "logits/rejected": -1.92683744430542, + "logps/chosen": -192.08355712890625, + "logps/rejected": -182.40234375, + "loss": 0.2652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2393803745508194, + "rewards/margins": 2.53415584564209, + "rewards/rejected": -2.773536443710327, + "step": 2692 + }, + { + "epoch": 0.31, + "learning_rate": 2.0915318294555333e-07, + "logits/chosen": -2.032057523727417, + "logits/rejected": -2.245615005493164, + "logps/chosen": -290.7298583984375, + "logps/rejected": -283.6589050292969, + "loss": 0.4533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.117310643196106, + "rewards/margins": 1.4772664308547974, + "rewards/rejected": -2.5945773124694824, + "step": 2693 + }, + { + "epoch": 0.31, + "learning_rate": 2.0911775126963505e-07, + "logits/chosen": -2.108515977859497, + "logits/rejected": -2.051095962524414, + "logps/chosen": -155.84579467773438, + "logps/rejected": -194.0198211669922, + "loss": 0.6813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2425987422466278, + "rewards/margins": 0.7405811548233032, + "rewards/rejected": -0.9831799268722534, + "step": 2694 + }, + { + "epoch": 0.31, + "learning_rate": 2.0908231959371677e-07, + "logits/chosen": -1.584932804107666, + "logits/rejected": -1.5097304582595825, + "logps/chosen": -445.1934814453125, + "logps/rejected": -479.97479248046875, + "loss": 0.7303, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3746416568756104, + "rewards/margins": 0.7369695901870728, + "rewards/rejected": -2.1116113662719727, + "step": 2695 + }, + { + "epoch": 0.31, + "learning_rate": 2.0904688791779852e-07, + "logits/chosen": -2.2612454891204834, + "logits/rejected": -2.4997611045837402, + "logps/chosen": -299.16070556640625, + "logps/rejected": -266.1734313964844, + "loss": 0.1038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023641347885131836, + "rewards/margins": 3.937661647796631, + "rewards/rejected": -3.914020538330078, + "step": 2696 + }, + { + "epoch": 0.31, + "learning_rate": 2.0901145624188024e-07, + "logits/chosen": -2.5422918796539307, + "logits/rejected": -2.656534194946289, + "logps/chosen": -257.2203674316406, + "logps/rejected": -330.8985595703125, + "loss": 0.2619, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.693057119846344, + "rewards/margins": 2.447880983352661, + "rewards/rejected": -3.1409378051757812, + "step": 2697 + }, + { + "epoch": 0.31, + "learning_rate": 2.0897602456596196e-07, + "logits/chosen": -2.4870247840881348, + "logits/rejected": -2.641413688659668, + "logps/chosen": -310.0843200683594, + "logps/rejected": -296.08013916015625, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7178309559822083, + "rewards/margins": 2.392054557800293, + "rewards/rejected": -3.1098852157592773, + "step": 2698 + }, + { + "epoch": 0.31, + "learning_rate": 2.0894059289004369e-07, + "logits/chosen": -2.3552918434143066, + "logits/rejected": -2.371882677078247, + "logps/chosen": -306.52587890625, + "logps/rejected": -253.155029296875, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.046555757522583, + "rewards/margins": 1.6710288524627686, + "rewards/rejected": -2.7175846099853516, + "step": 2699 + }, + { + "epoch": 0.31, + "learning_rate": 2.089051612141254e-07, + "logits/chosen": -2.275146722793579, + "logits/rejected": -2.556574821472168, + "logps/chosen": -197.9202880859375, + "logps/rejected": -232.50062561035156, + "loss": 0.4161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47690507769584656, + "rewards/margins": 1.6315109729766846, + "rewards/rejected": -2.1084160804748535, + "step": 2700 + }, + { + "epoch": 0.31, + "learning_rate": 2.0886972953820713e-07, + "logits/chosen": -2.44958758354187, + "logits/rejected": -2.1986613273620605, + "logps/chosen": -172.91000366210938, + "logps/rejected": -170.13551330566406, + "loss": 0.3125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7574644684791565, + "rewards/margins": 1.9040602445602417, + "rewards/rejected": -2.661524534225464, + "step": 2701 + }, + { + "epoch": 0.31, + "learning_rate": 2.0883429786228888e-07, + "logits/chosen": -2.2510390281677246, + "logits/rejected": -2.4591660499572754, + "logps/chosen": -324.08685302734375, + "logps/rejected": -215.64913940429688, + "loss": 1.0262, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.139427661895752, + "rewards/margins": 0.7493161559104919, + "rewards/rejected": -4.8887434005737305, + "step": 2702 + }, + { + "epoch": 0.31, + "learning_rate": 2.087988661863706e-07, + "logits/chosen": -2.3729872703552246, + "logits/rejected": -2.431762933731079, + "logps/chosen": -332.2828674316406, + "logps/rejected": -244.47764587402344, + "loss": 1.055, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0413732528686523, + "rewards/margins": 0.15132497251033783, + "rewards/rejected": -1.1926981210708618, + "step": 2703 + }, + { + "epoch": 0.31, + "learning_rate": 2.0876343451045232e-07, + "logits/chosen": -2.578282356262207, + "logits/rejected": -2.8098061084747314, + "logps/chosen": -289.2070617675781, + "logps/rejected": -318.4175109863281, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3002718985080719, + "rewards/margins": 2.323092460632324, + "rewards/rejected": -2.623364210128784, + "step": 2704 + }, + { + "epoch": 0.31, + "learning_rate": 2.0872800283453407e-07, + "logits/chosen": -2.8158648014068604, + "logits/rejected": -2.7035233974456787, + "logps/chosen": -261.217529296875, + "logps/rejected": -415.8542785644531, + "loss": 0.2243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2730704545974731, + "rewards/margins": 2.550398349761963, + "rewards/rejected": -3.8234689235687256, + "step": 2705 + }, + { + "epoch": 0.31, + "learning_rate": 2.086925711586158e-07, + "logits/chosen": -2.313232421875, + "logits/rejected": -2.3152084350585938, + "logps/chosen": -245.49098205566406, + "logps/rejected": -287.53326416015625, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11041027307510376, + "rewards/margins": 3.080228090286255, + "rewards/rejected": -3.1906380653381348, + "step": 2706 + }, + { + "epoch": 0.31, + "learning_rate": 2.0865713948269754e-07, + "logits/chosen": -2.3331079483032227, + "logits/rejected": -1.9809876680374146, + "logps/chosen": -251.12557983398438, + "logps/rejected": -396.9845886230469, + "loss": 0.3684, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1850337982177734, + "rewards/margins": 1.868536353111267, + "rewards/rejected": -3.053570032119751, + "step": 2707 + }, + { + "epoch": 0.32, + "learning_rate": 2.0862170780677926e-07, + "logits/chosen": -2.043034315109253, + "logits/rejected": -2.257098436355591, + "logps/chosen": -399.12982177734375, + "logps/rejected": -302.3027038574219, + "loss": 0.2456, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.435766577720642, + "rewards/margins": 1.4610176086425781, + "rewards/rejected": -2.8967843055725098, + "step": 2708 + }, + { + "epoch": 0.32, + "learning_rate": 2.0858627613086099e-07, + "logits/chosen": -2.543705463409424, + "logits/rejected": -2.572354793548584, + "logps/chosen": -318.005859375, + "logps/rejected": -205.0018310546875, + "loss": 0.485, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0689947605133057, + "rewards/margins": 1.9310626983642578, + "rewards/rejected": -3.0000574588775635, + "step": 2709 + }, + { + "epoch": 0.32, + "learning_rate": 2.085508444549427e-07, + "logits/chosen": -2.081629753112793, + "logits/rejected": -2.4560599327087402, + "logps/chosen": -302.975341796875, + "logps/rejected": -231.81283569335938, + "loss": 0.617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6423460245132446, + "rewards/margins": 1.2739678621292114, + "rewards/rejected": -1.9163137674331665, + "step": 2710 + }, + { + "epoch": 0.32, + "learning_rate": 2.0851541277902443e-07, + "logits/chosen": -2.493652105331421, + "logits/rejected": -2.3145036697387695, + "logps/chosen": -203.880126953125, + "logps/rejected": -186.07920837402344, + "loss": 0.3767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28596559166908264, + "rewards/margins": 1.7772178649902344, + "rewards/rejected": -2.063183546066284, + "step": 2711 + }, + { + "epoch": 0.32, + "learning_rate": 2.0847998110310615e-07, + "logits/chosen": -2.4053444862365723, + "logits/rejected": -2.243661403656006, + "logps/chosen": -244.26467895507812, + "logps/rejected": -341.4519348144531, + "loss": 0.3238, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45404720306396484, + "rewards/margins": 2.7348456382751465, + "rewards/rejected": -3.1888930797576904, + "step": 2712 + }, + { + "epoch": 0.32, + "learning_rate": 2.0844454942718787e-07, + "logits/chosen": -2.0521459579467773, + "logits/rejected": -1.850496768951416, + "logps/chosen": -461.64825439453125, + "logps/rejected": -508.4150695800781, + "loss": 0.5861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4270686209201813, + "rewards/margins": 1.003695011138916, + "rewards/rejected": -1.4307637214660645, + "step": 2713 + }, + { + "epoch": 0.32, + "learning_rate": 2.0840911775126962e-07, + "logits/chosen": -2.57513427734375, + "logits/rejected": -2.2692623138427734, + "logps/chosen": -336.09906005859375, + "logps/rejected": -319.16558837890625, + "loss": 0.4216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.892180323600769, + "rewards/margins": 2.566850185394287, + "rewards/rejected": -3.4590303897857666, + "step": 2714 + }, + { + "epoch": 0.32, + "learning_rate": 2.0837368607535135e-07, + "logits/chosen": -2.70721435546875, + "logits/rejected": -2.7745985984802246, + "logps/chosen": -172.1348114013672, + "logps/rejected": -229.09019470214844, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4388311803340912, + "rewards/margins": 2.2791738510131836, + "rewards/rejected": -2.7180051803588867, + "step": 2715 + }, + { + "epoch": 0.32, + "learning_rate": 2.083382543994331e-07, + "logits/chosen": -2.6381447315216064, + "logits/rejected": -2.6967344284057617, + "logps/chosen": -243.62779235839844, + "logps/rejected": -175.01564025878906, + "loss": 0.503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.009652681648731232, + "rewards/margins": 1.187166690826416, + "rewards/rejected": -1.1968194246292114, + "step": 2716 + }, + { + "epoch": 0.32, + "learning_rate": 2.0830282272351482e-07, + "logits/chosen": -2.1260969638824463, + "logits/rejected": -2.3022420406341553, + "logps/chosen": -318.4859619140625, + "logps/rejected": -182.4466094970703, + "loss": 0.3821, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3226085305213928, + "rewards/margins": 1.228325605392456, + "rewards/rejected": -1.5509341955184937, + "step": 2717 + }, + { + "epoch": 0.32, + "learning_rate": 2.0826739104759656e-07, + "logits/chosen": -2.1855757236480713, + "logits/rejected": -2.2827584743499756, + "logps/chosen": -332.7135314941406, + "logps/rejected": -479.521728515625, + "loss": 0.1845, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.54310941696167, + "rewards/margins": 3.481614112854004, + "rewards/rejected": -5.024723052978516, + "step": 2718 + }, + { + "epoch": 0.32, + "learning_rate": 2.082319593716783e-07, + "logits/chosen": -2.354111671447754, + "logits/rejected": -2.3428194522857666, + "logps/chosen": -177.3536834716797, + "logps/rejected": -270.4571228027344, + "loss": 0.1437, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0465937852859497, + "rewards/margins": 4.1524434089660645, + "rewards/rejected": -5.199037075042725, + "step": 2719 + }, + { + "epoch": 0.32, + "learning_rate": 2.0819652769576e-07, + "logits/chosen": -1.8852524757385254, + "logits/rejected": -1.9846627712249756, + "logps/chosen": -243.71365356445312, + "logps/rejected": -282.68231201171875, + "loss": 0.5994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5771859884262085, + "rewards/margins": 1.673588752746582, + "rewards/rejected": -2.25077486038208, + "step": 2720 + }, + { + "epoch": 0.32, + "learning_rate": 2.0816109601984173e-07, + "logits/chosen": -2.1657636165618896, + "logits/rejected": -2.3458809852600098, + "logps/chosen": -267.2022705078125, + "logps/rejected": -286.8110046386719, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4018622636795044, + "rewards/margins": 1.750438928604126, + "rewards/rejected": -2.152301073074341, + "step": 2721 + }, + { + "epoch": 0.32, + "learning_rate": 2.0812566434392345e-07, + "logits/chosen": -1.8264944553375244, + "logits/rejected": -1.5642446279525757, + "logps/chosen": -263.8738098144531, + "logps/rejected": -304.9481201171875, + "loss": 0.4248, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3707170784473419, + "rewards/margins": 1.8187414407730103, + "rewards/rejected": -2.1894583702087402, + "step": 2722 + }, + { + "epoch": 0.32, + "learning_rate": 2.0809023266800518e-07, + "logits/chosen": -2.591073513031006, + "logits/rejected": -2.356257438659668, + "logps/chosen": -153.2386016845703, + "logps/rejected": -352.4542236328125, + "loss": 0.4566, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.449241042137146, + "rewards/margins": 3.144718647003174, + "rewards/rejected": -3.5939598083496094, + "step": 2723 + }, + { + "epoch": 0.32, + "learning_rate": 2.080548009920869e-07, + "logits/chosen": -2.091245174407959, + "logits/rejected": -1.8745932579040527, + "logps/chosen": -257.42578125, + "logps/rejected": -289.25762939453125, + "loss": 0.3799, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4190809726715088, + "rewards/margins": 1.846193790435791, + "rewards/rejected": -3.265275001525879, + "step": 2724 + }, + { + "epoch": 0.32, + "learning_rate": 2.0801936931616865e-07, + "logits/chosen": -2.501713275909424, + "logits/rejected": -2.440138101577759, + "logps/chosen": -119.82244873046875, + "logps/rejected": -169.54965209960938, + "loss": 0.3586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7004234194755554, + "rewards/margins": 2.619542121887207, + "rewards/rejected": -3.3199656009674072, + "step": 2725 + }, + { + "epoch": 0.32, + "learning_rate": 2.0798393764025037e-07, + "logits/chosen": -2.308194160461426, + "logits/rejected": -2.2186479568481445, + "logps/chosen": -251.18609619140625, + "logps/rejected": -376.8612365722656, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36350494623184204, + "rewards/margins": 3.757767677307129, + "rewards/rejected": -4.121273040771484, + "step": 2726 + }, + { + "epoch": 0.32, + "learning_rate": 2.079485059643321e-07, + "logits/chosen": -2.215871810913086, + "logits/rejected": -2.5223546028137207, + "logps/chosen": -319.2067565917969, + "logps/rejected": -202.1268310546875, + "loss": 0.7024, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.575373649597168, + "rewards/margins": 0.24656054377555847, + "rewards/rejected": -1.8219343423843384, + "step": 2727 + }, + { + "epoch": 0.32, + "learning_rate": 2.0791307428841384e-07, + "logits/chosen": -2.6945254802703857, + "logits/rejected": -2.711095094680786, + "logps/chosen": -224.7584991455078, + "logps/rejected": -178.49807739257812, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5578495860099792, + "rewards/margins": 1.756978988647461, + "rewards/rejected": -2.314828634262085, + "step": 2728 + }, + { + "epoch": 0.32, + "learning_rate": 2.0787764261249556e-07, + "logits/chosen": -1.7340097427368164, + "logits/rejected": -1.6804218292236328, + "logps/chosen": -231.2611083984375, + "logps/rejected": -205.69174194335938, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8761190176010132, + "rewards/margins": 1.3855739831924438, + "rewards/rejected": -2.261693000793457, + "step": 2729 + }, + { + "epoch": 0.32, + "learning_rate": 2.078422109365773e-07, + "logits/chosen": -2.211613655090332, + "logits/rejected": -2.573498249053955, + "logps/chosen": -315.66827392578125, + "logps/rejected": -300.2453308105469, + "loss": 0.4093, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6856505870819092, + "rewards/margins": 1.6242314577102661, + "rewards/rejected": -2.309882164001465, + "step": 2730 + }, + { + "epoch": 0.32, + "learning_rate": 2.0780677926065903e-07, + "logits/chosen": -2.530533790588379, + "logits/rejected": -2.6519880294799805, + "logps/chosen": -455.1575927734375, + "logps/rejected": -481.04278564453125, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1108334064483643, + "rewards/margins": 2.2547214031219482, + "rewards/rejected": -3.3655548095703125, + "step": 2731 + }, + { + "epoch": 0.32, + "learning_rate": 2.0777134758474075e-07, + "logits/chosen": -2.007115364074707, + "logits/rejected": -1.9796065092086792, + "logps/chosen": -339.02227783203125, + "logps/rejected": -325.2618103027344, + "loss": 0.199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5551353693008423, + "rewards/margins": 2.371004819869995, + "rewards/rejected": -2.926140069961548, + "step": 2732 + }, + { + "epoch": 0.32, + "learning_rate": 2.0773591590882248e-07, + "logits/chosen": -2.073641300201416, + "logits/rejected": -2.2942140102386475, + "logps/chosen": -322.10284423828125, + "logps/rejected": -201.0215606689453, + "loss": 0.8298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.557479202747345, + "rewards/margins": 0.8951711058616638, + "rewards/rejected": -1.4526503086090088, + "step": 2733 + }, + { + "epoch": 0.32, + "learning_rate": 2.077004842329042e-07, + "logits/chosen": -2.0518383979797363, + "logits/rejected": -2.151930570602417, + "logps/chosen": -202.91928100585938, + "logps/rejected": -146.01695251464844, + "loss": 0.5492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34734249114990234, + "rewards/margins": 1.4260258674621582, + "rewards/rejected": -1.7733683586120605, + "step": 2734 + }, + { + "epoch": 0.32, + "learning_rate": 2.0766505255698592e-07, + "logits/chosen": -2.3419580459594727, + "logits/rejected": -2.4661552906036377, + "logps/chosen": -257.7551574707031, + "logps/rejected": -190.0042724609375, + "loss": 0.8978, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8303265571594238, + "rewards/margins": 0.545998215675354, + "rewards/rejected": -2.3763246536254883, + "step": 2735 + }, + { + "epoch": 0.32, + "learning_rate": 2.0762962088106767e-07, + "logits/chosen": -2.167544364929199, + "logits/rejected": -2.2882955074310303, + "logps/chosen": -207.08541870117188, + "logps/rejected": -243.2821044921875, + "loss": 0.2834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2470426708459854, + "rewards/margins": 2.240060329437256, + "rewards/rejected": -2.48710298538208, + "step": 2736 + }, + { + "epoch": 0.32, + "learning_rate": 2.075941892051494e-07, + "logits/chosen": -2.061751127243042, + "logits/rejected": -2.0869791507720947, + "logps/chosen": -217.41952514648438, + "logps/rejected": -295.2725524902344, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.870985209941864, + "rewards/margins": 2.4654653072357178, + "rewards/rejected": -3.3364505767822266, + "step": 2737 + }, + { + "epoch": 0.32, + "learning_rate": 2.0755875752923111e-07, + "logits/chosen": -2.6105051040649414, + "logits/rejected": -2.3733105659484863, + "logps/chosen": -119.23661804199219, + "logps/rejected": -206.68031311035156, + "loss": 0.4588, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5620625019073486, + "rewards/margins": 1.7952492237091064, + "rewards/rejected": -2.357311725616455, + "step": 2738 + }, + { + "epoch": 0.32, + "learning_rate": 2.0752332585331284e-07, + "logits/chosen": -2.3295812606811523, + "logits/rejected": -2.262988805770874, + "logps/chosen": -258.0665588378906, + "logps/rejected": -293.92681884765625, + "loss": 0.3361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41324377059936523, + "rewards/margins": 2.177206039428711, + "rewards/rejected": -2.590449571609497, + "step": 2739 + }, + { + "epoch": 0.32, + "learning_rate": 2.0748789417739458e-07, + "logits/chosen": -2.2079854011535645, + "logits/rejected": -2.301755905151367, + "logps/chosen": -282.29510498046875, + "logps/rejected": -346.2251892089844, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5780449509620667, + "rewards/margins": 4.109785079956055, + "rewards/rejected": -4.687829971313477, + "step": 2740 + }, + { + "epoch": 0.32, + "learning_rate": 2.0745246250147633e-07, + "logits/chosen": -2.4354004859924316, + "logits/rejected": -2.5989863872528076, + "logps/chosen": -281.60809326171875, + "logps/rejected": -188.22213745117188, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2605706453323364, + "rewards/margins": 1.468400239944458, + "rewards/rejected": -2.728970766067505, + "step": 2741 + }, + { + "epoch": 0.32, + "learning_rate": 2.0741703082555805e-07, + "logits/chosen": -2.324098587036133, + "logits/rejected": -2.496450424194336, + "logps/chosen": -266.6272888183594, + "logps/rejected": -270.54107666015625, + "loss": 1.1013, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8765517473220825, + "rewards/margins": 0.3214974105358124, + "rewards/rejected": -2.1980490684509277, + "step": 2742 + }, + { + "epoch": 0.32, + "learning_rate": 2.0738159914963978e-07, + "logits/chosen": -2.2760794162750244, + "logits/rejected": -1.904067039489746, + "logps/chosen": -197.61044311523438, + "logps/rejected": -232.00567626953125, + "loss": 1.3495, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0899405479431152, + "rewards/margins": 0.24331754446029663, + "rewards/rejected": -3.3332581520080566, + "step": 2743 + }, + { + "epoch": 0.32, + "learning_rate": 2.073461674737215e-07, + "logits/chosen": -2.0641071796417236, + "logits/rejected": -2.1095101833343506, + "logps/chosen": -158.85879516601562, + "logps/rejected": -372.7569580078125, + "loss": 0.7306, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2892751693725586, + "rewards/margins": 0.744792640209198, + "rewards/rejected": -2.0340676307678223, + "step": 2744 + }, + { + "epoch": 0.32, + "learning_rate": 2.0731073579780322e-07, + "logits/chosen": -2.890855312347412, + "logits/rejected": -2.6605968475341797, + "logps/chosen": -197.75306701660156, + "logps/rejected": -188.14488220214844, + "loss": 0.2276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28597861528396606, + "rewards/margins": 3.1753170490264893, + "rewards/rejected": -3.4612953662872314, + "step": 2745 + }, + { + "epoch": 0.32, + "learning_rate": 2.0727530412188494e-07, + "logits/chosen": -2.375332832336426, + "logits/rejected": -2.5051138401031494, + "logps/chosen": -171.81069946289062, + "logps/rejected": -258.024658203125, + "loss": 0.5519, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6839407086372375, + "rewards/margins": 1.333158016204834, + "rewards/rejected": -2.017098903656006, + "step": 2746 + }, + { + "epoch": 0.32, + "learning_rate": 2.072398724459667e-07, + "logits/chosen": -1.9678460359573364, + "logits/rejected": -2.075413942337036, + "logps/chosen": -292.64447021484375, + "logps/rejected": -345.1872253417969, + "loss": 0.8388, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7227127552032471, + "rewards/margins": 1.054764747619629, + "rewards/rejected": -1.7774773836135864, + "step": 2747 + }, + { + "epoch": 0.32, + "learning_rate": 2.0720444077004841e-07, + "logits/chosen": -2.1775062084198, + "logits/rejected": -2.182426929473877, + "logps/chosen": -160.1633758544922, + "logps/rejected": -211.48031616210938, + "loss": 0.7152, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.440686583518982, + "rewards/margins": 1.395626187324524, + "rewards/rejected": -2.836312770843506, + "step": 2748 + }, + { + "epoch": 0.32, + "learning_rate": 2.0716900909413014e-07, + "logits/chosen": -1.7111473083496094, + "logits/rejected": -1.7333407402038574, + "logps/chosen": -482.4383544921875, + "logps/rejected": -424.7276916503906, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5618476867675781, + "rewards/margins": 1.6746454238891602, + "rewards/rejected": -2.2364931106567383, + "step": 2749 + }, + { + "epoch": 0.32, + "learning_rate": 2.0713357741821186e-07, + "logits/chosen": -2.2246415615081787, + "logits/rejected": -2.442307949066162, + "logps/chosen": -297.3554992675781, + "logps/rejected": -177.75241088867188, + "loss": 0.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.666824996471405, + "rewards/margins": 1.2845443487167358, + "rewards/rejected": -1.951369285583496, + "step": 2750 + }, + { + "epoch": 0.32, + "learning_rate": 2.070981457422936e-07, + "logits/chosen": -1.4929125308990479, + "logits/rejected": -1.8814928531646729, + "logps/chosen": -488.4658203125, + "logps/rejected": -376.7021179199219, + "loss": 0.7114, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.417024850845337, + "rewards/margins": 0.3771205544471741, + "rewards/rejected": -1.7941455841064453, + "step": 2751 + }, + { + "epoch": 0.32, + "learning_rate": 2.0706271406637536e-07, + "logits/chosen": -1.990221619606018, + "logits/rejected": -2.194573402404785, + "logps/chosen": -323.70196533203125, + "logps/rejected": -292.50250244140625, + "loss": 0.3849, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6472268104553223, + "rewards/margins": 2.5692431926727295, + "rewards/rejected": -3.216470241546631, + "step": 2752 + }, + { + "epoch": 0.32, + "learning_rate": 2.0702728239045708e-07, + "logits/chosen": -2.5495829582214355, + "logits/rejected": -2.5197689533233643, + "logps/chosen": -282.1540222167969, + "logps/rejected": -298.8853759765625, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5023384690284729, + "rewards/margins": 3.5250680446624756, + "rewards/rejected": -4.027406692504883, + "step": 2753 + }, + { + "epoch": 0.32, + "learning_rate": 2.069918507145388e-07, + "logits/chosen": -2.1546311378479004, + "logits/rejected": -2.3572661876678467, + "logps/chosen": -345.59613037109375, + "logps/rejected": -296.26458740234375, + "loss": 0.2234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8814011216163635, + "rewards/margins": 2.0716118812561035, + "rewards/rejected": -2.9530129432678223, + "step": 2754 + }, + { + "epoch": 0.32, + "learning_rate": 2.0695641903862052e-07, + "logits/chosen": -2.4767515659332275, + "logits/rejected": -2.5641064643859863, + "logps/chosen": -307.09490966796875, + "logps/rejected": -207.67019653320312, + "loss": 0.3301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3820151388645172, + "rewards/margins": 1.6839958429336548, + "rewards/rejected": -2.0660109519958496, + "step": 2755 + }, + { + "epoch": 0.32, + "learning_rate": 2.0692098736270224e-07, + "logits/chosen": -1.636419653892517, + "logits/rejected": -2.002798080444336, + "logps/chosen": -384.4627990722656, + "logps/rejected": -309.7705383300781, + "loss": 0.8475, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.529336452484131, + "rewards/margins": 0.9278169274330139, + "rewards/rejected": -3.4571533203125, + "step": 2756 + }, + { + "epoch": 0.32, + "learning_rate": 2.0688555568678397e-07, + "logits/chosen": -2.4896345138549805, + "logits/rejected": -2.5010228157043457, + "logps/chosen": -323.11871337890625, + "logps/rejected": -345.5707092285156, + "loss": 0.3174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8689936995506287, + "rewards/margins": 2.8992867469787598, + "rewards/rejected": -3.768280506134033, + "step": 2757 + }, + { + "epoch": 0.32, + "learning_rate": 2.068501240108657e-07, + "logits/chosen": -2.0251681804656982, + "logits/rejected": -2.2202882766723633, + "logps/chosen": -533.9995727539062, + "logps/rejected": -328.071044921875, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8631742000579834, + "rewards/margins": 1.4737939834594727, + "rewards/rejected": -2.336968183517456, + "step": 2758 + }, + { + "epoch": 0.32, + "learning_rate": 2.0681469233494744e-07, + "logits/chosen": -2.1772360801696777, + "logits/rejected": -1.9749560356140137, + "logps/chosen": -172.00509643554688, + "logps/rejected": -267.8743896484375, + "loss": 0.541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8997894525527954, + "rewards/margins": 2.082317352294922, + "rewards/rejected": -2.9821066856384277, + "step": 2759 + }, + { + "epoch": 0.32, + "learning_rate": 2.0677926065902916e-07, + "logits/chosen": -2.3416895866394043, + "logits/rejected": -2.430999994277954, + "logps/chosen": -242.75958251953125, + "logps/rejected": -223.500732421875, + "loss": 0.4602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1338908672332764, + "rewards/margins": 1.7552207708358765, + "rewards/rejected": -2.8891115188598633, + "step": 2760 + }, + { + "epoch": 0.32, + "learning_rate": 2.0674382898311088e-07, + "logits/chosen": -2.363949775695801, + "logits/rejected": -2.4174118041992188, + "logps/chosen": -227.4774932861328, + "logps/rejected": -272.5215759277344, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8144614100456238, + "rewards/margins": 3.817065954208374, + "rewards/rejected": -4.631526947021484, + "step": 2761 + }, + { + "epoch": 0.32, + "learning_rate": 2.067083973071926e-07, + "logits/chosen": -2.1846981048583984, + "logits/rejected": -2.5316524505615234, + "logps/chosen": -268.4424743652344, + "logps/rejected": -185.2261199951172, + "loss": 1.1532, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.9046132564544678, + "rewards/margins": -0.2842203378677368, + "rewards/rejected": -1.6203927993774414, + "step": 2762 + }, + { + "epoch": 0.32, + "learning_rate": 2.0667296563127438e-07, + "logits/chosen": -2.5707197189331055, + "logits/rejected": -2.623444080352783, + "logps/chosen": -248.19915771484375, + "logps/rejected": -155.9779052734375, + "loss": 0.3997, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7524929046630859, + "rewards/margins": 1.272731900215149, + "rewards/rejected": -2.0252246856689453, + "step": 2763 + }, + { + "epoch": 0.32, + "learning_rate": 2.066375339553561e-07, + "logits/chosen": -2.3865294456481934, + "logits/rejected": -2.6887707710266113, + "logps/chosen": -349.3714294433594, + "logps/rejected": -177.7042694091797, + "loss": 0.446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.781042218208313, + "rewards/margins": 1.9484821557998657, + "rewards/rejected": -2.7295243740081787, + "step": 2764 + }, + { + "epoch": 0.32, + "learning_rate": 2.0660210227943782e-07, + "logits/chosen": -2.0423269271850586, + "logits/rejected": -1.8252992630004883, + "logps/chosen": -337.62451171875, + "logps/rejected": -408.57843017578125, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02934664487838745, + "rewards/margins": 3.8525209426879883, + "rewards/rejected": -3.8818678855895996, + "step": 2765 + }, + { + "epoch": 0.32, + "learning_rate": 2.0656667060351954e-07, + "logits/chosen": -2.1039938926696777, + "logits/rejected": -2.0415587425231934, + "logps/chosen": -387.4490661621094, + "logps/rejected": -350.63916015625, + "loss": 0.2306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.681992769241333, + "rewards/margins": 2.5637311935424805, + "rewards/rejected": -3.2457237243652344, + "step": 2766 + }, + { + "epoch": 0.32, + "learning_rate": 2.0653123892760127e-07, + "logits/chosen": -2.2397561073303223, + "logits/rejected": -2.482811689376831, + "logps/chosen": -271.80615234375, + "logps/rejected": -229.54083251953125, + "loss": 0.534, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.632980465888977, + "rewards/margins": 1.8278834819793701, + "rewards/rejected": -3.4608640670776367, + "step": 2767 + }, + { + "epoch": 0.32, + "learning_rate": 2.06495807251683e-07, + "logits/chosen": -2.835266351699829, + "logits/rejected": -2.8360862731933594, + "logps/chosen": -269.29150390625, + "logps/rejected": -247.71299743652344, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2972280979156494, + "rewards/margins": 4.1515116691589355, + "rewards/rejected": -5.448740005493164, + "step": 2768 + }, + { + "epoch": 0.32, + "learning_rate": 2.064603755757647e-07, + "logits/chosen": -2.455646514892578, + "logits/rejected": -2.509162187576294, + "logps/chosen": -260.1095886230469, + "logps/rejected": -268.59326171875, + "loss": 0.3223, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0549540519714355, + "rewards/margins": 2.536766767501831, + "rewards/rejected": -3.5917210578918457, + "step": 2769 + }, + { + "epoch": 0.32, + "learning_rate": 2.0642494389984646e-07, + "logits/chosen": -1.972563624382019, + "logits/rejected": -2.487394332885742, + "logps/chosen": -384.8781433105469, + "logps/rejected": -289.3961486816406, + "loss": 0.1449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1388077288866043, + "rewards/margins": 2.5734758377075195, + "rewards/rejected": -2.7122836112976074, + "step": 2770 + }, + { + "epoch": 0.32, + "learning_rate": 2.0638951222392818e-07, + "logits/chosen": -2.3750717639923096, + "logits/rejected": -2.4007115364074707, + "logps/chosen": -364.71380615234375, + "logps/rejected": -486.5599670410156, + "loss": 0.5689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6389650702476501, + "rewards/margins": 1.2544959783554077, + "rewards/rejected": -1.8934611082077026, + "step": 2771 + }, + { + "epoch": 0.32, + "learning_rate": 2.063540805480099e-07, + "logits/chosen": -1.9437891244888306, + "logits/rejected": -2.008641004562378, + "logps/chosen": -425.13885498046875, + "logps/rejected": -367.20001220703125, + "loss": 0.4111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9009430408477783, + "rewards/margins": 1.0747218132019043, + "rewards/rejected": -1.975664734840393, + "step": 2772 + }, + { + "epoch": 0.32, + "learning_rate": 2.0631864887209163e-07, + "logits/chosen": -1.707619309425354, + "logits/rejected": -2.017993927001953, + "logps/chosen": -443.898193359375, + "logps/rejected": -235.83226013183594, + "loss": 0.3726, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.061083436012268066, + "rewards/margins": 3.158172607421875, + "rewards/rejected": -3.2192559242248535, + "step": 2773 + }, + { + "epoch": 0.32, + "learning_rate": 2.0628321719617335e-07, + "logits/chosen": -2.4310951232910156, + "logits/rejected": -2.6068644523620605, + "logps/chosen": -210.737548828125, + "logps/rejected": -211.23036193847656, + "loss": 0.3731, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43631404638290405, + "rewards/margins": 3.100959300994873, + "rewards/rejected": -3.5372731685638428, + "step": 2774 + }, + { + "epoch": 0.32, + "learning_rate": 2.0624778552025512e-07, + "logits/chosen": -1.755044937133789, + "logits/rejected": -1.5662319660186768, + "logps/chosen": -141.99420166015625, + "logps/rejected": -269.23297119140625, + "loss": 0.417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7886494994163513, + "rewards/margins": 1.9704394340515137, + "rewards/rejected": -2.759089231491089, + "step": 2775 + }, + { + "epoch": 0.32, + "learning_rate": 2.0621235384433685e-07, + "logits/chosen": -2.406850814819336, + "logits/rejected": -2.6088953018188477, + "logps/chosen": -334.05059814453125, + "logps/rejected": -249.40940856933594, + "loss": 0.2336, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00031469762325286865, + "rewards/margins": 2.3747613430023193, + "rewards/rejected": -2.3744466304779053, + "step": 2776 + }, + { + "epoch": 0.32, + "learning_rate": 2.0617692216841857e-07, + "logits/chosen": -2.20857572555542, + "logits/rejected": -2.3470346927642822, + "logps/chosen": -298.4207763671875, + "logps/rejected": -250.31565856933594, + "loss": 0.4144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9045414328575134, + "rewards/margins": 0.8793571591377258, + "rewards/rejected": -1.7838985919952393, + "step": 2777 + }, + { + "epoch": 0.32, + "learning_rate": 2.061414904925003e-07, + "logits/chosen": -1.9985195398330688, + "logits/rejected": -2.314100980758667, + "logps/chosen": -179.6425323486328, + "logps/rejected": -180.563720703125, + "loss": 0.3423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8461536765098572, + "rewards/margins": 1.5938369035720825, + "rewards/rejected": -2.439990520477295, + "step": 2778 + }, + { + "epoch": 0.32, + "learning_rate": 2.06106058816582e-07, + "logits/chosen": -2.5613293647766113, + "logits/rejected": -2.5643248558044434, + "logps/chosen": -519.8988037109375, + "logps/rejected": -323.7275085449219, + "loss": 0.1304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25258561968803406, + "rewards/margins": 3.1322360038757324, + "rewards/rejected": -2.879650354385376, + "step": 2779 + }, + { + "epoch": 0.32, + "learning_rate": 2.0607062714066373e-07, + "logits/chosen": -2.276991128921509, + "logits/rejected": -2.284942150115967, + "logps/chosen": -184.548095703125, + "logps/rejected": -228.9742431640625, + "loss": 0.1879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6611593961715698, + "rewards/margins": 2.3809316158294678, + "rewards/rejected": -3.042090892791748, + "step": 2780 + }, + { + "epoch": 0.32, + "learning_rate": 2.0603519546474548e-07, + "logits/chosen": -1.5570497512817383, + "logits/rejected": -1.7196389436721802, + "logps/chosen": -420.92437744140625, + "logps/rejected": -375.7477111816406, + "loss": 0.2125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.031127259135246277, + "rewards/margins": 1.7547438144683838, + "rewards/rejected": -1.723616600036621, + "step": 2781 + }, + { + "epoch": 0.32, + "learning_rate": 2.059997637888272e-07, + "logits/chosen": -2.448502540588379, + "logits/rejected": -2.5700931549072266, + "logps/chosen": -304.7576904296875, + "logps/rejected": -231.25494384765625, + "loss": 0.5484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9735724329948425, + "rewards/margins": 1.3316367864608765, + "rewards/rejected": -2.305209159851074, + "step": 2782 + }, + { + "epoch": 0.32, + "learning_rate": 2.0596433211290893e-07, + "logits/chosen": -2.560392379760742, + "logits/rejected": -2.583927869796753, + "logps/chosen": -177.2637939453125, + "logps/rejected": -331.18511962890625, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15544834733009338, + "rewards/margins": 4.33696174621582, + "rewards/rejected": -4.492409706115723, + "step": 2783 + }, + { + "epoch": 0.32, + "learning_rate": 2.0592890043699065e-07, + "logits/chosen": -2.3378896713256836, + "logits/rejected": -2.2976109981536865, + "logps/chosen": -318.4089050292969, + "logps/rejected": -385.070556640625, + "loss": 0.5601, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9627291560173035, + "rewards/margins": 1.2610429525375366, + "rewards/rejected": -2.2237720489501953, + "step": 2784 + }, + { + "epoch": 0.32, + "learning_rate": 2.0589346876107237e-07, + "logits/chosen": -2.465488910675049, + "logits/rejected": -2.543954372406006, + "logps/chosen": -218.9886932373047, + "logps/rejected": -352.46820068359375, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8004039525985718, + "rewards/margins": 2.812516927719116, + "rewards/rejected": -3.6129207611083984, + "step": 2785 + }, + { + "epoch": 0.32, + "learning_rate": 2.0585803708515415e-07, + "logits/chosen": -2.5342752933502197, + "logits/rejected": -2.416834592819214, + "logps/chosen": -362.8609619140625, + "logps/rejected": -349.4075622558594, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4146467447280884, + "rewards/margins": 2.964639902114868, + "rewards/rejected": -3.379286527633667, + "step": 2786 + }, + { + "epoch": 0.32, + "learning_rate": 2.0582260540923587e-07, + "logits/chosen": -2.4081315994262695, + "logits/rejected": -2.337503433227539, + "logps/chosen": -403.8183288574219, + "logps/rejected": -514.3018798828125, + "loss": 0.3147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6671752333641052, + "rewards/margins": 1.7555726766586304, + "rewards/rejected": -2.422747850418091, + "step": 2787 + }, + { + "epoch": 0.32, + "learning_rate": 2.057871737333176e-07, + "logits/chosen": -2.1663217544555664, + "logits/rejected": -2.2671895027160645, + "logps/chosen": -195.31683349609375, + "logps/rejected": -233.54225158691406, + "loss": 0.5943, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3199719190597534, + "rewards/margins": 0.9029352068901062, + "rewards/rejected": -2.222907066345215, + "step": 2788 + }, + { + "epoch": 0.32, + "learning_rate": 2.057517420573993e-07, + "logits/chosen": -2.0933797359466553, + "logits/rejected": -2.1743855476379395, + "logps/chosen": -317.94073486328125, + "logps/rejected": -377.95111083984375, + "loss": 0.1873, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6250758767127991, + "rewards/margins": 4.745172500610352, + "rewards/rejected": -5.370248317718506, + "step": 2789 + }, + { + "epoch": 0.32, + "learning_rate": 2.0571631038148103e-07, + "logits/chosen": -2.0753180980682373, + "logits/rejected": -2.053215742111206, + "logps/chosen": -197.126220703125, + "logps/rejected": -256.1136169433594, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0177252292633057, + "rewards/margins": 3.7494266033172607, + "rewards/rejected": -4.767152309417725, + "step": 2790 + }, + { + "epoch": 0.32, + "learning_rate": 2.0568087870556276e-07, + "logits/chosen": -2.7155985832214355, + "logits/rejected": -2.6023130416870117, + "logps/chosen": -126.25444030761719, + "logps/rejected": -196.45538330078125, + "loss": 0.2156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6991429328918457, + "rewards/margins": 2.856741189956665, + "rewards/rejected": -3.5558838844299316, + "step": 2791 + }, + { + "epoch": 0.32, + "learning_rate": 2.056454470296445e-07, + "logits/chosen": -1.5817039012908936, + "logits/rejected": -1.6766940355300903, + "logps/chosen": -414.9866638183594, + "logps/rejected": -340.78143310546875, + "loss": 0.9663, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6030627489089966, + "rewards/margins": 0.3543718457221985, + "rewards/rejected": -1.9574344158172607, + "step": 2792 + }, + { + "epoch": 0.32, + "learning_rate": 2.0561001535372623e-07, + "logits/chosen": -2.886582136154175, + "logits/rejected": -2.7930896282196045, + "logps/chosen": -125.51559448242188, + "logps/rejected": -134.61605834960938, + "loss": 0.5199, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2961407899856567, + "rewards/margins": 2.088855743408203, + "rewards/rejected": -3.3849964141845703, + "step": 2793 + }, + { + "epoch": 0.33, + "learning_rate": 2.0557458367780795e-07, + "logits/chosen": -2.455160140991211, + "logits/rejected": -2.613832473754883, + "logps/chosen": -512.1253662109375, + "logps/rejected": -237.97140502929688, + "loss": 0.3089, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8840157985687256, + "rewards/margins": 1.7954705953598022, + "rewards/rejected": -3.6794862747192383, + "step": 2794 + }, + { + "epoch": 0.33, + "learning_rate": 2.0553915200188967e-07, + "logits/chosen": -2.0982892513275146, + "logits/rejected": -2.1001107692718506, + "logps/chosen": -274.06158447265625, + "logps/rejected": -276.936279296875, + "loss": 0.7712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2053558826446533, + "rewards/margins": 0.3067867159843445, + "rewards/rejected": -1.512142539024353, + "step": 2795 + }, + { + "epoch": 0.33, + "learning_rate": 2.055037203259714e-07, + "logits/chosen": -1.6699740886688232, + "logits/rejected": -1.6630700826644897, + "logps/chosen": -292.50103759765625, + "logps/rejected": -260.097412109375, + "loss": 0.3754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3645244538784027, + "rewards/margins": 2.2247323989868164, + "rewards/rejected": -2.589256763458252, + "step": 2796 + }, + { + "epoch": 0.33, + "learning_rate": 2.0546828865005312e-07, + "logits/chosen": -2.236677646636963, + "logits/rejected": -2.4373927116394043, + "logps/chosen": -346.0108642578125, + "logps/rejected": -258.8670349121094, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5777016878128052, + "rewards/margins": 2.119042158126831, + "rewards/rejected": -2.6967437267303467, + "step": 2797 + }, + { + "epoch": 0.33, + "learning_rate": 2.054328569741349e-07, + "logits/chosen": -2.675575017929077, + "logits/rejected": -2.6351897716522217, + "logps/chosen": -400.10516357421875, + "logps/rejected": -325.06365966796875, + "loss": 0.9688, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2481026649475098, + "rewards/margins": 0.4926161468029022, + "rewards/rejected": -2.7407186031341553, + "step": 2798 + }, + { + "epoch": 0.33, + "learning_rate": 2.053974252982166e-07, + "logits/chosen": -2.7971432209014893, + "logits/rejected": -2.633223533630371, + "logps/chosen": -302.6767578125, + "logps/rejected": -245.10604858398438, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9614919424057007, + "rewards/margins": 3.4177117347717285, + "rewards/rejected": -4.379203796386719, + "step": 2799 + }, + { + "epoch": 0.33, + "learning_rate": 2.0536199362229834e-07, + "logits/chosen": -2.0296876430511475, + "logits/rejected": -1.885248064994812, + "logps/chosen": -254.11224365234375, + "logps/rejected": -303.008544921875, + "loss": 0.3742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0010836124420166, + "rewards/margins": 1.263413429260254, + "rewards/rejected": -2.2644970417022705, + "step": 2800 + }, + { + "epoch": 0.33, + "learning_rate": 2.0532656194638006e-07, + "logits/chosen": -1.991680383682251, + "logits/rejected": -1.502970814704895, + "logps/chosen": -155.11618041992188, + "logps/rejected": -236.58041381835938, + "loss": 0.4252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8260632753372192, + "rewards/margins": 1.3820829391479492, + "rewards/rejected": -2.208146095275879, + "step": 2801 + }, + { + "epoch": 0.33, + "learning_rate": 2.0529113027046178e-07, + "logits/chosen": -1.6984056234359741, + "logits/rejected": -1.3114551305770874, + "logps/chosen": -86.82627868652344, + "logps/rejected": -152.1896209716797, + "loss": 0.4359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04985235258936882, + "rewards/margins": 1.2690937519073486, + "rewards/rejected": -1.318946123123169, + "step": 2802 + }, + { + "epoch": 0.33, + "learning_rate": 2.052556985945435e-07, + "logits/chosen": -2.5094528198242188, + "logits/rejected": -2.4720582962036133, + "logps/chosen": -176.70758056640625, + "logps/rejected": -283.5103759765625, + "loss": 0.1812, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.026608943939209, + "rewards/margins": 3.0588245391845703, + "rewards/rejected": -4.085433483123779, + "step": 2803 + }, + { + "epoch": 0.33, + "learning_rate": 2.0522026691862525e-07, + "logits/chosen": -1.6189991235733032, + "logits/rejected": -2.006326198577881, + "logps/chosen": -656.6763916015625, + "logps/rejected": -392.430419921875, + "loss": 0.6457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0681871175765991, + "rewards/margins": 0.5803056359291077, + "rewards/rejected": -1.6484928131103516, + "step": 2804 + }, + { + "epoch": 0.33, + "learning_rate": 2.0518483524270697e-07, + "logits/chosen": -2.323451280593872, + "logits/rejected": -2.3354034423828125, + "logps/chosen": -469.4316711425781, + "logps/rejected": -366.966064453125, + "loss": 0.3055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05852103233337402, + "rewards/margins": 2.029176712036133, + "rewards/rejected": -2.087697982788086, + "step": 2805 + }, + { + "epoch": 0.33, + "learning_rate": 2.051494035667887e-07, + "logits/chosen": -2.3163020610809326, + "logits/rejected": -2.150282859802246, + "logps/chosen": -344.0404052734375, + "logps/rejected": -274.2154235839844, + "loss": 0.3899, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0138919353485107, + "rewards/margins": 1.303979516029358, + "rewards/rejected": -2.317871570587158, + "step": 2806 + }, + { + "epoch": 0.33, + "learning_rate": 2.0511397189087042e-07, + "logits/chosen": -2.7944467067718506, + "logits/rejected": -2.772958755493164, + "logps/chosen": -312.7858581542969, + "logps/rejected": -386.0821228027344, + "loss": 0.1438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.231122225522995, + "rewards/margins": 4.467634677886963, + "rewards/rejected": -4.698757171630859, + "step": 2807 + }, + { + "epoch": 0.33, + "learning_rate": 2.0507854021495214e-07, + "logits/chosen": -2.3214612007141113, + "logits/rejected": -1.9573845863342285, + "logps/chosen": -60.63727951049805, + "logps/rejected": -171.14071655273438, + "loss": 0.2813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3563816249370575, + "rewards/margins": 1.8481109142303467, + "rewards/rejected": -2.2044925689697266, + "step": 2808 + }, + { + "epoch": 0.33, + "learning_rate": 2.0504310853903386e-07, + "logits/chosen": -2.7508983612060547, + "logits/rejected": -2.7914950847625732, + "logps/chosen": -256.6297607421875, + "logps/rejected": -307.9344177246094, + "loss": 0.3206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0767958164215088, + "rewards/margins": 1.325225591659546, + "rewards/rejected": -2.4020214080810547, + "step": 2809 + }, + { + "epoch": 0.33, + "learning_rate": 2.0500767686311564e-07, + "logits/chosen": -1.5481828451156616, + "logits/rejected": -1.5910831689834595, + "logps/chosen": -252.73226928710938, + "logps/rejected": -312.66729736328125, + "loss": 0.5177, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2445430755615234, + "rewards/margins": 0.6077486872673035, + "rewards/rejected": -1.8522920608520508, + "step": 2810 + }, + { + "epoch": 0.33, + "learning_rate": 2.0497224518719736e-07, + "logits/chosen": -1.960991382598877, + "logits/rejected": -2.089397430419922, + "logps/chosen": -152.9217071533203, + "logps/rejected": -163.07533264160156, + "loss": 0.6286, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.081068515777588, + "rewards/margins": 0.28588971495628357, + "rewards/rejected": -1.3669581413269043, + "step": 2811 + }, + { + "epoch": 0.33, + "learning_rate": 2.0493681351127908e-07, + "logits/chosen": -2.4788684844970703, + "logits/rejected": -2.6726245880126953, + "logps/chosen": -402.6098327636719, + "logps/rejected": -265.6576232910156, + "loss": 0.6706, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5419917106628418, + "rewards/margins": 1.1017179489135742, + "rewards/rejected": -2.643709659576416, + "step": 2812 + }, + { + "epoch": 0.33, + "learning_rate": 2.049013818353608e-07, + "logits/chosen": -2.468499183654785, + "logits/rejected": -2.6753218173980713, + "logps/chosen": -219.60073852539062, + "logps/rejected": -196.6517333984375, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.676396906375885, + "rewards/margins": 2.1995468139648438, + "rewards/rejected": -2.875943660736084, + "step": 2813 + }, + { + "epoch": 0.33, + "learning_rate": 2.0486595015944252e-07, + "logits/chosen": -2.416999101638794, + "logits/rejected": -2.546029567718506, + "logps/chosen": -217.00042724609375, + "logps/rejected": -193.41323852539062, + "loss": 0.2883, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7034333944320679, + "rewards/margins": 2.724259853363037, + "rewards/rejected": -3.4276933670043945, + "step": 2814 + }, + { + "epoch": 0.33, + "learning_rate": 2.0483051848352427e-07, + "logits/chosen": -1.8129630088806152, + "logits/rejected": -2.2402396202087402, + "logps/chosen": -397.4700927734375, + "logps/rejected": -268.7874450683594, + "loss": 0.4886, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2502238750457764, + "rewards/margins": 1.4893684387207031, + "rewards/rejected": -2.7395923137664795, + "step": 2815 + }, + { + "epoch": 0.33, + "learning_rate": 2.04795086807606e-07, + "logits/chosen": -2.5891482830047607, + "logits/rejected": -2.6199758052825928, + "logps/chosen": -276.3951721191406, + "logps/rejected": -261.1891174316406, + "loss": 0.7248, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.683912992477417, + "rewards/margins": 1.5360163450241089, + "rewards/rejected": -3.2199292182922363, + "step": 2816 + }, + { + "epoch": 0.33, + "learning_rate": 2.0475965513168772e-07, + "logits/chosen": -2.5540971755981445, + "logits/rejected": -2.3981118202209473, + "logps/chosen": -209.75234985351562, + "logps/rejected": -212.50405883789062, + "loss": 0.8078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8089838027954102, + "rewards/margins": 0.9001773595809937, + "rewards/rejected": -2.7091610431671143, + "step": 2817 + }, + { + "epoch": 0.33, + "learning_rate": 2.0472422345576944e-07, + "logits/chosen": -1.6472505331039429, + "logits/rejected": -2.2328805923461914, + "logps/chosen": -484.7900695800781, + "logps/rejected": -172.57437133789062, + "loss": 0.5016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7868074178695679, + "rewards/margins": 1.4236958026885986, + "rewards/rejected": -2.210503101348877, + "step": 2818 + }, + { + "epoch": 0.33, + "learning_rate": 2.0468879177985116e-07, + "logits/chosen": -2.3393406867980957, + "logits/rejected": -2.351262092590332, + "logps/chosen": -449.6119384765625, + "logps/rejected": -239.43972778320312, + "loss": 0.8984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8348033428192139, + "rewards/margins": 1.2704559564590454, + "rewards/rejected": -2.105259418487549, + "step": 2819 + }, + { + "epoch": 0.33, + "learning_rate": 2.0465336010393288e-07, + "logits/chosen": -2.749523162841797, + "logits/rejected": -2.5502002239227295, + "logps/chosen": -119.17343139648438, + "logps/rejected": -257.31591796875, + "loss": 0.223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45282799005508423, + "rewards/margins": 2.287104606628418, + "rewards/rejected": -2.7399325370788574, + "step": 2820 + }, + { + "epoch": 0.33, + "learning_rate": 2.0461792842801466e-07, + "logits/chosen": -2.3165204524993896, + "logits/rejected": -2.5828311443328857, + "logps/chosen": -272.6784973144531, + "logps/rejected": -144.60081481933594, + "loss": 0.3326, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14727984368801117, + "rewards/margins": 1.8323566913604736, + "rewards/rejected": -1.979636549949646, + "step": 2821 + }, + { + "epoch": 0.33, + "learning_rate": 2.0458249675209638e-07, + "logits/chosen": -2.792811155319214, + "logits/rejected": -2.8511414527893066, + "logps/chosen": -133.80726623535156, + "logps/rejected": -203.33470153808594, + "loss": 0.3303, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6973686218261719, + "rewards/margins": 2.3489441871643066, + "rewards/rejected": -3.0463125705718994, + "step": 2822 + }, + { + "epoch": 0.33, + "learning_rate": 2.045470650761781e-07, + "logits/chosen": -2.4337236881256104, + "logits/rejected": -2.5470268726348877, + "logps/chosen": -308.79425048828125, + "logps/rejected": -519.998291015625, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42340192198753357, + "rewards/margins": 5.131959915161133, + "rewards/rejected": -5.555361747741699, + "step": 2823 + }, + { + "epoch": 0.33, + "learning_rate": 2.0451163340025982e-07, + "logits/chosen": -2.36165189743042, + "logits/rejected": -2.2468252182006836, + "logps/chosen": -166.033447265625, + "logps/rejected": -243.00094604492188, + "loss": 0.6086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7419983148574829, + "rewards/margins": 1.414143443107605, + "rewards/rejected": -2.156141757965088, + "step": 2824 + }, + { + "epoch": 0.33, + "learning_rate": 2.0447620172434155e-07, + "logits/chosen": -2.1709892749786377, + "logits/rejected": -2.123112678527832, + "logps/chosen": -115.22529602050781, + "logps/rejected": -203.98980712890625, + "loss": 0.1664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22637619078159332, + "rewards/margins": 2.349916934967041, + "rewards/rejected": -2.5762932300567627, + "step": 2825 + }, + { + "epoch": 0.33, + "learning_rate": 2.044407700484233e-07, + "logits/chosen": -1.9990406036376953, + "logits/rejected": -2.1865406036376953, + "logps/chosen": -360.7684020996094, + "logps/rejected": -319.9784240722656, + "loss": 0.5165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6421569585800171, + "rewards/margins": 0.8304758071899414, + "rewards/rejected": -1.4726327657699585, + "step": 2826 + }, + { + "epoch": 0.33, + "learning_rate": 2.0440533837250502e-07, + "logits/chosen": -2.164041519165039, + "logits/rejected": -2.4912188053131104, + "logps/chosen": -379.3470458984375, + "logps/rejected": -220.86074829101562, + "loss": 0.3036, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2399139404296875, + "rewards/margins": 1.163254737854004, + "rewards/rejected": -2.4031686782836914, + "step": 2827 + }, + { + "epoch": 0.33, + "learning_rate": 2.0436990669658674e-07, + "logits/chosen": -2.855140209197998, + "logits/rejected": -2.8357858657836914, + "logps/chosen": -207.58799743652344, + "logps/rejected": -247.04696655273438, + "loss": 0.5711, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0696563720703125, + "rewards/margins": 1.5588760375976562, + "rewards/rejected": -2.6285324096679688, + "step": 2828 + }, + { + "epoch": 0.33, + "learning_rate": 2.0433447502066846e-07, + "logits/chosen": -2.6308095455169678, + "logits/rejected": -2.6607749462127686, + "logps/chosen": -109.74212646484375, + "logps/rejected": -153.10687255859375, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5706846117973328, + "rewards/margins": 1.284409761428833, + "rewards/rejected": -1.855094313621521, + "step": 2829 + }, + { + "epoch": 0.33, + "learning_rate": 2.0429904334475018e-07, + "logits/chosen": -2.5287153720855713, + "logits/rejected": -2.578961133956909, + "logps/chosen": -386.8229064941406, + "logps/rejected": -355.1982727050781, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38310733437538147, + "rewards/margins": 2.1144466400146484, + "rewards/rejected": -2.497554063796997, + "step": 2830 + }, + { + "epoch": 0.33, + "learning_rate": 2.042636116688319e-07, + "logits/chosen": -1.7102911472320557, + "logits/rejected": -1.9979699850082397, + "logps/chosen": -318.42352294921875, + "logps/rejected": -279.9345703125, + "loss": 0.4946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7330102920532227, + "rewards/margins": 0.9977298974990845, + "rewards/rejected": -1.7307403087615967, + "step": 2831 + }, + { + "epoch": 0.33, + "learning_rate": 2.0422817999291363e-07, + "logits/chosen": -2.863314628601074, + "logits/rejected": -2.8775007724761963, + "logps/chosen": -186.6444091796875, + "logps/rejected": -206.51780700683594, + "loss": 0.7769, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4505507946014404, + "rewards/margins": 0.6711548566818237, + "rewards/rejected": -2.1217057704925537, + "step": 2832 + }, + { + "epoch": 0.33, + "learning_rate": 2.041927483169954e-07, + "logits/chosen": -2.596134662628174, + "logits/rejected": -2.3948166370391846, + "logps/chosen": -211.3447265625, + "logps/rejected": -213.29898071289062, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14425039291381836, + "rewards/margins": 2.322404384613037, + "rewards/rejected": -2.4666545391082764, + "step": 2833 + }, + { + "epoch": 0.33, + "learning_rate": 2.0415731664107713e-07, + "logits/chosen": -2.1336591243743896, + "logits/rejected": -1.9621479511260986, + "logps/chosen": -105.7725830078125, + "logps/rejected": -165.7503662109375, + "loss": 0.8965, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.604356288909912, + "rewards/margins": 0.2732434868812561, + "rewards/rejected": -1.8775997161865234, + "step": 2834 + }, + { + "epoch": 0.33, + "learning_rate": 2.0412188496515885e-07, + "logits/chosen": -2.74996018409729, + "logits/rejected": -2.406066417694092, + "logps/chosen": -226.12258911132812, + "logps/rejected": -270.3167724609375, + "loss": 0.5797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6943837404251099, + "rewards/margins": 0.9944208860397339, + "rewards/rejected": -1.6888046264648438, + "step": 2835 + }, + { + "epoch": 0.33, + "learning_rate": 2.0408645328924057e-07, + "logits/chosen": -2.2756476402282715, + "logits/rejected": -2.1289124488830566, + "logps/chosen": -135.25123596191406, + "logps/rejected": -201.6737823486328, + "loss": 0.4717, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3982540369033813, + "rewards/margins": 1.5935182571411133, + "rewards/rejected": -2.991771936416626, + "step": 2836 + }, + { + "epoch": 0.33, + "learning_rate": 2.040510216133223e-07, + "logits/chosen": -2.5038020610809326, + "logits/rejected": -1.7901339530944824, + "logps/chosen": -111.13200378417969, + "logps/rejected": -408.293212890625, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.059613630175590515, + "rewards/margins": 2.4257407188415527, + "rewards/rejected": -2.3661270141601562, + "step": 2837 + }, + { + "epoch": 0.33, + "learning_rate": 2.0401558993740404e-07, + "logits/chosen": -2.749030590057373, + "logits/rejected": -2.726857900619507, + "logps/chosen": -176.64962768554688, + "logps/rejected": -178.43772888183594, + "loss": 0.7638, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6310076713562012, + "rewards/margins": 1.1016085147857666, + "rewards/rejected": -2.732616424560547, + "step": 2838 + }, + { + "epoch": 0.33, + "learning_rate": 2.0398015826148576e-07, + "logits/chosen": -2.220405101776123, + "logits/rejected": -2.0455281734466553, + "logps/chosen": -123.35054016113281, + "logps/rejected": -140.5076904296875, + "loss": 0.5718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5483088493347168, + "rewards/margins": 1.0419373512268066, + "rewards/rejected": -1.5902462005615234, + "step": 2839 + }, + { + "epoch": 0.33, + "learning_rate": 2.0394472658556748e-07, + "logits/chosen": -2.772846221923828, + "logits/rejected": -2.603695869445801, + "logps/chosen": -219.9719696044922, + "logps/rejected": -273.76690673828125, + "loss": 0.406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5464353561401367, + "rewards/margins": 1.6724764108657837, + "rewards/rejected": -2.218911647796631, + "step": 2840 + }, + { + "epoch": 0.33, + "learning_rate": 2.039092949096492e-07, + "logits/chosen": -2.092108726501465, + "logits/rejected": -2.126579761505127, + "logps/chosen": -429.8004150390625, + "logps/rejected": -406.6106262207031, + "loss": 0.2536, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9370561838150024, + "rewards/margins": 1.758062481880188, + "rewards/rejected": -2.6951186656951904, + "step": 2841 + }, + { + "epoch": 0.33, + "learning_rate": 2.0387386323373093e-07, + "logits/chosen": -2.3389503955841064, + "logits/rejected": -2.499793529510498, + "logps/chosen": -249.271728515625, + "logps/rejected": -225.81951904296875, + "loss": 0.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0997467041015625, + "rewards/margins": 2.8356688022613525, + "rewards/rejected": -3.935415744781494, + "step": 2842 + }, + { + "epoch": 0.33, + "learning_rate": 2.0383843155781265e-07, + "logits/chosen": -2.9714183807373047, + "logits/rejected": -2.9895739555358887, + "logps/chosen": -265.3147888183594, + "logps/rejected": -221.22161865234375, + "loss": 0.285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7265208959579468, + "rewards/margins": 2.158388137817383, + "rewards/rejected": -2.884908676147461, + "step": 2843 + }, + { + "epoch": 0.33, + "learning_rate": 2.038029998818944e-07, + "logits/chosen": -2.8175811767578125, + "logits/rejected": -2.8435721397399902, + "logps/chosen": -253.998046875, + "logps/rejected": -244.614013671875, + "loss": 0.2087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14495407044887543, + "rewards/margins": 2.406085729598999, + "rewards/rejected": -2.551039695739746, + "step": 2844 + }, + { + "epoch": 0.33, + "learning_rate": 2.0376756820597615e-07, + "logits/chosen": -1.9524171352386475, + "logits/rejected": -2.0545637607574463, + "logps/chosen": -187.60845947265625, + "logps/rejected": -196.5067596435547, + "loss": 0.3161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8776882290840149, + "rewards/margins": 1.4847383499145508, + "rewards/rejected": -2.362426519393921, + "step": 2845 + }, + { + "epoch": 0.33, + "learning_rate": 2.0373213653005787e-07, + "logits/chosen": -2.3409676551818848, + "logits/rejected": -2.6742193698883057, + "logps/chosen": -554.197509765625, + "logps/rejected": -411.0762634277344, + "loss": 0.4378, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.222048044204712, + "rewards/margins": 3.0703349113464355, + "rewards/rejected": -4.292383193969727, + "step": 2846 + }, + { + "epoch": 0.33, + "learning_rate": 2.036967048541396e-07, + "logits/chosen": -2.534383535385132, + "logits/rejected": -2.6092355251312256, + "logps/chosen": -197.86746215820312, + "logps/rejected": -141.77825927734375, + "loss": 0.2369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38461869955062866, + "rewards/margins": 1.4713630676269531, + "rewards/rejected": -1.8559815883636475, + "step": 2847 + }, + { + "epoch": 0.33, + "learning_rate": 2.0366127317822131e-07, + "logits/chosen": -2.293240547180176, + "logits/rejected": -2.2658278942108154, + "logps/chosen": -311.14373779296875, + "logps/rejected": -248.43307495117188, + "loss": 0.2915, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6427315473556519, + "rewards/margins": 2.1426873207092285, + "rewards/rejected": -2.78541898727417, + "step": 2848 + }, + { + "epoch": 0.33, + "learning_rate": 2.0362584150230306e-07, + "logits/chosen": -2.338634490966797, + "logits/rejected": -2.0143356323242188, + "logps/chosen": -107.28548431396484, + "logps/rejected": -286.7553405761719, + "loss": 0.2444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08572103083133698, + "rewards/margins": 2.2003350257873535, + "rewards/rejected": -2.2860560417175293, + "step": 2849 + }, + { + "epoch": 0.33, + "learning_rate": 2.0359040982638479e-07, + "logits/chosen": -2.2955589294433594, + "logits/rejected": -2.47743558883667, + "logps/chosen": -239.28292846679688, + "logps/rejected": -159.82424926757812, + "loss": 0.7208, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6456551551818848, + "rewards/margins": 0.5764782428741455, + "rewards/rejected": -2.222133159637451, + "step": 2850 + }, + { + "epoch": 0.33, + "learning_rate": 2.035549781504665e-07, + "logits/chosen": -2.3325648307800293, + "logits/rejected": -2.3824925422668457, + "logps/chosen": -183.08778381347656, + "logps/rejected": -194.589111328125, + "loss": 0.4246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2112860679626465, + "rewards/margins": 1.254469394683838, + "rewards/rejected": -2.4657554626464844, + "step": 2851 + }, + { + "epoch": 0.33, + "learning_rate": 2.0351954647454823e-07, + "logits/chosen": -2.3954105377197266, + "logits/rejected": -2.534482717514038, + "logps/chosen": -326.9853820800781, + "logps/rejected": -411.5298767089844, + "loss": 0.4292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6472383737564087, + "rewards/margins": 1.7075189352035522, + "rewards/rejected": -2.35475754737854, + "step": 2852 + }, + { + "epoch": 0.33, + "learning_rate": 2.0348411479862995e-07, + "logits/chosen": -2.3276333808898926, + "logits/rejected": -2.65321683883667, + "logps/chosen": -382.74755859375, + "logps/rejected": -254.12014770507812, + "loss": 0.457, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8999982476234436, + "rewards/margins": 0.901712954044342, + "rewards/rejected": -1.8017112016677856, + "step": 2853 + }, + { + "epoch": 0.33, + "learning_rate": 2.0344868312271167e-07, + "logits/chosen": -2.7296299934387207, + "logits/rejected": -2.7550432682037354, + "logps/chosen": -528.6714477539062, + "logps/rejected": -346.8750305175781, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.018274188041687, + "rewards/margins": 0.6785012483596802, + "rewards/rejected": -1.6967754364013672, + "step": 2854 + }, + { + "epoch": 0.33, + "learning_rate": 2.0341325144679342e-07, + "logits/chosen": -2.86840558052063, + "logits/rejected": -2.946042537689209, + "logps/chosen": -202.4578094482422, + "logps/rejected": -223.56314086914062, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6463537812232971, + "rewards/margins": 2.3079893589019775, + "rewards/rejected": -2.954343318939209, + "step": 2855 + }, + { + "epoch": 0.33, + "learning_rate": 2.0337781977087517e-07, + "logits/chosen": -2.5411508083343506, + "logits/rejected": -2.691070079803467, + "logps/chosen": -256.4216613769531, + "logps/rejected": -236.79119873046875, + "loss": 0.4356, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.609972357749939, + "rewards/margins": 1.3840147256851196, + "rewards/rejected": -2.9939870834350586, + "step": 2856 + }, + { + "epoch": 0.33, + "learning_rate": 2.033423880949569e-07, + "logits/chosen": -2.832761287689209, + "logits/rejected": -2.796764612197876, + "logps/chosen": -190.47116088867188, + "logps/rejected": -258.75421142578125, + "loss": 0.177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5369760394096375, + "rewards/margins": 3.00662899017334, + "rewards/rejected": -3.543605089187622, + "step": 2857 + }, + { + "epoch": 0.33, + "learning_rate": 2.0330695641903862e-07, + "logits/chosen": -2.257845640182495, + "logits/rejected": -2.3125150203704834, + "logps/chosen": -230.78269958496094, + "logps/rejected": -216.63568115234375, + "loss": 0.6072, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7062742710113525, + "rewards/margins": 1.691977620124817, + "rewards/rejected": -2.398252010345459, + "step": 2858 + }, + { + "epoch": 0.33, + "learning_rate": 2.0327152474312034e-07, + "logits/chosen": -2.4231739044189453, + "logits/rejected": -2.214143753051758, + "logps/chosen": -364.149169921875, + "logps/rejected": -352.7958679199219, + "loss": 0.4091, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2891604900360107, + "rewards/margins": 1.647066593170166, + "rewards/rejected": -2.9362268447875977, + "step": 2859 + }, + { + "epoch": 0.33, + "learning_rate": 2.0323609306720209e-07, + "logits/chosen": -2.1073765754699707, + "logits/rejected": -2.4159884452819824, + "logps/chosen": -441.6005859375, + "logps/rejected": -346.96142578125, + "loss": 0.485, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0235151052474976, + "rewards/margins": 1.7792872190475464, + "rewards/rejected": -2.802802324295044, + "step": 2860 + }, + { + "epoch": 0.33, + "learning_rate": 2.032006613912838e-07, + "logits/chosen": -2.1934328079223633, + "logits/rejected": -2.5753846168518066, + "logps/chosen": -289.9516906738281, + "logps/rejected": -208.5700225830078, + "loss": 0.5458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8048868179321289, + "rewards/margins": 1.155310034751892, + "rewards/rejected": -1.9601967334747314, + "step": 2861 + }, + { + "epoch": 0.33, + "learning_rate": 2.0316522971536553e-07, + "logits/chosen": -2.6112372875213623, + "logits/rejected": -2.5233471393585205, + "logps/chosen": -353.9142150878906, + "logps/rejected": -322.8084411621094, + "loss": 0.7719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3388844728469849, + "rewards/margins": 0.8053247928619385, + "rewards/rejected": -2.144209146499634, + "step": 2862 + }, + { + "epoch": 0.33, + "learning_rate": 2.0312979803944725e-07, + "logits/chosen": -2.28244686126709, + "logits/rejected": -2.1591668128967285, + "logps/chosen": -218.95614624023438, + "logps/rejected": -375.42645263671875, + "loss": 0.2096, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0893088579177856, + "rewards/margins": 4.839071273803711, + "rewards/rejected": -5.928380012512207, + "step": 2863 + }, + { + "epoch": 0.33, + "learning_rate": 2.0309436636352897e-07, + "logits/chosen": -2.4412362575531006, + "logits/rejected": -2.451605796813965, + "logps/chosen": -157.55386352539062, + "logps/rejected": -170.01113891601562, + "loss": 0.4649, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5210105180740356, + "rewards/margins": 1.0127930641174316, + "rewards/rejected": -1.5338035821914673, + "step": 2864 + }, + { + "epoch": 0.33, + "learning_rate": 2.030589346876107e-07, + "logits/chosen": -2.1625285148620605, + "logits/rejected": -1.6983957290649414, + "logps/chosen": -277.6005859375, + "logps/rejected": -385.6598205566406, + "loss": 0.656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1222116947174072, + "rewards/margins": 1.2989028692245483, + "rewards/rejected": -2.421114683151245, + "step": 2865 + }, + { + "epoch": 0.33, + "learning_rate": 2.0302350301169242e-07, + "logits/chosen": -1.9634594917297363, + "logits/rejected": -2.1003565788269043, + "logps/chosen": -321.10919189453125, + "logps/rejected": -198.8200225830078, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6732341647148132, + "rewards/margins": 0.20739053189754486, + "rewards/rejected": -0.8806246519088745, + "step": 2866 + }, + { + "epoch": 0.33, + "learning_rate": 2.0298807133577417e-07, + "logits/chosen": -2.4625420570373535, + "logits/rejected": -2.706028461456299, + "logps/chosen": -299.9356689453125, + "logps/rejected": -322.8956298828125, + "loss": 0.3407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7983559966087341, + "rewards/margins": 2.469330310821533, + "rewards/rejected": -3.267686367034912, + "step": 2867 + }, + { + "epoch": 0.33, + "learning_rate": 2.0295263965985592e-07, + "logits/chosen": -2.326526641845703, + "logits/rejected": -2.367274761199951, + "logps/chosen": -111.6295166015625, + "logps/rejected": -231.0255126953125, + "loss": 0.3975, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2228307723999023, + "rewards/margins": 1.820825219154358, + "rewards/rejected": -3.0436558723449707, + "step": 2868 + }, + { + "epoch": 0.33, + "learning_rate": 2.0291720798393764e-07, + "logits/chosen": -2.430966377258301, + "logits/rejected": -2.1374354362487793, + "logps/chosen": -226.77301025390625, + "logps/rejected": -285.22515869140625, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2108231782913208, + "rewards/margins": 2.0325894355773926, + "rewards/rejected": -3.243412971496582, + "step": 2869 + }, + { + "epoch": 0.33, + "learning_rate": 2.0288177630801936e-07, + "logits/chosen": -1.837465763092041, + "logits/rejected": -2.1744298934936523, + "logps/chosen": -383.08575439453125, + "logps/rejected": -252.12258911132812, + "loss": 0.3452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5019859671592712, + "rewards/margins": 1.609582781791687, + "rewards/rejected": -2.1115686893463135, + "step": 2870 + }, + { + "epoch": 0.33, + "learning_rate": 2.028463446321011e-07, + "logits/chosen": -2.3352088928222656, + "logits/rejected": -2.360283613204956, + "logps/chosen": -201.75726318359375, + "logps/rejected": -309.3847351074219, + "loss": 0.4806, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.734175205230713, + "rewards/margins": 1.2834196090698242, + "rewards/rejected": -3.017594814300537, + "step": 2871 + }, + { + "epoch": 0.33, + "learning_rate": 2.0281091295618283e-07, + "logits/chosen": -1.9165136814117432, + "logits/rejected": -2.10412335395813, + "logps/chosen": -372.65545654296875, + "logps/rejected": -407.44317626953125, + "loss": 0.2476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9890196323394775, + "rewards/margins": 2.9019620418548584, + "rewards/rejected": -3.890981674194336, + "step": 2872 + }, + { + "epoch": 0.33, + "learning_rate": 2.0277548128026455e-07, + "logits/chosen": -2.246105194091797, + "logits/rejected": -2.0619866847991943, + "logps/chosen": -247.62705993652344, + "logps/rejected": -339.67572021484375, + "loss": 0.2168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4305148124694824, + "rewards/margins": 2.4015254974365234, + "rewards/rejected": -2.832040309906006, + "step": 2873 + }, + { + "epoch": 0.33, + "learning_rate": 2.0274004960434628e-07, + "logits/chosen": -2.075566291809082, + "logits/rejected": -2.5348029136657715, + "logps/chosen": -299.35845947265625, + "logps/rejected": -232.8660430908203, + "loss": 0.3858, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4475215971469879, + "rewards/margins": 2.4609344005584717, + "rewards/rejected": -2.9084560871124268, + "step": 2874 + }, + { + "epoch": 0.33, + "learning_rate": 2.02704617928428e-07, + "logits/chosen": -2.7401082515716553, + "logits/rejected": -2.295564651489258, + "logps/chosen": -241.01943969726562, + "logps/rejected": -370.2752380371094, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7928420901298523, + "rewards/margins": 3.9431591033935547, + "rewards/rejected": -4.736001014709473, + "step": 2875 + }, + { + "epoch": 0.33, + "learning_rate": 2.0266918625250972e-07, + "logits/chosen": -1.4570530652999878, + "logits/rejected": -1.7534987926483154, + "logps/chosen": -264.896240234375, + "logps/rejected": -202.8114471435547, + "loss": 0.8372, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6122134923934937, + "rewards/margins": 1.0992285013198853, + "rewards/rejected": -2.711441993713379, + "step": 2876 + }, + { + "epoch": 0.33, + "learning_rate": 2.0263375457659144e-07, + "logits/chosen": -2.8308329582214355, + "logits/rejected": -2.811244010925293, + "logps/chosen": -200.76812744140625, + "logps/rejected": -228.46072387695312, + "loss": 0.3615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5914900302886963, + "rewards/margins": 1.6404783725738525, + "rewards/rejected": -2.231968402862549, + "step": 2877 + }, + { + "epoch": 0.33, + "learning_rate": 2.025983229006732e-07, + "logits/chosen": -2.3264904022216797, + "logits/rejected": -2.418992519378662, + "logps/chosen": -341.9829406738281, + "logps/rejected": -294.7510681152344, + "loss": 1.2145, + "rewards/accuracies": 0.25, + "rewards/chosen": -2.2792155742645264, + "rewards/margins": -0.3553204834461212, + "rewards/rejected": -1.923895001411438, + "step": 2878 + }, + { + "epoch": 0.33, + "learning_rate": 2.025628912247549e-07, + "logits/chosen": -2.5910556316375732, + "logits/rejected": -2.6425607204437256, + "logps/chosen": -427.37554931640625, + "logps/rejected": -372.8592529296875, + "loss": 0.1674, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1722436249256134, + "rewards/margins": 2.4149796962738037, + "rewards/rejected": -2.2427358627319336, + "step": 2879 + }, + { + "epoch": 0.34, + "learning_rate": 2.0252745954883666e-07, + "logits/chosen": -2.256223440170288, + "logits/rejected": -1.9176206588745117, + "logps/chosen": -281.580078125, + "logps/rejected": -357.53619384765625, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24218136072158813, + "rewards/margins": 2.945889472961426, + "rewards/rejected": -3.188070774078369, + "step": 2880 + }, + { + "epoch": 0.34, + "learning_rate": 2.0249202787291838e-07, + "logits/chosen": -2.393981695175171, + "logits/rejected": -2.6840949058532715, + "logps/chosen": -617.1217041015625, + "logps/rejected": -177.08956909179688, + "loss": 0.7975, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5384984016418457, + "rewards/margins": 1.6234798431396484, + "rewards/rejected": -3.161978244781494, + "step": 2881 + }, + { + "epoch": 0.34, + "learning_rate": 2.024565961970001e-07, + "logits/chosen": -1.8459445238113403, + "logits/rejected": -2.3582472801208496, + "logps/chosen": -456.4781799316406, + "logps/rejected": -187.4498291015625, + "loss": 0.4169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3145253658294678, + "rewards/margins": 0.9809325337409973, + "rewards/rejected": -2.2954578399658203, + "step": 2882 + }, + { + "epoch": 0.34, + "learning_rate": 2.0242116452108185e-07, + "logits/chosen": -2.0384066104888916, + "logits/rejected": -2.3625717163085938, + "logps/chosen": -217.86029052734375, + "logps/rejected": -158.98114013671875, + "loss": 1.1107, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.6349494457244873, + "rewards/margins": -0.5352143049240112, + "rewards/rejected": -1.099735140800476, + "step": 2883 + }, + { + "epoch": 0.34, + "learning_rate": 2.0238573284516358e-07, + "logits/chosen": -2.0060477256774902, + "logits/rejected": -2.1232669353485107, + "logps/chosen": -206.22161865234375, + "logps/rejected": -219.52394104003906, + "loss": 0.2985, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.124667763710022, + "rewards/margins": 2.266490936279297, + "rewards/rejected": -3.3911588191986084, + "step": 2884 + }, + { + "epoch": 0.34, + "learning_rate": 2.023503011692453e-07, + "logits/chosen": -1.9929039478302002, + "logits/rejected": -2.0601019859313965, + "logps/chosen": -403.5986328125, + "logps/rejected": -498.83563232421875, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9665526151657104, + "rewards/margins": 1.8448909521102905, + "rewards/rejected": -2.811443328857422, + "step": 2885 + }, + { + "epoch": 0.34, + "learning_rate": 2.0231486949332702e-07, + "logits/chosen": -2.6422231197357178, + "logits/rejected": -2.5143351554870605, + "logps/chosen": -212.97222900390625, + "logps/rejected": -285.6768493652344, + "loss": 0.3461, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18182986974716187, + "rewards/margins": 1.9619641304016113, + "rewards/rejected": -2.143794059753418, + "step": 2886 + }, + { + "epoch": 0.34, + "learning_rate": 2.0227943781740874e-07, + "logits/chosen": -2.1844730377197266, + "logits/rejected": -2.6466782093048096, + "logps/chosen": -282.8886413574219, + "logps/rejected": -215.51840209960938, + "loss": 0.7105, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9629329442977905, + "rewards/margins": 2.287843704223633, + "rewards/rejected": -3.250776767730713, + "step": 2887 + }, + { + "epoch": 0.34, + "learning_rate": 2.0224400614149046e-07, + "logits/chosen": -2.2526354789733887, + "logits/rejected": -2.3379318714141846, + "logps/chosen": -263.161376953125, + "logps/rejected": -217.009033203125, + "loss": 0.7185, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2396337985992432, + "rewards/margins": 0.9094007015228271, + "rewards/rejected": -2.1490345001220703, + "step": 2888 + }, + { + "epoch": 0.34, + "learning_rate": 2.022085744655722e-07, + "logits/chosen": -2.1689679622650146, + "logits/rejected": -2.242253065109253, + "logps/chosen": -290.3797607421875, + "logps/rejected": -266.25079345703125, + "loss": 0.4546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.680088222026825, + "rewards/margins": 1.7658886909484863, + "rewards/rejected": -2.445976972579956, + "step": 2889 + }, + { + "epoch": 0.34, + "learning_rate": 2.0217314278965394e-07, + "logits/chosen": -1.8021471500396729, + "logits/rejected": -1.776775598526001, + "logps/chosen": -194.62965393066406, + "logps/rejected": -318.5040588378906, + "loss": 0.6408, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.107653021812439, + "rewards/margins": 1.904958963394165, + "rewards/rejected": -3.0126121044158936, + "step": 2890 + }, + { + "epoch": 0.34, + "learning_rate": 2.0213771111373568e-07, + "logits/chosen": -1.9805021286010742, + "logits/rejected": -1.8746891021728516, + "logps/chosen": -252.8421630859375, + "logps/rejected": -274.3184814453125, + "loss": 0.1827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8497920036315918, + "rewards/margins": 1.9227381944656372, + "rewards/rejected": -2.7725300788879395, + "step": 2891 + }, + { + "epoch": 0.34, + "learning_rate": 2.021022794378174e-07, + "logits/chosen": -2.0213210582733154, + "logits/rejected": -1.8614336252212524, + "logps/chosen": -296.9938659667969, + "logps/rejected": -351.732421875, + "loss": 0.7181, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0767945051193237, + "rewards/margins": 0.8482465744018555, + "rewards/rejected": -1.9250409603118896, + "step": 2892 + }, + { + "epoch": 0.34, + "learning_rate": 2.0206684776189913e-07, + "logits/chosen": -2.6949777603149414, + "logits/rejected": -2.6188080310821533, + "logps/chosen": -188.3943634033203, + "logps/rejected": -182.68670654296875, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23387223482131958, + "rewards/margins": 1.9993082284927368, + "rewards/rejected": -2.233180522918701, + "step": 2893 + }, + { + "epoch": 0.34, + "learning_rate": 2.0203141608598088e-07, + "logits/chosen": -2.3466105461120605, + "logits/rejected": -2.556361675262451, + "logps/chosen": -115.06883239746094, + "logps/rejected": -175.20767211914062, + "loss": 0.2328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49210256338119507, + "rewards/margins": 2.1394600868225098, + "rewards/rejected": -2.6315627098083496, + "step": 2894 + }, + { + "epoch": 0.34, + "learning_rate": 2.019959844100626e-07, + "logits/chosen": -2.686100959777832, + "logits/rejected": -2.531803846359253, + "logps/chosen": -104.9197998046875, + "logps/rejected": -143.34988403320312, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0272703170776367, + "rewards/margins": 0.6143026947975159, + "rewards/rejected": -1.6415729522705078, + "step": 2895 + }, + { + "epoch": 0.34, + "learning_rate": 2.0196055273414432e-07, + "logits/chosen": -2.579378843307495, + "logits/rejected": -2.589851140975952, + "logps/chosen": -288.3555908203125, + "logps/rejected": -211.2042694091797, + "loss": 0.3076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5928442478179932, + "rewards/margins": 1.6573816537857056, + "rewards/rejected": -2.2502260208129883, + "step": 2896 + }, + { + "epoch": 0.34, + "learning_rate": 2.0192512105822604e-07, + "logits/chosen": -2.2085208892822266, + "logits/rejected": -2.3226630687713623, + "logps/chosen": -173.33340454101562, + "logps/rejected": -156.54083251953125, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.801984429359436, + "rewards/margins": 1.124724268913269, + "rewards/rejected": -1.926708698272705, + "step": 2897 + }, + { + "epoch": 0.34, + "learning_rate": 2.0188968938230776e-07, + "logits/chosen": -1.993521809577942, + "logits/rejected": -1.9770526885986328, + "logps/chosen": -437.001708984375, + "logps/rejected": -418.07904052734375, + "loss": 0.2164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.704768180847168, + "rewards/margins": 2.854259967803955, + "rewards/rejected": -3.559028148651123, + "step": 2898 + }, + { + "epoch": 0.34, + "learning_rate": 2.018542577063895e-07, + "logits/chosen": -2.6230273246765137, + "logits/rejected": -2.598602294921875, + "logps/chosen": -258.31561279296875, + "logps/rejected": -341.5511779785156, + "loss": 0.2956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4791499674320221, + "rewards/margins": 2.113140344619751, + "rewards/rejected": -2.5922904014587402, + "step": 2899 + }, + { + "epoch": 0.34, + "learning_rate": 2.0181882603047124e-07, + "logits/chosen": -2.226626396179199, + "logits/rejected": -2.5421509742736816, + "logps/chosen": -349.17041015625, + "logps/rejected": -221.76226806640625, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.514342188835144, + "rewards/margins": 1.2088603973388672, + "rewards/rejected": -1.7232025861740112, + "step": 2900 + }, + { + "epoch": 0.34, + "learning_rate": 2.0178339435455296e-07, + "logits/chosen": -2.86385440826416, + "logits/rejected": -2.7421200275421143, + "logps/chosen": -247.175537109375, + "logps/rejected": -198.66641235351562, + "loss": 0.2221, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3531253337860107, + "rewards/margins": 2.4901657104492188, + "rewards/rejected": -3.8432910442352295, + "step": 2901 + }, + { + "epoch": 0.34, + "learning_rate": 2.0174796267863468e-07, + "logits/chosen": -2.6131203174591064, + "logits/rejected": -2.6613430976867676, + "logps/chosen": -119.65361785888672, + "logps/rejected": -235.96148681640625, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23277662694454193, + "rewards/margins": 3.518507480621338, + "rewards/rejected": -3.751284122467041, + "step": 2902 + }, + { + "epoch": 0.34, + "learning_rate": 2.0171253100271643e-07, + "logits/chosen": -1.866573691368103, + "logits/rejected": -2.312160015106201, + "logps/chosen": -359.2745361328125, + "logps/rejected": -280.340576171875, + "loss": 0.488, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5938076376914978, + "rewards/margins": 0.7951335310935974, + "rewards/rejected": -1.3889411687850952, + "step": 2903 + }, + { + "epoch": 0.34, + "learning_rate": 2.0167709932679815e-07, + "logits/chosen": -2.5010504722595215, + "logits/rejected": -2.3570709228515625, + "logps/chosen": -308.05621337890625, + "logps/rejected": -354.69866943359375, + "loss": 0.2587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6857733726501465, + "rewards/margins": 3.434004306793213, + "rewards/rejected": -4.119777679443359, + "step": 2904 + }, + { + "epoch": 0.34, + "learning_rate": 2.016416676508799e-07, + "logits/chosen": -2.238051652908325, + "logits/rejected": -2.1889498233795166, + "logps/chosen": -442.6329040527344, + "logps/rejected": -322.31597900390625, + "loss": 0.3949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7156566977500916, + "rewards/margins": 1.6037371158599854, + "rewards/rejected": -2.3193938732147217, + "step": 2905 + }, + { + "epoch": 0.34, + "learning_rate": 2.0160623597496162e-07, + "logits/chosen": -1.8123810291290283, + "logits/rejected": -1.8424222469329834, + "logps/chosen": -346.43878173828125, + "logps/rejected": -297.72308349609375, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5600947737693787, + "rewards/margins": 3.2305641174316406, + "rewards/rejected": -3.790658712387085, + "step": 2906 + }, + { + "epoch": 0.34, + "learning_rate": 2.0157080429904334e-07, + "logits/chosen": -2.413424015045166, + "logits/rejected": -2.764202833175659, + "logps/chosen": -294.4854736328125, + "logps/rejected": -249.68624877929688, + "loss": 0.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2591441571712494, + "rewards/margins": 3.248051166534424, + "rewards/rejected": -3.507195472717285, + "step": 2907 + }, + { + "epoch": 0.34, + "learning_rate": 2.0153537262312507e-07, + "logits/chosen": -2.763993978500366, + "logits/rejected": -2.573305368423462, + "logps/chosen": -98.67894744873047, + "logps/rejected": -141.6231231689453, + "loss": 0.3705, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.029970407485962, + "rewards/margins": 2.1616175174713135, + "rewards/rejected": -3.1915879249572754, + "step": 2908 + }, + { + "epoch": 0.34, + "learning_rate": 2.014999409472068e-07, + "logits/chosen": -1.715468168258667, + "logits/rejected": -1.9786796569824219, + "logps/chosen": -553.8857421875, + "logps/rejected": -351.9848327636719, + "loss": 0.2811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21551834046840668, + "rewards/margins": 2.5704092979431152, + "rewards/rejected": -2.7859277725219727, + "step": 2909 + }, + { + "epoch": 0.34, + "learning_rate": 2.014645092712885e-07, + "logits/chosen": -2.671719551086426, + "logits/rejected": -2.6838431358337402, + "logps/chosen": -339.44317626953125, + "logps/rejected": -447.43310546875, + "loss": 0.7095, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.504263162612915, + "rewards/margins": 2.1330599784851074, + "rewards/rejected": -3.6373231410980225, + "step": 2910 + }, + { + "epoch": 0.34, + "learning_rate": 2.0142907759537023e-07, + "logits/chosen": -2.1355679035186768, + "logits/rejected": -2.1122989654541016, + "logps/chosen": -379.2438049316406, + "logps/rejected": -341.314208984375, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8497755527496338, + "rewards/margins": 1.79927396774292, + "rewards/rejected": -3.6490495204925537, + "step": 2911 + }, + { + "epoch": 0.34, + "learning_rate": 2.0139364591945198e-07, + "logits/chosen": -2.6578526496887207, + "logits/rejected": -2.619096040725708, + "logps/chosen": -170.84283447265625, + "logps/rejected": -255.70046997070312, + "loss": 0.685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.555381178855896, + "rewards/margins": 3.028592109680176, + "rewards/rejected": -4.583973407745361, + "step": 2912 + }, + { + "epoch": 0.34, + "learning_rate": 2.013582142435337e-07, + "logits/chosen": -2.1168463230133057, + "logits/rejected": -2.243605613708496, + "logps/chosen": -367.1270751953125, + "logps/rejected": -220.95384216308594, + "loss": 0.596, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.028884768486023, + "rewards/margins": 1.0163989067077637, + "rewards/rejected": -2.045283794403076, + "step": 2913 + }, + { + "epoch": 0.34, + "learning_rate": 2.0132278256761542e-07, + "logits/chosen": -2.322432041168213, + "logits/rejected": -2.6980199813842773, + "logps/chosen": -235.40992736816406, + "logps/rejected": -144.21420288085938, + "loss": 0.7743, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5636762380599976, + "rewards/margins": 0.9845834970474243, + "rewards/rejected": -2.548259973526001, + "step": 2914 + }, + { + "epoch": 0.34, + "learning_rate": 2.0128735089169717e-07, + "logits/chosen": -2.9559946060180664, + "logits/rejected": -2.9896111488342285, + "logps/chosen": -194.0387725830078, + "logps/rejected": -194.22564697265625, + "loss": 0.3681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4741092920303345, + "rewards/margins": 2.248908519744873, + "rewards/rejected": -2.723017930984497, + "step": 2915 + }, + { + "epoch": 0.34, + "learning_rate": 2.0125191921577892e-07, + "logits/chosen": -2.7808947563171387, + "logits/rejected": -2.914289951324463, + "logps/chosen": -108.57732391357422, + "logps/rejected": -297.12030029296875, + "loss": 0.4967, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6685969829559326, + "rewards/margins": 1.3955144882202148, + "rewards/rejected": -2.0641114711761475, + "step": 2916 + }, + { + "epoch": 0.34, + "learning_rate": 2.0121648753986064e-07, + "logits/chosen": -2.30912184715271, + "logits/rejected": -2.3399477005004883, + "logps/chosen": -272.7632141113281, + "logps/rejected": -271.8263854980469, + "loss": 0.5932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5316908955574036, + "rewards/margins": 1.1266906261444092, + "rewards/rejected": -1.658381462097168, + "step": 2917 + }, + { + "epoch": 0.34, + "learning_rate": 2.0118105586394237e-07, + "logits/chosen": -2.4315483570098877, + "logits/rejected": -2.2851662635803223, + "logps/chosen": -182.47215270996094, + "logps/rejected": -257.54913330078125, + "loss": 0.336, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5161490440368652, + "rewards/margins": 1.561649203300476, + "rewards/rejected": -2.077798366546631, + "step": 2918 + }, + { + "epoch": 0.34, + "learning_rate": 2.011456241880241e-07, + "logits/chosen": -2.2006661891937256, + "logits/rejected": -2.1646039485931396, + "logps/chosen": -434.67877197265625, + "logps/rejected": -341.927978515625, + "loss": 0.9661, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.740044116973877, + "rewards/margins": 0.9853518605232239, + "rewards/rejected": -2.725396156311035, + "step": 2919 + }, + { + "epoch": 0.34, + "learning_rate": 2.011101925121058e-07, + "logits/chosen": -1.6322543621063232, + "logits/rejected": -1.8881782293319702, + "logps/chosen": -292.7653503417969, + "logps/rejected": -219.0809326171875, + "loss": 0.6202, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6112018823623657, + "rewards/margins": 0.7212937474250793, + "rewards/rejected": -1.3324956893920898, + "step": 2920 + }, + { + "epoch": 0.34, + "learning_rate": 2.0107476083618753e-07, + "logits/chosen": -2.7338504791259766, + "logits/rejected": -2.4025659561157227, + "logps/chosen": -177.26974487304688, + "logps/rejected": -221.90487670898438, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1996528059244156, + "rewards/margins": 1.690220832824707, + "rewards/rejected": -1.8898735046386719, + "step": 2921 + }, + { + "epoch": 0.34, + "learning_rate": 2.0103932916026925e-07, + "logits/chosen": -2.3635764122009277, + "logits/rejected": -2.472235918045044, + "logps/chosen": -213.44801330566406, + "logps/rejected": -218.5472869873047, + "loss": 0.5757, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4223453998565674, + "rewards/margins": 0.8540315628051758, + "rewards/rejected": -2.2763772010803223, + "step": 2922 + }, + { + "epoch": 0.34, + "learning_rate": 2.01003897484351e-07, + "logits/chosen": -2.14583158493042, + "logits/rejected": -2.2695555686950684, + "logps/chosen": -135.1510467529297, + "logps/rejected": -154.6786651611328, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7779644131660461, + "rewards/margins": 2.2833235263824463, + "rewards/rejected": -3.0612878799438477, + "step": 2923 + }, + { + "epoch": 0.34, + "learning_rate": 2.0096846580843273e-07, + "logits/chosen": -2.3814899921417236, + "logits/rejected": -2.5100526809692383, + "logps/chosen": -195.1119384765625, + "logps/rejected": -177.22975158691406, + "loss": 0.2506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22154103219509125, + "rewards/margins": 2.2221519947052, + "rewards/rejected": -2.443693161010742, + "step": 2924 + }, + { + "epoch": 0.34, + "learning_rate": 2.0093303413251445e-07, + "logits/chosen": -2.037078857421875, + "logits/rejected": -2.2519540786743164, + "logps/chosen": -361.8112487792969, + "logps/rejected": -324.4891357421875, + "loss": 0.3512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2261950969696045, + "rewards/margins": 1.9505302906036377, + "rewards/rejected": -2.176725387573242, + "step": 2925 + }, + { + "epoch": 0.34, + "learning_rate": 2.0089760245659617e-07, + "logits/chosen": -2.271433115005493, + "logits/rejected": -2.143216133117676, + "logps/chosen": -140.87429809570312, + "logps/rejected": -243.0975799560547, + "loss": 0.9611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7680010795593262, + "rewards/margins": 0.9985706210136414, + "rewards/rejected": -2.766571521759033, + "step": 2926 + }, + { + "epoch": 0.34, + "learning_rate": 2.0086217078067792e-07, + "logits/chosen": -2.230926275253296, + "logits/rejected": -2.198648452758789, + "logps/chosen": -176.80764770507812, + "logps/rejected": -225.64303588867188, + "loss": 0.3351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.251363754272461, + "rewards/margins": 2.4190735816955566, + "rewards/rejected": -3.6704373359680176, + "step": 2927 + }, + { + "epoch": 0.34, + "learning_rate": 2.0082673910475967e-07, + "logits/chosen": -2.1258020401000977, + "logits/rejected": -1.8229312896728516, + "logps/chosen": -202.33834838867188, + "logps/rejected": -347.025146484375, + "loss": 0.5939, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6505422592163086, + "rewards/margins": 2.076451063156128, + "rewards/rejected": -3.7269930839538574, + "step": 2928 + }, + { + "epoch": 0.34, + "learning_rate": 2.007913074288414e-07, + "logits/chosen": -2.3995542526245117, + "logits/rejected": -2.0580923557281494, + "logps/chosen": -135.10195922851562, + "logps/rejected": -277.869873046875, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5674827098846436, + "rewards/margins": 2.2271230220794678, + "rewards/rejected": -2.7946054935455322, + "step": 2929 + }, + { + "epoch": 0.34, + "learning_rate": 2.007558757529231e-07, + "logits/chosen": -2.7765636444091797, + "logits/rejected": -2.593801975250244, + "logps/chosen": -235.21005249023438, + "logps/rejected": -215.4985809326172, + "loss": 0.2078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1597038209438324, + "rewards/margins": 2.3615593910217285, + "rewards/rejected": -2.521263360977173, + "step": 2930 + }, + { + "epoch": 0.34, + "learning_rate": 2.0072044407700483e-07, + "logits/chosen": -2.7641549110412598, + "logits/rejected": -2.7341067790985107, + "logps/chosen": -358.2259521484375, + "logps/rejected": -305.59783935546875, + "loss": 0.6213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9237785339355469, + "rewards/margins": 1.4694404602050781, + "rewards/rejected": -2.393218994140625, + "step": 2931 + }, + { + "epoch": 0.34, + "learning_rate": 2.0068501240108656e-07, + "logits/chosen": -2.1385064125061035, + "logits/rejected": -2.5001134872436523, + "logps/chosen": -262.65655517578125, + "logps/rejected": -193.37716674804688, + "loss": 0.5703, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5864198207855225, + "rewards/margins": 0.9017243385314941, + "rewards/rejected": -1.4881441593170166, + "step": 2932 + }, + { + "epoch": 0.34, + "learning_rate": 2.0064958072516828e-07, + "logits/chosen": -1.7771201133728027, + "logits/rejected": -1.6606181859970093, + "logps/chosen": -401.1876525878906, + "logps/rejected": -445.3562927246094, + "loss": 0.4001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4578157663345337, + "rewards/margins": 2.6788392066955566, + "rewards/rejected": -3.136654853820801, + "step": 2933 + }, + { + "epoch": 0.34, + "learning_rate": 2.0061414904925003e-07, + "logits/chosen": -1.9131063222885132, + "logits/rejected": -1.6544075012207031, + "logps/chosen": -344.4603271484375, + "logps/rejected": -343.1708984375, + "loss": 0.155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08929747343063354, + "rewards/margins": 2.488907814025879, + "rewards/rejected": -2.3996105194091797, + "step": 2934 + }, + { + "epoch": 0.34, + "learning_rate": 2.0057871737333175e-07, + "logits/chosen": -2.7798643112182617, + "logits/rejected": -2.7114267349243164, + "logps/chosen": -185.43434143066406, + "logps/rejected": -160.83795166015625, + "loss": 0.0914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5058056116104126, + "rewards/margins": 2.873469352722168, + "rewards/rejected": -3.379274845123291, + "step": 2935 + }, + { + "epoch": 0.34, + "learning_rate": 2.0054328569741347e-07, + "logits/chosen": -1.8365588188171387, + "logits/rejected": -1.977575421333313, + "logps/chosen": -504.61358642578125, + "logps/rejected": -423.5311279296875, + "loss": 0.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5996456742286682, + "rewards/margins": 1.2731050252914429, + "rewards/rejected": -1.8727507591247559, + "step": 2936 + }, + { + "epoch": 0.34, + "learning_rate": 2.005078540214952e-07, + "logits/chosen": -2.2681212425231934, + "logits/rejected": -2.564570426940918, + "logps/chosen": -298.9365234375, + "logps/rejected": -216.32440185546875, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.857894241809845, + "rewards/margins": 0.7612103223800659, + "rewards/rejected": -1.6191045045852661, + "step": 2937 + }, + { + "epoch": 0.34, + "learning_rate": 2.0047242234557694e-07, + "logits/chosen": -2.101719379425049, + "logits/rejected": -2.5418734550476074, + "logps/chosen": -510.6044921875, + "logps/rejected": -247.27154541015625, + "loss": 0.224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6818119287490845, + "rewards/margins": 2.496598243713379, + "rewards/rejected": -3.178410530090332, + "step": 2938 + }, + { + "epoch": 0.34, + "learning_rate": 2.004369906696587e-07, + "logits/chosen": -2.2151851654052734, + "logits/rejected": -2.372112989425659, + "logps/chosen": -379.4731750488281, + "logps/rejected": -304.09332275390625, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45288822054862976, + "rewards/margins": 2.0562357902526855, + "rewards/rejected": -2.5091238021850586, + "step": 2939 + }, + { + "epoch": 0.34, + "learning_rate": 2.004015589937404e-07, + "logits/chosen": -2.6408915519714355, + "logits/rejected": -2.4832851886749268, + "logps/chosen": -209.3400421142578, + "logps/rejected": -239.52781677246094, + "loss": 0.3537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10983119904994965, + "rewards/margins": 2.682030439376831, + "rewards/rejected": -2.7918617725372314, + "step": 2940 + }, + { + "epoch": 0.34, + "learning_rate": 2.0036612731782213e-07, + "logits/chosen": -2.320194721221924, + "logits/rejected": -2.442559242248535, + "logps/chosen": -301.04241943359375, + "logps/rejected": -242.84500122070312, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4987078309059143, + "rewards/margins": 1.8824102878570557, + "rewards/rejected": -2.381118059158325, + "step": 2941 + }, + { + "epoch": 0.34, + "learning_rate": 2.0033069564190386e-07, + "logits/chosen": -2.288820743560791, + "logits/rejected": -2.0482044219970703, + "logps/chosen": -232.35494995117188, + "logps/rejected": -275.17254638671875, + "loss": 0.1896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.962920606136322, + "rewards/margins": 3.1037437915802, + "rewards/rejected": -4.066664218902588, + "step": 2942 + }, + { + "epoch": 0.34, + "learning_rate": 2.0029526396598558e-07, + "logits/chosen": -1.9667783975601196, + "logits/rejected": -1.9268335103988647, + "logps/chosen": -437.59912109375, + "logps/rejected": -294.917236328125, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7919634580612183, + "rewards/margins": 1.086709976196289, + "rewards/rejected": -1.8786734342575073, + "step": 2943 + }, + { + "epoch": 0.34, + "learning_rate": 2.002598322900673e-07, + "logits/chosen": -2.3284685611724854, + "logits/rejected": -2.206052780151367, + "logps/chosen": -277.4950866699219, + "logps/rejected": -312.2777099609375, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.789352297782898, + "rewards/margins": 2.4773261547088623, + "rewards/rejected": -3.26667857170105, + "step": 2944 + }, + { + "epoch": 0.34, + "learning_rate": 2.0022440061414905e-07, + "logits/chosen": -2.353238582611084, + "logits/rejected": -2.2482492923736572, + "logps/chosen": -333.7976989746094, + "logps/rejected": -321.90814208984375, + "loss": 0.4478, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0109210014343262, + "rewards/margins": 0.8346867561340332, + "rewards/rejected": -1.8456077575683594, + "step": 2945 + }, + { + "epoch": 0.34, + "learning_rate": 2.0018896893823077e-07, + "logits/chosen": -2.375850200653076, + "logits/rejected": -2.362276554107666, + "logps/chosen": -344.5463562011719, + "logps/rejected": -316.58587646484375, + "loss": 0.3865, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4915316104888916, + "rewards/margins": 1.0389384031295776, + "rewards/rejected": -2.530470132827759, + "step": 2946 + }, + { + "epoch": 0.34, + "learning_rate": 2.001535372623125e-07, + "logits/chosen": -2.4104995727539062, + "logits/rejected": -2.292480945587158, + "logps/chosen": -226.97662353515625, + "logps/rejected": -293.36663818359375, + "loss": 0.8654, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2003618478775024, + "rewards/margins": 0.32490360736846924, + "rewards/rejected": -1.5252654552459717, + "step": 2947 + }, + { + "epoch": 0.34, + "learning_rate": 2.0011810558639422e-07, + "logits/chosen": -2.283973217010498, + "logits/rejected": -2.1675024032592773, + "logps/chosen": -236.4239501953125, + "logps/rejected": -301.725341796875, + "loss": 0.5205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6969221830368042, + "rewards/margins": 1.0932577848434448, + "rewards/rejected": -1.790179967880249, + "step": 2948 + }, + { + "epoch": 0.34, + "learning_rate": 2.0008267391047594e-07, + "logits/chosen": -2.4770517349243164, + "logits/rejected": -2.0775909423828125, + "logps/chosen": -217.0772247314453, + "logps/rejected": -337.2941589355469, + "loss": 0.6725, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.282008171081543, + "rewards/margins": 0.7198567390441895, + "rewards/rejected": -2.0018649101257324, + "step": 2949 + }, + { + "epoch": 0.34, + "learning_rate": 2.000472422345577e-07, + "logits/chosen": -2.556140422821045, + "logits/rejected": -2.5009443759918213, + "logps/chosen": -173.9896697998047, + "logps/rejected": -282.7094421386719, + "loss": 0.3951, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0036544799804688, + "rewards/margins": 1.163169503211975, + "rewards/rejected": -2.1668238639831543, + "step": 2950 + }, + { + "epoch": 0.34, + "learning_rate": 2.0001181055863943e-07, + "logits/chosen": -2.332852363586426, + "logits/rejected": -2.428933620452881, + "logps/chosen": -424.38568115234375, + "logps/rejected": -472.03759765625, + "loss": 0.2362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6782013177871704, + "rewards/margins": 3.6055824756622314, + "rewards/rejected": -4.283783435821533, + "step": 2951 + }, + { + "epoch": 0.34, + "learning_rate": 1.9997637888272116e-07, + "logits/chosen": -2.506739616394043, + "logits/rejected": -2.450920820236206, + "logps/chosen": -264.32586669921875, + "logps/rejected": -227.83897399902344, + "loss": 0.3261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7826262712478638, + "rewards/margins": 2.484111785888672, + "rewards/rejected": -3.266737937927246, + "step": 2952 + }, + { + "epoch": 0.34, + "learning_rate": 1.9994094720680288e-07, + "logits/chosen": -2.051487922668457, + "logits/rejected": -2.0303730964660645, + "logps/chosen": -291.3682861328125, + "logps/rejected": -314.61407470703125, + "loss": 0.4538, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7440359592437744, + "rewards/margins": 2.156759262084961, + "rewards/rejected": -2.9007952213287354, + "step": 2953 + }, + { + "epoch": 0.34, + "learning_rate": 1.999055155308846e-07, + "logits/chosen": -2.425264596939087, + "logits/rejected": -2.3210527896881104, + "logps/chosen": -269.73736572265625, + "logps/rejected": -356.1817626953125, + "loss": 0.3489, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.511172354221344, + "rewards/margins": 1.8551009893417358, + "rewards/rejected": -2.3662734031677246, + "step": 2954 + }, + { + "epoch": 0.34, + "learning_rate": 1.9987008385496632e-07, + "logits/chosen": -2.1973328590393066, + "logits/rejected": -2.434110164642334, + "logps/chosen": -263.4280700683594, + "logps/rejected": -203.09645080566406, + "loss": 0.4527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8146154880523682, + "rewards/margins": 0.8889662027359009, + "rewards/rejected": -1.703581690788269, + "step": 2955 + }, + { + "epoch": 0.34, + "learning_rate": 1.9983465217904805e-07, + "logits/chosen": -2.2124061584472656, + "logits/rejected": -2.2912731170654297, + "logps/chosen": -259.53277587890625, + "logps/rejected": -230.1304473876953, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5238584280014038, + "rewards/margins": 2.341963768005371, + "rewards/rejected": -2.8658220767974854, + "step": 2956 + }, + { + "epoch": 0.34, + "learning_rate": 1.997992205031298e-07, + "logits/chosen": -2.4036221504211426, + "logits/rejected": -2.5151474475860596, + "logps/chosen": -422.882568359375, + "logps/rejected": -258.9435729980469, + "loss": 0.9013, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4087369441986084, + "rewards/margins": 1.2083147764205933, + "rewards/rejected": -2.617051839828491, + "step": 2957 + }, + { + "epoch": 0.34, + "learning_rate": 1.9976378882721152e-07, + "logits/chosen": -2.221648693084717, + "logits/rejected": -2.2349069118499756, + "logps/chosen": -322.3822021484375, + "logps/rejected": -365.86627197265625, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25471287965774536, + "rewards/margins": 4.34344482421875, + "rewards/rejected": -4.59815788269043, + "step": 2958 + }, + { + "epoch": 0.34, + "learning_rate": 1.9972835715129324e-07, + "logits/chosen": -1.7634639739990234, + "logits/rejected": -2.3269524574279785, + "logps/chosen": -409.5461730957031, + "logps/rejected": -182.36782836914062, + "loss": 1.1843, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8923242092132568, + "rewards/margins": -0.4478663504123688, + "rewards/rejected": -1.4444578886032104, + "step": 2959 + }, + { + "epoch": 0.34, + "learning_rate": 1.9969292547537496e-07, + "logits/chosen": -1.9270144701004028, + "logits/rejected": -1.9363837242126465, + "logps/chosen": -251.80111694335938, + "logps/rejected": -229.285888671875, + "loss": 0.3353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.795459508895874, + "rewards/margins": 2.083198070526123, + "rewards/rejected": -2.878657341003418, + "step": 2960 + }, + { + "epoch": 0.34, + "learning_rate": 1.9965749379945668e-07, + "logits/chosen": -3.1551668643951416, + "logits/rejected": -2.959796190261841, + "logps/chosen": -365.0365905761719, + "logps/rejected": -304.3221435546875, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9896816611289978, + "rewards/margins": 0.9685075283050537, + "rewards/rejected": -1.9581892490386963, + "step": 2961 + }, + { + "epoch": 0.34, + "learning_rate": 1.9962206212353846e-07, + "logits/chosen": -1.998806118965149, + "logits/rejected": -1.9266431331634521, + "logps/chosen": -198.1507110595703, + "logps/rejected": -261.8472900390625, + "loss": 0.5192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4290041923522949, + "rewards/margins": 1.997981071472168, + "rewards/rejected": -2.426985502243042, + "step": 2962 + }, + { + "epoch": 0.34, + "learning_rate": 1.9958663044762018e-07, + "logits/chosen": -1.930488109588623, + "logits/rejected": -2.0079345703125, + "logps/chosen": -333.8968505859375, + "logps/rejected": -328.39569091796875, + "loss": 0.9569, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.3199591636657715, + "rewards/margins": 0.017900601029396057, + "rewards/rejected": -1.3378597497940063, + "step": 2963 + }, + { + "epoch": 0.34, + "learning_rate": 1.995511987717019e-07, + "logits/chosen": -1.8392246961593628, + "logits/rejected": -2.2603702545166016, + "logps/chosen": -282.7964172363281, + "logps/rejected": -164.8245086669922, + "loss": 0.2697, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16705051064491272, + "rewards/margins": 2.112429618835449, + "rewards/rejected": -2.279480457305908, + "step": 2964 + }, + { + "epoch": 0.34, + "learning_rate": 1.9951576709578362e-07, + "logits/chosen": -1.9577205181121826, + "logits/rejected": -1.6914575099945068, + "logps/chosen": -113.56126403808594, + "logps/rejected": -139.3248748779297, + "loss": 0.5022, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21151790022850037, + "rewards/margins": 0.7962382435798645, + "rewards/rejected": -1.007756233215332, + "step": 2965 + }, + { + "epoch": 0.35, + "learning_rate": 1.9948033541986535e-07, + "logits/chosen": -2.312349319458008, + "logits/rejected": -2.349256992340088, + "logps/chosen": -247.89352416992188, + "logps/rejected": -332.27764892578125, + "loss": 0.4856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.57518470287323, + "rewards/margins": 1.1079691648483276, + "rewards/rejected": -1.6831538677215576, + "step": 2966 + }, + { + "epoch": 0.35, + "learning_rate": 1.9944490374394707e-07, + "logits/chosen": -2.3599276542663574, + "logits/rejected": -2.5052852630615234, + "logps/chosen": -306.83685302734375, + "logps/rejected": -230.5961151123047, + "loss": 0.3585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8077543377876282, + "rewards/margins": 1.8203766345977783, + "rewards/rejected": -2.6281309127807617, + "step": 2967 + }, + { + "epoch": 0.35, + "learning_rate": 1.9940947206802882e-07, + "logits/chosen": -2.6003191471099854, + "logits/rejected": -2.269643783569336, + "logps/chosen": -183.5027313232422, + "logps/rejected": -316.7831726074219, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4048264026641846, + "rewards/margins": 2.2455625534057617, + "rewards/rejected": -3.6503889560699463, + "step": 2968 + }, + { + "epoch": 0.35, + "learning_rate": 1.9937404039211054e-07, + "logits/chosen": -2.4002928733825684, + "logits/rejected": -2.388449192047119, + "logps/chosen": -149.33644104003906, + "logps/rejected": -318.544677734375, + "loss": 0.4518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7640703320503235, + "rewards/margins": 2.808077573776245, + "rewards/rejected": -3.572147846221924, + "step": 2969 + }, + { + "epoch": 0.35, + "learning_rate": 1.9933860871619226e-07, + "logits/chosen": -2.581054210662842, + "logits/rejected": -2.480968475341797, + "logps/chosen": -262.0176696777344, + "logps/rejected": -350.72088623046875, + "loss": 0.4046, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5113088488578796, + "rewards/margins": 1.4705942869186401, + "rewards/rejected": -1.981903314590454, + "step": 2970 + }, + { + "epoch": 0.35, + "learning_rate": 1.9930317704027398e-07, + "logits/chosen": -2.44429349899292, + "logits/rejected": -2.1890335083007812, + "logps/chosen": -192.13336181640625, + "logps/rejected": -292.06060791015625, + "loss": 0.3554, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1395573616027832, + "rewards/margins": 2.0346007347106934, + "rewards/rejected": -3.1741578578948975, + "step": 2971 + }, + { + "epoch": 0.35, + "learning_rate": 1.992677453643557e-07, + "logits/chosen": -2.1442832946777344, + "logits/rejected": -2.402881383895874, + "logps/chosen": -211.551025390625, + "logps/rejected": -234.384033203125, + "loss": 0.5705, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8243160247802734, + "rewards/margins": 3.032723903656006, + "rewards/rejected": -3.8570399284362793, + "step": 2972 + }, + { + "epoch": 0.35, + "learning_rate": 1.9923231368843748e-07, + "logits/chosen": -2.887645721435547, + "logits/rejected": -2.9233834743499756, + "logps/chosen": -242.0677490234375, + "logps/rejected": -214.32501220703125, + "loss": 0.3434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8676060438156128, + "rewards/margins": 2.1376421451568604, + "rewards/rejected": -3.0052480697631836, + "step": 2973 + }, + { + "epoch": 0.35, + "learning_rate": 1.991968820125192e-07, + "logits/chosen": -2.344966411590576, + "logits/rejected": -2.4604201316833496, + "logps/chosen": -235.11480712890625, + "logps/rejected": -355.4427795410156, + "loss": 0.2285, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1107521057128906, + "rewards/margins": 4.45008659362793, + "rewards/rejected": -5.56083869934082, + "step": 2974 + }, + { + "epoch": 0.35, + "learning_rate": 1.9916145033660092e-07, + "logits/chosen": -2.7818405628204346, + "logits/rejected": -2.8401758670806885, + "logps/chosen": -139.1961212158203, + "logps/rejected": -249.43685913085938, + "loss": 0.3002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6239289045333862, + "rewards/margins": 3.0030181407928467, + "rewards/rejected": -3.6269471645355225, + "step": 2975 + }, + { + "epoch": 0.35, + "learning_rate": 1.9912601866068265e-07, + "logits/chosen": -2.375119686126709, + "logits/rejected": -2.4892711639404297, + "logps/chosen": -237.21255493164062, + "logps/rejected": -246.2691650390625, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.83076012134552, + "rewards/margins": 2.461059331893921, + "rewards/rejected": -3.2918195724487305, + "step": 2976 + }, + { + "epoch": 0.35, + "learning_rate": 1.9909058698476437e-07, + "logits/chosen": -1.9953749179840088, + "logits/rejected": -1.4665166139602661, + "logps/chosen": -271.1525573730469, + "logps/rejected": -423.68304443359375, + "loss": 0.2447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7756668925285339, + "rewards/margins": 3.527169942855835, + "rewards/rejected": -4.302836894989014, + "step": 2977 + }, + { + "epoch": 0.35, + "learning_rate": 1.990551553088461e-07, + "logits/chosen": -2.419403076171875, + "logits/rejected": -2.393122911453247, + "logps/chosen": -389.3238525390625, + "logps/rejected": -435.58233642578125, + "loss": 0.4571, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9680401682853699, + "rewards/margins": 1.2539844512939453, + "rewards/rejected": -2.22202467918396, + "step": 2978 + }, + { + "epoch": 0.35, + "learning_rate": 1.9901972363292784e-07, + "logits/chosen": -2.7056260108947754, + "logits/rejected": -2.8224802017211914, + "logps/chosen": -189.83680725097656, + "logps/rejected": -271.62164306640625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09028299152851105, + "rewards/margins": 5.211880683898926, + "rewards/rejected": -5.121597766876221, + "step": 2979 + }, + { + "epoch": 0.35, + "learning_rate": 1.9898429195700956e-07, + "logits/chosen": -1.7996091842651367, + "logits/rejected": -1.9412338733673096, + "logps/chosen": -325.79541015625, + "logps/rejected": -261.46051025390625, + "loss": 0.341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17956455051898956, + "rewards/margins": 1.185703992843628, + "rewards/rejected": -1.0061395168304443, + "step": 2980 + }, + { + "epoch": 0.35, + "learning_rate": 1.9894886028109128e-07, + "logits/chosen": -2.3512072563171387, + "logits/rejected": -2.5131142139434814, + "logps/chosen": -207.00772094726562, + "logps/rejected": -189.58924865722656, + "loss": 0.3772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39496469497680664, + "rewards/margins": 1.1836988925933838, + "rewards/rejected": -1.5786635875701904, + "step": 2981 + }, + { + "epoch": 0.35, + "learning_rate": 1.98913428605173e-07, + "logits/chosen": -2.139326333999634, + "logits/rejected": -2.3618686199188232, + "logps/chosen": -270.9585266113281, + "logps/rejected": -274.06103515625, + "loss": 0.4571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9525195360183716, + "rewards/margins": 2.381098747253418, + "rewards/rejected": -3.333617925643921, + "step": 2982 + }, + { + "epoch": 0.35, + "learning_rate": 1.9887799692925473e-07, + "logits/chosen": -2.7527027130126953, + "logits/rejected": -2.67350697517395, + "logps/chosen": -127.47023010253906, + "logps/rejected": -275.6497802734375, + "loss": 0.7784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3970443606376648, + "rewards/margins": 1.5528340339660645, + "rewards/rejected": -1.949878454208374, + "step": 2983 + }, + { + "epoch": 0.35, + "learning_rate": 1.9884256525333645e-07, + "logits/chosen": -2.7597310543060303, + "logits/rejected": -2.3768808841705322, + "logps/chosen": -372.4945068359375, + "logps/rejected": -386.5562438964844, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7409670948982239, + "rewards/margins": 2.8956496715545654, + "rewards/rejected": -3.6366167068481445, + "step": 2984 + }, + { + "epoch": 0.35, + "learning_rate": 1.9880713357741822e-07, + "logits/chosen": -2.453596591949463, + "logits/rejected": -2.5824103355407715, + "logps/chosen": -281.9296875, + "logps/rejected": -169.03256225585938, + "loss": 0.7276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3892822265625, + "rewards/margins": 1.9185659885406494, + "rewards/rejected": -2.3078482151031494, + "step": 2985 + }, + { + "epoch": 0.35, + "learning_rate": 1.9877170190149995e-07, + "logits/chosen": -1.853387475013733, + "logits/rejected": -2.404853343963623, + "logps/chosen": -329.145263671875, + "logps/rejected": -164.2345733642578, + "loss": 1.0488, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0530712604522705, + "rewards/margins": 0.32960045337677, + "rewards/rejected": -1.382671594619751, + "step": 2986 + }, + { + "epoch": 0.35, + "learning_rate": 1.9873627022558167e-07, + "logits/chosen": -2.7191615104675293, + "logits/rejected": -2.728963613510132, + "logps/chosen": -184.5883026123047, + "logps/rejected": -182.47152709960938, + "loss": 0.2529, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13931691646575928, + "rewards/margins": 2.1777701377868652, + "rewards/rejected": -2.317087173461914, + "step": 2987 + }, + { + "epoch": 0.35, + "learning_rate": 1.987008385496634e-07, + "logits/chosen": -1.8460450172424316, + "logits/rejected": -2.049100399017334, + "logps/chosen": -280.8143310546875, + "logps/rejected": -231.23255920410156, + "loss": 0.5839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7711240649223328, + "rewards/margins": 1.2123451232910156, + "rewards/rejected": -1.9834691286087036, + "step": 2988 + }, + { + "epoch": 0.35, + "learning_rate": 1.9866540687374511e-07, + "logits/chosen": -2.3131420612335205, + "logits/rejected": -2.54781436920166, + "logps/chosen": -345.4716796875, + "logps/rejected": -226.30291748046875, + "loss": 0.4623, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7816973328590393, + "rewards/margins": 2.7152488231658936, + "rewards/rejected": -3.496945858001709, + "step": 2989 + }, + { + "epoch": 0.35, + "learning_rate": 1.9862997519782686e-07, + "logits/chosen": -2.094048500061035, + "logits/rejected": -2.2418534755706787, + "logps/chosen": -181.994140625, + "logps/rejected": -133.42947387695312, + "loss": 0.5018, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7052596807479858, + "rewards/margins": 1.3199505805969238, + "rewards/rejected": -2.025210380554199, + "step": 2990 + }, + { + "epoch": 0.35, + "learning_rate": 1.9859454352190858e-07, + "logits/chosen": -2.3440046310424805, + "logits/rejected": -2.161285161972046, + "logps/chosen": -358.63232421875, + "logps/rejected": -209.40484619140625, + "loss": 0.4257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6296483874320984, + "rewards/margins": 1.4024100303649902, + "rewards/rejected": -2.0320584774017334, + "step": 2991 + }, + { + "epoch": 0.35, + "learning_rate": 1.985591118459903e-07, + "logits/chosen": -2.4367330074310303, + "logits/rejected": -2.5656981468200684, + "logps/chosen": -302.6728515625, + "logps/rejected": -251.53436279296875, + "loss": 0.5902, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8423860669136047, + "rewards/margins": 1.7068713903427124, + "rewards/rejected": -2.549257516860962, + "step": 2992 + }, + { + "epoch": 0.35, + "learning_rate": 1.9852368017007203e-07, + "logits/chosen": -2.4103927612304688, + "logits/rejected": -2.3722410202026367, + "logps/chosen": -163.59613037109375, + "logps/rejected": -187.11683654785156, + "loss": 0.2976, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.244042992591858, + "rewards/margins": 1.6250025033950806, + "rewards/rejected": -2.8690457344055176, + "step": 2993 + }, + { + "epoch": 0.35, + "learning_rate": 1.9848824849415375e-07, + "logits/chosen": -2.537788152694702, + "logits/rejected": -2.557664632797241, + "logps/chosen": -285.9046630859375, + "logps/rejected": -212.3981170654297, + "loss": 0.2143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3959222733974457, + "rewards/margins": 2.684755802154541, + "rewards/rejected": -3.0806779861450195, + "step": 2994 + }, + { + "epoch": 0.35, + "learning_rate": 1.9845281681823547e-07, + "logits/chosen": -2.4008352756500244, + "logits/rejected": -2.4128642082214355, + "logps/chosen": -225.70220947265625, + "logps/rejected": -173.80136108398438, + "loss": 0.2537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7729071378707886, + "rewards/margins": 1.7236690521240234, + "rewards/rejected": -2.4965763092041016, + "step": 2995 + }, + { + "epoch": 0.35, + "learning_rate": 1.984173851423172e-07, + "logits/chosen": -2.0448684692382812, + "logits/rejected": -2.454012870788574, + "logps/chosen": -336.7002258300781, + "logps/rejected": -271.1465148925781, + "loss": 0.1749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2043069750070572, + "rewards/margins": 2.5484235286712646, + "rewards/rejected": -2.752730369567871, + "step": 2996 + }, + { + "epoch": 0.35, + "learning_rate": 1.9838195346639897e-07, + "logits/chosen": -2.50107741355896, + "logits/rejected": -2.298346519470215, + "logps/chosen": -121.91665649414062, + "logps/rejected": -178.97055053710938, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6283455491065979, + "rewards/margins": 1.3679494857788086, + "rewards/rejected": -1.9962950944900513, + "step": 2997 + }, + { + "epoch": 0.35, + "learning_rate": 1.983465217904807e-07, + "logits/chosen": -2.1958560943603516, + "logits/rejected": -1.8425116539001465, + "logps/chosen": -205.7991485595703, + "logps/rejected": -294.8202209472656, + "loss": 0.4541, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9130134582519531, + "rewards/margins": 1.8140803575515747, + "rewards/rejected": -2.7270936965942383, + "step": 2998 + }, + { + "epoch": 0.35, + "learning_rate": 1.9831109011456241e-07, + "logits/chosen": -2.493767499923706, + "logits/rejected": -2.607168674468994, + "logps/chosen": -221.44847106933594, + "logps/rejected": -389.8901062011719, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3364153206348419, + "rewards/margins": 2.517232894897461, + "rewards/rejected": -2.8536481857299805, + "step": 2999 + }, + { + "epoch": 0.35, + "learning_rate": 1.9827565843864414e-07, + "logits/chosen": -2.1696486473083496, + "logits/rejected": -2.1916065216064453, + "logps/chosen": -202.68438720703125, + "logps/rejected": -253.47877502441406, + "loss": 0.5182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9760220050811768, + "rewards/margins": 1.5308241844177246, + "rewards/rejected": -2.5068461894989014, + "step": 3000 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -1.7466588020324707, + "eval_logits/rejected": -1.746543049812317, + "eval_logps/chosen": -277.8332824707031, + "eval_logps/rejected": -275.6523132324219, + "eval_loss": 0.3812580704689026, + "eval_rewards/accuracies": 0.8405172228813171, + "eval_rewards/chosen": -0.5667834281921387, + "eval_rewards/margins": 1.897790551185608, + "eval_rewards/rejected": -2.464573621749878, + "eval_runtime": 237.7682, + "eval_samples_per_second": 2.923, + "eval_steps_per_second": 1.464, + "step": 3000 + }, + { + "epoch": 0.35, + "learning_rate": 1.9824022676272586e-07, + "logits/chosen": -2.4594738483428955, + "logits/rejected": -2.363607406616211, + "logps/chosen": -146.41091918945312, + "logps/rejected": -274.72247314453125, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9387666583061218, + "rewards/margins": 2.9524550437927246, + "rewards/rejected": -3.891221523284912, + "step": 3001 + }, + { + "epoch": 0.35, + "learning_rate": 1.982047950868076e-07, + "logits/chosen": -2.6245057582855225, + "logits/rejected": -2.642518997192383, + "logps/chosen": -190.01617431640625, + "logps/rejected": -242.55023193359375, + "loss": 0.3491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6915631294250488, + "rewards/margins": 1.4898996353149414, + "rewards/rejected": -2.1814630031585693, + "step": 3002 + }, + { + "epoch": 0.35, + "learning_rate": 1.9816936341088933e-07, + "logits/chosen": -2.780846118927002, + "logits/rejected": -2.767791986465454, + "logps/chosen": -173.71292114257812, + "logps/rejected": -163.12527465820312, + "loss": 0.3204, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.555416464805603, + "rewards/margins": 1.2818272113800049, + "rewards/rejected": -1.8372435569763184, + "step": 3003 + }, + { + "epoch": 0.35, + "learning_rate": 1.9813393173497105e-07, + "logits/chosen": -2.277650833129883, + "logits/rejected": -2.189649820327759, + "logps/chosen": -346.3839111328125, + "logps/rejected": -313.9497985839844, + "loss": 0.3721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8681928515434265, + "rewards/margins": 2.0290160179138184, + "rewards/rejected": -2.8972091674804688, + "step": 3004 + }, + { + "epoch": 0.35, + "learning_rate": 1.9809850005905277e-07, + "logits/chosen": -2.9929909706115723, + "logits/rejected": -3.0167157649993896, + "logps/chosen": -138.9799346923828, + "logps/rejected": -199.08450317382812, + "loss": 0.1956, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1523414850234985, + "rewards/margins": 2.8133692741394043, + "rewards/rejected": -3.9657106399536133, + "step": 3005 + }, + { + "epoch": 0.35, + "learning_rate": 1.980630683831345e-07, + "logits/chosen": -2.255199909210205, + "logits/rejected": -2.5363755226135254, + "logps/chosen": -293.1972961425781, + "logps/rejected": -168.693115234375, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8815634250640869, + "rewards/margins": 1.6672542095184326, + "rewards/rejected": -2.5488173961639404, + "step": 3006 + }, + { + "epoch": 0.35, + "learning_rate": 1.9802763670721622e-07, + "logits/chosen": -1.6118335723876953, + "logits/rejected": -1.9646233320236206, + "logps/chosen": -553.3547973632812, + "logps/rejected": -306.34893798828125, + "loss": 0.5595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7926628589630127, + "rewards/margins": 2.412933826446533, + "rewards/rejected": -4.205596446990967, + "step": 3007 + }, + { + "epoch": 0.35, + "learning_rate": 1.97992205031298e-07, + "logits/chosen": -1.5852832794189453, + "logits/rejected": -2.020991802215576, + "logps/chosen": -670.7092895507812, + "logps/rejected": -305.908935546875, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4429447054862976, + "rewards/margins": 2.6377742290496826, + "rewards/rejected": -3.080718994140625, + "step": 3008 + }, + { + "epoch": 0.35, + "learning_rate": 1.9795677335537971e-07, + "logits/chosen": -1.9796062707901, + "logits/rejected": -2.377922773361206, + "logps/chosen": -340.1313171386719, + "logps/rejected": -198.8621826171875, + "loss": 0.2767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20771107077598572, + "rewards/margins": 2.2245826721191406, + "rewards/rejected": -2.4322938919067383, + "step": 3009 + }, + { + "epoch": 0.35, + "learning_rate": 1.9792134167946144e-07, + "logits/chosen": -2.693833589553833, + "logits/rejected": -2.7357425689697266, + "logps/chosen": -191.77951049804688, + "logps/rejected": -179.29135131835938, + "loss": 0.8104, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1371018886566162, + "rewards/margins": 0.0901307538151741, + "rewards/rejected": -1.227232575416565, + "step": 3010 + }, + { + "epoch": 0.35, + "learning_rate": 1.9788591000354316e-07, + "logits/chosen": -2.0227575302124023, + "logits/rejected": -1.629905343055725, + "logps/chosen": -232.49853515625, + "logps/rejected": -318.7529296875, + "loss": 0.6374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5750538110733032, + "rewards/margins": 0.24198950827121735, + "rewards/rejected": -0.8170433044433594, + "step": 3011 + }, + { + "epoch": 0.35, + "learning_rate": 1.9785047832762488e-07, + "logits/chosen": -1.9970715045928955, + "logits/rejected": -1.8324120044708252, + "logps/chosen": -277.4881591796875, + "logps/rejected": -350.7144775390625, + "loss": 0.4483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8784058094024658, + "rewards/margins": 1.7513728141784668, + "rewards/rejected": -2.6297788619995117, + "step": 3012 + }, + { + "epoch": 0.35, + "learning_rate": 1.9781504665170663e-07, + "logits/chosen": -2.326165199279785, + "logits/rejected": -2.4515037536621094, + "logps/chosen": -378.49078369140625, + "logps/rejected": -348.53515625, + "loss": 0.4571, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7918878793716431, + "rewards/margins": 1.3227030038833618, + "rewards/rejected": -2.114590883255005, + "step": 3013 + }, + { + "epoch": 0.35, + "learning_rate": 1.9777961497578835e-07, + "logits/chosen": -1.9172874689102173, + "logits/rejected": -2.1663930416107178, + "logps/chosen": -510.93011474609375, + "logps/rejected": -243.97113037109375, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03988611698150635, + "rewards/margins": 3.142962694168091, + "rewards/rejected": -3.1828489303588867, + "step": 3014 + }, + { + "epoch": 0.35, + "learning_rate": 1.9774418329987007e-07, + "logits/chosen": -2.1951584815979004, + "logits/rejected": -1.9230486154556274, + "logps/chosen": -256.08148193359375, + "logps/rejected": -294.5352783203125, + "loss": 0.3055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9217814803123474, + "rewards/margins": 3.449796676635742, + "rewards/rejected": -4.371578216552734, + "step": 3015 + }, + { + "epoch": 0.35, + "learning_rate": 1.977087516239518e-07, + "logits/chosen": -2.2998878955841064, + "logits/rejected": -2.2646679878234863, + "logps/chosen": -303.7473449707031, + "logps/rejected": -297.41607666015625, + "loss": 0.3862, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0641454458236694, + "rewards/margins": 1.3899402618408203, + "rewards/rejected": -2.4540855884552, + "step": 3016 + }, + { + "epoch": 0.35, + "learning_rate": 1.9767331994803352e-07, + "logits/chosen": -2.2941417694091797, + "logits/rejected": -2.199272632598877, + "logps/chosen": -265.89361572265625, + "logps/rejected": -349.03594970703125, + "loss": 0.377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.644376277923584, + "rewards/margins": 2.3195011615753174, + "rewards/rejected": -2.9638774394989014, + "step": 3017 + }, + { + "epoch": 0.35, + "learning_rate": 1.9763788827211524e-07, + "logits/chosen": -2.039112091064453, + "logits/rejected": -1.9755923748016357, + "logps/chosen": -292.5597839355469, + "logps/rejected": -273.22222900390625, + "loss": 0.1079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.065636157989502, + "rewards/margins": 3.8276336193084717, + "rewards/rejected": -4.8932695388793945, + "step": 3018 + }, + { + "epoch": 0.35, + "learning_rate": 1.97602456596197e-07, + "logits/chosen": -2.5239152908325195, + "logits/rejected": -2.5838866233825684, + "logps/chosen": -139.35659790039062, + "logps/rejected": -242.7225341796875, + "loss": 0.5887, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1285662651062012, + "rewards/margins": 2.7950539588928223, + "rewards/rejected": -3.9236199855804443, + "step": 3019 + }, + { + "epoch": 0.35, + "learning_rate": 1.9756702492027874e-07, + "logits/chosen": -2.066732883453369, + "logits/rejected": -2.1423020362854004, + "logps/chosen": -169.10150146484375, + "logps/rejected": -290.64788818359375, + "loss": 0.737, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1237233877182007, + "rewards/margins": 1.1853047609329224, + "rewards/rejected": -2.309027910232544, + "step": 3020 + }, + { + "epoch": 0.35, + "learning_rate": 1.9753159324436046e-07, + "logits/chosen": -2.687124729156494, + "logits/rejected": -2.672086477279663, + "logps/chosen": -116.39154052734375, + "logps/rejected": -194.3573760986328, + "loss": 0.4016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3717831373214722, + "rewards/margins": 1.6063649654388428, + "rewards/rejected": -2.9781484603881836, + "step": 3021 + }, + { + "epoch": 0.35, + "learning_rate": 1.9749616156844218e-07, + "logits/chosen": -2.2653307914733887, + "logits/rejected": -1.9556455612182617, + "logps/chosen": -264.12860107421875, + "logps/rejected": -309.27215576171875, + "loss": 0.5846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030281245708465576, + "rewards/margins": 1.4140753746032715, + "rewards/rejected": -1.4443565607070923, + "step": 3022 + }, + { + "epoch": 0.35, + "learning_rate": 1.974607298925239e-07, + "logits/chosen": -2.6611971855163574, + "logits/rejected": -2.529115915298462, + "logps/chosen": -301.7157897949219, + "logps/rejected": -355.5975341796875, + "loss": 0.2084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6033135056495667, + "rewards/margins": 2.247802257537842, + "rewards/rejected": -2.8511159420013428, + "step": 3023 + }, + { + "epoch": 0.35, + "learning_rate": 1.9742529821660565e-07, + "logits/chosen": -2.738468647003174, + "logits/rejected": -2.705026865005493, + "logps/chosen": -280.864990234375, + "logps/rejected": -298.3419494628906, + "loss": 0.2549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5234990119934082, + "rewards/margins": 2.169316291809082, + "rewards/rejected": -2.6928153038024902, + "step": 3024 + }, + { + "epoch": 0.35, + "learning_rate": 1.9738986654068737e-07, + "logits/chosen": -2.534022092819214, + "logits/rejected": -2.8444674015045166, + "logps/chosen": -244.8070831298828, + "logps/rejected": -249.51451110839844, + "loss": 0.398, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.063251256942749, + "rewards/margins": 1.882554292678833, + "rewards/rejected": -2.945805549621582, + "step": 3025 + }, + { + "epoch": 0.35, + "learning_rate": 1.973544348647691e-07, + "logits/chosen": -2.283919095993042, + "logits/rejected": -2.44333553314209, + "logps/chosen": -178.06307983398438, + "logps/rejected": -216.8458251953125, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7997996807098389, + "rewards/margins": 3.042936325073242, + "rewards/rejected": -3.842735767364502, + "step": 3026 + }, + { + "epoch": 0.35, + "learning_rate": 1.9731900318885082e-07, + "logits/chosen": -2.255034923553467, + "logits/rejected": -2.3941080570220947, + "logps/chosen": -414.7364196777344, + "logps/rejected": -465.2244873046875, + "loss": 0.2052, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.063772052526474, + "rewards/margins": 2.7206168174743652, + "rewards/rejected": -2.784389019012451, + "step": 3027 + }, + { + "epoch": 0.35, + "learning_rate": 1.9728357151293254e-07, + "logits/chosen": -2.4632627964019775, + "logits/rejected": -2.5817079544067383, + "logps/chosen": -128.80618286132812, + "logps/rejected": -214.5970458984375, + "loss": 0.2904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7817670106887817, + "rewards/margins": 2.3830456733703613, + "rewards/rejected": -3.1648128032684326, + "step": 3028 + }, + { + "epoch": 0.35, + "learning_rate": 1.9724813983701426e-07, + "logits/chosen": -2.1246497631073, + "logits/rejected": -2.068174362182617, + "logps/chosen": -421.6078186035156, + "logps/rejected": -337.85955810546875, + "loss": 0.9095, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6912754774093628, + "rewards/margins": 0.7982547879219055, + "rewards/rejected": -2.489530324935913, + "step": 3029 + }, + { + "epoch": 0.35, + "learning_rate": 1.9721270816109599e-07, + "logits/chosen": -2.2448923587799072, + "logits/rejected": -2.1223881244659424, + "logps/chosen": -301.55438232421875, + "logps/rejected": -315.1142578125, + "loss": 0.7824, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.768424153327942, + "rewards/margins": 1.113840103149414, + "rewards/rejected": -2.8822643756866455, + "step": 3030 + }, + { + "epoch": 0.35, + "learning_rate": 1.9717727648517773e-07, + "logits/chosen": -2.2851743698120117, + "logits/rejected": -2.413346767425537, + "logps/chosen": -368.11212158203125, + "logps/rejected": -255.84933471679688, + "loss": 0.3918, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0641793012619019, + "rewards/margins": 1.5954036712646484, + "rewards/rejected": -2.6595828533172607, + "step": 3031 + }, + { + "epoch": 0.35, + "learning_rate": 1.9714184480925948e-07, + "logits/chosen": -1.6360836029052734, + "logits/rejected": -1.790245771408081, + "logps/chosen": -335.57537841796875, + "logps/rejected": -321.9815368652344, + "loss": 0.6121, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6329741477966309, + "rewards/margins": 0.8294532299041748, + "rewards/rejected": -1.4624273777008057, + "step": 3032 + }, + { + "epoch": 0.35, + "learning_rate": 1.971064131333412e-07, + "logits/chosen": -2.2159547805786133, + "logits/rejected": -2.468369483947754, + "logps/chosen": -426.2157897949219, + "logps/rejected": -252.64630126953125, + "loss": 0.5882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47872400283813477, + "rewards/margins": 1.574688196182251, + "rewards/rejected": -2.0534119606018066, + "step": 3033 + }, + { + "epoch": 0.35, + "learning_rate": 1.9707098145742293e-07, + "logits/chosen": -2.431584596633911, + "logits/rejected": -2.1954903602600098, + "logps/chosen": -227.67306518554688, + "logps/rejected": -393.4449768066406, + "loss": 0.6734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8990190029144287, + "rewards/margins": 0.8050597906112671, + "rewards/rejected": -1.7040787935256958, + "step": 3034 + }, + { + "epoch": 0.35, + "learning_rate": 1.9703554978150465e-07, + "logits/chosen": -2.5853936672210693, + "logits/rejected": -2.7979090213775635, + "logps/chosen": -228.4013671875, + "logps/rejected": -157.54718017578125, + "loss": 0.3534, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.823639988899231, + "rewards/margins": 1.9678676128387451, + "rewards/rejected": -2.7915077209472656, + "step": 3035 + }, + { + "epoch": 0.35, + "learning_rate": 1.970001181055864e-07, + "logits/chosen": -2.774946689605713, + "logits/rejected": -2.5650579929351807, + "logps/chosen": -182.18905639648438, + "logps/rejected": -185.5701446533203, + "loss": 0.4161, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1382135152816772, + "rewards/margins": 1.5018057823181152, + "rewards/rejected": -2.640019416809082, + "step": 3036 + }, + { + "epoch": 0.35, + "learning_rate": 1.9696468642966812e-07, + "logits/chosen": -2.361912965774536, + "logits/rejected": -2.1005945205688477, + "logps/chosen": -125.73505401611328, + "logps/rejected": -326.3161926269531, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3146044909954071, + "rewards/margins": 3.5657927989959717, + "rewards/rejected": -3.880397319793701, + "step": 3037 + }, + { + "epoch": 0.35, + "learning_rate": 1.9692925475374984e-07, + "logits/chosen": -2.1438868045806885, + "logits/rejected": -2.056556463241577, + "logps/chosen": -339.1431884765625, + "logps/rejected": -337.5471496582031, + "loss": 0.811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1610056161880493, + "rewards/margins": 0.3940655589103699, + "rewards/rejected": -1.555071234703064, + "step": 3038 + }, + { + "epoch": 0.35, + "learning_rate": 1.9689382307783156e-07, + "logits/chosen": -2.820479393005371, + "logits/rejected": -2.6825873851776123, + "logps/chosen": -171.345458984375, + "logps/rejected": -213.25445556640625, + "loss": 0.3305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8060693144798279, + "rewards/margins": 2.465203046798706, + "rewards/rejected": -3.2712721824645996, + "step": 3039 + }, + { + "epoch": 0.35, + "learning_rate": 1.9685839140191329e-07, + "logits/chosen": -2.2133054733276367, + "logits/rejected": -1.7539443969726562, + "logps/chosen": -261.4666748046875, + "logps/rejected": -445.5120849609375, + "loss": 0.2601, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6042224764823914, + "rewards/margins": 3.9756383895874023, + "rewards/rejected": -4.579861164093018, + "step": 3040 + }, + { + "epoch": 0.35, + "learning_rate": 1.96822959725995e-07, + "logits/chosen": -1.61419677734375, + "logits/rejected": -1.696324110031128, + "logps/chosen": -388.1860046386719, + "logps/rejected": -428.0269775390625, + "loss": 0.5707, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4056317806243896, + "rewards/margins": 1.5454752445220947, + "rewards/rejected": -2.9511070251464844, + "step": 3041 + }, + { + "epoch": 0.35, + "learning_rate": 1.9678752805007676e-07, + "logits/chosen": -2.667227268218994, + "logits/rejected": -2.4604671001434326, + "logps/chosen": -243.31875610351562, + "logps/rejected": -245.55548095703125, + "loss": 0.2512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9633252620697021, + "rewards/margins": 2.358229637145996, + "rewards/rejected": -3.3215551376342773, + "step": 3042 + }, + { + "epoch": 0.35, + "learning_rate": 1.967520963741585e-07, + "logits/chosen": -2.3172240257263184, + "logits/rejected": -2.187418222427368, + "logps/chosen": -275.0868835449219, + "logps/rejected": -253.36917114257812, + "loss": 0.3858, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0002658367156982, + "rewards/margins": 2.307159900665283, + "rewards/rejected": -3.3074259757995605, + "step": 3043 + }, + { + "epoch": 0.35, + "learning_rate": 1.9671666469824023e-07, + "logits/chosen": -2.287874221801758, + "logits/rejected": -2.1239354610443115, + "logps/chosen": -236.24758911132812, + "logps/rejected": -270.7436218261719, + "loss": 0.354, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0777268409729004, + "rewards/margins": 1.7625505924224854, + "rewards/rejected": -2.8402771949768066, + "step": 3044 + }, + { + "epoch": 0.35, + "learning_rate": 1.9668123302232195e-07, + "logits/chosen": -2.6026806831359863, + "logits/rejected": -2.573774814605713, + "logps/chosen": -258.40960693359375, + "logps/rejected": -279.1078796386719, + "loss": 0.6847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7767550945281982, + "rewards/margins": 0.6069813966751099, + "rewards/rejected": -2.3837363719940186, + "step": 3045 + }, + { + "epoch": 0.35, + "learning_rate": 1.9664580134640367e-07, + "logits/chosen": -3.045938491821289, + "logits/rejected": -2.9854049682617188, + "logps/chosen": -265.9449768066406, + "logps/rejected": -171.61712646484375, + "loss": 0.3814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7812365293502808, + "rewards/margins": 1.5574370622634888, + "rewards/rejected": -2.3386735916137695, + "step": 3046 + }, + { + "epoch": 0.35, + "learning_rate": 1.9661036967048542e-07, + "logits/chosen": -1.602947473526001, + "logits/rejected": -1.9173753261566162, + "logps/chosen": -445.0979309082031, + "logps/rejected": -367.5957946777344, + "loss": 0.6145, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.136256605386734, + "rewards/margins": 1.3297594785690308, + "rewards/rejected": -1.4660160541534424, + "step": 3047 + }, + { + "epoch": 0.35, + "learning_rate": 1.9657493799456714e-07, + "logits/chosen": -2.202681303024292, + "logits/rejected": -2.1134557723999023, + "logps/chosen": -356.2322998046875, + "logps/rejected": -447.5849914550781, + "loss": 0.3043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7635381817817688, + "rewards/margins": 2.8975396156311035, + "rewards/rejected": -3.6610779762268066, + "step": 3048 + }, + { + "epoch": 0.35, + "learning_rate": 1.9653950631864886e-07, + "logits/chosen": -2.311034679412842, + "logits/rejected": -2.165663003921509, + "logps/chosen": -312.968505859375, + "logps/rejected": -493.8941650390625, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.760169267654419, + "rewards/margins": 2.8355534076690674, + "rewards/rejected": -3.5957226753234863, + "step": 3049 + }, + { + "epoch": 0.35, + "learning_rate": 1.9650407464273059e-07, + "logits/chosen": -2.7979226112365723, + "logits/rejected": -2.599684476852417, + "logps/chosen": -107.00436401367188, + "logps/rejected": -132.50714111328125, + "loss": 0.3813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8473031520843506, + "rewards/margins": 1.4625849723815918, + "rewards/rejected": -2.3098881244659424, + "step": 3050 + }, + { + "epoch": 0.35, + "learning_rate": 1.964686429668123e-07, + "logits/chosen": -2.096634864807129, + "logits/rejected": -2.1523337364196777, + "logps/chosen": -328.902587890625, + "logps/rejected": -343.0173034667969, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6052758693695068, + "rewards/margins": 1.3955286741256714, + "rewards/rejected": -2.0008046627044678, + "step": 3051 + }, + { + "epoch": 0.36, + "learning_rate": 1.9643321129089403e-07, + "logits/chosen": -2.555698871612549, + "logits/rejected": -2.5644805431365967, + "logps/chosen": -345.969970703125, + "logps/rejected": -239.98837280273438, + "loss": 0.1566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9914661645889282, + "rewards/margins": 2.1565024852752686, + "rewards/rejected": -3.1479687690734863, + "step": 3052 + }, + { + "epoch": 0.36, + "learning_rate": 1.9639777961497578e-07, + "logits/chosen": -2.8092093467712402, + "logits/rejected": -2.7629144191741943, + "logps/chosen": -196.1188507080078, + "logps/rejected": -183.78057861328125, + "loss": 0.3435, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4851170778274536, + "rewards/margins": 1.7952828407287598, + "rewards/rejected": -2.280399799346924, + "step": 3053 + }, + { + "epoch": 0.36, + "learning_rate": 1.963623479390575e-07, + "logits/chosen": -2.3625946044921875, + "logits/rejected": -2.498396158218384, + "logps/chosen": -100.30036926269531, + "logps/rejected": -171.9199981689453, + "loss": 0.5455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5099483728408813, + "rewards/margins": 1.9526798725128174, + "rewards/rejected": -2.462628126144409, + "step": 3054 + }, + { + "epoch": 0.36, + "learning_rate": 1.9632691626313925e-07, + "logits/chosen": -2.6012723445892334, + "logits/rejected": -2.2539377212524414, + "logps/chosen": -234.70997619628906, + "logps/rejected": -305.70782470703125, + "loss": 0.3875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5423040986061096, + "rewards/margins": 1.9892863035202026, + "rewards/rejected": -2.531590223312378, + "step": 3055 + }, + { + "epoch": 0.36, + "learning_rate": 1.9629148458722097e-07, + "logits/chosen": -2.3057796955108643, + "logits/rejected": -2.6744885444641113, + "logps/chosen": -265.74713134765625, + "logps/rejected": -150.86221313476562, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6600843667984009, + "rewards/margins": 1.7885726690292358, + "rewards/rejected": -2.448657274246216, + "step": 3056 + }, + { + "epoch": 0.36, + "learning_rate": 1.962560529113027e-07, + "logits/chosen": -1.8908133506774902, + "logits/rejected": -1.6980204582214355, + "logps/chosen": -484.2646179199219, + "logps/rejected": -501.28961181640625, + "loss": 0.429, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.662290096282959, + "rewards/margins": 1.6878539323806763, + "rewards/rejected": -2.3501439094543457, + "step": 3057 + }, + { + "epoch": 0.36, + "learning_rate": 1.9622062123538444e-07, + "logits/chosen": -2.3585987091064453, + "logits/rejected": -2.1797397136688232, + "logps/chosen": -189.91006469726562, + "logps/rejected": -297.7913818359375, + "loss": 0.5103, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.766045868396759, + "rewards/margins": 1.836298942565918, + "rewards/rejected": -2.6023447513580322, + "step": 3058 + }, + { + "epoch": 0.36, + "learning_rate": 1.9618518955946617e-07, + "logits/chosen": -2.548710823059082, + "logits/rejected": -2.476182699203491, + "logps/chosen": -207.40208435058594, + "logps/rejected": -222.58419799804688, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.594593346118927, + "rewards/margins": 1.672452449798584, + "rewards/rejected": -2.2670459747314453, + "step": 3059 + }, + { + "epoch": 0.36, + "learning_rate": 1.961497578835479e-07, + "logits/chosen": -2.3417482376098633, + "logits/rejected": -2.456052541732788, + "logps/chosen": -261.672607421875, + "logps/rejected": -289.4530029296875, + "loss": 0.2638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4028152525424957, + "rewards/margins": 1.9479931592941284, + "rewards/rejected": -2.3508083820343018, + "step": 3060 + }, + { + "epoch": 0.36, + "learning_rate": 1.961143262076296e-07, + "logits/chosen": -2.9097137451171875, + "logits/rejected": -2.752378225326538, + "logps/chosen": -241.68771362304688, + "logps/rejected": -350.2965087890625, + "loss": 0.5437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7554417848587036, + "rewards/margins": 1.824181318283081, + "rewards/rejected": -2.579623222351074, + "step": 3061 + }, + { + "epoch": 0.36, + "learning_rate": 1.9607889453171133e-07, + "logits/chosen": -1.8009636402130127, + "logits/rejected": -1.760741949081421, + "logps/chosen": -198.28707885742188, + "logps/rejected": -276.1143493652344, + "loss": 0.3623, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0645872354507446, + "rewards/margins": 1.5824391841888428, + "rewards/rejected": -2.647026300430298, + "step": 3062 + }, + { + "epoch": 0.36, + "learning_rate": 1.9604346285579305e-07, + "logits/chosen": -2.268942356109619, + "logits/rejected": -2.323070764541626, + "logps/chosen": -362.32147216796875, + "logps/rejected": -338.8183288574219, + "loss": 0.4187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4711971879005432, + "rewards/margins": 1.6433734893798828, + "rewards/rejected": -2.1145706176757812, + "step": 3063 + }, + { + "epoch": 0.36, + "learning_rate": 1.9600803117987478e-07, + "logits/chosen": -2.310106039047241, + "logits/rejected": -2.692974805831909, + "logps/chosen": -214.76718139648438, + "logps/rejected": -164.80487060546875, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7426090240478516, + "rewards/margins": 0.9774537086486816, + "rewards/rejected": -1.7200628519058228, + "step": 3064 + }, + { + "epoch": 0.36, + "learning_rate": 1.9597259950395652e-07, + "logits/chosen": -2.5107953548431396, + "logits/rejected": -2.5724096298217773, + "logps/chosen": -212.32220458984375, + "logps/rejected": -177.16461181640625, + "loss": 0.478, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.750869631767273, + "rewards/margins": 1.648637294769287, + "rewards/rejected": -2.3995070457458496, + "step": 3065 + }, + { + "epoch": 0.36, + "learning_rate": 1.9593716782803825e-07, + "logits/chosen": -1.6984198093414307, + "logits/rejected": -1.6876379251480103, + "logps/chosen": -310.6319885253906, + "logps/rejected": -316.47149658203125, + "loss": 0.986, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.828258514404297, + "rewards/margins": 0.3625556230545044, + "rewards/rejected": -3.1908140182495117, + "step": 3066 + }, + { + "epoch": 0.36, + "learning_rate": 1.9590173615212e-07, + "logits/chosen": -2.5617804527282715, + "logits/rejected": -2.786628246307373, + "logps/chosen": -186.99172973632812, + "logps/rejected": -202.25515747070312, + "loss": 0.2187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3378148376941681, + "rewards/margins": 2.022547721862793, + "rewards/rejected": -2.3603627681732178, + "step": 3067 + }, + { + "epoch": 0.36, + "learning_rate": 1.9586630447620172e-07, + "logits/chosen": -2.2520787715911865, + "logits/rejected": -2.1152021884918213, + "logps/chosen": -303.16363525390625, + "logps/rejected": -241.32870483398438, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7017867565155029, + "rewards/margins": 2.899522304534912, + "rewards/rejected": -3.601309061050415, + "step": 3068 + }, + { + "epoch": 0.36, + "learning_rate": 1.9583087280028347e-07, + "logits/chosen": -1.8861809968948364, + "logits/rejected": -1.9619195461273193, + "logps/chosen": -229.7706756591797, + "logps/rejected": -218.89755249023438, + "loss": 0.3756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5653438568115234, + "rewards/margins": 2.2450454235076904, + "rewards/rejected": -2.810389518737793, + "step": 3069 + }, + { + "epoch": 0.36, + "learning_rate": 1.957954411243652e-07, + "logits/chosen": -2.3462936878204346, + "logits/rejected": -2.492610454559326, + "logps/chosen": -176.53970336914062, + "logps/rejected": -275.4410400390625, + "loss": 0.4486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.462695837020874, + "rewards/margins": 2.3066556453704834, + "rewards/rejected": -2.7693514823913574, + "step": 3070 + }, + { + "epoch": 0.36, + "learning_rate": 1.957600094484469e-07, + "logits/chosen": -2.7490100860595703, + "logits/rejected": -2.8418877124786377, + "logps/chosen": -137.25155639648438, + "logps/rejected": -158.52777099609375, + "loss": 0.404, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26092809438705444, + "rewards/margins": 2.4463939666748047, + "rewards/rejected": -2.707322120666504, + "step": 3071 + }, + { + "epoch": 0.36, + "learning_rate": 1.9572457777252863e-07, + "logits/chosen": -2.5266342163085938, + "logits/rejected": -2.472921371459961, + "logps/chosen": -422.720458984375, + "logps/rejected": -464.9222412109375, + "loss": 0.2407, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7467414736747742, + "rewards/margins": 1.8848586082458496, + "rewards/rejected": -2.6315999031066895, + "step": 3072 + }, + { + "epoch": 0.36, + "learning_rate": 1.9568914609661035e-07, + "logits/chosen": -2.4075984954833984, + "logits/rejected": -2.510087728500366, + "logps/chosen": -281.11444091796875, + "logps/rejected": -300.56378173828125, + "loss": 0.3065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1748780608177185, + "rewards/margins": 1.968912124633789, + "rewards/rejected": -2.1437902450561523, + "step": 3073 + }, + { + "epoch": 0.36, + "learning_rate": 1.9565371442069208e-07, + "logits/chosen": -2.2236740589141846, + "logits/rejected": -2.3185322284698486, + "logps/chosen": -365.0279235839844, + "logps/rejected": -310.16162109375, + "loss": 0.394, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0041489601135254, + "rewards/margins": 1.437568187713623, + "rewards/rejected": -2.4417169094085693, + "step": 3074 + }, + { + "epoch": 0.36, + "learning_rate": 1.956182827447738e-07, + "logits/chosen": -2.2502241134643555, + "logits/rejected": -2.277646541595459, + "logps/chosen": -318.6170959472656, + "logps/rejected": -357.46002197265625, + "loss": 0.4725, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25281912088394165, + "rewards/margins": 1.5596362352371216, + "rewards/rejected": -1.812455415725708, + "step": 3075 + }, + { + "epoch": 0.36, + "learning_rate": 1.9558285106885555e-07, + "logits/chosen": -2.744269609451294, + "logits/rejected": -2.8026366233825684, + "logps/chosen": -251.04922485351562, + "logps/rejected": -244.90658569335938, + "loss": 1.0798, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3697755336761475, + "rewards/margins": 0.5562127828598022, + "rewards/rejected": -1.9259884357452393, + "step": 3076 + }, + { + "epoch": 0.36, + "learning_rate": 1.9554741939293727e-07, + "logits/chosen": -2.1796066761016846, + "logits/rejected": -2.51393985748291, + "logps/chosen": -493.0710144042969, + "logps/rejected": -361.9283447265625, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7756271362304688, + "rewards/margins": 1.6182520389556885, + "rewards/rejected": -2.393878936767578, + "step": 3077 + }, + { + "epoch": 0.36, + "learning_rate": 1.9551198771701902e-07, + "logits/chosen": -2.298384666442871, + "logits/rejected": -2.4795377254486084, + "logps/chosen": -156.8967742919922, + "logps/rejected": -161.52896118164062, + "loss": 0.4335, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0661638975143433, + "rewards/margins": 1.6871165037155151, + "rewards/rejected": -2.7532804012298584, + "step": 3078 + }, + { + "epoch": 0.36, + "learning_rate": 1.9547655604110074e-07, + "logits/chosen": -2.139148473739624, + "logits/rejected": -2.3867995738983154, + "logps/chosen": -225.09780883789062, + "logps/rejected": -276.9072570800781, + "loss": 0.3089, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31729376316070557, + "rewards/margins": 2.4318580627441406, + "rewards/rejected": -2.7491519451141357, + "step": 3079 + }, + { + "epoch": 0.36, + "learning_rate": 1.9544112436518246e-07, + "logits/chosen": -2.761261463165283, + "logits/rejected": -2.780332326889038, + "logps/chosen": -171.3284912109375, + "logps/rejected": -206.00689697265625, + "loss": 0.3987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.61780846118927, + "rewards/margins": 1.9389897584915161, + "rewards/rejected": -2.556798219680786, + "step": 3080 + }, + { + "epoch": 0.36, + "learning_rate": 1.954056926892642e-07, + "logits/chosen": -1.691419243812561, + "logits/rejected": -2.125296115875244, + "logps/chosen": -365.7619323730469, + "logps/rejected": -267.72296142578125, + "loss": 0.4409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7751481533050537, + "rewards/margins": 1.0066146850585938, + "rewards/rejected": -1.781762957572937, + "step": 3081 + }, + { + "epoch": 0.36, + "learning_rate": 1.9537026101334593e-07, + "logits/chosen": -1.7622246742248535, + "logits/rejected": -2.2715725898742676, + "logps/chosen": -583.3938598632812, + "logps/rejected": -357.3657531738281, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8566279411315918, + "rewards/margins": 0.6388593912124634, + "rewards/rejected": -2.4954874515533447, + "step": 3082 + }, + { + "epoch": 0.36, + "learning_rate": 1.9533482933742765e-07, + "logits/chosen": -2.4055278301239014, + "logits/rejected": -2.5267140865325928, + "logps/chosen": -201.18251037597656, + "logps/rejected": -233.51934814453125, + "loss": 0.1476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5946815013885498, + "rewards/margins": 2.7114150524139404, + "rewards/rejected": -3.3060965538024902, + "step": 3083 + }, + { + "epoch": 0.36, + "learning_rate": 1.9529939766150938e-07, + "logits/chosen": -2.5423502922058105, + "logits/rejected": -2.625427722930908, + "logps/chosen": -386.2813415527344, + "logps/rejected": -258.6645812988281, + "loss": 0.8179, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.235407829284668, + "rewards/margins": 2.2106876373291016, + "rewards/rejected": -4.4460954666137695, + "step": 3084 + }, + { + "epoch": 0.36, + "learning_rate": 1.952639659855911e-07, + "logits/chosen": -1.863872766494751, + "logits/rejected": -1.8901612758636475, + "logps/chosen": -275.6691589355469, + "logps/rejected": -235.2613067626953, + "loss": 0.2872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33936962485313416, + "rewards/margins": 1.726625919342041, + "rewards/rejected": -2.065995454788208, + "step": 3085 + }, + { + "epoch": 0.36, + "learning_rate": 1.9522853430967282e-07, + "logits/chosen": -2.193173408508301, + "logits/rejected": -2.205641746520996, + "logps/chosen": -222.06204223632812, + "logps/rejected": -275.3194274902344, + "loss": 0.2699, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13196103274822235, + "rewards/margins": 2.625652551651001, + "rewards/rejected": -2.7576136589050293, + "step": 3086 + }, + { + "epoch": 0.36, + "learning_rate": 1.9519310263375457e-07, + "logits/chosen": -2.003970146179199, + "logits/rejected": -2.077258825302124, + "logps/chosen": -316.2494201660156, + "logps/rejected": -367.93499755859375, + "loss": 0.2564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4094979166984558, + "rewards/margins": 2.5925018787384033, + "rewards/rejected": -3.002000093460083, + "step": 3087 + }, + { + "epoch": 0.36, + "learning_rate": 1.951576709578363e-07, + "logits/chosen": -2.466526508331299, + "logits/rejected": -2.3165440559387207, + "logps/chosen": -148.6517333984375, + "logps/rejected": -225.70298767089844, + "loss": 0.2519, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8260928392410278, + "rewards/margins": 2.692384719848633, + "rewards/rejected": -3.51847767829895, + "step": 3088 + }, + { + "epoch": 0.36, + "learning_rate": 1.9512223928191801e-07, + "logits/chosen": -1.9633926153182983, + "logits/rejected": -1.7110555171966553, + "logps/chosen": -400.64117431640625, + "logps/rejected": -626.545654296875, + "loss": 0.2578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9234839677810669, + "rewards/margins": 3.7648444175720215, + "rewards/rejected": -4.688328266143799, + "step": 3089 + }, + { + "epoch": 0.36, + "learning_rate": 1.9508680760599976e-07, + "logits/chosen": -2.1981005668640137, + "logits/rejected": -2.250837802886963, + "logps/chosen": -226.04299926757812, + "logps/rejected": -357.83270263671875, + "loss": 0.5753, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8571372032165527, + "rewards/margins": 1.8876385688781738, + "rewards/rejected": -2.7447757720947266, + "step": 3090 + }, + { + "epoch": 0.36, + "learning_rate": 1.9505137593008148e-07, + "logits/chosen": -2.577970504760742, + "logits/rejected": -2.7372524738311768, + "logps/chosen": -286.6099853515625, + "logps/rejected": -237.63589477539062, + "loss": 0.7865, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2667179107666016, + "rewards/margins": 0.23975539207458496, + "rewards/rejected": -2.5064735412597656, + "step": 3091 + }, + { + "epoch": 0.36, + "learning_rate": 1.9501594425416323e-07, + "logits/chosen": -2.3509678840637207, + "logits/rejected": -2.6302804946899414, + "logps/chosen": -319.7651062011719, + "logps/rejected": -203.45895385742188, + "loss": 0.6763, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3953826427459717, + "rewards/margins": 1.3449151515960693, + "rewards/rejected": -2.740297794342041, + "step": 3092 + }, + { + "epoch": 0.36, + "learning_rate": 1.9498051257824496e-07, + "logits/chosen": -1.8278895616531372, + "logits/rejected": -1.8975833654403687, + "logps/chosen": -500.2528381347656, + "logps/rejected": -508.5038146972656, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30173200368881226, + "rewards/margins": 3.3749606609344482, + "rewards/rejected": -3.676692485809326, + "step": 3093 + }, + { + "epoch": 0.36, + "learning_rate": 1.9494508090232668e-07, + "logits/chosen": -1.5039117336273193, + "logits/rejected": -1.9227688312530518, + "logps/chosen": -463.9488220214844, + "logps/rejected": -399.75213623046875, + "loss": 0.0679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04540317505598068, + "rewards/margins": 3.228289842605591, + "rewards/rejected": -3.273693084716797, + "step": 3094 + }, + { + "epoch": 0.36, + "learning_rate": 1.949096492264084e-07, + "logits/chosen": -2.2632195949554443, + "logits/rejected": -2.4194135665893555, + "logps/chosen": -394.73480224609375, + "logps/rejected": -223.79910278320312, + "loss": 0.5495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1289913654327393, + "rewards/margins": 0.8319392204284668, + "rewards/rejected": -1.960930585861206, + "step": 3095 + }, + { + "epoch": 0.36, + "learning_rate": 1.9487421755049012e-07, + "logits/chosen": -2.183795690536499, + "logits/rejected": -2.347364664077759, + "logps/chosen": -403.4247131347656, + "logps/rejected": -249.07310485839844, + "loss": 0.6306, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4496487379074097, + "rewards/margins": 1.746886968612671, + "rewards/rejected": -3.196535587310791, + "step": 3096 + }, + { + "epoch": 0.36, + "learning_rate": 1.9483878587457184e-07, + "logits/chosen": -1.8170714378356934, + "logits/rejected": -2.2644705772399902, + "logps/chosen": -383.1539306640625, + "logps/rejected": -222.98590087890625, + "loss": 0.6317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6372826099395752, + "rewards/margins": 2.0703272819519043, + "rewards/rejected": -2.7076096534729004, + "step": 3097 + }, + { + "epoch": 0.36, + "learning_rate": 1.948033541986536e-07, + "logits/chosen": -2.527939558029175, + "logits/rejected": -2.755084276199341, + "logps/chosen": -246.33460998535156, + "logps/rejected": -275.1799621582031, + "loss": 0.4103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9714311957359314, + "rewards/margins": 3.269047737121582, + "rewards/rejected": -4.240478992462158, + "step": 3098 + }, + { + "epoch": 0.36, + "learning_rate": 1.9476792252273531e-07, + "logits/chosen": -2.2928264141082764, + "logits/rejected": -2.0483834743499756, + "logps/chosen": -186.25051879882812, + "logps/rejected": -290.2942199707031, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8996459245681763, + "rewards/margins": 1.9352480173110962, + "rewards/rejected": -2.8348939418792725, + "step": 3099 + }, + { + "epoch": 0.36, + "learning_rate": 1.9473249084681704e-07, + "logits/chosen": -2.523756265640259, + "logits/rejected": -2.684831142425537, + "logps/chosen": -341.0999755859375, + "logps/rejected": -172.13128662109375, + "loss": 0.7798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0476539134979248, + "rewards/margins": 0.6210567355155945, + "rewards/rejected": -1.6687105894088745, + "step": 3100 + }, + { + "epoch": 0.36, + "learning_rate": 1.9469705917089876e-07, + "logits/chosen": -2.1315619945526123, + "logits/rejected": -1.8821651935577393, + "logps/chosen": -157.85919189453125, + "logps/rejected": -200.81057739257812, + "loss": 0.458, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8662946224212646, + "rewards/margins": 1.5491145849227905, + "rewards/rejected": -3.4154093265533447, + "step": 3101 + }, + { + "epoch": 0.36, + "learning_rate": 1.946616274949805e-07, + "logits/chosen": -2.6267032623291016, + "logits/rejected": -2.5020735263824463, + "logps/chosen": -217.08343505859375, + "logps/rejected": -344.1680908203125, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15955981612205505, + "rewards/margins": 2.61568546295166, + "rewards/rejected": -2.775245428085327, + "step": 3102 + }, + { + "epoch": 0.36, + "learning_rate": 1.9462619581906226e-07, + "logits/chosen": -1.9715205430984497, + "logits/rejected": -2.3447420597076416, + "logps/chosen": -512.128173828125, + "logps/rejected": -314.21331787109375, + "loss": 0.3411, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.272097110748291, + "rewards/margins": 2.964650869369507, + "rewards/rejected": -2.6925535202026367, + "step": 3103 + }, + { + "epoch": 0.36, + "learning_rate": 1.9459076414314398e-07, + "logits/chosen": -2.1395809650421143, + "logits/rejected": -2.515711545944214, + "logps/chosen": -174.10145568847656, + "logps/rejected": -156.0001220703125, + "loss": 0.3387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29176539182662964, + "rewards/margins": 1.6570239067077637, + "rewards/rejected": -1.948789358139038, + "step": 3104 + }, + { + "epoch": 0.36, + "learning_rate": 1.945553324672257e-07, + "logits/chosen": -2.255340099334717, + "logits/rejected": -2.1561477184295654, + "logps/chosen": -428.8955078125, + "logps/rejected": -367.0755615234375, + "loss": 0.6539, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2377064228057861, + "rewards/margins": 1.6460140943527222, + "rewards/rejected": -2.8837203979492188, + "step": 3105 + }, + { + "epoch": 0.36, + "learning_rate": 1.9451990079130742e-07, + "logits/chosen": -1.3099322319030762, + "logits/rejected": -1.6241211891174316, + "logps/chosen": -401.2363586425781, + "logps/rejected": -285.8721618652344, + "loss": 0.7805, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.53331458568573, + "rewards/margins": 0.006419472396373749, + "rewards/rejected": -1.539734125137329, + "step": 3106 + }, + { + "epoch": 0.36, + "learning_rate": 1.9448446911538914e-07, + "logits/chosen": -1.6691772937774658, + "logits/rejected": -1.7718544006347656, + "logps/chosen": -439.49578857421875, + "logps/rejected": -377.2452392578125, + "loss": 0.8446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8918315172195435, + "rewards/margins": 2.0388946533203125, + "rewards/rejected": -2.9307260513305664, + "step": 3107 + }, + { + "epoch": 0.36, + "learning_rate": 1.9444903743947087e-07, + "logits/chosen": -2.6120729446411133, + "logits/rejected": -2.7166614532470703, + "logps/chosen": -380.427490234375, + "logps/rejected": -381.2755126953125, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5260157585144043, + "rewards/margins": 1.6505365371704102, + "rewards/rejected": -2.1765522956848145, + "step": 3108 + }, + { + "epoch": 0.36, + "learning_rate": 1.944136057635526e-07, + "logits/chosen": -2.2808775901794434, + "logits/rejected": -2.4095587730407715, + "logps/chosen": -507.74395751953125, + "logps/rejected": -337.41571044921875, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2569407224655151, + "rewards/margins": 2.731412887573242, + "rewards/rejected": -3.988353729248047, + "step": 3109 + }, + { + "epoch": 0.36, + "learning_rate": 1.9437817408763434e-07, + "logits/chosen": -2.2663347721099854, + "logits/rejected": -2.319885492324829, + "logps/chosen": -213.2877197265625, + "logps/rejected": -244.46157836914062, + "loss": 0.4708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5261576175689697, + "rewards/margins": 1.0919712781906128, + "rewards/rejected": -1.6181288957595825, + "step": 3110 + }, + { + "epoch": 0.36, + "learning_rate": 1.9434274241171606e-07, + "logits/chosen": -2.2333898544311523, + "logits/rejected": -2.3654563426971436, + "logps/chosen": -407.58697509765625, + "logps/rejected": -380.197509765625, + "loss": 0.4992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6908873319625854, + "rewards/margins": 1.0724596977233887, + "rewards/rejected": -1.7633470296859741, + "step": 3111 + }, + { + "epoch": 0.36, + "learning_rate": 1.9430731073579778e-07, + "logits/chosen": -1.9532456398010254, + "logits/rejected": -2.08958101272583, + "logps/chosen": -262.1343078613281, + "logps/rejected": -237.88241577148438, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5578480362892151, + "rewards/margins": 2.770270347595215, + "rewards/rejected": -3.328118324279785, + "step": 3112 + }, + { + "epoch": 0.36, + "learning_rate": 1.9427187905987953e-07, + "logits/chosen": -2.6000728607177734, + "logits/rejected": -2.4458532333374023, + "logps/chosen": -382.8772888183594, + "logps/rejected": -310.2657470703125, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4247158765792847, + "rewards/margins": 0.8124868273735046, + "rewards/rejected": -2.2372026443481445, + "step": 3113 + }, + { + "epoch": 0.36, + "learning_rate": 1.9423644738396128e-07, + "logits/chosen": -1.7292454242706299, + "logits/rejected": -1.625171422958374, + "logps/chosen": -132.97607421875, + "logps/rejected": -171.45257568359375, + "loss": 1.2613, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.566391944885254, + "rewards/margins": 0.7467284798622131, + "rewards/rejected": -3.3131203651428223, + "step": 3114 + }, + { + "epoch": 0.36, + "learning_rate": 1.94201015708043e-07, + "logits/chosen": -2.256432294845581, + "logits/rejected": -2.1059908866882324, + "logps/chosen": -199.9241180419922, + "logps/rejected": -217.54931640625, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7063655853271484, + "rewards/margins": 1.918487310409546, + "rewards/rejected": -2.6248531341552734, + "step": 3115 + }, + { + "epoch": 0.36, + "learning_rate": 1.9416558403212472e-07, + "logits/chosen": -2.3934261798858643, + "logits/rejected": -2.551150321960449, + "logps/chosen": -192.21408081054688, + "logps/rejected": -264.39056396484375, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5673214197158813, + "rewards/margins": 3.1848888397216797, + "rewards/rejected": -3.7522099018096924, + "step": 3116 + }, + { + "epoch": 0.36, + "learning_rate": 1.9413015235620645e-07, + "logits/chosen": -2.4616799354553223, + "logits/rejected": -2.571305751800537, + "logps/chosen": -108.89421844482422, + "logps/rejected": -232.10964965820312, + "loss": 0.3327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5497379899024963, + "rewards/margins": 2.1916420459747314, + "rewards/rejected": -2.741380214691162, + "step": 3117 + }, + { + "epoch": 0.36, + "learning_rate": 1.9409472068028817e-07, + "logits/chosen": -2.4977517127990723, + "logits/rejected": -2.0010085105895996, + "logps/chosen": -83.21744537353516, + "logps/rejected": -276.8382568359375, + "loss": 0.4153, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9481070637702942, + "rewards/margins": 2.5234591960906982, + "rewards/rejected": -3.4715659618377686, + "step": 3118 + }, + { + "epoch": 0.36, + "learning_rate": 1.940592890043699e-07, + "logits/chosen": -2.4068498611450195, + "logits/rejected": -2.175837516784668, + "logps/chosen": -235.85528564453125, + "logps/rejected": -398.30291748046875, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3214884400367737, + "rewards/margins": 2.6764488220214844, + "rewards/rejected": -2.9979374408721924, + "step": 3119 + }, + { + "epoch": 0.36, + "learning_rate": 1.940238573284516e-07, + "logits/chosen": -2.5489022731781006, + "logits/rejected": -2.4398512840270996, + "logps/chosen": -324.2091979980469, + "logps/rejected": -352.88848876953125, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11648707091808319, + "rewards/margins": 2.299185276031494, + "rewards/rejected": -2.415672540664673, + "step": 3120 + }, + { + "epoch": 0.36, + "learning_rate": 1.9398842565253336e-07, + "logits/chosen": -2.684638023376465, + "logits/rejected": -2.5656890869140625, + "logps/chosen": -182.70233154296875, + "logps/rejected": -240.39285278320312, + "loss": 0.2162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8190944194793701, + "rewards/margins": 2.705848455429077, + "rewards/rejected": -3.5249428749084473, + "step": 3121 + }, + { + "epoch": 0.36, + "learning_rate": 1.9395299397661508e-07, + "logits/chosen": -1.9103986024856567, + "logits/rejected": -2.164799928665161, + "logps/chosen": -393.04437255859375, + "logps/rejected": -335.9977722167969, + "loss": 0.4371, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1821215152740479, + "rewards/margins": 1.507992148399353, + "rewards/rejected": -2.6901135444641113, + "step": 3122 + }, + { + "epoch": 0.36, + "learning_rate": 1.939175623006968e-07, + "logits/chosen": -2.592072010040283, + "logits/rejected": -2.6159229278564453, + "logps/chosen": -116.55342102050781, + "logps/rejected": -210.6845245361328, + "loss": 0.2051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46971070766448975, + "rewards/margins": 2.2071855068206787, + "rewards/rejected": -2.676896095275879, + "step": 3123 + }, + { + "epoch": 0.36, + "learning_rate": 1.9388213062477853e-07, + "logits/chosen": -2.10007905960083, + "logits/rejected": -2.185770034790039, + "logps/chosen": -343.1531982421875, + "logps/rejected": -320.8360595703125, + "loss": 0.2492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20104816555976868, + "rewards/margins": 1.8057745695114136, + "rewards/rejected": -2.0068225860595703, + "step": 3124 + }, + { + "epoch": 0.36, + "learning_rate": 1.9384669894886028e-07, + "logits/chosen": -1.4998385906219482, + "logits/rejected": -2.3586649894714355, + "logps/chosen": -636.711181640625, + "logps/rejected": -382.38177490234375, + "loss": 0.5859, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1133610010147095, + "rewards/margins": 1.0986320972442627, + "rewards/rejected": -2.2119932174682617, + "step": 3125 + }, + { + "epoch": 0.36, + "learning_rate": 1.9381126727294202e-07, + "logits/chosen": -2.4803004264831543, + "logits/rejected": -2.161444902420044, + "logps/chosen": -172.16583251953125, + "logps/rejected": -286.1757507324219, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0674363225698471, + "rewards/margins": 2.683539390563965, + "rewards/rejected": -2.7509756088256836, + "step": 3126 + }, + { + "epoch": 0.36, + "learning_rate": 1.9377583559702375e-07, + "logits/chosen": -2.656484603881836, + "logits/rejected": -2.694640874862671, + "logps/chosen": -80.12544250488281, + "logps/rejected": -138.1623992919922, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07886181771755219, + "rewards/margins": 2.4925155639648438, + "rewards/rejected": -2.41365385055542, + "step": 3127 + }, + { + "epoch": 0.36, + "learning_rate": 1.9374040392110547e-07, + "logits/chosen": -2.350421905517578, + "logits/rejected": -2.515573263168335, + "logps/chosen": -277.6192932128906, + "logps/rejected": -240.78457641601562, + "loss": 0.2561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5066509246826172, + "rewards/margins": 2.982818365097046, + "rewards/rejected": -3.489469051361084, + "step": 3128 + }, + { + "epoch": 0.36, + "learning_rate": 1.937049722451872e-07, + "logits/chosen": -2.965205669403076, + "logits/rejected": -3.0474088191986084, + "logps/chosen": -329.3662414550781, + "logps/rejected": -249.338134765625, + "loss": 0.8615, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2708778381347656, + "rewards/margins": 1.2647546529769897, + "rewards/rejected": -2.535632610321045, + "step": 3129 + }, + { + "epoch": 0.36, + "learning_rate": 1.936695405692689e-07, + "logits/chosen": -2.4371161460876465, + "logits/rejected": -2.088001251220703, + "logps/chosen": -170.34738159179688, + "logps/rejected": -288.82403564453125, + "loss": 0.3816, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2606197595596313, + "rewards/margins": 2.0035548210144043, + "rewards/rejected": -3.2641749382019043, + "step": 3130 + }, + { + "epoch": 0.36, + "learning_rate": 1.9363410889335063e-07, + "logits/chosen": -2.080554485321045, + "logits/rejected": -2.3817286491394043, + "logps/chosen": -264.6495361328125, + "logps/rejected": -149.86404418945312, + "loss": 0.5214, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7187601327896118, + "rewards/margins": 1.0175445079803467, + "rewards/rejected": -1.736304521560669, + "step": 3131 + }, + { + "epoch": 0.36, + "learning_rate": 1.9359867721743238e-07, + "logits/chosen": -2.275902271270752, + "logits/rejected": -2.3267109394073486, + "logps/chosen": -207.8766632080078, + "logps/rejected": -230.2354278564453, + "loss": 0.2692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6790817975997925, + "rewards/margins": 1.982696771621704, + "rewards/rejected": -2.661778450012207, + "step": 3132 + }, + { + "epoch": 0.36, + "learning_rate": 1.935632455415141e-07, + "logits/chosen": -2.3230996131896973, + "logits/rejected": -2.574122428894043, + "logps/chosen": -382.58636474609375, + "logps/rejected": -217.22059631347656, + "loss": 0.2448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12732012569904327, + "rewards/margins": 1.843235969543457, + "rewards/rejected": -1.7159159183502197, + "step": 3133 + }, + { + "epoch": 0.36, + "learning_rate": 1.9352781386559583e-07, + "logits/chosen": -2.2557919025421143, + "logits/rejected": -2.490957498550415, + "logps/chosen": -587.0347900390625, + "logps/rejected": -331.9078369140625, + "loss": 0.3415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.62187659740448, + "rewards/margins": 2.351989984512329, + "rewards/rejected": -2.9738669395446777, + "step": 3134 + }, + { + "epoch": 0.36, + "learning_rate": 1.9349238218967755e-07, + "logits/chosen": -2.4150912761688232, + "logits/rejected": -2.300534248352051, + "logps/chosen": -217.19692993164062, + "logps/rejected": -440.0042724609375, + "loss": 0.4936, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.277635097503662, + "rewards/margins": 2.6255717277526855, + "rewards/rejected": -3.9032068252563477, + "step": 3135 + }, + { + "epoch": 0.36, + "learning_rate": 1.9345695051375927e-07, + "logits/chosen": -2.434382438659668, + "logits/rejected": -2.4949252605438232, + "logps/chosen": -252.11721801757812, + "logps/rejected": -251.63311767578125, + "loss": 0.3064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.801544725894928, + "rewards/margins": 2.8959438800811768, + "rewards/rejected": -3.697488784790039, + "step": 3136 + }, + { + "epoch": 0.36, + "learning_rate": 1.9342151883784105e-07, + "logits/chosen": -2.308393955230713, + "logits/rejected": -2.4564592838287354, + "logps/chosen": -444.9427490234375, + "logps/rejected": -336.61981201171875, + "loss": 0.312, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0134586095809937, + "rewards/margins": 1.9564151763916016, + "rewards/rejected": -2.9698739051818848, + "step": 3137 + }, + { + "epoch": 0.37, + "learning_rate": 1.9338608716192277e-07, + "logits/chosen": -2.2691216468811035, + "logits/rejected": -2.1121933460235596, + "logps/chosen": -582.8369140625, + "logps/rejected": -380.7342529296875, + "loss": 0.8403, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.307189702987671, + "rewards/margins": 0.9894444942474365, + "rewards/rejected": -2.2966341972351074, + "step": 3138 + }, + { + "epoch": 0.37, + "learning_rate": 1.933506554860045e-07, + "logits/chosen": -2.5083045959472656, + "logits/rejected": -2.6909873485565186, + "logps/chosen": -303.33453369140625, + "logps/rejected": -339.1681823730469, + "loss": 0.4014, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9819588661193848, + "rewards/margins": 1.6858787536621094, + "rewards/rejected": -2.667837619781494, + "step": 3139 + }, + { + "epoch": 0.37, + "learning_rate": 1.933152238100862e-07, + "logits/chosen": -2.160820722579956, + "logits/rejected": -1.9663872718811035, + "logps/chosen": -325.99127197265625, + "logps/rejected": -417.8299560546875, + "loss": 0.2755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45534124970436096, + "rewards/margins": 2.8656198978424072, + "rewards/rejected": -3.320960760116577, + "step": 3140 + }, + { + "epoch": 0.37, + "learning_rate": 1.9327979213416794e-07, + "logits/chosen": -2.3190388679504395, + "logits/rejected": -2.156068801879883, + "logps/chosen": -191.9080810546875, + "logps/rejected": -203.80789184570312, + "loss": 0.5179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8741163015365601, + "rewards/margins": 1.057347297668457, + "rewards/rejected": -1.9314637184143066, + "step": 3141 + }, + { + "epoch": 0.37, + "learning_rate": 1.9324436045824966e-07, + "logits/chosen": -2.2184793949127197, + "logits/rejected": -2.405529022216797, + "logps/chosen": -276.3310852050781, + "logps/rejected": -269.9331359863281, + "loss": 1.3409, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8734275102615356, + "rewards/margins": 1.4713757038116455, + "rewards/rejected": -3.3448030948638916, + "step": 3142 + }, + { + "epoch": 0.37, + "learning_rate": 1.932089287823314e-07, + "logits/chosen": -2.256789207458496, + "logits/rejected": -2.385599374771118, + "logps/chosen": -395.928466796875, + "logps/rejected": -247.03604125976562, + "loss": 0.4809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.973597526550293, + "rewards/margins": 1.6667382717132568, + "rewards/rejected": -2.640336036682129, + "step": 3143 + }, + { + "epoch": 0.37, + "learning_rate": 1.9317349710641313e-07, + "logits/chosen": -2.290410041809082, + "logits/rejected": -2.6533286571502686, + "logps/chosen": -298.1980285644531, + "logps/rejected": -231.24752807617188, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2942817211151123, + "rewards/margins": 0.6732892990112305, + "rewards/rejected": -1.9675710201263428, + "step": 3144 + }, + { + "epoch": 0.37, + "learning_rate": 1.9313806543049485e-07, + "logits/chosen": -2.188337564468384, + "logits/rejected": -2.416543960571289, + "logps/chosen": -318.5128479003906, + "logps/rejected": -234.77174377441406, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8966893553733826, + "rewards/margins": 1.403678297996521, + "rewards/rejected": -2.300367832183838, + "step": 3145 + }, + { + "epoch": 0.37, + "learning_rate": 1.9310263375457657e-07, + "logits/chosen": -2.6692333221435547, + "logits/rejected": -2.755354881286621, + "logps/chosen": -169.9678955078125, + "logps/rejected": -182.23582458496094, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14124518632888794, + "rewards/margins": 2.9847562313079834, + "rewards/rejected": -2.8435111045837402, + "step": 3146 + }, + { + "epoch": 0.37, + "learning_rate": 1.930672020786583e-07, + "logits/chosen": -1.7486519813537598, + "logits/rejected": -1.798386573791504, + "logps/chosen": -477.0087585449219, + "logps/rejected": -409.324462890625, + "loss": 0.6296, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.899261474609375, + "rewards/margins": 1.913628339767456, + "rewards/rejected": -2.812889814376831, + "step": 3147 + }, + { + "epoch": 0.37, + "learning_rate": 1.9303177040274007e-07, + "logits/chosen": -2.1254711151123047, + "logits/rejected": -2.646773338317871, + "logps/chosen": -298.5115966796875, + "logps/rejected": -299.6465148925781, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0887181758880615, + "rewards/margins": 2.2744734287261963, + "rewards/rejected": -3.363191604614258, + "step": 3148 + }, + { + "epoch": 0.37, + "learning_rate": 1.929963387268218e-07, + "logits/chosen": -2.6941685676574707, + "logits/rejected": -2.668522357940674, + "logps/chosen": -186.5926513671875, + "logps/rejected": -417.25933837890625, + "loss": 0.1505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8929291367530823, + "rewards/margins": 3.580596446990967, + "rewards/rejected": -4.473525524139404, + "step": 3149 + }, + { + "epoch": 0.37, + "learning_rate": 1.9296090705090351e-07, + "logits/chosen": -2.7407705783843994, + "logits/rejected": -2.7407851219177246, + "logps/chosen": -226.93617248535156, + "logps/rejected": -251.0614471435547, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4730029106140137, + "rewards/margins": 3.7094790935516357, + "rewards/rejected": -5.18248176574707, + "step": 3150 + }, + { + "epoch": 0.37, + "learning_rate": 1.9292547537498524e-07, + "logits/chosen": -2.237391948699951, + "logits/rejected": -2.5306034088134766, + "logps/chosen": -343.9668273925781, + "logps/rejected": -294.36614990234375, + "loss": 0.4401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7083986401557922, + "rewards/margins": 2.134974956512451, + "rewards/rejected": -2.8433735370635986, + "step": 3151 + }, + { + "epoch": 0.37, + "learning_rate": 1.9289004369906696e-07, + "logits/chosen": -2.045839548110962, + "logits/rejected": -1.9975348711013794, + "logps/chosen": -159.90927124023438, + "logps/rejected": -158.82867431640625, + "loss": 0.4595, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0179097652435303, + "rewards/margins": 0.8473888635635376, + "rewards/rejected": -1.8652986288070679, + "step": 3152 + }, + { + "epoch": 0.37, + "learning_rate": 1.9285461202314868e-07, + "logits/chosen": -2.5274221897125244, + "logits/rejected": -2.0660452842712402, + "logps/chosen": -126.86803436279297, + "logps/rejected": -264.5057678222656, + "loss": 0.2647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14727041125297546, + "rewards/margins": 2.82851243019104, + "rewards/rejected": -2.975782871246338, + "step": 3153 + }, + { + "epoch": 0.37, + "learning_rate": 1.928191803472304e-07, + "logits/chosen": -2.249345302581787, + "logits/rejected": -2.2968902587890625, + "logps/chosen": -79.97516632080078, + "logps/rejected": -149.7945098876953, + "loss": 0.2891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.019741669297218323, + "rewards/margins": 2.123882293701172, + "rewards/rejected": -2.1436238288879395, + "step": 3154 + }, + { + "epoch": 0.37, + "learning_rate": 1.9278374867131215e-07, + "logits/chosen": -2.892702579498291, + "logits/rejected": -2.7949483394622803, + "logps/chosen": -123.44725799560547, + "logps/rejected": -174.04078674316406, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2171828746795654, + "rewards/margins": 2.4107046127319336, + "rewards/rejected": -3.627887487411499, + "step": 3155 + }, + { + "epoch": 0.37, + "learning_rate": 1.9274831699539387e-07, + "logits/chosen": -2.4653091430664062, + "logits/rejected": -2.7025585174560547, + "logps/chosen": -258.10858154296875, + "logps/rejected": -185.49456787109375, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.931549072265625, + "rewards/margins": 1.700240135192871, + "rewards/rejected": -2.631789207458496, + "step": 3156 + }, + { + "epoch": 0.37, + "learning_rate": 1.927128853194756e-07, + "logits/chosen": -2.163545846939087, + "logits/rejected": -2.6042652130126953, + "logps/chosen": -401.03607177734375, + "logps/rejected": -233.97898864746094, + "loss": 0.1997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5937491655349731, + "rewards/margins": 2.5653302669525146, + "rewards/rejected": -3.1590797901153564, + "step": 3157 + }, + { + "epoch": 0.37, + "learning_rate": 1.9267745364355732e-07, + "logits/chosen": -2.768998622894287, + "logits/rejected": -2.6972715854644775, + "logps/chosen": -180.02444458007812, + "logps/rejected": -222.41339111328125, + "loss": 1.3841, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2711691856384277, + "rewards/margins": 0.2155253291130066, + "rewards/rejected": -2.4866943359375, + "step": 3158 + }, + { + "epoch": 0.37, + "learning_rate": 1.9264202196763904e-07, + "logits/chosen": -1.850663661956787, + "logits/rejected": -1.9897730350494385, + "logps/chosen": -541.7745971679688, + "logps/rejected": -489.3761291503906, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13470707833766937, + "rewards/margins": 3.1683897972106934, + "rewards/rejected": -3.0336828231811523, + "step": 3159 + }, + { + "epoch": 0.37, + "learning_rate": 1.9260659029172081e-07, + "logits/chosen": -2.499582529067993, + "logits/rejected": -2.44433331489563, + "logps/chosen": -245.42471313476562, + "logps/rejected": -302.79852294921875, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6744850277900696, + "rewards/margins": 3.5585427284240723, + "rewards/rejected": -4.233027935028076, + "step": 3160 + }, + { + "epoch": 0.37, + "learning_rate": 1.9257115861580254e-07, + "logits/chosen": -2.900247097015381, + "logits/rejected": -2.930180072784424, + "logps/chosen": -389.54107666015625, + "logps/rejected": -310.0992431640625, + "loss": 0.1728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9990042448043823, + "rewards/margins": 2.1150031089782715, + "rewards/rejected": -3.1140072345733643, + "step": 3161 + }, + { + "epoch": 0.37, + "learning_rate": 1.9253572693988426e-07, + "logits/chosen": -2.3940377235412598, + "logits/rejected": -2.246613025665283, + "logps/chosen": -301.7984313964844, + "logps/rejected": -341.15692138671875, + "loss": 0.2776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.946528434753418, + "rewards/margins": 2.131150245666504, + "rewards/rejected": -3.077678680419922, + "step": 3162 + }, + { + "epoch": 0.37, + "learning_rate": 1.9250029526396598e-07, + "logits/chosen": -2.44759202003479, + "logits/rejected": -2.4825141429901123, + "logps/chosen": -316.13604736328125, + "logps/rejected": -286.5953369140625, + "loss": 0.316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8119725584983826, + "rewards/margins": 2.597079038619995, + "rewards/rejected": -3.4090514183044434, + "step": 3163 + }, + { + "epoch": 0.37, + "learning_rate": 1.924648635880477e-07, + "logits/chosen": -2.7020111083984375, + "logits/rejected": -2.660953998565674, + "logps/chosen": -148.32684326171875, + "logps/rejected": -237.59823608398438, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7312800288200378, + "rewards/margins": 2.763472080230713, + "rewards/rejected": -3.4947519302368164, + "step": 3164 + }, + { + "epoch": 0.37, + "learning_rate": 1.9242943191212942e-07, + "logits/chosen": -2.0983476638793945, + "logits/rejected": -2.643665075302124, + "logps/chosen": -421.32574462890625, + "logps/rejected": -178.2677001953125, + "loss": 0.3181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8557639718055725, + "rewards/margins": 1.6461113691329956, + "rewards/rejected": -2.501875162124634, + "step": 3165 + }, + { + "epoch": 0.37, + "learning_rate": 1.9239400023621117e-07, + "logits/chosen": -1.6538810729980469, + "logits/rejected": -2.04146671295166, + "logps/chosen": -437.01287841796875, + "logps/rejected": -357.8394775390625, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8159582018852234, + "rewards/margins": 2.586364984512329, + "rewards/rejected": -3.402323007583618, + "step": 3166 + }, + { + "epoch": 0.37, + "learning_rate": 1.923585685602929e-07, + "logits/chosen": -2.6980769634246826, + "logits/rejected": -2.9216294288635254, + "logps/chosen": -384.0958251953125, + "logps/rejected": -358.8319396972656, + "loss": 0.265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9607188701629639, + "rewards/margins": 2.52705454826355, + "rewards/rejected": -3.4877734184265137, + "step": 3167 + }, + { + "epoch": 0.37, + "learning_rate": 1.9232313688437462e-07, + "logits/chosen": -2.5672430992126465, + "logits/rejected": -2.5246002674102783, + "logps/chosen": -348.85107421875, + "logps/rejected": -260.0709228515625, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9269664287567139, + "rewards/margins": 2.1458261013031006, + "rewards/rejected": -3.0727925300598145, + "step": 3168 + }, + { + "epoch": 0.37, + "learning_rate": 1.9228770520845634e-07, + "logits/chosen": -1.9530051946640015, + "logits/rejected": -2.399528741836548, + "logps/chosen": -406.9411926269531, + "logps/rejected": -260.3360595703125, + "loss": 0.5938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7623782753944397, + "rewards/margins": 0.5734649896621704, + "rewards/rejected": -1.3358433246612549, + "step": 3169 + }, + { + "epoch": 0.37, + "learning_rate": 1.9225227353253806e-07, + "logits/chosen": -2.3042895793914795, + "logits/rejected": -2.6620395183563232, + "logps/chosen": -272.5081787109375, + "logps/rejected": -210.39694213867188, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06712014973163605, + "rewards/margins": 2.498213768005371, + "rewards/rejected": -2.431093692779541, + "step": 3170 + }, + { + "epoch": 0.37, + "learning_rate": 1.9221684185661978e-07, + "logits/chosen": -2.5497233867645264, + "logits/rejected": -2.7116408348083496, + "logps/chosen": -202.58700561523438, + "logps/rejected": -200.17637634277344, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09004423022270203, + "rewards/margins": 1.9502902030944824, + "rewards/rejected": -2.0403342247009277, + "step": 3171 + }, + { + "epoch": 0.37, + "learning_rate": 1.9218141018070156e-07, + "logits/chosen": -2.507938861846924, + "logits/rejected": -2.4972078800201416, + "logps/chosen": -296.51654052734375, + "logps/rejected": -281.16644287109375, + "loss": 0.3491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.139602780342102, + "rewards/margins": 1.7851760387420654, + "rewards/rejected": -2.924778938293457, + "step": 3172 + }, + { + "epoch": 0.37, + "learning_rate": 1.9214597850478328e-07, + "logits/chosen": -2.0462560653686523, + "logits/rejected": -1.9870109558105469, + "logps/chosen": -281.8150634765625, + "logps/rejected": -310.6997375488281, + "loss": 0.377, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4008017778396606, + "rewards/margins": 2.5081920623779297, + "rewards/rejected": -3.9089934825897217, + "step": 3173 + }, + { + "epoch": 0.37, + "learning_rate": 1.92110546828865e-07, + "logits/chosen": -1.8173969984054565, + "logits/rejected": -2.0294840335845947, + "logps/chosen": -319.11834716796875, + "logps/rejected": -298.73309326171875, + "loss": 0.6093, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8669200539588928, + "rewards/margins": 0.6802937984466553, + "rewards/rejected": -1.5472137928009033, + "step": 3174 + }, + { + "epoch": 0.37, + "learning_rate": 1.9207511515294673e-07, + "logits/chosen": -1.8864620923995972, + "logits/rejected": -2.190478801727295, + "logps/chosen": -474.81103515625, + "logps/rejected": -387.6461486816406, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8746293187141418, + "rewards/margins": 2.1003055572509766, + "rewards/rejected": -2.9749348163604736, + "step": 3175 + }, + { + "epoch": 0.37, + "learning_rate": 1.9203968347702845e-07, + "logits/chosen": -2.182969093322754, + "logits/rejected": -2.1879749298095703, + "logps/chosen": -150.87913513183594, + "logps/rejected": -309.70782470703125, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3453094959259033, + "rewards/margins": 1.3505222797393799, + "rewards/rejected": -1.6958316564559937, + "step": 3176 + }, + { + "epoch": 0.37, + "learning_rate": 1.920042518011102e-07, + "logits/chosen": -1.974729061126709, + "logits/rejected": -2.245504379272461, + "logps/chosen": -306.3042297363281, + "logps/rejected": -292.2158203125, + "loss": 0.2734, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09486120939254761, + "rewards/margins": 2.164905309677124, + "rewards/rejected": -2.2597665786743164, + "step": 3177 + }, + { + "epoch": 0.37, + "learning_rate": 1.9196882012519192e-07, + "logits/chosen": -2.327970504760742, + "logits/rejected": -2.292393684387207, + "logps/chosen": -330.66845703125, + "logps/rejected": -250.999755859375, + "loss": 0.4247, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4500001072883606, + "rewards/margins": 0.8633212447166443, + "rewards/rejected": -1.3133213520050049, + "step": 3178 + }, + { + "epoch": 0.37, + "learning_rate": 1.9193338844927364e-07, + "logits/chosen": -2.018627643585205, + "logits/rejected": -2.206972599029541, + "logps/chosen": -199.66067504882812, + "logps/rejected": -248.08346557617188, + "loss": 0.6225, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3957357406616211, + "rewards/margins": 0.9450592398643494, + "rewards/rejected": -1.3407950401306152, + "step": 3179 + }, + { + "epoch": 0.37, + "learning_rate": 1.9189795677335536e-07, + "logits/chosen": -1.895951509475708, + "logits/rejected": -2.53849196434021, + "logps/chosen": -433.8330993652344, + "logps/rejected": -242.42787170410156, + "loss": 0.2672, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41876792907714844, + "rewards/margins": 1.9294724464416504, + "rewards/rejected": -2.348240375518799, + "step": 3180 + }, + { + "epoch": 0.37, + "learning_rate": 1.9186252509743708e-07, + "logits/chosen": -2.1511030197143555, + "logits/rejected": -2.1521201133728027, + "logps/chosen": -132.5138702392578, + "logps/rejected": -133.8966064453125, + "loss": 0.6024, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7199131846427917, + "rewards/margins": 1.7673122882843018, + "rewards/rejected": -2.4872255325317383, + "step": 3181 + }, + { + "epoch": 0.37, + "learning_rate": 1.918270934215188e-07, + "logits/chosen": -1.8453233242034912, + "logits/rejected": -2.110959768295288, + "logps/chosen": -309.5833435058594, + "logps/rejected": -255.5559539794922, + "loss": 0.2443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36813098192214966, + "rewards/margins": 2.3243069648742676, + "rewards/rejected": -2.6924376487731934, + "step": 3182 + }, + { + "epoch": 0.37, + "learning_rate": 1.9179166174560058e-07, + "logits/chosen": -2.0539145469665527, + "logits/rejected": -1.9143383502960205, + "logps/chosen": -339.960693359375, + "logps/rejected": -368.6610107421875, + "loss": 0.5098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8677932620048523, + "rewards/margins": 0.8465698957443237, + "rewards/rejected": -1.7143632173538208, + "step": 3183 + }, + { + "epoch": 0.37, + "learning_rate": 1.917562300696823e-07, + "logits/chosen": -2.1474761962890625, + "logits/rejected": -2.2780191898345947, + "logps/chosen": -164.1271514892578, + "logps/rejected": -212.37091064453125, + "loss": 0.4573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38927680253982544, + "rewards/margins": 1.5299155712127686, + "rewards/rejected": -1.9191921949386597, + "step": 3184 + }, + { + "epoch": 0.37, + "learning_rate": 1.9172079839376403e-07, + "logits/chosen": -2.792558193206787, + "logits/rejected": -2.643643379211426, + "logps/chosen": -193.36093139648438, + "logps/rejected": -273.80584716796875, + "loss": 0.4996, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.536325454711914, + "rewards/margins": 0.8020482659339905, + "rewards/rejected": -2.338373899459839, + "step": 3185 + }, + { + "epoch": 0.37, + "learning_rate": 1.9168536671784575e-07, + "logits/chosen": -2.2104530334472656, + "logits/rejected": -1.7676036357879639, + "logps/chosen": -175.49789428710938, + "logps/rejected": -302.2394714355469, + "loss": 0.2512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9535512924194336, + "rewards/margins": 2.583712577819824, + "rewards/rejected": -3.5372636318206787, + "step": 3186 + }, + { + "epoch": 0.37, + "learning_rate": 1.9164993504192747e-07, + "logits/chosen": -1.6194193363189697, + "logits/rejected": -1.925923466682434, + "logps/chosen": -225.3070068359375, + "logps/rejected": -163.70974731445312, + "loss": 0.6318, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9462329149246216, + "rewards/margins": 0.6181483864784241, + "rewards/rejected": -1.5643812417984009, + "step": 3187 + }, + { + "epoch": 0.37, + "learning_rate": 1.9161450336600922e-07, + "logits/chosen": -2.3876099586486816, + "logits/rejected": -2.440568208694458, + "logps/chosen": -333.3180847167969, + "logps/rejected": -325.70404052734375, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7548568844795227, + "rewards/margins": 3.3911831378936768, + "rewards/rejected": -4.146039962768555, + "step": 3188 + }, + { + "epoch": 0.37, + "learning_rate": 1.9157907169009094e-07, + "logits/chosen": -2.2270219326019287, + "logits/rejected": -2.1326770782470703, + "logps/chosen": -322.4561767578125, + "logps/rejected": -348.9518737792969, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6574346423149109, + "rewards/margins": 0.8381863832473755, + "rewards/rejected": -1.4956210851669312, + "step": 3189 + }, + { + "epoch": 0.37, + "learning_rate": 1.9154364001417266e-07, + "logits/chosen": -2.464693069458008, + "logits/rejected": -2.4156620502471924, + "logps/chosen": -185.22540283203125, + "logps/rejected": -257.880615234375, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5320767760276794, + "rewards/margins": 1.382102608680725, + "rewards/rejected": -1.9141794443130493, + "step": 3190 + }, + { + "epoch": 0.37, + "learning_rate": 1.9150820833825439e-07, + "logits/chosen": -2.4733452796936035, + "logits/rejected": -2.435786485671997, + "logps/chosen": -241.92214965820312, + "logps/rejected": -242.48731994628906, + "loss": 0.7388, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8836735486984253, + "rewards/margins": 1.1027183532714844, + "rewards/rejected": -1.9863920211791992, + "step": 3191 + }, + { + "epoch": 0.37, + "learning_rate": 1.914727766623361e-07, + "logits/chosen": -2.5483412742614746, + "logits/rejected": -2.20320725440979, + "logps/chosen": -242.97195434570312, + "logps/rejected": -254.82826232910156, + "loss": 0.3482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7763811945915222, + "rewards/margins": 1.9565354585647583, + "rewards/rejected": -2.732916831970215, + "step": 3192 + }, + { + "epoch": 0.37, + "learning_rate": 1.9143734498641783e-07, + "logits/chosen": -1.8268365859985352, + "logits/rejected": -2.417933940887451, + "logps/chosen": -357.09478759765625, + "logps/rejected": -255.75735473632812, + "loss": 0.3739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7232149839401245, + "rewards/margins": 1.1114208698272705, + "rewards/rejected": -1.8346359729766846, + "step": 3193 + }, + { + "epoch": 0.37, + "learning_rate": 1.9140191331049955e-07, + "logits/chosen": -2.2324602603912354, + "logits/rejected": -1.8773897886276245, + "logps/chosen": -118.87921905517578, + "logps/rejected": -299.29290771484375, + "loss": 0.2625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2545661926269531, + "rewards/margins": 2.1697733402252197, + "rewards/rejected": -2.424339532852173, + "step": 3194 + }, + { + "epoch": 0.37, + "learning_rate": 1.9136648163458133e-07, + "logits/chosen": -2.559506416320801, + "logits/rejected": -2.5846002101898193, + "logps/chosen": -282.648681640625, + "logps/rejected": -256.5959777832031, + "loss": 0.3366, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5970479249954224, + "rewards/margins": 1.9208812713623047, + "rewards/rejected": -2.5179290771484375, + "step": 3195 + }, + { + "epoch": 0.37, + "learning_rate": 1.9133104995866305e-07, + "logits/chosen": -2.651726484298706, + "logits/rejected": -2.7787532806396484, + "logps/chosen": -364.31707763671875, + "logps/rejected": -193.00221252441406, + "loss": 0.4738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6019917130470276, + "rewards/margins": 1.286452054977417, + "rewards/rejected": -1.8884437084197998, + "step": 3196 + }, + { + "epoch": 0.37, + "learning_rate": 1.9129561828274477e-07, + "logits/chosen": -2.733668565750122, + "logits/rejected": -2.750742197036743, + "logps/chosen": -136.57806396484375, + "logps/rejected": -129.05055236816406, + "loss": 0.3988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0261073112487793, + "rewards/margins": 1.187927007675171, + "rewards/rejected": -2.21403431892395, + "step": 3197 + }, + { + "epoch": 0.37, + "learning_rate": 1.912601866068265e-07, + "logits/chosen": -2.331082820892334, + "logits/rejected": -2.3602089881896973, + "logps/chosen": -348.1024169921875, + "logps/rejected": -240.73379516601562, + "loss": 0.712, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0845441818237305, + "rewards/margins": 0.7099411487579346, + "rewards/rejected": -1.794485330581665, + "step": 3198 + }, + { + "epoch": 0.37, + "learning_rate": 1.9122475493090822e-07, + "logits/chosen": -2.169079303741455, + "logits/rejected": -2.362210512161255, + "logps/chosen": -91.36421203613281, + "logps/rejected": -93.60712432861328, + "loss": 1.3947, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6875498294830322, + "rewards/margins": 0.17984771728515625, + "rewards/rejected": -1.867397665977478, + "step": 3199 + }, + { + "epoch": 0.37, + "learning_rate": 1.9118932325498996e-07, + "logits/chosen": -2.5542519092559814, + "logits/rejected": -2.444429874420166, + "logps/chosen": -184.71420288085938, + "logps/rejected": -225.59666442871094, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.667992353439331, + "rewards/margins": 2.259141445159912, + "rewards/rejected": -2.927133560180664, + "step": 3200 + }, + { + "epoch": 0.37, + "learning_rate": 1.9115389157907169e-07, + "logits/chosen": -2.5047459602355957, + "logits/rejected": -2.534607410430908, + "logps/chosen": -506.90509033203125, + "logps/rejected": -404.16741943359375, + "loss": 0.1579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10554055869579315, + "rewards/margins": 3.207212448120117, + "rewards/rejected": -3.312753200531006, + "step": 3201 + }, + { + "epoch": 0.37, + "learning_rate": 1.911184599031534e-07, + "logits/chosen": -1.6969332695007324, + "logits/rejected": -1.32732355594635, + "logps/chosen": -366.0934143066406, + "logps/rejected": -515.6815185546875, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1854037195444107, + "rewards/margins": 2.7835452556610107, + "rewards/rejected": -2.9689488410949707, + "step": 3202 + }, + { + "epoch": 0.37, + "learning_rate": 1.9108302822723513e-07, + "logits/chosen": -2.7645230293273926, + "logits/rejected": -2.7652127742767334, + "logps/chosen": -273.9284362792969, + "logps/rejected": -238.3545684814453, + "loss": 0.2577, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47424209117889404, + "rewards/margins": 3.1590867042541504, + "rewards/rejected": -3.633328676223755, + "step": 3203 + }, + { + "epoch": 0.37, + "learning_rate": 1.9104759655131685e-07, + "logits/chosen": -2.684561252593994, + "logits/rejected": -2.6680617332458496, + "logps/chosen": -172.08779907226562, + "logps/rejected": -259.9454345703125, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4344140887260437, + "rewards/margins": 3.1675710678100586, + "rewards/rejected": -3.601985454559326, + "step": 3204 + }, + { + "epoch": 0.37, + "learning_rate": 1.9101216487539857e-07, + "logits/chosen": -2.579371929168701, + "logits/rejected": -2.4444425106048584, + "logps/chosen": -247.86573791503906, + "logps/rejected": -755.3213500976562, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3130439519882202, + "rewards/margins": 4.38079309463501, + "rewards/rejected": -5.6938371658325195, + "step": 3205 + }, + { + "epoch": 0.37, + "learning_rate": 1.9097673319948032e-07, + "logits/chosen": -2.394746780395508, + "logits/rejected": -2.4037437438964844, + "logps/chosen": -390.1556091308594, + "logps/rejected": -254.78860473632812, + "loss": 0.5292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46477609872817993, + "rewards/margins": 1.7637434005737305, + "rewards/rejected": -2.2285194396972656, + "step": 3206 + }, + { + "epoch": 0.37, + "learning_rate": 1.9094130152356207e-07, + "logits/chosen": -2.0833282470703125, + "logits/rejected": -1.9953733682632446, + "logps/chosen": -252.40838623046875, + "logps/rejected": -304.6759338378906, + "loss": 1.0719, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.583401083946228, + "rewards/margins": 2.3307087421417236, + "rewards/rejected": -3.9141101837158203, + "step": 3207 + }, + { + "epoch": 0.37, + "learning_rate": 1.909058698476438e-07, + "logits/chosen": -2.0746684074401855, + "logits/rejected": -2.3198964595794678, + "logps/chosen": -402.735595703125, + "logps/rejected": -260.41046142578125, + "loss": 0.3226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5031176209449768, + "rewards/margins": 1.4772083759307861, + "rewards/rejected": -1.9803260564804077, + "step": 3208 + }, + { + "epoch": 0.37, + "learning_rate": 1.9087043817172552e-07, + "logits/chosen": -2.644043445587158, + "logits/rejected": -2.176420211791992, + "logps/chosen": -143.150146484375, + "logps/rejected": -329.0374755859375, + "loss": 0.6718, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4195330142974854, + "rewards/margins": 1.9932042360305786, + "rewards/rejected": -3.4127373695373535, + "step": 3209 + }, + { + "epoch": 0.37, + "learning_rate": 1.9083500649580724e-07, + "logits/chosen": -1.914677619934082, + "logits/rejected": -2.0118980407714844, + "logps/chosen": -233.90213012695312, + "logps/rejected": -245.00588989257812, + "loss": 0.6469, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3744927644729614, + "rewards/margins": 1.8965227603912354, + "rewards/rejected": -3.2710154056549072, + "step": 3210 + }, + { + "epoch": 0.37, + "learning_rate": 1.9079957481988899e-07, + "logits/chosen": -2.3674724102020264, + "logits/rejected": -2.420836925506592, + "logps/chosen": -389.5140380859375, + "logps/rejected": -313.31268310546875, + "loss": 0.2617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.637345552444458, + "rewards/margins": 1.5520628690719604, + "rewards/rejected": -2.189408302307129, + "step": 3211 + }, + { + "epoch": 0.37, + "learning_rate": 1.907641431439707e-07, + "logits/chosen": -2.3276522159576416, + "logits/rejected": -2.312445640563965, + "logps/chosen": -204.3212890625, + "logps/rejected": -280.382568359375, + "loss": 0.4871, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8925344944000244, + "rewards/margins": 1.6479724645614624, + "rewards/rejected": -2.5405070781707764, + "step": 3212 + }, + { + "epoch": 0.37, + "learning_rate": 1.9072871146805243e-07, + "logits/chosen": -2.2591464519500732, + "logits/rejected": -2.060910224914551, + "logps/chosen": -264.9720458984375, + "logps/rejected": -304.6864929199219, + "loss": 0.3779, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0994749069213867, + "rewards/margins": 2.774629831314087, + "rewards/rejected": -4.8741044998168945, + "step": 3213 + }, + { + "epoch": 0.37, + "learning_rate": 1.9069327979213415e-07, + "logits/chosen": -2.688176155090332, + "logits/rejected": -2.602663278579712, + "logps/chosen": -214.14093017578125, + "logps/rejected": -239.98794555664062, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2367576360702515, + "rewards/margins": 2.65828800201416, + "rewards/rejected": -3.895045518875122, + "step": 3214 + }, + { + "epoch": 0.37, + "learning_rate": 1.9065784811621588e-07, + "logits/chosen": -2.7085845470428467, + "logits/rejected": -2.707627296447754, + "logps/chosen": -359.3642578125, + "logps/rejected": -258.1665344238281, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21059304475784302, + "rewards/margins": 3.703244209289551, + "rewards/rejected": -3.91383695602417, + "step": 3215 + }, + { + "epoch": 0.37, + "learning_rate": 1.906224164402976e-07, + "logits/chosen": -2.687804937362671, + "logits/rejected": -2.605192184448242, + "logps/chosen": -221.1123046875, + "logps/rejected": -235.78416442871094, + "loss": 0.5507, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4222317636013031, + "rewards/margins": 1.3833495378494263, + "rewards/rejected": -1.8055810928344727, + "step": 3216 + }, + { + "epoch": 0.37, + "learning_rate": 1.9058698476437932e-07, + "logits/chosen": -2.241328477859497, + "logits/rejected": -2.6773998737335205, + "logps/chosen": -314.7541809082031, + "logps/rejected": -300.5948791503906, + "loss": 0.5495, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47973719239234924, + "rewards/margins": 1.2005685567855835, + "rewards/rejected": -1.6803057193756104, + "step": 3217 + }, + { + "epoch": 0.37, + "learning_rate": 1.905515530884611e-07, + "logits/chosen": -2.600045919418335, + "logits/rejected": -2.5837719440460205, + "logps/chosen": -174.25775146484375, + "logps/rejected": -255.02529907226562, + "loss": 0.755, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2335269451141357, + "rewards/margins": 0.5501248836517334, + "rewards/rejected": -1.7836518287658691, + "step": 3218 + }, + { + "epoch": 0.37, + "learning_rate": 1.9051612141254282e-07, + "logits/chosen": -2.5625696182250977, + "logits/rejected": -2.2703685760498047, + "logps/chosen": -163.1625518798828, + "logps/rejected": -264.955322265625, + "loss": 0.175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49817001819610596, + "rewards/margins": 3.069423198699951, + "rewards/rejected": -3.5675930976867676, + "step": 3219 + }, + { + "epoch": 0.37, + "learning_rate": 1.9048068973662454e-07, + "logits/chosen": -1.7626054286956787, + "logits/rejected": -1.8993251323699951, + "logps/chosen": -294.11614990234375, + "logps/rejected": -316.83099365234375, + "loss": 0.3901, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6766389012336731, + "rewards/margins": 0.9292320013046265, + "rewards/rejected": -1.6058709621429443, + "step": 3220 + }, + { + "epoch": 0.37, + "learning_rate": 1.9044525806070626e-07, + "logits/chosen": -2.450798273086548, + "logits/rejected": -2.672447443008423, + "logps/chosen": -289.308349609375, + "logps/rejected": -204.59765625, + "loss": 0.3008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5295712351799011, + "rewards/margins": 1.7788383960723877, + "rewards/rejected": -2.3084096908569336, + "step": 3221 + }, + { + "epoch": 0.37, + "learning_rate": 1.90409826384788e-07, + "logits/chosen": -2.841648817062378, + "logits/rejected": -2.8426687717437744, + "logps/chosen": -499.3968200683594, + "logps/rejected": -519.694580078125, + "loss": 0.2382, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.220902681350708, + "rewards/margins": 2.7731587886810303, + "rewards/rejected": -3.9940614700317383, + "step": 3222 + }, + { + "epoch": 0.37, + "learning_rate": 1.9037439470886973e-07, + "logits/chosen": -2.506439208984375, + "logits/rejected": -2.4853339195251465, + "logps/chosen": -231.01251220703125, + "logps/rejected": -169.3791961669922, + "loss": 0.3347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6539261341094971, + "rewards/margins": 1.743590235710144, + "rewards/rejected": -2.3975162506103516, + "step": 3223 + }, + { + "epoch": 0.38, + "learning_rate": 1.9033896303295145e-07, + "logits/chosen": -2.6239452362060547, + "logits/rejected": -2.7804253101348877, + "logps/chosen": -273.1101989746094, + "logps/rejected": -157.99765014648438, + "loss": 0.4335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2694106698036194, + "rewards/margins": 2.116637706756592, + "rewards/rejected": -2.3860483169555664, + "step": 3224 + }, + { + "epoch": 0.38, + "learning_rate": 1.9030353135703318e-07, + "logits/chosen": -2.3311619758605957, + "logits/rejected": -2.4225945472717285, + "logps/chosen": -78.176025390625, + "logps/rejected": -115.0626220703125, + "loss": 0.3385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29209184646606445, + "rewards/margins": 1.7206566333770752, + "rewards/rejected": -2.0127484798431396, + "step": 3225 + }, + { + "epoch": 0.38, + "learning_rate": 1.902680996811149e-07, + "logits/chosen": -2.4940054416656494, + "logits/rejected": -2.4528303146362305, + "logps/chosen": -265.793212890625, + "logps/rejected": -231.36289978027344, + "loss": 0.3222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.062372103333473206, + "rewards/margins": 2.670358419418335, + "rewards/rejected": -2.7327306270599365, + "step": 3226 + }, + { + "epoch": 0.38, + "learning_rate": 1.9023266800519662e-07, + "logits/chosen": -1.8420449495315552, + "logits/rejected": -2.151578903198242, + "logps/chosen": -529.20361328125, + "logps/rejected": -408.520751953125, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2404475212097168, + "rewards/margins": 2.4566221237182617, + "rewards/rejected": -3.6970691680908203, + "step": 3227 + }, + { + "epoch": 0.38, + "learning_rate": 1.9019723632927834e-07, + "logits/chosen": -2.2002663612365723, + "logits/rejected": -2.0867955684661865, + "logps/chosen": -154.4943084716797, + "logps/rejected": -182.96224975585938, + "loss": 0.1634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9413011074066162, + "rewards/margins": 2.0985305309295654, + "rewards/rejected": -3.0398316383361816, + "step": 3228 + }, + { + "epoch": 0.38, + "learning_rate": 1.901618046533601e-07, + "logits/chosen": -1.637128233909607, + "logits/rejected": -1.6729846000671387, + "logps/chosen": -326.263427734375, + "logps/rejected": -385.0272216796875, + "loss": 0.3685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7870007157325745, + "rewards/margins": 1.9854624271392822, + "rewards/rejected": -2.772463083267212, + "step": 3229 + }, + { + "epoch": 0.38, + "learning_rate": 1.9012637297744184e-07, + "logits/chosen": -2.356210231781006, + "logits/rejected": -2.3210883140563965, + "logps/chosen": -136.46890258789062, + "logps/rejected": -216.78048706054688, + "loss": 0.4094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4427909255027771, + "rewards/margins": 2.1402082443237305, + "rewards/rejected": -2.5829989910125732, + "step": 3230 + }, + { + "epoch": 0.38, + "learning_rate": 1.9009094130152356e-07, + "logits/chosen": -2.2593088150024414, + "logits/rejected": -1.9057867527008057, + "logps/chosen": -258.94329833984375, + "logps/rejected": -406.21307373046875, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.540289044380188, + "rewards/margins": 4.383500576019287, + "rewards/rejected": -4.9237895011901855, + "step": 3231 + }, + { + "epoch": 0.38, + "learning_rate": 1.9005550962560528e-07, + "logits/chosen": -1.9971282482147217, + "logits/rejected": -1.6898767948150635, + "logps/chosen": -306.839599609375, + "logps/rejected": -442.9520568847656, + "loss": 0.5708, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1713738441467285, + "rewards/margins": 1.1274758577346802, + "rewards/rejected": -2.2988498210906982, + "step": 3232 + }, + { + "epoch": 0.38, + "learning_rate": 1.90020077949687e-07, + "logits/chosen": -2.4572126865386963, + "logits/rejected": -2.5742881298065186, + "logps/chosen": -206.15101623535156, + "logps/rejected": -256.31146240234375, + "loss": 0.3748, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4443996846675873, + "rewards/margins": 2.25850248336792, + "rewards/rejected": -2.70290207862854, + "step": 3233 + }, + { + "epoch": 0.38, + "learning_rate": 1.8998464627376875e-07, + "logits/chosen": -2.2815403938293457, + "logits/rejected": -2.492597818374634, + "logps/chosen": -245.51341247558594, + "logps/rejected": -259.1561279296875, + "loss": 0.5709, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1496930122375488, + "rewards/margins": 1.920180082321167, + "rewards/rejected": -3.069873332977295, + "step": 3234 + }, + { + "epoch": 0.38, + "learning_rate": 1.8994921459785048e-07, + "logits/chosen": -2.298266887664795, + "logits/rejected": -2.4823861122131348, + "logps/chosen": -277.4986877441406, + "logps/rejected": -293.2684020996094, + "loss": 0.2074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3420056402683258, + "rewards/margins": 1.8934776782989502, + "rewards/rejected": -2.235483407974243, + "step": 3235 + }, + { + "epoch": 0.38, + "learning_rate": 1.899137829219322e-07, + "logits/chosen": -2.55987811088562, + "logits/rejected": -2.3394289016723633, + "logps/chosen": -212.00074768066406, + "logps/rejected": -288.7040100097656, + "loss": 0.1006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3443097174167633, + "rewards/margins": 3.84413480758667, + "rewards/rejected": -4.1884446144104, + "step": 3236 + }, + { + "epoch": 0.38, + "learning_rate": 1.8987835124601392e-07, + "logits/chosen": -2.2606234550476074, + "logits/rejected": -2.6480982303619385, + "logps/chosen": -210.07150268554688, + "logps/rejected": -129.44456481933594, + "loss": 0.576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3498830199241638, + "rewards/margins": 2.073061943054199, + "rewards/rejected": -2.422945261001587, + "step": 3237 + }, + { + "epoch": 0.38, + "learning_rate": 1.8984291957009564e-07, + "logits/chosen": -2.804220199584961, + "logits/rejected": -2.641544818878174, + "logps/chosen": -287.5760498046875, + "logps/rejected": -255.9549560546875, + "loss": 0.1771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42140915989875793, + "rewards/margins": 2.1233999729156494, + "rewards/rejected": -2.544809341430664, + "step": 3238 + }, + { + "epoch": 0.38, + "learning_rate": 1.8980748789417737e-07, + "logits/chosen": -2.1498894691467285, + "logits/rejected": -2.084552049636841, + "logps/chosen": -245.97970581054688, + "logps/rejected": -197.21173095703125, + "loss": 0.7039, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9468976259231567, + "rewards/margins": 0.6123813390731812, + "rewards/rejected": -1.5592788457870483, + "step": 3239 + }, + { + "epoch": 0.38, + "learning_rate": 1.8977205621825911e-07, + "logits/chosen": -2.7211756706237793, + "logits/rejected": -2.6205661296844482, + "logps/chosen": -148.29661560058594, + "logps/rejected": -164.67198181152344, + "loss": 0.4059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8614287972450256, + "rewards/margins": 1.9615739583969116, + "rewards/rejected": -2.823002815246582, + "step": 3240 + }, + { + "epoch": 0.38, + "learning_rate": 1.8973662454234084e-07, + "logits/chosen": -2.555750846862793, + "logits/rejected": -2.4533040523529053, + "logps/chosen": -278.0552978515625, + "logps/rejected": -296.5425720214844, + "loss": 0.835, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4452159404754639, + "rewards/margins": 1.44575035572052, + "rewards/rejected": -2.8909664154052734, + "step": 3241 + }, + { + "epoch": 0.38, + "learning_rate": 1.8970119286642258e-07, + "logits/chosen": -2.185974597930908, + "logits/rejected": -2.089214324951172, + "logps/chosen": -359.0887756347656, + "logps/rejected": -382.88623046875, + "loss": 0.5292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7457459568977356, + "rewards/margins": 1.3702107667922974, + "rewards/rejected": -2.1159567832946777, + "step": 3242 + }, + { + "epoch": 0.38, + "learning_rate": 1.896657611905043e-07, + "logits/chosen": -2.3580374717712402, + "logits/rejected": -2.51841139793396, + "logps/chosen": -391.99407958984375, + "logps/rejected": -149.8769989013672, + "loss": 0.9725, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3286888599395752, + "rewards/margins": 0.42823508381843567, + "rewards/rejected": -1.756924033164978, + "step": 3243 + }, + { + "epoch": 0.38, + "learning_rate": 1.8963032951458603e-07, + "logits/chosen": -2.07189679145813, + "logits/rejected": -2.239478826522827, + "logps/chosen": -278.524658203125, + "logps/rejected": -297.58721923828125, + "loss": 0.2304, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0030953884124756, + "rewards/margins": 2.5819036960601807, + "rewards/rejected": -3.5849990844726562, + "step": 3244 + }, + { + "epoch": 0.38, + "learning_rate": 1.8959489783866778e-07, + "logits/chosen": -2.0882723331451416, + "logits/rejected": -2.2723264694213867, + "logps/chosen": -509.1351318359375, + "logps/rejected": -338.858642578125, + "loss": 0.1637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43399888277053833, + "rewards/margins": 2.8785440921783447, + "rewards/rejected": -3.3125431537628174, + "step": 3245 + }, + { + "epoch": 0.38, + "learning_rate": 1.895594661627495e-07, + "logits/chosen": -1.9129339456558228, + "logits/rejected": -2.246053695678711, + "logps/chosen": -290.80548095703125, + "logps/rejected": -286.33477783203125, + "loss": 0.3913, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0789754390716553, + "rewards/margins": 2.2127249240875244, + "rewards/rejected": -3.2917003631591797, + "step": 3246 + }, + { + "epoch": 0.38, + "learning_rate": 1.8952403448683122e-07, + "logits/chosen": -2.2596805095672607, + "logits/rejected": -2.291738748550415, + "logps/chosen": -343.4935302734375, + "logps/rejected": -524.1539916992188, + "loss": 0.6708, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.792493999004364, + "rewards/margins": 1.9889382123947144, + "rewards/rejected": -2.7814323902130127, + "step": 3247 + }, + { + "epoch": 0.38, + "learning_rate": 1.8948860281091294e-07, + "logits/chosen": -2.271575927734375, + "logits/rejected": -2.4065840244293213, + "logps/chosen": -327.3310546875, + "logps/rejected": -208.06649780273438, + "loss": 0.631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8533061146736145, + "rewards/margins": 0.3383706212043762, + "rewards/rejected": -1.1916767358779907, + "step": 3248 + }, + { + "epoch": 0.38, + "learning_rate": 1.8945317113499467e-07, + "logits/chosen": -2.5871002674102783, + "logits/rejected": -2.4718213081359863, + "logps/chosen": -264.62835693359375, + "logps/rejected": -275.6920471191406, + "loss": 0.591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6940630674362183, + "rewards/margins": 0.9341267347335815, + "rewards/rejected": -2.6281898021698, + "step": 3249 + }, + { + "epoch": 0.38, + "learning_rate": 1.894177394590764e-07, + "logits/chosen": -2.6770834922790527, + "logits/rejected": -2.6993303298950195, + "logps/chosen": -231.24710083007812, + "logps/rejected": -329.3340759277344, + "loss": 0.3595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2437734454870224, + "rewards/margins": 1.5285239219665527, + "rewards/rejected": -1.7722972631454468, + "step": 3250 + }, + { + "epoch": 0.38, + "learning_rate": 1.8938230778315814e-07, + "logits/chosen": -2.0170881748199463, + "logits/rejected": -2.368675947189331, + "logps/chosen": -218.9362030029297, + "logps/rejected": -138.83453369140625, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6683643460273743, + "rewards/margins": 1.1873081922531128, + "rewards/rejected": -1.8556725978851318, + "step": 3251 + }, + { + "epoch": 0.38, + "learning_rate": 1.8934687610723986e-07, + "logits/chosen": -1.7311886548995972, + "logits/rejected": -1.9576821327209473, + "logps/chosen": -360.599609375, + "logps/rejected": -247.40048217773438, + "loss": 1.2568, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.505693197250366, + "rewards/margins": -0.039053529500961304, + "rewards/rejected": -2.466639518737793, + "step": 3252 + }, + { + "epoch": 0.38, + "learning_rate": 1.8931144443132158e-07, + "logits/chosen": -2.3881936073303223, + "logits/rejected": -2.609907627105713, + "logps/chosen": -402.769775390625, + "logps/rejected": -373.51251220703125, + "loss": 0.5153, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1083673238754272, + "rewards/margins": 1.5676758289337158, + "rewards/rejected": -2.6760432720184326, + "step": 3253 + }, + { + "epoch": 0.38, + "learning_rate": 1.8927601275540333e-07, + "logits/chosen": -2.4687955379486084, + "logits/rejected": -2.5008156299591064, + "logps/chosen": -162.7052459716797, + "logps/rejected": -206.4859619140625, + "loss": 0.2254, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6865992546081543, + "rewards/margins": 1.9023888111114502, + "rewards/rejected": -2.5889883041381836, + "step": 3254 + }, + { + "epoch": 0.38, + "learning_rate": 1.8924058107948505e-07, + "logits/chosen": -2.119002342224121, + "logits/rejected": -2.344454050064087, + "logps/chosen": -370.7475280761719, + "logps/rejected": -223.86312866210938, + "loss": 0.3289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6575223803520203, + "rewards/margins": 1.6050114631652832, + "rewards/rejected": -2.2625339031219482, + "step": 3255 + }, + { + "epoch": 0.38, + "learning_rate": 1.892051494035668e-07, + "logits/chosen": -2.538059711456299, + "logits/rejected": -2.309659004211426, + "logps/chosen": -534.3564453125, + "logps/rejected": -439.9421081542969, + "loss": 0.4138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6898093223571777, + "rewards/margins": 2.3398847579956055, + "rewards/rejected": -3.029693841934204, + "step": 3256 + }, + { + "epoch": 0.38, + "learning_rate": 1.8916971772764852e-07, + "logits/chosen": -2.649397611618042, + "logits/rejected": -2.3945024013519287, + "logps/chosen": -276.1242370605469, + "logps/rejected": -340.9031066894531, + "loss": 0.5471, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0764834880828857, + "rewards/margins": 1.5946435928344727, + "rewards/rejected": -2.6711270809173584, + "step": 3257 + }, + { + "epoch": 0.38, + "learning_rate": 1.8913428605173024e-07, + "logits/chosen": -2.056574583053589, + "logits/rejected": -2.2691707611083984, + "logps/chosen": -459.6457214355469, + "logps/rejected": -466.3200988769531, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4385409355163574, + "rewards/margins": 2.2731130123138428, + "rewards/rejected": -3.711653709411621, + "step": 3258 + }, + { + "epoch": 0.38, + "learning_rate": 1.8909885437581197e-07, + "logits/chosen": -2.0088343620300293, + "logits/rejected": -2.321969985961914, + "logps/chosen": -545.6524047851562, + "logps/rejected": -334.8851318359375, + "loss": 0.431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30487310886383057, + "rewards/margins": 1.9125112295150757, + "rewards/rejected": -2.2173843383789062, + "step": 3259 + }, + { + "epoch": 0.38, + "learning_rate": 1.890634226998937e-07, + "logits/chosen": -1.913032054901123, + "logits/rejected": -1.9833568334579468, + "logps/chosen": -358.41131591796875, + "logps/rejected": -286.3663024902344, + "loss": 0.5419, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.520216464996338, + "rewards/margins": 1.5905377864837646, + "rewards/rejected": -3.1107544898986816, + "step": 3260 + }, + { + "epoch": 0.38, + "learning_rate": 1.890279910239754e-07, + "logits/chosen": -2.4498484134674072, + "logits/rejected": -2.6825144290924072, + "logps/chosen": -471.0520324707031, + "logps/rejected": -364.9278564453125, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4404642581939697, + "rewards/margins": 1.9710488319396973, + "rewards/rejected": -3.411513090133667, + "step": 3261 + }, + { + "epoch": 0.38, + "learning_rate": 1.8899255934805713e-07, + "logits/chosen": -2.439056396484375, + "logits/rejected": -2.253410816192627, + "logps/chosen": -384.49566650390625, + "logps/rejected": -277.30072021484375, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1500232219696045, + "rewards/margins": 1.5387393236160278, + "rewards/rejected": -2.688762664794922, + "step": 3262 + }, + { + "epoch": 0.38, + "learning_rate": 1.8895712767213888e-07, + "logits/chosen": -2.501188278198242, + "logits/rejected": -2.6026458740234375, + "logps/chosen": -256.1303405761719, + "logps/rejected": -280.6025390625, + "loss": 0.215, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.015978217124939, + "rewards/margins": 3.0172438621520996, + "rewards/rejected": -4.033222198486328, + "step": 3263 + }, + { + "epoch": 0.38, + "learning_rate": 1.889216959962206e-07, + "logits/chosen": -2.377229690551758, + "logits/rejected": -2.1464321613311768, + "logps/chosen": -296.6788330078125, + "logps/rejected": -265.0169372558594, + "loss": 0.9112, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1680712699890137, + "rewards/margins": 1.4681212902069092, + "rewards/rejected": -2.636192798614502, + "step": 3264 + }, + { + "epoch": 0.38, + "learning_rate": 1.8888626432030235e-07, + "logits/chosen": -2.1957216262817383, + "logits/rejected": -2.1763265132904053, + "logps/chosen": -209.71295166015625, + "logps/rejected": -273.5195007324219, + "loss": 0.462, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8503550291061401, + "rewards/margins": 1.5570666790008545, + "rewards/rejected": -2.407421588897705, + "step": 3265 + }, + { + "epoch": 0.38, + "learning_rate": 1.8885083264438407e-07, + "logits/chosen": -1.683905005455017, + "logits/rejected": -2.0291144847869873, + "logps/chosen": -347.59271240234375, + "logps/rejected": -278.66033935546875, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7877626419067383, + "rewards/margins": 0.8475530743598938, + "rewards/rejected": -1.6353156566619873, + "step": 3266 + }, + { + "epoch": 0.38, + "learning_rate": 1.8881540096846582e-07, + "logits/chosen": -1.8197193145751953, + "logits/rejected": -2.1332387924194336, + "logps/chosen": -265.1026916503906, + "logps/rejected": -189.19644165039062, + "loss": 1.1772, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0541492700576782, + "rewards/margins": -0.2400013506412506, + "rewards/rejected": -0.81414794921875, + "step": 3267 + }, + { + "epoch": 0.38, + "learning_rate": 1.8877996929254754e-07, + "logits/chosen": -2.113678455352783, + "logits/rejected": -2.051194667816162, + "logps/chosen": -318.1328125, + "logps/rejected": -292.32940673828125, + "loss": 0.4659, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2011120319366455, + "rewards/margins": 0.7260014414787292, + "rewards/rejected": -1.9271135330200195, + "step": 3268 + }, + { + "epoch": 0.38, + "learning_rate": 1.8874453761662927e-07, + "logits/chosen": -2.189897298812866, + "logits/rejected": -2.47031569480896, + "logps/chosen": -364.5947570800781, + "logps/rejected": -344.44622802734375, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9895294904708862, + "rewards/margins": 1.9544157981872559, + "rewards/rejected": -2.9439454078674316, + "step": 3269 + }, + { + "epoch": 0.38, + "learning_rate": 1.88709105940711e-07, + "logits/chosen": -2.3104217052459717, + "logits/rejected": -2.359304904937744, + "logps/chosen": -331.6579284667969, + "logps/rejected": -277.7344665527344, + "loss": 0.5275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7964348793029785, + "rewards/margins": 1.7797231674194336, + "rewards/rejected": -2.576158285140991, + "step": 3270 + }, + { + "epoch": 0.38, + "learning_rate": 1.886736742647927e-07, + "logits/chosen": -2.1030499935150146, + "logits/rejected": -2.3495609760284424, + "logps/chosen": -482.6959228515625, + "logps/rejected": -287.880126953125, + "loss": 0.8062, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0059123039245605, + "rewards/margins": 2.1069846153259277, + "rewards/rejected": -4.112896919250488, + "step": 3271 + }, + { + "epoch": 0.38, + "learning_rate": 1.8863824258887443e-07, + "logits/chosen": -2.1760880947113037, + "logits/rejected": -2.185364007949829, + "logps/chosen": -374.39385986328125, + "logps/rejected": -390.8062438964844, + "loss": 0.3791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2656015455722809, + "rewards/margins": 2.6022567749023438, + "rewards/rejected": -2.867858409881592, + "step": 3272 + }, + { + "epoch": 0.38, + "learning_rate": 1.8860281091295616e-07, + "logits/chosen": -2.666940450668335, + "logits/rejected": -2.614044666290283, + "logps/chosen": -228.04689025878906, + "logps/rejected": -327.348876953125, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7845311164855957, + "rewards/margins": 3.8655014038085938, + "rewards/rejected": -4.6500325202941895, + "step": 3273 + }, + { + "epoch": 0.38, + "learning_rate": 1.885673792370379e-07, + "logits/chosen": -2.316056966781616, + "logits/rejected": -2.647771120071411, + "logps/chosen": -466.0149230957031, + "logps/rejected": -297.01220703125, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7311259508132935, + "rewards/margins": 2.488952159881592, + "rewards/rejected": -3.2200779914855957, + "step": 3274 + }, + { + "epoch": 0.38, + "learning_rate": 1.8853194756111963e-07, + "logits/chosen": -1.5578582286834717, + "logits/rejected": -1.7535136938095093, + "logps/chosen": -298.4570617675781, + "logps/rejected": -274.2190856933594, + "loss": 0.6043, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8602393865585327, + "rewards/margins": 1.0028367042541504, + "rewards/rejected": -1.863076090812683, + "step": 3275 + }, + { + "epoch": 0.38, + "learning_rate": 1.8849651588520135e-07, + "logits/chosen": -2.7714154720306396, + "logits/rejected": -2.6975979804992676, + "logps/chosen": -99.29049682617188, + "logps/rejected": -199.0389404296875, + "loss": 0.5736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8301012516021729, + "rewards/margins": 1.476645588874817, + "rewards/rejected": -2.3067469596862793, + "step": 3276 + }, + { + "epoch": 0.38, + "learning_rate": 1.884610842092831e-07, + "logits/chosen": -2.0784099102020264, + "logits/rejected": -2.207335948944092, + "logps/chosen": -341.85150146484375, + "logps/rejected": -304.4019775390625, + "loss": 0.5567, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1250044107437134, + "rewards/margins": 1.435920238494873, + "rewards/rejected": -2.560924530029297, + "step": 3277 + }, + { + "epoch": 0.38, + "learning_rate": 1.8842565253336482e-07, + "logits/chosen": -2.7889060974121094, + "logits/rejected": -2.8827128410339355, + "logps/chosen": -222.6475067138672, + "logps/rejected": -238.67617797851562, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7017053365707397, + "rewards/margins": 3.163712978363037, + "rewards/rejected": -3.8654184341430664, + "step": 3278 + }, + { + "epoch": 0.38, + "learning_rate": 1.8839022085744657e-07, + "logits/chosen": -2.3735227584838867, + "logits/rejected": -2.4404592514038086, + "logps/chosen": -151.88734436035156, + "logps/rejected": -173.66217041015625, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38649022579193115, + "rewards/margins": 2.8479325771331787, + "rewards/rejected": -3.2344229221343994, + "step": 3279 + }, + { + "epoch": 0.38, + "learning_rate": 1.883547891815283e-07, + "logits/chosen": -2.619227170944214, + "logits/rejected": -2.6384117603302, + "logps/chosen": -196.28121948242188, + "logps/rejected": -295.298095703125, + "loss": 0.3877, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.427058219909668, + "rewards/margins": 2.388202667236328, + "rewards/rejected": -3.815260887145996, + "step": 3280 + }, + { + "epoch": 0.38, + "learning_rate": 1.8831935750561e-07, + "logits/chosen": -2.4417858123779297, + "logits/rejected": -2.1641342639923096, + "logps/chosen": -320.58404541015625, + "logps/rejected": -301.60162353515625, + "loss": 0.3924, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0585428476333618, + "rewards/margins": 1.3287712335586548, + "rewards/rejected": -2.3873140811920166, + "step": 3281 + }, + { + "epoch": 0.38, + "learning_rate": 1.8828392582969173e-07, + "logits/chosen": -2.4424550533294678, + "logits/rejected": -2.5853452682495117, + "logps/chosen": -364.84979248046875, + "logps/rejected": -251.2431640625, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5532305240631104, + "rewards/margins": 1.5544167757034302, + "rewards/rejected": -2.107647180557251, + "step": 3282 + }, + { + "epoch": 0.38, + "learning_rate": 1.8824849415377346e-07, + "logits/chosen": -2.5711097717285156, + "logits/rejected": -2.64656400680542, + "logps/chosen": -258.8346862792969, + "logps/rejected": -215.57003784179688, + "loss": 0.3759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32285669445991516, + "rewards/margins": 1.3606055974960327, + "rewards/rejected": -1.6834622621536255, + "step": 3283 + }, + { + "epoch": 0.38, + "learning_rate": 1.8821306247785518e-07, + "logits/chosen": -2.6931533813476562, + "logits/rejected": -2.821028709411621, + "logps/chosen": -201.779541015625, + "logps/rejected": -174.5984649658203, + "loss": 0.33, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0129868984222412, + "rewards/margins": 1.2468655109405518, + "rewards/rejected": -2.259852409362793, + "step": 3284 + }, + { + "epoch": 0.38, + "learning_rate": 1.8817763080193693e-07, + "logits/chosen": -2.5199334621429443, + "logits/rejected": -2.24521803855896, + "logps/chosen": -221.0295867919922, + "logps/rejected": -266.2992858886719, + "loss": 0.8316, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0914595127105713, + "rewards/margins": 1.4784820079803467, + "rewards/rejected": -3.569941759109497, + "step": 3285 + }, + { + "epoch": 0.38, + "learning_rate": 1.8814219912601865e-07, + "logits/chosen": -2.4212422370910645, + "logits/rejected": -2.3263494968414307, + "logps/chosen": -317.54193115234375, + "logps/rejected": -367.0589599609375, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20900674164295197, + "rewards/margins": 2.5605926513671875, + "rewards/rejected": -2.769599437713623, + "step": 3286 + }, + { + "epoch": 0.38, + "learning_rate": 1.8810676745010037e-07, + "logits/chosen": -2.948509931564331, + "logits/rejected": -2.9453582763671875, + "logps/chosen": -162.92503356933594, + "logps/rejected": -175.3579559326172, + "loss": 0.2717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.030333571135997772, + "rewards/margins": 1.5389050245285034, + "rewards/rejected": -1.569238543510437, + "step": 3287 + }, + { + "epoch": 0.38, + "learning_rate": 1.880713357741821e-07, + "logits/chosen": -2.790242910385132, + "logits/rejected": -2.395946979522705, + "logps/chosen": -258.07012939453125, + "logps/rejected": -408.6767578125, + "loss": 0.4176, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7302748560905457, + "rewards/margins": 2.8038430213928223, + "rewards/rejected": -3.534118175506592, + "step": 3288 + }, + { + "epoch": 0.38, + "learning_rate": 1.8803590409826384e-07, + "logits/chosen": -2.131655693054199, + "logits/rejected": -2.0914385318756104, + "logps/chosen": -338.2203369140625, + "logps/rejected": -334.618408203125, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8387415409088135, + "rewards/margins": 2.3420779705047607, + "rewards/rejected": -3.180819511413574, + "step": 3289 + }, + { + "epoch": 0.38, + "learning_rate": 1.880004724223456e-07, + "logits/chosen": -1.38021981716156, + "logits/rejected": -1.765668272972107, + "logps/chosen": -527.9132080078125, + "logps/rejected": -331.218994140625, + "loss": 0.7581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265608310699463, + "rewards/margins": 0.8357164859771729, + "rewards/rejected": -2.1013247966766357, + "step": 3290 + }, + { + "epoch": 0.38, + "learning_rate": 1.879650407464273e-07, + "logits/chosen": -1.8684849739074707, + "logits/rejected": -1.780892252922058, + "logps/chosen": -277.1417236328125, + "logps/rejected": -269.4858093261719, + "loss": 0.7831, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2440983057022095, + "rewards/margins": 1.630279541015625, + "rewards/rejected": -2.874377965927124, + "step": 3291 + }, + { + "epoch": 0.38, + "learning_rate": 1.8792960907050903e-07, + "logits/chosen": -2.1279478073120117, + "logits/rejected": -2.0628511905670166, + "logps/chosen": -239.98599243164062, + "logps/rejected": -387.88665771484375, + "loss": 1.0234, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5104138851165771, + "rewards/margins": 0.2929292917251587, + "rewards/rejected": -1.8033432960510254, + "step": 3292 + }, + { + "epoch": 0.38, + "learning_rate": 1.8789417739459076e-07, + "logits/chosen": -2.607560634613037, + "logits/rejected": -2.4234440326690674, + "logps/chosen": -194.35614013671875, + "logps/rejected": -328.53851318359375, + "loss": 0.9115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8143310546875, + "rewards/margins": 1.6550230979919434, + "rewards/rejected": -3.4693541526794434, + "step": 3293 + }, + { + "epoch": 0.38, + "learning_rate": 1.8785874571867248e-07, + "logits/chosen": -2.297886848449707, + "logits/rejected": -2.2505717277526855, + "logps/chosen": -256.78399658203125, + "logps/rejected": -271.07916259765625, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13164746761322021, + "rewards/margins": 1.5230202674865723, + "rewards/rejected": -1.654667615890503, + "step": 3294 + }, + { + "epoch": 0.38, + "learning_rate": 1.878233140427542e-07, + "logits/chosen": -2.346222400665283, + "logits/rejected": -2.397308111190796, + "logps/chosen": -332.24151611328125, + "logps/rejected": -271.36566162109375, + "loss": 0.6777, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8540563583374023, + "rewards/margins": 1.443103551864624, + "rewards/rejected": -3.2971601486206055, + "step": 3295 + }, + { + "epoch": 0.38, + "learning_rate": 1.8778788236683595e-07, + "logits/chosen": -1.509308934211731, + "logits/rejected": -1.9815223217010498, + "logps/chosen": -465.8705749511719, + "logps/rejected": -283.3115234375, + "loss": 0.3616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9027110934257507, + "rewards/margins": 1.0970818996429443, + "rewards/rejected": -1.9997931718826294, + "step": 3296 + }, + { + "epoch": 0.38, + "learning_rate": 1.8775245069091767e-07, + "logits/chosen": -2.637381076812744, + "logits/rejected": -2.630558490753174, + "logps/chosen": -233.5251922607422, + "logps/rejected": -222.3778076171875, + "loss": 0.3755, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1114683151245117, + "rewards/margins": 2.5344152450561523, + "rewards/rejected": -3.645883560180664, + "step": 3297 + }, + { + "epoch": 0.38, + "learning_rate": 1.877170190149994e-07, + "logits/chosen": -2.675461530685425, + "logits/rejected": -2.8047633171081543, + "logps/chosen": -366.94696044921875, + "logps/rejected": -371.2447509765625, + "loss": 0.3709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.653447151184082, + "rewards/margins": 2.008774518966675, + "rewards/rejected": -2.662221670150757, + "step": 3298 + }, + { + "epoch": 0.38, + "learning_rate": 1.8768158733908112e-07, + "logits/chosen": -2.0736334323883057, + "logits/rejected": -2.4159953594207764, + "logps/chosen": -346.5773620605469, + "logps/rejected": -199.7591552734375, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7982228398323059, + "rewards/margins": 1.3728286027908325, + "rewards/rejected": -2.171051502227783, + "step": 3299 + }, + { + "epoch": 0.38, + "learning_rate": 1.8764615566316286e-07, + "logits/chosen": -2.1672611236572266, + "logits/rejected": -2.326845169067383, + "logps/chosen": -254.47702026367188, + "logps/rejected": -196.19400024414062, + "loss": 0.5652, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0665678977966309, + "rewards/margins": 1.664721965789795, + "rewards/rejected": -2.731289863586426, + "step": 3300 + }, + { + "epoch": 0.38, + "learning_rate": 1.876107239872446e-07, + "logits/chosen": -2.2654995918273926, + "logits/rejected": -2.5289788246154785, + "logps/chosen": -446.5240478515625, + "logps/rejected": -277.4591369628906, + "loss": 0.3351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6383839249610901, + "rewards/margins": 1.9719722270965576, + "rewards/rejected": -2.610355854034424, + "step": 3301 + }, + { + "epoch": 0.38, + "learning_rate": 1.8757529231132634e-07, + "logits/chosen": -2.534900188446045, + "logits/rejected": -2.5250163078308105, + "logps/chosen": -384.9508972167969, + "logps/rejected": -206.67236328125, + "loss": 0.4618, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.322751760482788, + "rewards/margins": 1.9341816902160645, + "rewards/rejected": -3.2569332122802734, + "step": 3302 + }, + { + "epoch": 0.38, + "learning_rate": 1.8753986063540806e-07, + "logits/chosen": -2.2728404998779297, + "logits/rejected": -2.2574386596679688, + "logps/chosen": -228.38821411132812, + "logps/rejected": -289.6147155761719, + "loss": 0.3669, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0045157670974731, + "rewards/margins": 2.9450840950012207, + "rewards/rejected": -3.949599504470825, + "step": 3303 + }, + { + "epoch": 0.38, + "learning_rate": 1.8750442895948978e-07, + "logits/chosen": -2.7738311290740967, + "logits/rejected": -2.824504852294922, + "logps/chosen": -341.946533203125, + "logps/rejected": -231.15078735351562, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4048522710800171, + "rewards/margins": 2.0366220474243164, + "rewards/rejected": -2.441474199295044, + "step": 3304 + }, + { + "epoch": 0.38, + "learning_rate": 1.874689972835715e-07, + "logits/chosen": -2.6926231384277344, + "logits/rejected": -2.7381725311279297, + "logps/chosen": -390.7256774902344, + "logps/rejected": -281.3282470703125, + "loss": 0.2631, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5844161510467529, + "rewards/margins": 2.045635223388672, + "rewards/rejected": -2.630051374435425, + "step": 3305 + }, + { + "epoch": 0.38, + "learning_rate": 1.8743356560765322e-07, + "logits/chosen": -2.7192962169647217, + "logits/rejected": -2.581277370452881, + "logps/chosen": -201.28512573242188, + "logps/rejected": -250.05494689941406, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2981903553009033, + "rewards/margins": 2.9865493774414062, + "rewards/rejected": -4.284739971160889, + "step": 3306 + }, + { + "epoch": 0.38, + "learning_rate": 1.8739813393173495e-07, + "logits/chosen": -2.1047887802124023, + "logits/rejected": -2.3058788776397705, + "logps/chosen": -305.2047119140625, + "logps/rejected": -299.7403564453125, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5046166181564331, + "rewards/margins": 3.402752161026001, + "rewards/rejected": -3.9073686599731445, + "step": 3307 + }, + { + "epoch": 0.38, + "learning_rate": 1.873627022558167e-07, + "logits/chosen": -2.3415985107421875, + "logits/rejected": -1.9895336627960205, + "logps/chosen": -161.0898895263672, + "logps/rejected": -352.61944580078125, + "loss": 0.7431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3483014106750488, + "rewards/margins": 1.316733479499817, + "rewards/rejected": -2.665034770965576, + "step": 3308 + }, + { + "epoch": 0.38, + "learning_rate": 1.8732727057989842e-07, + "logits/chosen": -1.78541100025177, + "logits/rejected": -1.8967807292938232, + "logps/chosen": -250.46212768554688, + "logps/rejected": -154.62986755371094, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5219288468360901, + "rewards/margins": 2.0804286003112793, + "rewards/rejected": -2.6023576259613037, + "step": 3309 + }, + { + "epoch": 0.39, + "learning_rate": 1.8729183890398014e-07, + "logits/chosen": -2.682833194732666, + "logits/rejected": -2.683387041091919, + "logps/chosen": -292.8410339355469, + "logps/rejected": -383.4903259277344, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.433631181716919, + "rewards/margins": 4.542508125305176, + "rewards/rejected": -5.976139545440674, + "step": 3310 + }, + { + "epoch": 0.39, + "learning_rate": 1.8725640722806186e-07, + "logits/chosen": -1.8614394664764404, + "logits/rejected": -2.1195318698883057, + "logps/chosen": -159.45297241210938, + "logps/rejected": -161.68798828125, + "loss": 0.6485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5986157059669495, + "rewards/margins": 0.7332737445831299, + "rewards/rejected": -1.3318895101547241, + "step": 3311 + }, + { + "epoch": 0.39, + "learning_rate": 1.8722097555214364e-07, + "logits/chosen": -2.314818859100342, + "logits/rejected": -2.3607988357543945, + "logps/chosen": -303.1656494140625, + "logps/rejected": -217.95291137695312, + "loss": 0.389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.169060468673706, + "rewards/margins": 1.576852798461914, + "rewards/rejected": -2.745913028717041, + "step": 3312 + }, + { + "epoch": 0.39, + "learning_rate": 1.8718554387622536e-07, + "logits/chosen": -2.5871691703796387, + "logits/rejected": -2.842733383178711, + "logps/chosen": -356.845458984375, + "logps/rejected": -220.0530242919922, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04655265063047409, + "rewards/margins": 1.8413044214248657, + "rewards/rejected": -1.8878570795059204, + "step": 3313 + }, + { + "epoch": 0.39, + "learning_rate": 1.8715011220030708e-07, + "logits/chosen": -2.221605062484741, + "logits/rejected": -2.3359503746032715, + "logps/chosen": -200.07846069335938, + "logps/rejected": -165.93438720703125, + "loss": 0.32, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7202491760253906, + "rewards/margins": 1.2252986431121826, + "rewards/rejected": -1.9455479383468628, + "step": 3314 + }, + { + "epoch": 0.39, + "learning_rate": 1.871146805243888e-07, + "logits/chosen": -2.6385338306427, + "logits/rejected": -2.315721035003662, + "logps/chosen": -104.0951919555664, + "logps/rejected": -194.08631896972656, + "loss": 0.4855, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4786313772201538, + "rewards/margins": 1.1688787937164307, + "rewards/rejected": -1.6475101709365845, + "step": 3315 + }, + { + "epoch": 0.39, + "learning_rate": 1.8707924884847052e-07, + "logits/chosen": -2.3582026958465576, + "logits/rejected": -1.9953635931015015, + "logps/chosen": -276.3548583984375, + "logps/rejected": -320.47955322265625, + "loss": 0.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21279442310333252, + "rewards/margins": 1.643676519393921, + "rewards/rejected": -1.8564708232879639, + "step": 3316 + }, + { + "epoch": 0.39, + "learning_rate": 1.8704381717255225e-07, + "logits/chosen": -2.866072177886963, + "logits/rejected": -2.8295044898986816, + "logps/chosen": -291.04632568359375, + "logps/rejected": -254.11065673828125, + "loss": 0.191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6383817791938782, + "rewards/margins": 2.701434373855591, + "rewards/rejected": -3.339816093444824, + "step": 3317 + }, + { + "epoch": 0.39, + "learning_rate": 1.8700838549663397e-07, + "logits/chosen": -1.829728364944458, + "logits/rejected": -2.4846200942993164, + "logps/chosen": -409.5906982421875, + "logps/rejected": -211.8067626953125, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7719281911849976, + "rewards/margins": 2.0962283611297607, + "rewards/rejected": -2.868156671524048, + "step": 3318 + }, + { + "epoch": 0.39, + "learning_rate": 1.8697295382071572e-07, + "logits/chosen": -2.8726956844329834, + "logits/rejected": -2.3012900352478027, + "logps/chosen": -184.71572875976562, + "logps/rejected": -337.9961853027344, + "loss": 0.1151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8557183146476746, + "rewards/margins": 4.99931526184082, + "rewards/rejected": -5.855033874511719, + "step": 3319 + }, + { + "epoch": 0.39, + "learning_rate": 1.8693752214479744e-07, + "logits/chosen": -1.928252935409546, + "logits/rejected": -2.077303886413574, + "logps/chosen": -354.3191223144531, + "logps/rejected": -289.010986328125, + "loss": 0.3326, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1839746236801147, + "rewards/margins": 2.1675992012023926, + "rewards/rejected": -3.3515734672546387, + "step": 3320 + }, + { + "epoch": 0.39, + "learning_rate": 1.8690209046887916e-07, + "logits/chosen": -2.381877899169922, + "logits/rejected": -2.5532679557800293, + "logps/chosen": -356.2655944824219, + "logps/rejected": -237.33538818359375, + "loss": 0.2109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0129700899124146, + "rewards/margins": 1.920851469039917, + "rewards/rejected": -2.933821439743042, + "step": 3321 + }, + { + "epoch": 0.39, + "learning_rate": 1.8686665879296088e-07, + "logits/chosen": -2.1813547611236572, + "logits/rejected": -2.044093608856201, + "logps/chosen": -213.29417419433594, + "logps/rejected": -284.45184326171875, + "loss": 0.2663, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.775046706199646, + "rewards/margins": 4.860607147216797, + "rewards/rejected": -5.635654449462891, + "step": 3322 + }, + { + "epoch": 0.39, + "learning_rate": 1.868312271170426e-07, + "logits/chosen": -2.2203164100646973, + "logits/rejected": -2.448683977127075, + "logps/chosen": -390.7969665527344, + "logps/rejected": -443.055419921875, + "loss": 0.1291, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0547288656234741, + "rewards/margins": 2.9086837768554688, + "rewards/rejected": -3.9634127616882324, + "step": 3323 + }, + { + "epoch": 0.39, + "learning_rate": 1.8679579544112438e-07, + "logits/chosen": -2.5076441764831543, + "logits/rejected": -2.720520496368408, + "logps/chosen": -439.2734680175781, + "logps/rejected": -361.6002197265625, + "loss": 0.452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31858623027801514, + "rewards/margins": 1.3467223644256592, + "rewards/rejected": -1.6653087139129639, + "step": 3324 + }, + { + "epoch": 0.39, + "learning_rate": 1.867603637652061e-07, + "logits/chosen": -1.9937670230865479, + "logits/rejected": -1.9443154335021973, + "logps/chosen": -344.8470458984375, + "logps/rejected": -230.26991271972656, + "loss": 0.3954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.81369549036026, + "rewards/margins": 1.363511085510254, + "rewards/rejected": -2.177206516265869, + "step": 3325 + }, + { + "epoch": 0.39, + "learning_rate": 1.8672493208928783e-07, + "logits/chosen": -2.5026955604553223, + "logits/rejected": -2.5758228302001953, + "logps/chosen": -258.6585693359375, + "logps/rejected": -217.68820190429688, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6799204349517822, + "rewards/margins": 0.4473345875740051, + "rewards/rejected": -2.1272552013397217, + "step": 3326 + }, + { + "epoch": 0.39, + "learning_rate": 1.8668950041336955e-07, + "logits/chosen": -2.3101446628570557, + "logits/rejected": -2.4943928718566895, + "logps/chosen": -245.93479919433594, + "logps/rejected": -253.57289123535156, + "loss": 0.5011, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1684565544128418, + "rewards/margins": 1.916216492652893, + "rewards/rejected": -3.0846729278564453, + "step": 3327 + }, + { + "epoch": 0.39, + "learning_rate": 1.8665406873745127e-07, + "logits/chosen": -2.1146552562713623, + "logits/rejected": -1.8615665435791016, + "logps/chosen": -343.9508056640625, + "logps/rejected": -263.4842834472656, + "loss": 0.2565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9381371140480042, + "rewards/margins": 2.7295093536376953, + "rewards/rejected": -3.6676464080810547, + "step": 3328 + }, + { + "epoch": 0.39, + "learning_rate": 1.86618637061533e-07, + "logits/chosen": -2.3748559951782227, + "logits/rejected": -2.1856462955474854, + "logps/chosen": -392.9393310546875, + "logps/rejected": -284.84002685546875, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1904180645942688, + "rewards/margins": 1.7698698043823242, + "rewards/rejected": -1.9602878093719482, + "step": 3329 + }, + { + "epoch": 0.39, + "learning_rate": 1.8658320538561474e-07, + "logits/chosen": -2.0648443698883057, + "logits/rejected": -2.3510148525238037, + "logps/chosen": -437.35052490234375, + "logps/rejected": -214.12652587890625, + "loss": 0.2125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6996749639511108, + "rewards/margins": 2.5047664642333984, + "rewards/rejected": -3.2044413089752197, + "step": 3330 + }, + { + "epoch": 0.39, + "learning_rate": 1.8654777370969646e-07, + "logits/chosen": -2.55344820022583, + "logits/rejected": -2.7280197143554688, + "logps/chosen": -246.74685668945312, + "logps/rejected": -162.6654510498047, + "loss": 0.279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.968451976776123, + "rewards/margins": 1.7407293319702148, + "rewards/rejected": -2.709181308746338, + "step": 3331 + }, + { + "epoch": 0.39, + "learning_rate": 1.8651234203377818e-07, + "logits/chosen": -2.50717830657959, + "logits/rejected": -2.4567785263061523, + "logps/chosen": -153.53237915039062, + "logps/rejected": -120.8471908569336, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0896887332201004, + "rewards/margins": 1.7906875610351562, + "rewards/rejected": -1.7009987831115723, + "step": 3332 + }, + { + "epoch": 0.39, + "learning_rate": 1.864769103578599e-07, + "logits/chosen": -2.581838607788086, + "logits/rejected": -2.47988224029541, + "logps/chosen": -183.25698852539062, + "logps/rejected": -304.2426452636719, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9398155212402344, + "rewards/margins": 2.5162930488586426, + "rewards/rejected": -3.456108808517456, + "step": 3333 + }, + { + "epoch": 0.39, + "learning_rate": 1.8644147868194163e-07, + "logits/chosen": -2.358576774597168, + "logits/rejected": -2.3485751152038574, + "logps/chosen": -123.43535614013672, + "logps/rejected": -158.7224884033203, + "loss": 0.4572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5003291368484497, + "rewards/margins": 1.1466755867004395, + "rewards/rejected": -1.6470046043395996, + "step": 3334 + }, + { + "epoch": 0.39, + "learning_rate": 1.864060470060234e-07, + "logits/chosen": -1.9941812753677368, + "logits/rejected": -2.103116035461426, + "logps/chosen": -261.99432373046875, + "logps/rejected": -258.77264404296875, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5648093223571777, + "rewards/margins": 1.7207016944885254, + "rewards/rejected": -2.285511016845703, + "step": 3335 + }, + { + "epoch": 0.39, + "learning_rate": 1.8637061533010513e-07, + "logits/chosen": -2.4754061698913574, + "logits/rejected": -2.5634381771087646, + "logps/chosen": -177.4255828857422, + "logps/rejected": -200.84011840820312, + "loss": 0.5608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8623149394989014, + "rewards/margins": 1.6991163492202759, + "rewards/rejected": -2.5614311695098877, + "step": 3336 + }, + { + "epoch": 0.39, + "learning_rate": 1.8633518365418685e-07, + "logits/chosen": -1.8519277572631836, + "logits/rejected": -2.1480677127838135, + "logps/chosen": -413.74554443359375, + "logps/rejected": -283.88446044921875, + "loss": 0.3933, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4207637310028076, + "rewards/margins": 1.6770353317260742, + "rewards/rejected": -3.097799062728882, + "step": 3337 + }, + { + "epoch": 0.39, + "learning_rate": 1.8629975197826857e-07, + "logits/chosen": -2.4018783569335938, + "logits/rejected": -2.290470838546753, + "logps/chosen": -95.01655578613281, + "logps/rejected": -212.64405822753906, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5135877728462219, + "rewards/margins": 3.3934874534606934, + "rewards/rejected": -3.9070754051208496, + "step": 3338 + }, + { + "epoch": 0.39, + "learning_rate": 1.862643203023503e-07, + "logits/chosen": -2.568800210952759, + "logits/rejected": -2.854388952255249, + "logps/chosen": -224.6456756591797, + "logps/rejected": -269.7056579589844, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.078813076019287, + "rewards/margins": 1.965907096862793, + "rewards/rejected": -3.04472017288208, + "step": 3339 + }, + { + "epoch": 0.39, + "learning_rate": 1.8622888862643201e-07, + "logits/chosen": -2.9694817066192627, + "logits/rejected": -2.994757652282715, + "logps/chosen": -344.67596435546875, + "logps/rejected": -305.9635314941406, + "loss": 0.67, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1518006324768066, + "rewards/margins": 1.0053941011428833, + "rewards/rejected": -2.1571946144104004, + "step": 3340 + }, + { + "epoch": 0.39, + "learning_rate": 1.8619345695051376e-07, + "logits/chosen": -2.6398398876190186, + "logits/rejected": -2.655496120452881, + "logps/chosen": -353.6522521972656, + "logps/rejected": -320.16552734375, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7402046918869019, + "rewards/margins": 3.9241464138031006, + "rewards/rejected": -4.664350986480713, + "step": 3341 + }, + { + "epoch": 0.39, + "learning_rate": 1.8615802527459548e-07, + "logits/chosen": -2.510622501373291, + "logits/rejected": -2.5672836303710938, + "logps/chosen": -251.099609375, + "logps/rejected": -227.36441040039062, + "loss": 0.2572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9761390686035156, + "rewards/margins": 2.103545665740967, + "rewards/rejected": -3.0796849727630615, + "step": 3342 + }, + { + "epoch": 0.39, + "learning_rate": 1.861225935986772e-07, + "logits/chosen": -2.2911269664764404, + "logits/rejected": -2.1103334426879883, + "logps/chosen": -291.43133544921875, + "logps/rejected": -416.4708557128906, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5300992727279663, + "rewards/margins": 2.452591896057129, + "rewards/rejected": -2.9826908111572266, + "step": 3343 + }, + { + "epoch": 0.39, + "learning_rate": 1.8608716192275893e-07, + "logits/chosen": -2.241710662841797, + "logits/rejected": -2.292165517807007, + "logps/chosen": -233.99810791015625, + "logps/rejected": -321.11309814453125, + "loss": 0.1604, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0217769145965576, + "rewards/margins": 2.6047046184539795, + "rewards/rejected": -3.626481533050537, + "step": 3344 + }, + { + "epoch": 0.39, + "learning_rate": 1.8605173024684065e-07, + "logits/chosen": -2.350900650024414, + "logits/rejected": -2.3255608081817627, + "logps/chosen": -221.67034912109375, + "logps/rejected": -238.7288360595703, + "loss": 0.2186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5745895504951477, + "rewards/margins": 1.7933828830718994, + "rewards/rejected": -2.3679726123809814, + "step": 3345 + }, + { + "epoch": 0.39, + "learning_rate": 1.8601629857092237e-07, + "logits/chosen": -2.5243687629699707, + "logits/rejected": -2.4382429122924805, + "logps/chosen": -283.927490234375, + "logps/rejected": -219.6595001220703, + "loss": 0.3144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6615235805511475, + "rewards/margins": 1.6599574089050293, + "rewards/rejected": -2.321481227874756, + "step": 3346 + }, + { + "epoch": 0.39, + "learning_rate": 1.8598086689500415e-07, + "logits/chosen": -1.4668315649032593, + "logits/rejected": -1.9588223695755005, + "logps/chosen": -418.39691162109375, + "logps/rejected": -253.52468872070312, + "loss": 0.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04646792262792587, + "rewards/margins": 2.7656056880950928, + "rewards/rejected": -2.8120737075805664, + "step": 3347 + }, + { + "epoch": 0.39, + "learning_rate": 1.8594543521908587e-07, + "logits/chosen": -2.0182158946990967, + "logits/rejected": -2.154639720916748, + "logps/chosen": -308.4435729980469, + "logps/rejected": -269.21868896484375, + "loss": 0.3398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5667620301246643, + "rewards/margins": 1.583616852760315, + "rewards/rejected": -2.150378704071045, + "step": 3348 + }, + { + "epoch": 0.39, + "learning_rate": 1.859100035431676e-07, + "logits/chosen": -2.190103769302368, + "logits/rejected": -2.234595775604248, + "logps/chosen": -274.7421875, + "logps/rejected": -274.5881042480469, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4870738983154297, + "rewards/margins": 3.19370698928833, + "rewards/rejected": -3.6807806491851807, + "step": 3349 + }, + { + "epoch": 0.39, + "learning_rate": 1.8587457186724931e-07, + "logits/chosen": -2.8689074516296387, + "logits/rejected": -2.886915445327759, + "logps/chosen": -180.06417846679688, + "logps/rejected": -134.15542602539062, + "loss": 0.41, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3957795798778534, + "rewards/margins": 0.9580878019332886, + "rewards/rejected": -1.3538674116134644, + "step": 3350 + }, + { + "epoch": 0.39, + "learning_rate": 1.8583914019133104e-07, + "logits/chosen": -2.2750868797302246, + "logits/rejected": -2.3264975547790527, + "logps/chosen": -108.59992980957031, + "logps/rejected": -254.7178955078125, + "loss": 0.3528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6613905429840088, + "rewards/margins": 4.389074802398682, + "rewards/rejected": -5.0504655838012695, + "step": 3351 + }, + { + "epoch": 0.39, + "learning_rate": 1.8580370851541276e-07, + "logits/chosen": -2.9245758056640625, + "logits/rejected": -2.8773388862609863, + "logps/chosen": -208.82894897460938, + "logps/rejected": -121.12397766113281, + "loss": 0.3857, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3245733976364136, + "rewards/margins": 2.0999860763549805, + "rewards/rejected": -2.4245593547821045, + "step": 3352 + }, + { + "epoch": 0.39, + "learning_rate": 1.857682768394945e-07, + "logits/chosen": -2.0385944843292236, + "logits/rejected": -2.108271837234497, + "logps/chosen": -360.73211669921875, + "logps/rejected": -335.3489685058594, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5335421562194824, + "rewards/margins": 2.035780429840088, + "rewards/rejected": -2.5693225860595703, + "step": 3353 + }, + { + "epoch": 0.39, + "learning_rate": 1.8573284516357623e-07, + "logits/chosen": -2.2143685817718506, + "logits/rejected": -2.388627529144287, + "logps/chosen": -331.9945068359375, + "logps/rejected": -288.0626220703125, + "loss": 0.3354, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1735994815826416, + "rewards/margins": 2.3883297443389893, + "rewards/rejected": -3.561929225921631, + "step": 3354 + }, + { + "epoch": 0.39, + "learning_rate": 1.8569741348765795e-07, + "logits/chosen": -2.5897045135498047, + "logits/rejected": -2.4716248512268066, + "logps/chosen": -211.24725341796875, + "logps/rejected": -258.7600402832031, + "loss": 0.5871, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1549172401428223, + "rewards/margins": 1.1625418663024902, + "rewards/rejected": -2.3174591064453125, + "step": 3355 + }, + { + "epoch": 0.39, + "learning_rate": 1.8566198181173967e-07, + "logits/chosen": -1.9563812017440796, + "logits/rejected": -2.3838822841644287, + "logps/chosen": -279.91558837890625, + "logps/rejected": -194.49490356445312, + "loss": 0.5686, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.37327241897583, + "rewards/margins": 0.907346785068512, + "rewards/rejected": -2.2806191444396973, + "step": 3356 + }, + { + "epoch": 0.39, + "learning_rate": 1.856265501358214e-07, + "logits/chosen": -2.3209033012390137, + "logits/rejected": -2.299435615539551, + "logps/chosen": -277.2779846191406, + "logps/rejected": -425.1461486816406, + "loss": 0.3113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6206858158111572, + "rewards/margins": 2.0329325199127197, + "rewards/rejected": -2.653618335723877, + "step": 3357 + }, + { + "epoch": 0.39, + "learning_rate": 1.8559111845990312e-07, + "logits/chosen": -2.6779251098632812, + "logits/rejected": -2.851982355117798, + "logps/chosen": -298.42529296875, + "logps/rejected": -236.91897583007812, + "loss": 0.1444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49911510944366455, + "rewards/margins": 3.199662923812866, + "rewards/rejected": -3.698777914047241, + "step": 3358 + }, + { + "epoch": 0.39, + "learning_rate": 1.855556867839849e-07, + "logits/chosen": -2.393836498260498, + "logits/rejected": -2.3785572052001953, + "logps/chosen": -197.07711791992188, + "logps/rejected": -330.8213195800781, + "loss": 0.3793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5398982167243958, + "rewards/margins": 2.9000461101531982, + "rewards/rejected": -3.439944267272949, + "step": 3359 + }, + { + "epoch": 0.39, + "learning_rate": 1.8552025510806662e-07, + "logits/chosen": -2.209611415863037, + "logits/rejected": -1.9604334831237793, + "logps/chosen": -401.68878173828125, + "logps/rejected": -423.21710205078125, + "loss": 0.7055, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6965991854667664, + "rewards/margins": 1.432546854019165, + "rewards/rejected": -2.129146099090576, + "step": 3360 + }, + { + "epoch": 0.39, + "learning_rate": 1.8548482343214834e-07, + "logits/chosen": -2.1547179222106934, + "logits/rejected": -1.8922131061553955, + "logps/chosen": -266.1447448730469, + "logps/rejected": -443.15411376953125, + "loss": 0.2395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.760726809501648, + "rewards/margins": 2.46378231048584, + "rewards/rejected": -3.2245092391967773, + "step": 3361 + }, + { + "epoch": 0.39, + "learning_rate": 1.8544939175623006e-07, + "logits/chosen": -2.4448699951171875, + "logits/rejected": -2.257011890411377, + "logps/chosen": -414.1623840332031, + "logps/rejected": -479.10333251953125, + "loss": 0.4236, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.068786859512329, + "rewards/margins": 2.298985242843628, + "rewards/rejected": -3.367772102355957, + "step": 3362 + }, + { + "epoch": 0.39, + "learning_rate": 1.8541396008031178e-07, + "logits/chosen": -2.2770187854766846, + "logits/rejected": -2.4864161014556885, + "logps/chosen": -297.4179992675781, + "logps/rejected": -198.1750946044922, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6367000937461853, + "rewards/margins": 2.038423538208008, + "rewards/rejected": -2.675123453140259, + "step": 3363 + }, + { + "epoch": 0.39, + "learning_rate": 1.8537852840439353e-07, + "logits/chosen": -2.6885786056518555, + "logits/rejected": -2.8835196495056152, + "logps/chosen": -392.40936279296875, + "logps/rejected": -195.53433227539062, + "loss": 0.4347, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3198297023773193, + "rewards/margins": 1.8627830743789673, + "rewards/rejected": -3.182612895965576, + "step": 3364 + }, + { + "epoch": 0.39, + "learning_rate": 1.8534309672847525e-07, + "logits/chosen": -2.5977823734283447, + "logits/rejected": -2.455334186553955, + "logps/chosen": -196.82725524902344, + "logps/rejected": -311.94415283203125, + "loss": 0.1176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7752014398574829, + "rewards/margins": 3.6826090812683105, + "rewards/rejected": -4.457810401916504, + "step": 3365 + }, + { + "epoch": 0.39, + "learning_rate": 1.8530766505255697e-07, + "logits/chosen": -2.8230714797973633, + "logits/rejected": -2.935361385345459, + "logps/chosen": -192.60580444335938, + "logps/rejected": -184.5399169921875, + "loss": 0.3158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5311586856842041, + "rewards/margins": 1.745026707649231, + "rewards/rejected": -2.2761855125427246, + "step": 3366 + }, + { + "epoch": 0.39, + "learning_rate": 1.852722333766387e-07, + "logits/chosen": -3.038447618484497, + "logits/rejected": -2.913233757019043, + "logps/chosen": -260.3134765625, + "logps/rejected": -241.8472442626953, + "loss": 0.5531, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3188987970352173, + "rewards/margins": 1.4595441818237305, + "rewards/rejected": -2.778442859649658, + "step": 3367 + }, + { + "epoch": 0.39, + "learning_rate": 1.8523680170072042e-07, + "logits/chosen": -1.9022458791732788, + "logits/rejected": -1.920288324356079, + "logps/chosen": -379.50567626953125, + "logps/rejected": -335.2005920410156, + "loss": 0.4192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7813849449157715, + "rewards/margins": 1.1830203533172607, + "rewards/rejected": -1.9644052982330322, + "step": 3368 + }, + { + "epoch": 0.39, + "learning_rate": 1.8520137002480214e-07, + "logits/chosen": -2.3535709381103516, + "logits/rejected": -2.518467426300049, + "logps/chosen": -278.2225646972656, + "logps/rejected": -237.33392333984375, + "loss": 0.4634, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3926496505737305, + "rewards/margins": 2.7786455154418945, + "rewards/rejected": -4.171295166015625, + "step": 3369 + }, + { + "epoch": 0.39, + "learning_rate": 1.8516593834888392e-07, + "logits/chosen": -2.7618722915649414, + "logits/rejected": -2.703028917312622, + "logps/chosen": -260.31829833984375, + "logps/rejected": -308.255615234375, + "loss": 0.1919, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1674389839172363, + "rewards/margins": 3.200131416320801, + "rewards/rejected": -4.367570400238037, + "step": 3370 + }, + { + "epoch": 0.39, + "learning_rate": 1.8513050667296564e-07, + "logits/chosen": -2.4295482635498047, + "logits/rejected": -2.465958833694458, + "logps/chosen": -212.90882873535156, + "logps/rejected": -267.43682861328125, + "loss": 0.3377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6721929311752319, + "rewards/margins": 2.1515798568725586, + "rewards/rejected": -2.82377290725708, + "step": 3371 + }, + { + "epoch": 0.39, + "learning_rate": 1.8509507499704736e-07, + "logits/chosen": -2.2857778072357178, + "logits/rejected": -2.4301228523254395, + "logps/chosen": -289.69012451171875, + "logps/rejected": -287.66156005859375, + "loss": 0.5664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4596249461174011, + "rewards/margins": 1.9070607423782349, + "rewards/rejected": -2.366685390472412, + "step": 3372 + }, + { + "epoch": 0.39, + "learning_rate": 1.8505964332112908e-07, + "logits/chosen": -2.5876855850219727, + "logits/rejected": -2.7474260330200195, + "logps/chosen": -246.04519653320312, + "logps/rejected": -299.189453125, + "loss": 0.4011, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.074571132659912, + "rewards/margins": 2.1021828651428223, + "rewards/rejected": -3.176753520965576, + "step": 3373 + }, + { + "epoch": 0.39, + "learning_rate": 1.850242116452108e-07, + "logits/chosen": -2.898502826690674, + "logits/rejected": -2.7643239498138428, + "logps/chosen": -229.05201721191406, + "logps/rejected": -252.4426727294922, + "loss": 0.9962, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1958707571029663, + "rewards/margins": 0.9954532384872437, + "rewards/rejected": -2.191323757171631, + "step": 3374 + }, + { + "epoch": 0.39, + "learning_rate": 1.8498877996929255e-07, + "logits/chosen": -2.798494815826416, + "logits/rejected": -2.7742629051208496, + "logps/chosen": -323.30474853515625, + "logps/rejected": -243.24293518066406, + "loss": 0.284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5761392712593079, + "rewards/margins": 2.4230294227600098, + "rewards/rejected": -2.999168872833252, + "step": 3375 + }, + { + "epoch": 0.39, + "learning_rate": 1.8495334829337428e-07, + "logits/chosen": -2.1511106491088867, + "logits/rejected": -2.350090503692627, + "logps/chosen": -401.6890563964844, + "logps/rejected": -266.52703857421875, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7185509204864502, + "rewards/margins": 1.0081268548965454, + "rewards/rejected": -1.726677656173706, + "step": 3376 + }, + { + "epoch": 0.39, + "learning_rate": 1.84917916617456e-07, + "logits/chosen": -2.380931854248047, + "logits/rejected": -2.5055646896362305, + "logps/chosen": -353.9587707519531, + "logps/rejected": -282.37164306640625, + "loss": 0.637, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.744989275932312, + "rewards/margins": 1.0861908197402954, + "rewards/rejected": -1.8311800956726074, + "step": 3377 + }, + { + "epoch": 0.39, + "learning_rate": 1.8488248494153772e-07, + "logits/chosen": -2.0063726902008057, + "logits/rejected": -2.1342761516571045, + "logps/chosen": -243.80661010742188, + "logps/rejected": -296.5588073730469, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0722347497940063, + "rewards/margins": 0.9052100777626038, + "rewards/rejected": -1.9774448871612549, + "step": 3378 + }, + { + "epoch": 0.39, + "learning_rate": 1.8484705326561944e-07, + "logits/chosen": -2.2630808353424072, + "logits/rejected": -1.9485901594161987, + "logps/chosen": -250.26153564453125, + "logps/rejected": -317.00982666015625, + "loss": 0.4519, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9268052577972412, + "rewards/margins": 1.1187419891357422, + "rewards/rejected": -3.0455470085144043, + "step": 3379 + }, + { + "epoch": 0.39, + "learning_rate": 1.8481162158970116e-07, + "logits/chosen": -2.35150146484375, + "logits/rejected": -2.490654468536377, + "logps/chosen": -191.84776306152344, + "logps/rejected": -213.4203643798828, + "loss": 0.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9626486301422119, + "rewards/margins": 2.2764291763305664, + "rewards/rejected": -3.2390778064727783, + "step": 3380 + }, + { + "epoch": 0.39, + "learning_rate": 1.8477618991378289e-07, + "logits/chosen": -2.8254480361938477, + "logits/rejected": -2.891606092453003, + "logps/chosen": -239.35528564453125, + "logps/rejected": -481.6474609375, + "loss": 0.305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9276472926139832, + "rewards/margins": 2.4272212982177734, + "rewards/rejected": -3.3548684120178223, + "step": 3381 + }, + { + "epoch": 0.39, + "learning_rate": 1.8474075823786466e-07, + "logits/chosen": -1.466334342956543, + "logits/rejected": -1.8428864479064941, + "logps/chosen": -464.8097839355469, + "logps/rejected": -301.1081237792969, + "loss": 0.5536, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9402018785476685, + "rewards/margins": 0.8904042840003967, + "rewards/rejected": -1.83060622215271, + "step": 3382 + }, + { + "epoch": 0.39, + "learning_rate": 1.8470532656194638e-07, + "logits/chosen": -2.291431188583374, + "logits/rejected": -2.4347662925720215, + "logps/chosen": -232.44873046875, + "logps/rejected": -221.07321166992188, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09062960743904114, + "rewards/margins": 2.6399500370025635, + "rewards/rejected": -2.7305796146392822, + "step": 3383 + }, + { + "epoch": 0.39, + "learning_rate": 1.846698948860281e-07, + "logits/chosen": -1.9974241256713867, + "logits/rejected": -2.2461326122283936, + "logps/chosen": -377.51641845703125, + "logps/rejected": -318.126220703125, + "loss": 0.2322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5250531435012817, + "rewards/margins": 2.056821346282959, + "rewards/rejected": -2.581874370574951, + "step": 3384 + }, + { + "epoch": 0.39, + "learning_rate": 1.8463446321010983e-07, + "logits/chosen": -1.4070454835891724, + "logits/rejected": -1.860363483428955, + "logps/chosen": -354.82623291015625, + "logps/rejected": -215.9857940673828, + "loss": 0.5349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0824726819992065, + "rewards/margins": 1.2527027130126953, + "rewards/rejected": -2.3351755142211914, + "step": 3385 + }, + { + "epoch": 0.39, + "learning_rate": 1.8459903153419155e-07, + "logits/chosen": -2.058957815170288, + "logits/rejected": -2.2481486797332764, + "logps/chosen": -336.38543701171875, + "logps/rejected": -279.0199890136719, + "loss": 0.4237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6062447428703308, + "rewards/margins": 1.8290809392929077, + "rewards/rejected": -2.4353256225585938, + "step": 3386 + }, + { + "epoch": 0.39, + "learning_rate": 1.845635998582733e-07, + "logits/chosen": -2.434199333190918, + "logits/rejected": -2.3138539791107178, + "logps/chosen": -119.56906127929688, + "logps/rejected": -196.87399291992188, + "loss": 0.2916, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3057142496109009, + "rewards/margins": 3.055112361907959, + "rewards/rejected": -4.36082649230957, + "step": 3387 + }, + { + "epoch": 0.39, + "learning_rate": 1.8452816818235502e-07, + "logits/chosen": -2.657477378845215, + "logits/rejected": -2.8132357597351074, + "logps/chosen": -223.03981018066406, + "logps/rejected": -177.70660400390625, + "loss": 0.1743, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9501574635505676, + "rewards/margins": 1.9658327102661133, + "rewards/rejected": -2.915990114212036, + "step": 3388 + }, + { + "epoch": 0.39, + "learning_rate": 1.8449273650643674e-07, + "logits/chosen": -2.0795319080352783, + "logits/rejected": -2.0661044120788574, + "logps/chosen": -232.7060546875, + "logps/rejected": -232.88661193847656, + "loss": 0.3979, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5535342693328857, + "rewards/margins": 1.6858792304992676, + "rewards/rejected": -4.239413261413574, + "step": 3389 + }, + { + "epoch": 0.39, + "learning_rate": 1.8445730483051846e-07, + "logits/chosen": -2.5190484523773193, + "logits/rejected": -2.4008631706237793, + "logps/chosen": -250.22972106933594, + "logps/rejected": -362.1461181640625, + "loss": 0.3935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47159525752067566, + "rewards/margins": 2.203805685043335, + "rewards/rejected": -2.675400733947754, + "step": 3390 + }, + { + "epoch": 0.39, + "learning_rate": 1.8442187315460019e-07, + "logits/chosen": -2.583657741546631, + "logits/rejected": -2.720963478088379, + "logps/chosen": -319.7049255371094, + "logps/rejected": -337.41748046875, + "loss": 0.2782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33775949478149414, + "rewards/margins": 2.493699073791504, + "rewards/rejected": -2.831458568572998, + "step": 3391 + }, + { + "epoch": 0.39, + "learning_rate": 1.843864414786819e-07, + "logits/chosen": -1.9466845989227295, + "logits/rejected": -1.7100924253463745, + "logps/chosen": -440.768310546875, + "logps/rejected": -363.82562255859375, + "loss": 0.1139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025063902139663696, + "rewards/margins": 3.02437686920166, + "rewards/rejected": -3.049440860748291, + "step": 3392 + }, + { + "epoch": 0.39, + "learning_rate": 1.8435100980276366e-07, + "logits/chosen": -2.856184959411621, + "logits/rejected": -2.6893389225006104, + "logps/chosen": -341.54571533203125, + "logps/rejected": -271.4803466796875, + "loss": 0.7048, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7417503595352173, + "rewards/margins": 1.1092075109481812, + "rewards/rejected": -2.8509578704833984, + "step": 3393 + }, + { + "epoch": 0.39, + "learning_rate": 1.843155781268454e-07, + "logits/chosen": -2.6863150596618652, + "logits/rejected": -2.6962392330169678, + "logps/chosen": -153.38583374023438, + "logps/rejected": -241.77810668945312, + "loss": 0.2749, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.200852870941162, + "rewards/margins": 2.3777523040771484, + "rewards/rejected": -3.5786051750183105, + "step": 3394 + }, + { + "epoch": 0.39, + "learning_rate": 1.8428014645092713e-07, + "logits/chosen": -2.6275739669799805, + "logits/rejected": -2.694567918777466, + "logps/chosen": -150.1112060546875, + "logps/rejected": -224.17312622070312, + "loss": 0.97, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0985922813415527, + "rewards/margins": 1.3035974502563477, + "rewards/rejected": -2.4021897315979004, + "step": 3395 + }, + { + "epoch": 0.4, + "learning_rate": 1.8424471477500885e-07, + "logits/chosen": -2.5806496143341064, + "logits/rejected": -2.53804087638855, + "logps/chosen": -244.8392791748047, + "logps/rejected": -339.36016845703125, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7698785066604614, + "rewards/margins": 2.193786144256592, + "rewards/rejected": -3.9636647701263428, + "step": 3396 + }, + { + "epoch": 0.4, + "learning_rate": 1.8420928309909057e-07, + "logits/chosen": -2.183762311935425, + "logits/rejected": -2.3645591735839844, + "logps/chosen": -436.77374267578125, + "logps/rejected": -274.9459228515625, + "loss": 0.2079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17809192836284637, + "rewards/margins": 2.2717766761779785, + "rewards/rejected": -2.449868679046631, + "step": 3397 + }, + { + "epoch": 0.4, + "learning_rate": 1.8417385142317232e-07, + "logits/chosen": -2.506166458129883, + "logits/rejected": -2.466632604598999, + "logps/chosen": -244.1412353515625, + "logps/rejected": -421.9756774902344, + "loss": 0.2104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5013792514801025, + "rewards/margins": 2.5061001777648926, + "rewards/rejected": -3.007479429244995, + "step": 3398 + }, + { + "epoch": 0.4, + "learning_rate": 1.8413841974725404e-07, + "logits/chosen": -2.421644687652588, + "logits/rejected": -2.436990261077881, + "logps/chosen": -109.25733947753906, + "logps/rejected": -184.52667236328125, + "loss": 0.2479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5189033150672913, + "rewards/margins": 2.108459711074829, + "rewards/rejected": -2.6273629665374756, + "step": 3399 + }, + { + "epoch": 0.4, + "learning_rate": 1.8410298807133577e-07, + "logits/chosen": -2.79703688621521, + "logits/rejected": -2.5895659923553467, + "logps/chosen": -217.38339233398438, + "logps/rejected": -234.58453369140625, + "loss": 0.3201, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4397207498550415, + "rewards/margins": 2.3359646797180176, + "rewards/rejected": -3.7756855487823486, + "step": 3400 + }, + { + "epoch": 0.4, + "learning_rate": 1.840675563954175e-07, + "logits/chosen": -2.2023959159851074, + "logits/rejected": -2.325378656387329, + "logps/chosen": -531.6226806640625, + "logps/rejected": -475.36907958984375, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48696762323379517, + "rewards/margins": 4.032780647277832, + "rewards/rejected": -4.519747734069824, + "step": 3401 + }, + { + "epoch": 0.4, + "learning_rate": 1.840321247194992e-07, + "logits/chosen": -2.315439224243164, + "logits/rejected": -2.7421669960021973, + "logps/chosen": -287.54510498046875, + "logps/rejected": -193.55133056640625, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4484342336654663, + "rewards/margins": 4.028387069702148, + "rewards/rejected": -4.4768218994140625, + "step": 3402 + }, + { + "epoch": 0.4, + "learning_rate": 1.8399669304358093e-07, + "logits/chosen": -2.279280424118042, + "logits/rejected": -2.2649879455566406, + "logps/chosen": -265.00482177734375, + "logps/rejected": -320.2498474121094, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13400880992412567, + "rewards/margins": 2.6128296852111816, + "rewards/rejected": -2.746838331222534, + "step": 3403 + }, + { + "epoch": 0.4, + "learning_rate": 1.8396126136766268e-07, + "logits/chosen": -1.9287952184677124, + "logits/rejected": -1.9925240278244019, + "logps/chosen": -310.74298095703125, + "logps/rejected": -380.138427734375, + "loss": 0.4418, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1386207342147827, + "rewards/margins": 2.426340341567993, + "rewards/rejected": -3.5649609565734863, + "step": 3404 + }, + { + "epoch": 0.4, + "learning_rate": 1.8392582969174443e-07, + "logits/chosen": -1.8607568740844727, + "logits/rejected": -1.7253414392471313, + "logps/chosen": -274.81195068359375, + "logps/rejected": -341.8802185058594, + "loss": 0.4892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2398412227630615, + "rewards/margins": 0.9031581878662109, + "rewards/rejected": -2.1429994106292725, + "step": 3405 + }, + { + "epoch": 0.4, + "learning_rate": 1.8389039801582615e-07, + "logits/chosen": -2.2108569145202637, + "logits/rejected": -2.370239734649658, + "logps/chosen": -122.34679412841797, + "logps/rejected": -191.93954467773438, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17209647595882416, + "rewards/margins": 2.146580457687378, + "rewards/rejected": -2.3186769485473633, + "step": 3406 + }, + { + "epoch": 0.4, + "learning_rate": 1.8385496633990787e-07, + "logits/chosen": -2.393409490585327, + "logits/rejected": -1.99038565158844, + "logps/chosen": -225.8734588623047, + "logps/rejected": -338.2386779785156, + "loss": 0.1444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4594680964946747, + "rewards/margins": 3.004650592803955, + "rewards/rejected": -3.464118480682373, + "step": 3407 + }, + { + "epoch": 0.4, + "learning_rate": 1.838195346639896e-07, + "logits/chosen": -1.5098017454147339, + "logits/rejected": -1.5717741250991821, + "logps/chosen": -453.8023376464844, + "logps/rejected": -414.75274658203125, + "loss": 0.5073, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2314974069595337, + "rewards/margins": 1.3030622005462646, + "rewards/rejected": -2.534559726715088, + "step": 3408 + }, + { + "epoch": 0.4, + "learning_rate": 1.8378410298807134e-07, + "logits/chosen": -2.160737991333008, + "logits/rejected": -1.99154794216156, + "logps/chosen": -235.7310333251953, + "logps/rejected": -359.54864501953125, + "loss": 0.7449, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.389125943183899, + "rewards/margins": 0.7373011112213135, + "rewards/rejected": -2.126427173614502, + "step": 3409 + }, + { + "epoch": 0.4, + "learning_rate": 1.8374867131215307e-07, + "logits/chosen": -2.0576906204223633, + "logits/rejected": -2.491405487060547, + "logps/chosen": -336.63446044921875, + "logps/rejected": -346.96923828125, + "loss": 0.5451, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0263760089874268, + "rewards/margins": 1.245700716972351, + "rewards/rejected": -2.2720768451690674, + "step": 3410 + }, + { + "epoch": 0.4, + "learning_rate": 1.837132396362348e-07, + "logits/chosen": -1.917681097984314, + "logits/rejected": -1.8454885482788086, + "logps/chosen": -148.79788208007812, + "logps/rejected": -215.19613647460938, + "loss": 0.3919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3400630950927734, + "rewards/margins": 2.0094645023345947, + "rewards/rejected": -3.349527597427368, + "step": 3411 + }, + { + "epoch": 0.4, + "learning_rate": 1.836778079603165e-07, + "logits/chosen": -2.0476274490356445, + "logits/rejected": -2.1638777256011963, + "logps/chosen": -367.7312927246094, + "logps/rejected": -271.1231689453125, + "loss": 0.737, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.064182758331299, + "rewards/margins": 1.6046191453933716, + "rewards/rejected": -3.668801784515381, + "step": 3412 + }, + { + "epoch": 0.4, + "learning_rate": 1.8364237628439823e-07, + "logits/chosen": -2.203263998031616, + "logits/rejected": -2.3298511505126953, + "logps/chosen": -425.442626953125, + "logps/rejected": -334.72650146484375, + "loss": 0.3528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.96681147813797, + "rewards/margins": 1.7388677597045898, + "rewards/rejected": -2.705679416656494, + "step": 3413 + }, + { + "epoch": 0.4, + "learning_rate": 1.8360694460847995e-07, + "logits/chosen": -2.0349578857421875, + "logits/rejected": -1.8259761333465576, + "logps/chosen": -425.437744140625, + "logps/rejected": -288.0302734375, + "loss": 0.347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34897899627685547, + "rewards/margins": 1.5794318914413452, + "rewards/rejected": -1.9284110069274902, + "step": 3414 + }, + { + "epoch": 0.4, + "learning_rate": 1.8357151293256168e-07, + "logits/chosen": -1.8451001644134521, + "logits/rejected": -2.286156415939331, + "logps/chosen": -465.2957458496094, + "logps/rejected": -228.60989379882812, + "loss": 0.5056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9549244046211243, + "rewards/margins": 1.3354610204696655, + "rewards/rejected": -2.2903852462768555, + "step": 3415 + }, + { + "epoch": 0.4, + "learning_rate": 1.8353608125664343e-07, + "logits/chosen": -2.524022102355957, + "logits/rejected": -2.590106964111328, + "logps/chosen": -217.944580078125, + "logps/rejected": -173.87588500976562, + "loss": 0.6161, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7616043090820312, + "rewards/margins": 1.2919037342071533, + "rewards/rejected": -2.0535080432891846, + "step": 3416 + }, + { + "epoch": 0.4, + "learning_rate": 1.8350064958072517e-07, + "logits/chosen": -1.838868260383606, + "logits/rejected": -2.116912603378296, + "logps/chosen": -363.2449951171875, + "logps/rejected": -340.6737365722656, + "loss": 0.1579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15411730110645294, + "rewards/margins": 2.877066135406494, + "rewards/rejected": -3.0311832427978516, + "step": 3417 + }, + { + "epoch": 0.4, + "learning_rate": 1.834652179048069e-07, + "logits/chosen": -2.3090085983276367, + "logits/rejected": -2.4306185245513916, + "logps/chosen": -248.18258666992188, + "logps/rejected": -208.00596618652344, + "loss": 0.4291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4839983582496643, + "rewards/margins": 1.2085407972335815, + "rewards/rejected": -1.692539095878601, + "step": 3418 + }, + { + "epoch": 0.4, + "learning_rate": 1.8342978622888862e-07, + "logits/chosen": -2.5586283206939697, + "logits/rejected": -2.6537680625915527, + "logps/chosen": -309.36273193359375, + "logps/rejected": -265.89404296875, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.695320188999176, + "rewards/margins": 1.7607567310333252, + "rewards/rejected": -2.4560768604278564, + "step": 3419 + }, + { + "epoch": 0.4, + "learning_rate": 1.8339435455297037e-07, + "logits/chosen": -2.0043845176696777, + "logits/rejected": -1.89007568359375, + "logps/chosen": -342.0821533203125, + "logps/rejected": -367.994873046875, + "loss": 0.3976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.033517688512802124, + "rewards/margins": 1.744642972946167, + "rewards/rejected": -1.7781606912612915, + "step": 3420 + }, + { + "epoch": 0.4, + "learning_rate": 1.833589228770521e-07, + "logits/chosen": -2.3497707843780518, + "logits/rejected": -2.291553020477295, + "logps/chosen": -181.2816162109375, + "logps/rejected": -284.2139587402344, + "loss": 2.2077, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.2609784603118896, + "rewards/margins": -0.9471589922904968, + "rewards/rejected": -2.313819408416748, + "step": 3421 + }, + { + "epoch": 0.4, + "learning_rate": 1.833234912011338e-07, + "logits/chosen": -1.9613993167877197, + "logits/rejected": -1.8648691177368164, + "logps/chosen": -191.7077178955078, + "logps/rejected": -179.7085723876953, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1766455173492432, + "rewards/margins": 1.1588020324707031, + "rewards/rejected": -2.3354475498199463, + "step": 3422 + }, + { + "epoch": 0.4, + "learning_rate": 1.8328805952521553e-07, + "logits/chosen": -1.9801139831542969, + "logits/rejected": -2.2710070610046387, + "logps/chosen": -436.3531799316406, + "logps/rejected": -286.6208801269531, + "loss": 0.3139, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5968255996704102, + "rewards/margins": 1.8534002304077148, + "rewards/rejected": -3.450226068496704, + "step": 3423 + }, + { + "epoch": 0.4, + "learning_rate": 1.8325262784929726e-07, + "logits/chosen": -2.6476526260375977, + "logits/rejected": -2.8960063457489014, + "logps/chosen": -261.958984375, + "logps/rejected": -184.93478393554688, + "loss": 0.6177, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7138870358467102, + "rewards/margins": 1.0215260982513428, + "rewards/rejected": -1.7354131937026978, + "step": 3424 + }, + { + "epoch": 0.4, + "learning_rate": 1.8321719617337898e-07, + "logits/chosen": -1.9933302402496338, + "logits/rejected": -2.072636604309082, + "logps/chosen": -294.7523193359375, + "logps/rejected": -340.1819763183594, + "loss": 0.4764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9526187181472778, + "rewards/margins": 2.197521209716797, + "rewards/rejected": -3.1501400470733643, + "step": 3425 + }, + { + "epoch": 0.4, + "learning_rate": 1.831817644974607e-07, + "logits/chosen": -2.3068716526031494, + "logits/rejected": -2.191986560821533, + "logps/chosen": -282.1511535644531, + "logps/rejected": -385.1468811035156, + "loss": 0.4565, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1387066841125488, + "rewards/margins": 1.0770667791366577, + "rewards/rejected": -2.215773582458496, + "step": 3426 + }, + { + "epoch": 0.4, + "learning_rate": 1.8314633282154245e-07, + "logits/chosen": -1.8716621398925781, + "logits/rejected": -1.7429122924804688, + "logps/chosen": -135.0226593017578, + "logps/rejected": -155.77450561523438, + "loss": 0.7122, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9409748315811157, + "rewards/margins": 0.2603878974914551, + "rewards/rejected": -1.2013627290725708, + "step": 3427 + }, + { + "epoch": 0.4, + "learning_rate": 1.8311090114562417e-07, + "logits/chosen": -2.2367982864379883, + "logits/rejected": -2.2505195140838623, + "logps/chosen": -320.6040344238281, + "logps/rejected": -292.02801513671875, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5819922685623169, + "rewards/margins": 1.9463709592819214, + "rewards/rejected": -2.5283632278442383, + "step": 3428 + }, + { + "epoch": 0.4, + "learning_rate": 1.8307546946970592e-07, + "logits/chosen": -2.063913345336914, + "logits/rejected": -2.2774219512939453, + "logps/chosen": -333.12786865234375, + "logps/rejected": -265.0030517578125, + "loss": 0.3549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33602482080459595, + "rewards/margins": 1.420503854751587, + "rewards/rejected": -1.756528615951538, + "step": 3429 + }, + { + "epoch": 0.4, + "learning_rate": 1.8304003779378764e-07, + "logits/chosen": -1.988664150238037, + "logits/rejected": -2.303290367126465, + "logps/chosen": -389.89019775390625, + "logps/rejected": -414.17626953125, + "loss": 0.2615, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5854094624519348, + "rewards/margins": 2.9702248573303223, + "rewards/rejected": -3.5556344985961914, + "step": 3430 + }, + { + "epoch": 0.4, + "learning_rate": 1.8300460611786936e-07, + "logits/chosen": -2.9957938194274902, + "logits/rejected": -2.9585652351379395, + "logps/chosen": -230.4889373779297, + "logps/rejected": -334.8667297363281, + "loss": 0.502, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8904938697814941, + "rewards/margins": 3.502958059310913, + "rewards/rejected": -5.3934526443481445, + "step": 3431 + }, + { + "epoch": 0.4, + "learning_rate": 1.829691744419511e-07, + "logits/chosen": -1.8003120422363281, + "logits/rejected": -1.9285058975219727, + "logps/chosen": -459.41912841796875, + "logps/rejected": -311.0648498535156, + "loss": 0.7339, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9928489923477173, + "rewards/margins": 0.5760178565979004, + "rewards/rejected": -2.568866729736328, + "step": 3432 + }, + { + "epoch": 0.4, + "learning_rate": 1.8293374276603283e-07, + "logits/chosen": -2.255795478820801, + "logits/rejected": -2.1986513137817383, + "logps/chosen": -223.6953887939453, + "logps/rejected": -255.81591796875, + "loss": 0.7323, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2311562299728394, + "rewards/margins": 0.9870451092720032, + "rewards/rejected": -2.2182013988494873, + "step": 3433 + }, + { + "epoch": 0.4, + "learning_rate": 1.8289831109011456e-07, + "logits/chosen": -2.3228206634521484, + "logits/rejected": -2.08485746383667, + "logps/chosen": -87.28106689453125, + "logps/rejected": -177.53013610839844, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07531140744686127, + "rewards/margins": 2.7301158905029297, + "rewards/rejected": -2.805427312850952, + "step": 3434 + }, + { + "epoch": 0.4, + "learning_rate": 1.8286287941419628e-07, + "logits/chosen": -2.026492118835449, + "logits/rejected": -2.1857399940490723, + "logps/chosen": -321.1061706542969, + "logps/rejected": -274.46502685546875, + "loss": 0.4373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5961174964904785, + "rewards/margins": 1.5987675189971924, + "rewards/rejected": -2.19488525390625, + "step": 3435 + }, + { + "epoch": 0.4, + "learning_rate": 1.82827447738278e-07, + "logits/chosen": -2.499723434448242, + "logits/rejected": -2.3671600818634033, + "logps/chosen": -135.6142578125, + "logps/rejected": -153.54759216308594, + "loss": 0.2885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8509641885757446, + "rewards/margins": 2.261913776397705, + "rewards/rejected": -3.11287784576416, + "step": 3436 + }, + { + "epoch": 0.4, + "learning_rate": 1.8279201606235972e-07, + "logits/chosen": -2.418226718902588, + "logits/rejected": -2.295963764190674, + "logps/chosen": -350.7650451660156, + "logps/rejected": -445.7560729980469, + "loss": 0.7336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4148454666137695, + "rewards/margins": 1.1387925148010254, + "rewards/rejected": -2.553637742996216, + "step": 3437 + }, + { + "epoch": 0.4, + "learning_rate": 1.8275658438644147e-07, + "logits/chosen": -2.3481552600860596, + "logits/rejected": -2.3030970096588135, + "logps/chosen": -284.3382568359375, + "logps/rejected": -284.246826171875, + "loss": 0.2341, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24014732241630554, + "rewards/margins": 1.6432760953903198, + "rewards/rejected": -1.8834234476089478, + "step": 3438 + }, + { + "epoch": 0.4, + "learning_rate": 1.827211527105232e-07, + "logits/chosen": -2.6225171089172363, + "logits/rejected": -2.5971035957336426, + "logps/chosen": -244.91941833496094, + "logps/rejected": -251.77198791503906, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6536723375320435, + "rewards/margins": 2.9695374965667725, + "rewards/rejected": -3.6232099533081055, + "step": 3439 + }, + { + "epoch": 0.4, + "learning_rate": 1.8268572103460494e-07, + "logits/chosen": -2.3729755878448486, + "logits/rejected": -2.5604405403137207, + "logps/chosen": -186.47593688964844, + "logps/rejected": -178.91043090820312, + "loss": 0.579, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7151098847389221, + "rewards/margins": 1.7111750841140747, + "rewards/rejected": -2.4262850284576416, + "step": 3440 + }, + { + "epoch": 0.4, + "learning_rate": 1.8265028935868666e-07, + "logits/chosen": -2.422905445098877, + "logits/rejected": -2.6406314373016357, + "logps/chosen": -317.6475830078125, + "logps/rejected": -228.85494995117188, + "loss": 0.4481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7034437656402588, + "rewards/margins": 2.0382862091064453, + "rewards/rejected": -2.741729974746704, + "step": 3441 + }, + { + "epoch": 0.4, + "learning_rate": 1.8261485768276839e-07, + "logits/chosen": -2.862699270248413, + "logits/rejected": -2.724250078201294, + "logps/chosen": -204.6074981689453, + "logps/rejected": -257.3564453125, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8959665298461914, + "rewards/margins": 3.011035442352295, + "rewards/rejected": -3.9070017337799072, + "step": 3442 + }, + { + "epoch": 0.4, + "learning_rate": 1.8257942600685013e-07, + "logits/chosen": -2.3302226066589355, + "logits/rejected": -2.0286476612091064, + "logps/chosen": -257.14666748046875, + "logps/rejected": -306.5703125, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.779371976852417, + "rewards/margins": 1.4865121841430664, + "rewards/rejected": -2.2658839225769043, + "step": 3443 + }, + { + "epoch": 0.4, + "learning_rate": 1.8254399433093186e-07, + "logits/chosen": -1.9845811128616333, + "logits/rejected": -2.1462204456329346, + "logps/chosen": -474.25921630859375, + "logps/rejected": -438.884765625, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42834004759788513, + "rewards/margins": 2.771885871887207, + "rewards/rejected": -3.200226068496704, + "step": 3444 + }, + { + "epoch": 0.4, + "learning_rate": 1.8250856265501358e-07, + "logits/chosen": -2.149412155151367, + "logits/rejected": -2.2782368659973145, + "logps/chosen": -260.5132751464844, + "logps/rejected": -336.49951171875, + "loss": 0.4143, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7634851932525635, + "rewards/margins": 1.7064323425292969, + "rewards/rejected": -2.4699175357818604, + "step": 3445 + }, + { + "epoch": 0.4, + "learning_rate": 1.824731309790953e-07, + "logits/chosen": -1.6671053171157837, + "logits/rejected": -1.535812258720398, + "logps/chosen": -506.6976013183594, + "logps/rejected": -442.39404296875, + "loss": 0.2963, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20182761549949646, + "rewards/margins": 2.8442916870117188, + "rewards/rejected": -2.6424641609191895, + "step": 3446 + }, + { + "epoch": 0.4, + "learning_rate": 1.8243769930317702e-07, + "logits/chosen": -1.7697808742523193, + "logits/rejected": -1.8003170490264893, + "logps/chosen": -234.4547119140625, + "logps/rejected": -236.1967315673828, + "loss": 0.6206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.318357229232788, + "rewards/margins": 1.2679312229156494, + "rewards/rejected": -2.5862884521484375, + "step": 3447 + }, + { + "epoch": 0.4, + "learning_rate": 1.8240226762725874e-07, + "logits/chosen": -1.6967637538909912, + "logits/rejected": -1.7848191261291504, + "logps/chosen": -140.1956787109375, + "logps/rejected": -134.94064331054688, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9871538281440735, + "rewards/margins": 1.8120685815811157, + "rewards/rejected": -2.799222469329834, + "step": 3448 + }, + { + "epoch": 0.4, + "learning_rate": 1.823668359513405e-07, + "logits/chosen": -2.6802191734313965, + "logits/rejected": -2.652160167694092, + "logps/chosen": -142.31756591796875, + "logps/rejected": -176.07008361816406, + "loss": 0.2669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6251325607299805, + "rewards/margins": 2.531029224395752, + "rewards/rejected": -3.1561617851257324, + "step": 3449 + }, + { + "epoch": 0.4, + "learning_rate": 1.8233140427542222e-07, + "logits/chosen": -2.280028820037842, + "logits/rejected": -2.7876639366149902, + "logps/chosen": -387.5239562988281, + "logps/rejected": -139.0717315673828, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22632421553134918, + "rewards/margins": 2.5755815505981445, + "rewards/rejected": -2.801905870437622, + "step": 3450 + }, + { + "epoch": 0.4, + "learning_rate": 1.8229597259950394e-07, + "logits/chosen": -2.146669864654541, + "logits/rejected": -2.1526191234588623, + "logps/chosen": -177.8876953125, + "logps/rejected": -180.3375244140625, + "loss": 0.4018, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0363225936889648, + "rewards/margins": 1.62081778049469, + "rewards/rejected": -2.6571404933929443, + "step": 3451 + }, + { + "epoch": 0.4, + "learning_rate": 1.8226054092358569e-07, + "logits/chosen": -1.8966842889785767, + "logits/rejected": -2.008312463760376, + "logps/chosen": -269.3478088378906, + "logps/rejected": -235.667236328125, + "loss": 0.2421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.623069703578949, + "rewards/margins": 2.0242176055908203, + "rewards/rejected": -2.647287368774414, + "step": 3452 + }, + { + "epoch": 0.4, + "learning_rate": 1.822251092476674e-07, + "logits/chosen": -2.3339154720306396, + "logits/rejected": -2.684643507003784, + "logps/chosen": -418.045166015625, + "logps/rejected": -324.20648193359375, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8053046464920044, + "rewards/margins": 1.9975948333740234, + "rewards/rejected": -2.8028995990753174, + "step": 3453 + }, + { + "epoch": 0.4, + "learning_rate": 1.8218967757174916e-07, + "logits/chosen": -2.811805486679077, + "logits/rejected": -2.9165151119232178, + "logps/chosen": -187.67620849609375, + "logps/rejected": -164.572265625, + "loss": 0.6514, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7946363687515259, + "rewards/margins": 1.0353386402130127, + "rewards/rejected": -1.8299751281738281, + "step": 3454 + }, + { + "epoch": 0.4, + "learning_rate": 1.8215424589583088e-07, + "logits/chosen": -1.6600738763809204, + "logits/rejected": -1.6296623945236206, + "logps/chosen": -454.31903076171875, + "logps/rejected": -482.933837890625, + "loss": 0.252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30861684679985046, + "rewards/margins": 4.083954811096191, + "rewards/rejected": -4.392571449279785, + "step": 3455 + }, + { + "epoch": 0.4, + "learning_rate": 1.821188142199126e-07, + "logits/chosen": -2.2531771659851074, + "logits/rejected": -2.2589941024780273, + "logps/chosen": -224.0650177001953, + "logps/rejected": -161.78729248046875, + "loss": 0.3826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6734862923622131, + "rewards/margins": 1.4671533107757568, + "rewards/rejected": -2.140639543533325, + "step": 3456 + }, + { + "epoch": 0.4, + "learning_rate": 1.8208338254399432e-07, + "logits/chosen": -2.6245877742767334, + "logits/rejected": -2.4835093021392822, + "logps/chosen": -185.7711181640625, + "logps/rejected": -348.203857421875, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6677801609039307, + "rewards/margins": 3.8298797607421875, + "rewards/rejected": -4.497659683227539, + "step": 3457 + }, + { + "epoch": 0.4, + "learning_rate": 1.8204795086807605e-07, + "logits/chosen": -2.172866106033325, + "logits/rejected": -2.2109076976776123, + "logps/chosen": -343.8246154785156, + "logps/rejected": -275.88568115234375, + "loss": 0.347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9842840433120728, + "rewards/margins": 1.6339099407196045, + "rewards/rejected": -2.618194103240967, + "step": 3458 + }, + { + "epoch": 0.4, + "learning_rate": 1.8201251919215777e-07, + "logits/chosen": -2.7038345336914062, + "logits/rejected": -2.5690157413482666, + "logps/chosen": -208.51568603515625, + "logps/rejected": -256.2114562988281, + "loss": 0.1692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43005993962287903, + "rewards/margins": 3.176769256591797, + "rewards/rejected": -3.6068289279937744, + "step": 3459 + }, + { + "epoch": 0.4, + "learning_rate": 1.819770875162395e-07, + "logits/chosen": -2.684129238128662, + "logits/rejected": -2.689105749130249, + "logps/chosen": -261.56329345703125, + "logps/rejected": -306.5220947265625, + "loss": 0.6021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6093653440475464, + "rewards/margins": 1.2726601362228394, + "rewards/rejected": -1.8820254802703857, + "step": 3460 + }, + { + "epoch": 0.4, + "learning_rate": 1.8194165584032124e-07, + "logits/chosen": -2.184813976287842, + "logits/rejected": -2.0836241245269775, + "logps/chosen": -183.69354248046875, + "logps/rejected": -216.99349975585938, + "loss": 0.3274, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1437641382217407, + "rewards/margins": 1.7714369297027588, + "rewards/rejected": -2.915201187133789, + "step": 3461 + }, + { + "epoch": 0.4, + "learning_rate": 1.8190622416440296e-07, + "logits/chosen": -2.167269706726074, + "logits/rejected": -2.369142770767212, + "logps/chosen": -197.41116333007812, + "logps/rejected": -217.00262451171875, + "loss": 0.2199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5151286125183105, + "rewards/margins": 1.6658915281295776, + "rewards/rejected": -2.1810202598571777, + "step": 3462 + }, + { + "epoch": 0.4, + "learning_rate": 1.8187079248848468e-07, + "logits/chosen": -1.7284345626831055, + "logits/rejected": -1.8576667308807373, + "logps/chosen": -281.6728515625, + "logps/rejected": -262.4928894042969, + "loss": 0.3819, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8051906228065491, + "rewards/margins": 2.4754250049591064, + "rewards/rejected": -3.28061580657959, + "step": 3463 + }, + { + "epoch": 0.4, + "learning_rate": 1.8183536081256643e-07, + "logits/chosen": -2.0820486545562744, + "logits/rejected": -1.8396461009979248, + "logps/chosen": -257.5462341308594, + "logps/rejected": -349.35980224609375, + "loss": 0.5407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8029916286468506, + "rewards/margins": 2.4207277297973633, + "rewards/rejected": -4.223719120025635, + "step": 3464 + }, + { + "epoch": 0.4, + "learning_rate": 1.8179992913664818e-07, + "logits/chosen": -2.2973759174346924, + "logits/rejected": -2.3458855152130127, + "logps/chosen": -267.74725341796875, + "logps/rejected": -211.85549926757812, + "loss": 0.2421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9000104665756226, + "rewards/margins": 1.9934169054031372, + "rewards/rejected": -2.8934273719787598, + "step": 3465 + }, + { + "epoch": 0.4, + "learning_rate": 1.817644974607299e-07, + "logits/chosen": -2.381632089614868, + "logits/rejected": -2.385927677154541, + "logps/chosen": -350.2432556152344, + "logps/rejected": -376.9503173828125, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8776895999908447, + "rewards/margins": 1.1777267456054688, + "rewards/rejected": -2.0554161071777344, + "step": 3466 + }, + { + "epoch": 0.4, + "learning_rate": 1.8172906578481162e-07, + "logits/chosen": -1.6446380615234375, + "logits/rejected": -1.8471620082855225, + "logps/chosen": -323.9928283691406, + "logps/rejected": -194.76776123046875, + "loss": 0.5308, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0138981342315674, + "rewards/margins": 1.8179337978363037, + "rewards/rejected": -2.831831932067871, + "step": 3467 + }, + { + "epoch": 0.4, + "learning_rate": 1.8169363410889335e-07, + "logits/chosen": -2.436187505722046, + "logits/rejected": -2.117535352706909, + "logps/chosen": -215.30230712890625, + "logps/rejected": -349.6695251464844, + "loss": 0.5181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7542338371276855, + "rewards/margins": 1.3095452785491943, + "rewards/rejected": -3.063779354095459, + "step": 3468 + }, + { + "epoch": 0.4, + "learning_rate": 1.8165820243297507e-07, + "logits/chosen": -2.23575496673584, + "logits/rejected": -2.237022638320923, + "logps/chosen": -209.94631958007812, + "logps/rejected": -187.55474853515625, + "loss": 0.5578, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0012305974960327, + "rewards/margins": 1.7419180870056152, + "rewards/rejected": -2.7431485652923584, + "step": 3469 + }, + { + "epoch": 0.4, + "learning_rate": 1.816227707570568e-07, + "logits/chosen": -2.0229835510253906, + "logits/rejected": -2.3649144172668457, + "logps/chosen": -256.272216796875, + "logps/rejected": -245.07492065429688, + "loss": 0.4463, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6389966011047363, + "rewards/margins": 1.7969733476638794, + "rewards/rejected": -2.435969829559326, + "step": 3470 + }, + { + "epoch": 0.4, + "learning_rate": 1.815873390811385e-07, + "logits/chosen": -2.314714193344116, + "logits/rejected": -2.226654529571533, + "logps/chosen": -143.49826049804688, + "logps/rejected": -255.7918243408203, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21251842379570007, + "rewards/margins": 1.3284201622009277, + "rewards/rejected": -1.5409386157989502, + "step": 3471 + }, + { + "epoch": 0.4, + "learning_rate": 1.8155190740522026e-07, + "logits/chosen": -2.4801931381225586, + "logits/rejected": -2.494591236114502, + "logps/chosen": -287.0316162109375, + "logps/rejected": -273.40869140625, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.858190655708313, + "rewards/margins": 1.8553781509399414, + "rewards/rejected": -2.713568687438965, + "step": 3472 + }, + { + "epoch": 0.4, + "learning_rate": 1.8151647572930198e-07, + "logits/chosen": -2.9624462127685547, + "logits/rejected": -2.8155722618103027, + "logps/chosen": -196.74366760253906, + "logps/rejected": -230.84494018554688, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.003164291381836, + "rewards/margins": 1.9147225618362427, + "rewards/rejected": -2.917886734008789, + "step": 3473 + }, + { + "epoch": 0.4, + "learning_rate": 1.814810440533837e-07, + "logits/chosen": -2.722914695739746, + "logits/rejected": -2.4370079040527344, + "logps/chosen": -241.6461181640625, + "logps/rejected": -175.7159423828125, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03708430379629135, + "rewards/margins": 2.8979039192199707, + "rewards/rejected": -2.934988498687744, + "step": 3474 + }, + { + "epoch": 0.4, + "learning_rate": 1.8144561237746545e-07, + "logits/chosen": -1.7048097848892212, + "logits/rejected": -1.9716781377792358, + "logps/chosen": -178.8767852783203, + "logps/rejected": -158.35025024414062, + "loss": 0.3327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5342893600463867, + "rewards/margins": 1.9375873804092407, + "rewards/rejected": -2.471876859664917, + "step": 3475 + }, + { + "epoch": 0.4, + "learning_rate": 1.8141018070154718e-07, + "logits/chosen": -2.469611167907715, + "logits/rejected": -2.6071360111236572, + "logps/chosen": -411.48944091796875, + "logps/rejected": -358.63494873046875, + "loss": 0.408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9457463026046753, + "rewards/margins": 0.9883435368537903, + "rewards/rejected": -1.9340897798538208, + "step": 3476 + }, + { + "epoch": 0.4, + "learning_rate": 1.8137474902562892e-07, + "logits/chosen": -2.190236806869507, + "logits/rejected": -2.340275764465332, + "logps/chosen": -371.406982421875, + "logps/rejected": -312.7951965332031, + "loss": 1.0041, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7384518384933472, + "rewards/margins": 1.2432079315185547, + "rewards/rejected": -2.9816598892211914, + "step": 3477 + }, + { + "epoch": 0.4, + "learning_rate": 1.8133931734971065e-07, + "logits/chosen": -2.0100231170654297, + "logits/rejected": -2.238492727279663, + "logps/chosen": -540.544677734375, + "logps/rejected": -260.81964111328125, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0537341833114624, + "rewards/margins": 2.8624868392944336, + "rewards/rejected": -3.9162209033966064, + "step": 3478 + }, + { + "epoch": 0.4, + "learning_rate": 1.8130388567379237e-07, + "logits/chosen": -2.8566157817840576, + "logits/rejected": -2.6121459007263184, + "logps/chosen": -286.8302917480469, + "logps/rejected": -219.6306915283203, + "loss": 0.5166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4558669328689575, + "rewards/margins": 3.2675795555114746, + "rewards/rejected": -4.723446369171143, + "step": 3479 + }, + { + "epoch": 0.4, + "learning_rate": 1.812684539978741e-07, + "logits/chosen": -2.5391106605529785, + "logits/rejected": -2.3765158653259277, + "logps/chosen": -248.482177734375, + "logps/rejected": -293.71917724609375, + "loss": 0.1909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5479005575180054, + "rewards/margins": 2.992936372756958, + "rewards/rejected": -3.540836811065674, + "step": 3480 + }, + { + "epoch": 0.4, + "learning_rate": 1.812330223219558e-07, + "logits/chosen": -2.289834499359131, + "logits/rejected": -2.6613874435424805, + "logps/chosen": -453.4770202636719, + "logps/rejected": -229.69097900390625, + "loss": 0.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9666922092437744, + "rewards/margins": 1.94059419631958, + "rewards/rejected": -2.9072864055633545, + "step": 3481 + }, + { + "epoch": 0.41, + "learning_rate": 1.8119759064603754e-07, + "logits/chosen": -1.8712096214294434, + "logits/rejected": -1.8772907257080078, + "logps/chosen": -374.19696044921875, + "logps/rejected": -354.5101013183594, + "loss": 0.4168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7251015305519104, + "rewards/margins": 2.4606475830078125, + "rewards/rejected": -3.185749053955078, + "step": 3482 + }, + { + "epoch": 0.41, + "learning_rate": 1.8116215897011928e-07, + "logits/chosen": -2.292731285095215, + "logits/rejected": -2.1961114406585693, + "logps/chosen": -315.6981506347656, + "logps/rejected": -263.0450134277344, + "loss": 0.3967, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0341318845748901, + "rewards/margins": 1.7900776863098145, + "rewards/rejected": -2.824209690093994, + "step": 3483 + }, + { + "epoch": 0.41, + "learning_rate": 1.81126727294201e-07, + "logits/chosen": -2.0701613426208496, + "logits/rejected": -2.54555344581604, + "logps/chosen": -369.0523681640625, + "logps/rejected": -306.4394836425781, + "loss": 0.5889, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5256201028823853, + "rewards/margins": 1.6960835456848145, + "rewards/rejected": -2.2217037677764893, + "step": 3484 + }, + { + "epoch": 0.41, + "learning_rate": 1.8109129561828273e-07, + "logits/chosen": -1.6034080982208252, + "logits/rejected": -1.8846237659454346, + "logps/chosen": -285.5194396972656, + "logps/rejected": -279.722412109375, + "loss": 0.6802, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2692179679870605, + "rewards/margins": 1.1913917064666748, + "rewards/rejected": -3.4606096744537354, + "step": 3485 + }, + { + "epoch": 0.41, + "learning_rate": 1.8105586394236445e-07, + "logits/chosen": -1.8610413074493408, + "logits/rejected": -1.8935245275497437, + "logps/chosen": -180.952392578125, + "logps/rejected": -164.76702880859375, + "loss": 0.3643, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8129696249961853, + "rewards/margins": 1.7661027908325195, + "rewards/rejected": -2.5790724754333496, + "step": 3486 + }, + { + "epoch": 0.41, + "learning_rate": 1.810204322664462e-07, + "logits/chosen": -2.23494291305542, + "logits/rejected": -2.0021286010742188, + "logps/chosen": -220.959716796875, + "logps/rejected": -366.25482177734375, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27513551712036133, + "rewards/margins": 2.357337474822998, + "rewards/rejected": -2.6324732303619385, + "step": 3487 + }, + { + "epoch": 0.41, + "learning_rate": 1.8098500059052795e-07, + "logits/chosen": -2.4572815895080566, + "logits/rejected": -2.74783992767334, + "logps/chosen": -450.77935791015625, + "logps/rejected": -287.99066162109375, + "loss": 0.932, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7419416904449463, + "rewards/margins": 1.1591975688934326, + "rewards/rejected": -3.9011390209198, + "step": 3488 + }, + { + "epoch": 0.41, + "learning_rate": 1.8094956891460967e-07, + "logits/chosen": -2.1882410049438477, + "logits/rejected": -2.083984851837158, + "logps/chosen": -210.57313537597656, + "logps/rejected": -311.7477722167969, + "loss": 0.6179, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1949985027313232, + "rewards/margins": 1.1025619506835938, + "rewards/rejected": -2.297560691833496, + "step": 3489 + }, + { + "epoch": 0.41, + "learning_rate": 1.809141372386914e-07, + "logits/chosen": -2.760089874267578, + "logits/rejected": -2.685616970062256, + "logps/chosen": -164.60475158691406, + "logps/rejected": -216.81430053710938, + "loss": 0.2386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5426988005638123, + "rewards/margins": 2.8257641792297363, + "rewards/rejected": -3.3684630393981934, + "step": 3490 + }, + { + "epoch": 0.41, + "learning_rate": 1.8087870556277311e-07, + "logits/chosen": -2.046090841293335, + "logits/rejected": -1.7054047584533691, + "logps/chosen": -238.82742309570312, + "logps/rejected": -287.22332763671875, + "loss": 0.862, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7993741035461426, + "rewards/margins": -0.12083463370800018, + "rewards/rejected": -1.678539514541626, + "step": 3491 + }, + { + "epoch": 0.41, + "learning_rate": 1.8084327388685484e-07, + "logits/chosen": -2.0527124404907227, + "logits/rejected": -1.7574806213378906, + "logps/chosen": -231.71376037597656, + "logps/rejected": -313.06182861328125, + "loss": 1.0016, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0811411142349243, + "rewards/margins": 1.249955415725708, + "rewards/rejected": -2.3310964107513428, + "step": 3492 + }, + { + "epoch": 0.41, + "learning_rate": 1.8080784221093656e-07, + "logits/chosen": -2.3059990406036377, + "logits/rejected": -2.4513185024261475, + "logps/chosen": -288.9188232421875, + "logps/rejected": -256.84271240234375, + "loss": 0.4665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2827642858028412, + "rewards/margins": 2.0554280281066895, + "rewards/rejected": -2.3381922245025635, + "step": 3493 + }, + { + "epoch": 0.41, + "learning_rate": 1.807724105350183e-07, + "logits/chosen": -2.375497817993164, + "logits/rejected": -2.4758927822113037, + "logps/chosen": -301.5941467285156, + "logps/rejected": -253.87278747558594, + "loss": 0.2251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.983431339263916, + "rewards/margins": 2.6547458171844482, + "rewards/rejected": -3.6381771564483643, + "step": 3494 + }, + { + "epoch": 0.41, + "learning_rate": 1.8073697885910003e-07, + "logits/chosen": -2.234487533569336, + "logits/rejected": -2.447134256362915, + "logps/chosen": -375.3169250488281, + "logps/rejected": -286.3021240234375, + "loss": 0.5735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8082699775695801, + "rewards/margins": 1.4392603635787964, + "rewards/rejected": -2.247530460357666, + "step": 3495 + }, + { + "epoch": 0.41, + "learning_rate": 1.8070154718318175e-07, + "logits/chosen": -2.3936147689819336, + "logits/rejected": -2.562300205230713, + "logps/chosen": -483.7294006347656, + "logps/rejected": -306.5520324707031, + "loss": 0.5889, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3517714738845825, + "rewards/margins": 1.3163055181503296, + "rewards/rejected": -2.668076992034912, + "step": 3496 + }, + { + "epoch": 0.41, + "learning_rate": 1.8066611550726347e-07, + "logits/chosen": -2.871753454208374, + "logits/rejected": -2.822007179260254, + "logps/chosen": -96.70232391357422, + "logps/rejected": -265.482666015625, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031190797686576843, + "rewards/margins": 4.913817405700684, + "rewards/rejected": -4.945008277893066, + "step": 3497 + }, + { + "epoch": 0.41, + "learning_rate": 1.806306838313452e-07, + "logits/chosen": -1.9708285331726074, + "logits/rejected": -2.004542589187622, + "logps/chosen": -295.3489685058594, + "logps/rejected": -360.9647521972656, + "loss": 0.4208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8711240887641907, + "rewards/margins": 1.293111801147461, + "rewards/rejected": -2.164236068725586, + "step": 3498 + }, + { + "epoch": 0.41, + "learning_rate": 1.8059525215542697e-07, + "logits/chosen": -1.8350584506988525, + "logits/rejected": -1.9046859741210938, + "logps/chosen": -368.670654296875, + "logps/rejected": -369.3427734375, + "loss": 0.5639, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.269379734992981, + "rewards/margins": 1.5329030752182007, + "rewards/rejected": -2.8022828102111816, + "step": 3499 + }, + { + "epoch": 0.41, + "learning_rate": 1.805598204795087e-07, + "logits/chosen": -2.3209214210510254, + "logits/rejected": -2.073077440261841, + "logps/chosen": -311.97979736328125, + "logps/rejected": -241.4569091796875, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1335914134979248, + "rewards/margins": 2.1335089206695557, + "rewards/rejected": -3.2671003341674805, + "step": 3500 + }, + { + "epoch": 0.41, + "learning_rate": 1.8052438880359041e-07, + "logits/chosen": -2.7670509815216064, + "logits/rejected": -2.384779453277588, + "logps/chosen": -229.99908447265625, + "logps/rejected": -215.5937957763672, + "loss": 0.4046, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6581175327301025, + "rewards/margins": 1.1227374076843262, + "rewards/rejected": -1.7808549404144287, + "step": 3501 + }, + { + "epoch": 0.41, + "learning_rate": 1.8048895712767214e-07, + "logits/chosen": -1.5841097831726074, + "logits/rejected": -1.7556402683258057, + "logps/chosen": -474.21783447265625, + "logps/rejected": -300.3135986328125, + "loss": 0.8974, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8037099242210388, + "rewards/margins": 1.026113748550415, + "rewards/rejected": -1.829823613166809, + "step": 3502 + }, + { + "epoch": 0.41, + "learning_rate": 1.8045352545175386e-07, + "logits/chosen": -1.4763684272766113, + "logits/rejected": -1.6163674592971802, + "logps/chosen": -275.1263122558594, + "logps/rejected": -289.859375, + "loss": 0.29, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3430905342102051, + "rewards/margins": 1.8014004230499268, + "rewards/rejected": -2.144490957260132, + "step": 3503 + }, + { + "epoch": 0.41, + "learning_rate": 1.8041809377583558e-07, + "logits/chosen": -2.443110942840576, + "logits/rejected": -2.2346606254577637, + "logps/chosen": -175.27273559570312, + "logps/rejected": -355.02685546875, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27173158526420593, + "rewards/margins": 3.894028663635254, + "rewards/rejected": -4.165760040283203, + "step": 3504 + }, + { + "epoch": 0.41, + "learning_rate": 1.803826620999173e-07, + "logits/chosen": -1.6112265586853027, + "logits/rejected": -1.9491150379180908, + "logps/chosen": -521.761962890625, + "logps/rejected": -339.87762451171875, + "loss": 0.3651, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6493743658065796, + "rewards/margins": 1.5721012353897095, + "rewards/rejected": -2.221475601196289, + "step": 3505 + }, + { + "epoch": 0.41, + "learning_rate": 1.8034723042399905e-07, + "logits/chosen": -2.650249481201172, + "logits/rejected": -2.5606021881103516, + "logps/chosen": -93.33280944824219, + "logps/rejected": -166.89125061035156, + "loss": 0.3666, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5659737586975098, + "rewards/margins": 1.8510222434997559, + "rewards/rejected": -3.4169960021972656, + "step": 3506 + }, + { + "epoch": 0.41, + "learning_rate": 1.8031179874808077e-07, + "logits/chosen": -2.2135753631591797, + "logits/rejected": -2.378681182861328, + "logps/chosen": -398.87066650390625, + "logps/rejected": -267.5250244140625, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0656906366348267, + "rewards/margins": 2.171907901763916, + "rewards/rejected": -3.237598180770874, + "step": 3507 + }, + { + "epoch": 0.41, + "learning_rate": 1.802763670721625e-07, + "logits/chosen": -2.9413599967956543, + "logits/rejected": -2.9726080894470215, + "logps/chosen": -162.19247436523438, + "logps/rejected": -218.4949188232422, + "loss": 0.5164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9628802537918091, + "rewards/margins": 1.8480364084243774, + "rewards/rejected": -2.8109166622161865, + "step": 3508 + }, + { + "epoch": 0.41, + "learning_rate": 1.8024093539624422e-07, + "logits/chosen": -2.6523141860961914, + "logits/rejected": -2.5905001163482666, + "logps/chosen": -301.1302490234375, + "logps/rejected": -346.9701232910156, + "loss": 0.4735, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9804761409759521, + "rewards/margins": 1.849817156791687, + "rewards/rejected": -2.8302931785583496, + "step": 3509 + }, + { + "epoch": 0.41, + "learning_rate": 1.80205503720326e-07, + "logits/chosen": -2.1097044944763184, + "logits/rejected": -2.0307507514953613, + "logps/chosen": -324.13385009765625, + "logps/rejected": -342.94622802734375, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5603328347206116, + "rewards/margins": 1.3314919471740723, + "rewards/rejected": -1.8918248414993286, + "step": 3510 + }, + { + "epoch": 0.41, + "learning_rate": 1.8017007204440772e-07, + "logits/chosen": -3.062049150466919, + "logits/rejected": -3.036087989807129, + "logps/chosen": -294.62030029296875, + "logps/rejected": -280.7332763671875, + "loss": 0.3501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.458784282207489, + "rewards/margins": 2.485551357269287, + "rewards/rejected": -2.944335460662842, + "step": 3511 + }, + { + "epoch": 0.41, + "learning_rate": 1.8013464036848944e-07, + "logits/chosen": -2.5273399353027344, + "logits/rejected": -2.7085394859313965, + "logps/chosen": -256.06280517578125, + "logps/rejected": -320.10601806640625, + "loss": 0.3613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.516601026058197, + "rewards/margins": 2.4955244064331055, + "rewards/rejected": -3.0121254920959473, + "step": 3512 + }, + { + "epoch": 0.41, + "learning_rate": 1.8009920869257116e-07, + "logits/chosen": -2.1126956939697266, + "logits/rejected": -1.93386709690094, + "logps/chosen": -398.8091735839844, + "logps/rejected": -679.89794921875, + "loss": 0.1578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3739142417907715, + "rewards/margins": 2.4208693504333496, + "rewards/rejected": -2.794783592224121, + "step": 3513 + }, + { + "epoch": 0.41, + "learning_rate": 1.8006377701665288e-07, + "logits/chosen": -1.780928373336792, + "logits/rejected": -1.9208498001098633, + "logps/chosen": -295.7181091308594, + "logps/rejected": -297.3642883300781, + "loss": 0.3326, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5296118855476379, + "rewards/margins": 2.3239622116088867, + "rewards/rejected": -2.853574275970459, + "step": 3514 + }, + { + "epoch": 0.41, + "learning_rate": 1.800283453407346e-07, + "logits/chosen": -2.2927370071411133, + "logits/rejected": -2.2423202991485596, + "logps/chosen": -244.54112243652344, + "logps/rejected": -206.78167724609375, + "loss": 0.5312, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6663179397583008, + "rewards/margins": 0.9385527968406677, + "rewards/rejected": -1.6048707962036133, + "step": 3515 + }, + { + "epoch": 0.41, + "learning_rate": 1.7999291366481633e-07, + "logits/chosen": -2.1340131759643555, + "logits/rejected": -1.9963161945343018, + "logps/chosen": -292.4080810546875, + "logps/rejected": -326.043212890625, + "loss": 0.24, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6062695980072021, + "rewards/margins": 2.322446346282959, + "rewards/rejected": -2.928715705871582, + "step": 3516 + }, + { + "epoch": 0.41, + "learning_rate": 1.7995748198889807e-07, + "logits/chosen": -2.1126465797424316, + "logits/rejected": -2.2372891902923584, + "logps/chosen": -256.61053466796875, + "logps/rejected": -286.8475646972656, + "loss": 0.3407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41370606422424316, + "rewards/margins": 2.21718168258667, + "rewards/rejected": -2.630887746810913, + "step": 3517 + }, + { + "epoch": 0.41, + "learning_rate": 1.799220503129798e-07, + "logits/chosen": -1.7795729637145996, + "logits/rejected": -1.4197008609771729, + "logps/chosen": -255.68893432617188, + "logps/rejected": -491.74407958984375, + "loss": 0.4916, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4601662158966064, + "rewards/margins": 1.8375284671783447, + "rewards/rejected": -3.297694683074951, + "step": 3518 + }, + { + "epoch": 0.41, + "learning_rate": 1.7988661863706152e-07, + "logits/chosen": -2.5678749084472656, + "logits/rejected": -2.712343215942383, + "logps/chosen": -325.0626220703125, + "logps/rejected": -282.9248046875, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5546925067901611, + "rewards/margins": 3.773632526397705, + "rewards/rejected": -4.328324794769287, + "step": 3519 + }, + { + "epoch": 0.41, + "learning_rate": 1.7985118696114324e-07, + "logits/chosen": -2.684727907180786, + "logits/rejected": -2.7776882648468018, + "logps/chosen": -306.3103332519531, + "logps/rejected": -250.71902465820312, + "loss": 0.1226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16544845700263977, + "rewards/margins": 3.3635013103485107, + "rewards/rejected": -3.5289502143859863, + "step": 3520 + }, + { + "epoch": 0.41, + "learning_rate": 1.7981575528522496e-07, + "logits/chosen": -2.343658924102783, + "logits/rejected": -2.5683722496032715, + "logps/chosen": -337.3077392578125, + "logps/rejected": -239.2969512939453, + "loss": 0.3855, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0015336275100708, + "rewards/margins": 1.9261853694915771, + "rewards/rejected": -2.9277191162109375, + "step": 3521 + }, + { + "epoch": 0.41, + "learning_rate": 1.7978032360930674e-07, + "logits/chosen": -2.182919502258301, + "logits/rejected": -2.2048633098602295, + "logps/chosen": -460.90325927734375, + "logps/rejected": -355.2412109375, + "loss": 0.2833, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2295035123825073, + "rewards/margins": 2.1774675846099854, + "rewards/rejected": -3.406970977783203, + "step": 3522 + }, + { + "epoch": 0.41, + "learning_rate": 1.7974489193338846e-07, + "logits/chosen": -1.9187732934951782, + "logits/rejected": -1.9722661972045898, + "logps/chosen": -400.7926330566406, + "logps/rejected": -451.9626770019531, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5209985971450806, + "rewards/margins": 2.953122615814209, + "rewards/rejected": -3.47412109375, + "step": 3523 + }, + { + "epoch": 0.41, + "learning_rate": 1.7970946025747018e-07, + "logits/chosen": -2.0372047424316406, + "logits/rejected": -1.7956178188323975, + "logps/chosen": -256.33807373046875, + "logps/rejected": -227.98388671875, + "loss": 0.841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6912475824356079, + "rewards/margins": 1.1883018016815186, + "rewards/rejected": -1.8795496225357056, + "step": 3524 + }, + { + "epoch": 0.41, + "learning_rate": 1.796740285815519e-07, + "logits/chosen": -2.0953521728515625, + "logits/rejected": -2.1761341094970703, + "logps/chosen": -265.03973388671875, + "logps/rejected": -220.97825622558594, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3149622678756714, + "rewards/margins": 0.9173247218132019, + "rewards/rejected": -2.2322869300842285, + "step": 3525 + }, + { + "epoch": 0.41, + "learning_rate": 1.7963859690563363e-07, + "logits/chosen": -1.6164772510528564, + "logits/rejected": -1.7761706113815308, + "logps/chosen": -340.1525573730469, + "logps/rejected": -401.46380615234375, + "loss": 0.6148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1540478467941284, + "rewards/margins": 1.143075942993164, + "rewards/rejected": -2.297123908996582, + "step": 3526 + }, + { + "epoch": 0.41, + "learning_rate": 1.7960316522971535e-07, + "logits/chosen": -1.5704371929168701, + "logits/rejected": -2.412461757659912, + "logps/chosen": -430.100830078125, + "logps/rejected": -238.9053192138672, + "loss": 0.8264, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6638660430908203, + "rewards/margins": 1.5185625553131104, + "rewards/rejected": -3.1824283599853516, + "step": 3527 + }, + { + "epoch": 0.41, + "learning_rate": 1.795677335537971e-07, + "logits/chosen": -2.345571994781494, + "logits/rejected": -2.446469783782959, + "logps/chosen": -253.28292846679688, + "logps/rejected": -382.1732177734375, + "loss": 0.8381, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7856773138046265, + "rewards/margins": 2.4898681640625, + "rewards/rejected": -4.275545597076416, + "step": 3528 + }, + { + "epoch": 0.41, + "learning_rate": 1.7953230187787882e-07, + "logits/chosen": -1.757242202758789, + "logits/rejected": -1.9060916900634766, + "logps/chosen": -436.24365234375, + "logps/rejected": -448.3809814453125, + "loss": 0.3436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5778489112854004, + "rewards/margins": 1.7005529403686523, + "rewards/rejected": -2.2784018516540527, + "step": 3529 + }, + { + "epoch": 0.41, + "learning_rate": 1.7949687020196054e-07, + "logits/chosen": -2.4470512866973877, + "logits/rejected": -2.5518784523010254, + "logps/chosen": -501.38458251953125, + "logps/rejected": -380.09503173828125, + "loss": 0.2604, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.362999439239502, + "rewards/margins": 3.1923904418945312, + "rewards/rejected": -4.555389881134033, + "step": 3530 + }, + { + "epoch": 0.41, + "learning_rate": 1.7946143852604226e-07, + "logits/chosen": -2.3042213916778564, + "logits/rejected": -2.732530355453491, + "logps/chosen": -238.48646545410156, + "logps/rejected": -145.545166015625, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6837102174758911, + "rewards/margins": 1.9494374990463257, + "rewards/rejected": -2.633147716522217, + "step": 3531 + }, + { + "epoch": 0.41, + "learning_rate": 1.7942600685012399e-07, + "logits/chosen": -2.0871894359588623, + "logits/rejected": -1.7665197849273682, + "logps/chosen": -252.0256805419922, + "logps/rejected": -283.9460144042969, + "loss": 0.9837, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7659492492675781, + "rewards/margins": 0.19532887637615204, + "rewards/rejected": -1.961277961730957, + "step": 3532 + }, + { + "epoch": 0.41, + "learning_rate": 1.793905751742057e-07, + "logits/chosen": -2.557657241821289, + "logits/rejected": -2.61757755279541, + "logps/chosen": -335.87109375, + "logps/rejected": -241.59959411621094, + "loss": 1.2604, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.322140693664551, + "rewards/margins": 0.5192098617553711, + "rewards/rejected": -2.8413500785827637, + "step": 3533 + }, + { + "epoch": 0.41, + "learning_rate": 1.7935514349828748e-07, + "logits/chosen": -1.8706696033477783, + "logits/rejected": -1.7128409147262573, + "logps/chosen": -422.0047912597656, + "logps/rejected": -315.1707458496094, + "loss": 0.4431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0879558324813843, + "rewards/margins": 0.8794743418693542, + "rewards/rejected": -1.9674302339553833, + "step": 3534 + }, + { + "epoch": 0.41, + "learning_rate": 1.793197118223692e-07, + "logits/chosen": -2.341827154159546, + "logits/rejected": -2.3963005542755127, + "logps/chosen": -353.41265869140625, + "logps/rejected": -372.6905517578125, + "loss": 0.5782, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9014909267425537, + "rewards/margins": 1.74837327003479, + "rewards/rejected": -3.6498639583587646, + "step": 3535 + }, + { + "epoch": 0.41, + "learning_rate": 1.7928428014645093e-07, + "logits/chosen": -2.5787148475646973, + "logits/rejected": -2.742330312728882, + "logps/chosen": -245.3394012451172, + "logps/rejected": -267.8628234863281, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1837018728256226, + "rewards/margins": 2.1545286178588867, + "rewards/rejected": -3.3382303714752197, + "step": 3536 + }, + { + "epoch": 0.41, + "learning_rate": 1.7924884847053265e-07, + "logits/chosen": -2.4647176265716553, + "logits/rejected": -2.455451488494873, + "logps/chosen": -295.1525573730469, + "logps/rejected": -391.12188720703125, + "loss": 0.4218, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8545994758605957, + "rewards/margins": 0.995447039604187, + "rewards/rejected": -2.850046157836914, + "step": 3537 + }, + { + "epoch": 0.41, + "learning_rate": 1.7921341679461437e-07, + "logits/chosen": -1.7004281282424927, + "logits/rejected": -1.9507696628570557, + "logps/chosen": -494.37646484375, + "logps/rejected": -227.14193725585938, + "loss": 0.4808, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1681797504425049, + "rewards/margins": 0.7754212617874146, + "rewards/rejected": -1.9436010122299194, + "step": 3538 + }, + { + "epoch": 0.41, + "learning_rate": 1.7917798511869612e-07, + "logits/chosen": -1.7739512920379639, + "logits/rejected": -1.9718095064163208, + "logps/chosen": -437.89849853515625, + "logps/rejected": -357.6981201171875, + "loss": 0.5267, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0080795288085938, + "rewards/margins": 0.7793274521827698, + "rewards/rejected": -1.7874071598052979, + "step": 3539 + }, + { + "epoch": 0.41, + "learning_rate": 1.7914255344277784e-07, + "logits/chosen": -2.911754608154297, + "logits/rejected": -2.804948568344116, + "logps/chosen": -157.03355407714844, + "logps/rejected": -169.161865234375, + "loss": 0.2739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6699576377868652, + "rewards/margins": 2.372821807861328, + "rewards/rejected": -3.0427799224853516, + "step": 3540 + }, + { + "epoch": 0.41, + "learning_rate": 1.7910712176685956e-07, + "logits/chosen": -2.305417537689209, + "logits/rejected": -2.5289762020111084, + "logps/chosen": -529.061279296875, + "logps/rejected": -326.00439453125, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.999625563621521, + "rewards/margins": 2.4277429580688477, + "rewards/rejected": -3.427368640899658, + "step": 3541 + }, + { + "epoch": 0.41, + "learning_rate": 1.7907169009094129e-07, + "logits/chosen": -2.047551393508911, + "logits/rejected": -2.28385329246521, + "logps/chosen": -385.5389709472656, + "logps/rejected": -283.9618225097656, + "loss": 0.3993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022163957357406616, + "rewards/margins": 1.9295943975448608, + "rewards/rejected": -1.9517585039138794, + "step": 3542 + }, + { + "epoch": 0.41, + "learning_rate": 1.79036258415023e-07, + "logits/chosen": -2.390101671218872, + "logits/rejected": -2.5970568656921387, + "logps/chosen": -252.03076171875, + "logps/rejected": -175.04025268554688, + "loss": 0.644, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.897097110748291, + "rewards/margins": 0.5689926147460938, + "rewards/rejected": -1.4660897254943848, + "step": 3543 + }, + { + "epoch": 0.41, + "learning_rate": 1.7900082673910473e-07, + "logits/chosen": -2.293724536895752, + "logits/rejected": -2.166513442993164, + "logps/chosen": -289.19854736328125, + "logps/rejected": -250.8894805908203, + "loss": 0.4317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5054531097412109, + "rewards/margins": 1.270387887954712, + "rewards/rejected": -1.7758409976959229, + "step": 3544 + }, + { + "epoch": 0.41, + "learning_rate": 1.789653950631865e-07, + "logits/chosen": -2.3611490726470947, + "logits/rejected": -2.4121742248535156, + "logps/chosen": -354.8812255859375, + "logps/rejected": -219.28604125976562, + "loss": 0.1796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8508141040802002, + "rewards/margins": 2.677259683609009, + "rewards/rejected": -3.528073787689209, + "step": 3545 + }, + { + "epoch": 0.41, + "learning_rate": 1.7892996338726823e-07, + "logits/chosen": -2.5104284286499023, + "logits/rejected": -2.669137954711914, + "logps/chosen": -206.39553833007812, + "logps/rejected": -268.5229797363281, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.100898265838623, + "rewards/margins": 2.9684388637542725, + "rewards/rejected": -4.069336891174316, + "step": 3546 + }, + { + "epoch": 0.41, + "learning_rate": 1.7889453171134995e-07, + "logits/chosen": -2.480896234512329, + "logits/rejected": -2.676133155822754, + "logps/chosen": -329.6253356933594, + "logps/rejected": -279.0306091308594, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6654515266418457, + "rewards/margins": 3.365018367767334, + "rewards/rejected": -4.03046989440918, + "step": 3547 + }, + { + "epoch": 0.41, + "learning_rate": 1.7885910003543167e-07, + "logits/chosen": -2.8447577953338623, + "logits/rejected": -2.5669243335723877, + "logps/chosen": -155.79615783691406, + "logps/rejected": -218.80670166015625, + "loss": 0.3051, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4439754486083984, + "rewards/margins": 2.430471897125244, + "rewards/rejected": -3.8744473457336426, + "step": 3548 + }, + { + "epoch": 0.41, + "learning_rate": 1.788236683595134e-07, + "logits/chosen": -2.5551083087921143, + "logits/rejected": -2.35359525680542, + "logps/chosen": -274.81097412109375, + "logps/rejected": -388.0503845214844, + "loss": 0.286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6405341625213623, + "rewards/margins": 2.932678699493408, + "rewards/rejected": -3.5732128620147705, + "step": 3549 + }, + { + "epoch": 0.41, + "learning_rate": 1.7878823668359512e-07, + "logits/chosen": -1.6210076808929443, + "logits/rejected": -1.4251939058303833, + "logps/chosen": -289.1561279296875, + "logps/rejected": -381.55828857421875, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0639681816101074, + "rewards/margins": 1.4740254878997803, + "rewards/rejected": -2.537993907928467, + "step": 3550 + }, + { + "epoch": 0.41, + "learning_rate": 1.7875280500767686e-07, + "logits/chosen": -2.367678165435791, + "logits/rejected": -2.5169663429260254, + "logps/chosen": -225.14309692382812, + "logps/rejected": -190.63180541992188, + "loss": 0.4571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6098225116729736, + "rewards/margins": 1.59074068069458, + "rewards/rejected": -2.2005631923675537, + "step": 3551 + }, + { + "epoch": 0.41, + "learning_rate": 1.787173733317586e-07, + "logits/chosen": -2.5995543003082275, + "logits/rejected": -2.5345382690429688, + "logps/chosen": -90.4053955078125, + "logps/rejected": -151.53567504882812, + "loss": 1.2431, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2667312622070312, + "rewards/margins": -0.011912524700164795, + "rewards/rejected": -2.2548186779022217, + "step": 3552 + }, + { + "epoch": 0.41, + "learning_rate": 1.786819416558403e-07, + "logits/chosen": -2.086848258972168, + "logits/rejected": -2.436957597732544, + "logps/chosen": -279.67852783203125, + "logps/rejected": -192.96359252929688, + "loss": 0.1879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26876595616340637, + "rewards/margins": 3.0440266132354736, + "rewards/rejected": -3.3127923011779785, + "step": 3553 + }, + { + "epoch": 0.41, + "learning_rate": 1.7864650997992203e-07, + "logits/chosen": -2.781536340713501, + "logits/rejected": -2.6414291858673096, + "logps/chosen": -183.00436401367188, + "logps/rejected": -178.260009765625, + "loss": 0.3951, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7767140865325928, + "rewards/margins": 1.3530120849609375, + "rewards/rejected": -3.1297261714935303, + "step": 3554 + }, + { + "epoch": 0.41, + "learning_rate": 1.7861107830400375e-07, + "logits/chosen": -2.7459869384765625, + "logits/rejected": -2.7292261123657227, + "logps/chosen": -306.73883056640625, + "logps/rejected": -355.31353759765625, + "loss": 0.4011, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7559719085693359, + "rewards/margins": 3.492588996887207, + "rewards/rejected": -4.248560905456543, + "step": 3555 + }, + { + "epoch": 0.41, + "learning_rate": 1.7857564662808548e-07, + "logits/chosen": -2.2535552978515625, + "logits/rejected": -2.2723939418792725, + "logps/chosen": -247.45884704589844, + "logps/rejected": -276.532958984375, + "loss": 0.4472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.026872158050537, + "rewards/margins": 2.3950953483581543, + "rewards/rejected": -3.4219675064086914, + "step": 3556 + }, + { + "epoch": 0.41, + "learning_rate": 1.7854021495216725e-07, + "logits/chosen": -2.839066982269287, + "logits/rejected": -2.7557475566864014, + "logps/chosen": -110.80368041992188, + "logps/rejected": -173.53857421875, + "loss": 0.3137, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4238817095756531, + "rewards/margins": 2.6966474056243896, + "rewards/rejected": -3.1205291748046875, + "step": 3557 + }, + { + "epoch": 0.41, + "learning_rate": 1.7850478327624897e-07, + "logits/chosen": -2.57080078125, + "logits/rejected": -2.656045913696289, + "logps/chosen": -177.0863494873047, + "logps/rejected": -321.3726501464844, + "loss": 0.3506, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4827063083648682, + "rewards/margins": 2.9566259384155273, + "rewards/rejected": -4.439332008361816, + "step": 3558 + }, + { + "epoch": 0.41, + "learning_rate": 1.784693516003307e-07, + "logits/chosen": -1.5809412002563477, + "logits/rejected": -2.154513359069824, + "logps/chosen": -367.22540283203125, + "logps/rejected": -354.9560546875, + "loss": 0.2447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.758314311504364, + "rewards/margins": 3.441124200820923, + "rewards/rejected": -4.19943904876709, + "step": 3559 + }, + { + "epoch": 0.41, + "learning_rate": 1.7843391992441242e-07, + "logits/chosen": -2.4254488945007324, + "logits/rejected": -2.488272190093994, + "logps/chosen": -233.5460205078125, + "logps/rejected": -254.53948974609375, + "loss": 0.5147, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7740948796272278, + "rewards/margins": 1.2447701692581177, + "rewards/rejected": -2.0188651084899902, + "step": 3560 + }, + { + "epoch": 0.41, + "learning_rate": 1.7839848824849414e-07, + "logits/chosen": -1.762966275215149, + "logits/rejected": -1.8379013538360596, + "logps/chosen": -369.89849853515625, + "logps/rejected": -246.66749572753906, + "loss": 0.6498, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5052419900894165, + "rewards/margins": 0.4532448947429657, + "rewards/rejected": -0.9584868550300598, + "step": 3561 + }, + { + "epoch": 0.41, + "learning_rate": 1.783630565725759e-07, + "logits/chosen": -2.361143112182617, + "logits/rejected": -2.192044496536255, + "logps/chosen": -293.4867858886719, + "logps/rejected": -277.02056884765625, + "loss": 0.6629, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1997603178024292, + "rewards/margins": 3.244347095489502, + "rewards/rejected": -4.4441070556640625, + "step": 3562 + }, + { + "epoch": 0.41, + "learning_rate": 1.783276248966576e-07, + "logits/chosen": -2.2934041023254395, + "logits/rejected": -2.162379026412964, + "logps/chosen": -462.61248779296875, + "logps/rejected": -396.71942138671875, + "loss": 0.1872, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.041495442390442, + "rewards/margins": 3.1131086349487305, + "rewards/rejected": -4.154603958129883, + "step": 3563 + }, + { + "epoch": 0.41, + "learning_rate": 1.7829219322073933e-07, + "logits/chosen": -2.6462767124176025, + "logits/rejected": -2.7737021446228027, + "logps/chosen": -317.071533203125, + "logps/rejected": -165.41400146484375, + "loss": 0.2775, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1857931911945343, + "rewards/margins": 2.4301185607910156, + "rewards/rejected": -2.6159117221832275, + "step": 3564 + }, + { + "epoch": 0.41, + "learning_rate": 1.7825676154482105e-07, + "logits/chosen": -2.790363073348999, + "logits/rejected": -2.722480297088623, + "logps/chosen": -193.44439697265625, + "logps/rejected": -508.78594970703125, + "loss": 0.7189, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.300142526626587, + "rewards/margins": 1.875503420829773, + "rewards/rejected": -3.1756460666656494, + "step": 3565 + }, + { + "epoch": 0.41, + "learning_rate": 1.7822132986890278e-07, + "logits/chosen": -2.493271827697754, + "logits/rejected": -2.254833221435547, + "logps/chosen": -310.09539794921875, + "logps/rejected": -253.7480926513672, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2590174674987793, + "rewards/margins": 1.9139039516448975, + "rewards/rejected": -3.172921657562256, + "step": 3566 + }, + { + "epoch": 0.41, + "learning_rate": 1.781858981929845e-07, + "logits/chosen": -1.979998230934143, + "logits/rejected": -1.9005886316299438, + "logps/chosen": -170.67770385742188, + "logps/rejected": -251.8497314453125, + "loss": 0.3279, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5751266479492188, + "rewards/margins": 3.0666005611419678, + "rewards/rejected": -3.6417269706726074, + "step": 3567 + }, + { + "epoch": 0.42, + "learning_rate": 1.7815046651706625e-07, + "logits/chosen": -2.5845284461975098, + "logits/rejected": -2.626157760620117, + "logps/chosen": -179.29840087890625, + "logps/rejected": -229.39495849609375, + "loss": 0.2864, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.083378553390503, + "rewards/margins": 4.380307197570801, + "rewards/rejected": -5.463685512542725, + "step": 3568 + }, + { + "epoch": 0.42, + "learning_rate": 1.78115034841148e-07, + "logits/chosen": -2.1874494552612305, + "logits/rejected": -2.142787456512451, + "logps/chosen": -426.765625, + "logps/rejected": -378.71051025390625, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21815919876098633, + "rewards/margins": 3.244081497192383, + "rewards/rejected": -3.462240695953369, + "step": 3569 + }, + { + "epoch": 0.42, + "learning_rate": 1.7807960316522972e-07, + "logits/chosen": -2.0221614837646484, + "logits/rejected": -1.90340256690979, + "logps/chosen": -316.963623046875, + "logps/rejected": -304.0686950683594, + "loss": 0.4104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34536200761795044, + "rewards/margins": 1.8000059127807617, + "rewards/rejected": -2.1453678607940674, + "step": 3570 + }, + { + "epoch": 0.42, + "learning_rate": 1.7804417148931144e-07, + "logits/chosen": -2.0624890327453613, + "logits/rejected": -2.423828125, + "logps/chosen": -468.12786865234375, + "logps/rejected": -317.5612487792969, + "loss": 0.4791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40505450963974, + "rewards/margins": 2.180227041244507, + "rewards/rejected": -2.5852818489074707, + "step": 3571 + }, + { + "epoch": 0.42, + "learning_rate": 1.7800873981339316e-07, + "logits/chosen": -2.6220409870147705, + "logits/rejected": -2.660689115524292, + "logps/chosen": -244.15060424804688, + "logps/rejected": -222.98370361328125, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.73484867811203, + "rewards/margins": 1.7305066585540771, + "rewards/rejected": -2.465355157852173, + "step": 3572 + }, + { + "epoch": 0.42, + "learning_rate": 1.779733081374749e-07, + "logits/chosen": -2.7767302989959717, + "logits/rejected": -2.654404401779175, + "logps/chosen": -159.46878051757812, + "logps/rejected": -207.09323120117188, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4634143114089966, + "rewards/margins": 3.207425594329834, + "rewards/rejected": -3.670839786529541, + "step": 3573 + }, + { + "epoch": 0.42, + "learning_rate": 1.7793787646155663e-07, + "logits/chosen": -2.261578321456909, + "logits/rejected": -2.408339262008667, + "logps/chosen": -311.24273681640625, + "logps/rejected": -213.4822540283203, + "loss": 0.4469, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9702577590942383, + "rewards/margins": 1.3133666515350342, + "rewards/rejected": -3.2836246490478516, + "step": 3574 + }, + { + "epoch": 0.42, + "learning_rate": 1.7790244478563835e-07, + "logits/chosen": -2.3036224842071533, + "logits/rejected": -2.3730084896087646, + "logps/chosen": -290.65936279296875, + "logps/rejected": -193.30018615722656, + "loss": 0.3107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6184695959091187, + "rewards/margins": 2.126649856567383, + "rewards/rejected": -2.745119333267212, + "step": 3575 + }, + { + "epoch": 0.42, + "learning_rate": 1.7786701310972008e-07, + "logits/chosen": -2.6591644287109375, + "logits/rejected": -2.596353530883789, + "logps/chosen": -357.39239501953125, + "logps/rejected": -297.5425720214844, + "loss": 1.1002, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.900606632232666, + "rewards/margins": 1.0658588409423828, + "rewards/rejected": -2.966465473175049, + "step": 3576 + }, + { + "epoch": 0.42, + "learning_rate": 1.778315814338018e-07, + "logits/chosen": -2.6177947521209717, + "logits/rejected": -2.5440125465393066, + "logps/chosen": -627.4856567382812, + "logps/rejected": -329.6288757324219, + "loss": 0.1689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8175778388977051, + "rewards/margins": 3.001054048538208, + "rewards/rejected": -3.818632125854492, + "step": 3577 + }, + { + "epoch": 0.42, + "learning_rate": 1.7779614975788352e-07, + "logits/chosen": -2.025712490081787, + "logits/rejected": -2.3920509815216064, + "logps/chosen": -350.3843994140625, + "logps/rejected": -237.90760803222656, + "loss": 0.2389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4798944294452667, + "rewards/margins": 2.5500729084014893, + "rewards/rejected": -3.0299673080444336, + "step": 3578 + }, + { + "epoch": 0.42, + "learning_rate": 1.7776071808196524e-07, + "logits/chosen": -2.768082857131958, + "logits/rejected": -2.6145167350769043, + "logps/chosen": -87.90728759765625, + "logps/rejected": -210.63143920898438, + "loss": 0.6045, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2232578992843628, + "rewards/margins": 2.6498827934265137, + "rewards/rejected": -3.873140335083008, + "step": 3579 + }, + { + "epoch": 0.42, + "learning_rate": 1.77725286406047e-07, + "logits/chosen": -1.7401509284973145, + "logits/rejected": -1.832111120223999, + "logps/chosen": -368.4840393066406, + "logps/rejected": -313.94329833984375, + "loss": 1.2187, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.2665634155273438, + "rewards/margins": -0.09929028153419495, + "rewards/rejected": -3.1672730445861816, + "step": 3580 + }, + { + "epoch": 0.42, + "learning_rate": 1.7768985473012874e-07, + "logits/chosen": -2.432283401489258, + "logits/rejected": -2.452744245529175, + "logps/chosen": -257.9860534667969, + "logps/rejected": -193.98097229003906, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2247353196144104, + "rewards/margins": 2.963430166244507, + "rewards/rejected": -2.738694667816162, + "step": 3581 + }, + { + "epoch": 0.42, + "learning_rate": 1.7765442305421046e-07, + "logits/chosen": -1.7374093532562256, + "logits/rejected": -1.4071621894836426, + "logps/chosen": -240.60130310058594, + "logps/rejected": -265.11376953125, + "loss": 0.4674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6125986576080322, + "rewards/margins": 1.462754249572754, + "rewards/rejected": -2.075352907180786, + "step": 3582 + }, + { + "epoch": 0.42, + "learning_rate": 1.7761899137829218e-07, + "logits/chosen": -2.8444929122924805, + "logits/rejected": -2.783632278442383, + "logps/chosen": -343.1923828125, + "logps/rejected": -237.08937072753906, + "loss": 0.4051, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.251845121383667, + "rewards/margins": 2.149898052215576, + "rewards/rejected": -3.401743173599243, + "step": 3583 + }, + { + "epoch": 0.42, + "learning_rate": 1.775835597023739e-07, + "logits/chosen": -2.0380043983459473, + "logits/rejected": -2.1664633750915527, + "logps/chosen": -218.8868408203125, + "logps/rejected": -207.6944122314453, + "loss": 0.7806, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3493804931640625, + "rewards/margins": 0.6057875156402588, + "rewards/rejected": -1.9551681280136108, + "step": 3584 + }, + { + "epoch": 0.42, + "learning_rate": 1.7754812802645566e-07, + "logits/chosen": -1.9674994945526123, + "logits/rejected": -2.103658437728882, + "logps/chosen": -514.6777954101562, + "logps/rejected": -317.1293640136719, + "loss": 0.4516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8153533935546875, + "rewards/margins": 2.0653188228607178, + "rewards/rejected": -2.880671977996826, + "step": 3585 + }, + { + "epoch": 0.42, + "learning_rate": 1.7751269635053738e-07, + "logits/chosen": -2.6532673835754395, + "logits/rejected": -2.511963367462158, + "logps/chosen": -189.17442321777344, + "logps/rejected": -206.99172973632812, + "loss": 0.2706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9026861190795898, + "rewards/margins": 1.764678955078125, + "rewards/rejected": -2.667365074157715, + "step": 3586 + }, + { + "epoch": 0.42, + "learning_rate": 1.774772646746191e-07, + "logits/chosen": -2.826728105545044, + "logits/rejected": -2.6105151176452637, + "logps/chosen": -142.25827026367188, + "logps/rejected": -294.17462158203125, + "loss": 0.2584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6224987506866455, + "rewards/margins": 2.319270133972168, + "rewards/rejected": -2.9417686462402344, + "step": 3587 + }, + { + "epoch": 0.42, + "learning_rate": 1.7744183299870082e-07, + "logits/chosen": -2.3481814861297607, + "logits/rejected": -2.044620990753174, + "logps/chosen": -205.29986572265625, + "logps/rejected": -316.3489685058594, + "loss": 1.0331, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.477330207824707, + "rewards/margins": 1.7766166925430298, + "rewards/rejected": -3.2539470195770264, + "step": 3588 + }, + { + "epoch": 0.42, + "learning_rate": 1.7740640132278254e-07, + "logits/chosen": -2.2341949939727783, + "logits/rejected": -2.1889734268188477, + "logps/chosen": -226.51527404785156, + "logps/rejected": -238.739501953125, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7186123132705688, + "rewards/margins": 3.299710273742676, + "rewards/rejected": -4.018322467803955, + "step": 3589 + }, + { + "epoch": 0.42, + "learning_rate": 1.7737096964686427e-07, + "logits/chosen": -2.0105953216552734, + "logits/rejected": -1.8021154403686523, + "logps/chosen": -151.1517333984375, + "logps/rejected": -382.5098876953125, + "loss": 0.4521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39452704787254333, + "rewards/margins": 1.8812806606292725, + "rewards/rejected": -2.2758076190948486, + "step": 3590 + }, + { + "epoch": 0.42, + "learning_rate": 1.7733553797094601e-07, + "logits/chosen": -2.51668119430542, + "logits/rejected": -2.247572898864746, + "logps/chosen": -171.36346435546875, + "logps/rejected": -397.53155517578125, + "loss": 0.4308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9683553576469421, + "rewards/margins": 1.9785417318344116, + "rewards/rejected": -2.946897029876709, + "step": 3591 + }, + { + "epoch": 0.42, + "learning_rate": 1.7730010629502776e-07, + "logits/chosen": -2.838071584701538, + "logits/rejected": -2.5414888858795166, + "logps/chosen": -153.71627807617188, + "logps/rejected": -215.041748046875, + "loss": 0.2599, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3468116521835327, + "rewards/margins": 2.122192144393921, + "rewards/rejected": -2.469003915786743, + "step": 3592 + }, + { + "epoch": 0.42, + "learning_rate": 1.7726467461910949e-07, + "logits/chosen": -2.336592435836792, + "logits/rejected": -2.4484665393829346, + "logps/chosen": -267.7991027832031, + "logps/rejected": -228.49066162109375, + "loss": 0.1501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13436336815357208, + "rewards/margins": 2.288339853286743, + "rewards/rejected": -2.422703266143799, + "step": 3593 + }, + { + "epoch": 0.42, + "learning_rate": 1.772292429431912e-07, + "logits/chosen": -2.1479883193969727, + "logits/rejected": -2.079428195953369, + "logps/chosen": -200.84573364257812, + "logps/rejected": -323.9367980957031, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07172293961048126, + "rewards/margins": 3.0589191913604736, + "rewards/rejected": -3.1306421756744385, + "step": 3594 + }, + { + "epoch": 0.42, + "learning_rate": 1.7719381126727293e-07, + "logits/chosen": -2.1414804458618164, + "logits/rejected": -2.124013662338257, + "logps/chosen": -233.09112548828125, + "logps/rejected": -219.2594757080078, + "loss": 0.4309, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9435594081878662, + "rewards/margins": 1.9618418216705322, + "rewards/rejected": -2.9054014682769775, + "step": 3595 + }, + { + "epoch": 0.42, + "learning_rate": 1.7715837959135468e-07, + "logits/chosen": -2.3214995861053467, + "logits/rejected": -2.4274303913116455, + "logps/chosen": -323.5358581542969, + "logps/rejected": -292.90985107421875, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2833102941513062, + "rewards/margins": 2.7156105041503906, + "rewards/rejected": -3.9989206790924072, + "step": 3596 + }, + { + "epoch": 0.42, + "learning_rate": 1.771229479154364e-07, + "logits/chosen": -2.029998302459717, + "logits/rejected": -2.204063653945923, + "logps/chosen": -362.75396728515625, + "logps/rejected": -280.8029479980469, + "loss": 0.3929, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.520453691482544, + "rewards/margins": 1.7143495082855225, + "rewards/rejected": -3.2348031997680664, + "step": 3597 + }, + { + "epoch": 0.42, + "learning_rate": 1.7708751623951812e-07, + "logits/chosen": -2.3789286613464355, + "logits/rejected": -2.2715697288513184, + "logps/chosen": -193.6046600341797, + "logps/rejected": -333.04931640625, + "loss": 0.8181, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7458158135414124, + "rewards/margins": 0.49195536971092224, + "rewards/rejected": -1.2377711534500122, + "step": 3598 + }, + { + "epoch": 0.42, + "learning_rate": 1.7705208456359984e-07, + "logits/chosen": -2.5348358154296875, + "logits/rejected": -2.5544633865356445, + "logps/chosen": -243.3099365234375, + "logps/rejected": -220.73489379882812, + "loss": 0.2718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08000697195529938, + "rewards/margins": 2.8680825233459473, + "rewards/rejected": -2.948089122772217, + "step": 3599 + }, + { + "epoch": 0.42, + "learning_rate": 1.7701665288768157e-07, + "logits/chosen": -2.3766307830810547, + "logits/rejected": -2.594350576400757, + "logps/chosen": -362.82098388671875, + "logps/rejected": -447.5337829589844, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0997234582901, + "rewards/margins": 4.818094730377197, + "rewards/rejected": -5.917818069458008, + "step": 3600 + }, + { + "epoch": 0.42, + "learning_rate": 1.769812212117633e-07, + "logits/chosen": -2.704664945602417, + "logits/rejected": -2.80230712890625, + "logps/chosen": -238.21820068359375, + "logps/rejected": -227.79002380371094, + "loss": 0.4534, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3818296790122986, + "rewards/margins": 3.0973236560821533, + "rewards/rejected": -3.4791531562805176, + "step": 3601 + }, + { + "epoch": 0.42, + "learning_rate": 1.7694578953584504e-07, + "logits/chosen": -2.4974918365478516, + "logits/rejected": -2.3183693885803223, + "logps/chosen": -202.5987548828125, + "logps/rejected": -327.6900329589844, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.104593276977539, + "rewards/margins": 3.6806461811065674, + "rewards/rejected": -4.7852396965026855, + "step": 3602 + }, + { + "epoch": 0.42, + "learning_rate": 1.7691035785992676e-07, + "logits/chosen": -2.3356196880340576, + "logits/rejected": -2.3622946739196777, + "logps/chosen": -164.20257568359375, + "logps/rejected": -286.4875793457031, + "loss": 0.552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9016519784927368, + "rewards/margins": 3.297826051712036, + "rewards/rejected": -4.1994781494140625, + "step": 3603 + }, + { + "epoch": 0.42, + "learning_rate": 1.768749261840085e-07, + "logits/chosen": -2.0031211376190186, + "logits/rejected": -1.7946248054504395, + "logps/chosen": -196.7084197998047, + "logps/rejected": -336.53106689453125, + "loss": 0.4314, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8308393955230713, + "rewards/margins": 1.4122384786605835, + "rewards/rejected": -2.2430777549743652, + "step": 3604 + }, + { + "epoch": 0.42, + "learning_rate": 1.7683949450809023e-07, + "logits/chosen": -2.0044994354248047, + "logits/rejected": -2.441993236541748, + "logps/chosen": -317.69305419921875, + "logps/rejected": -299.50958251953125, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8257529735565186, + "rewards/margins": 3.0481374263763428, + "rewards/rejected": -3.8738903999328613, + "step": 3605 + }, + { + "epoch": 0.42, + "learning_rate": 1.7680406283217195e-07, + "logits/chosen": -2.4023056030273438, + "logits/rejected": -2.4707415103912354, + "logps/chosen": -249.3460693359375, + "logps/rejected": -272.2886047363281, + "loss": 0.6729, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8009041547775269, + "rewards/margins": 1.5948766469955444, + "rewards/rejected": -3.395780563354492, + "step": 3606 + }, + { + "epoch": 0.42, + "learning_rate": 1.767686311562537e-07, + "logits/chosen": -2.607919692993164, + "logits/rejected": -2.7359237670898438, + "logps/chosen": -207.57656860351562, + "logps/rejected": -148.43223571777344, + "loss": 0.1703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9396916031837463, + "rewards/margins": 2.1709530353546143, + "rewards/rejected": -3.1106443405151367, + "step": 3607 + }, + { + "epoch": 0.42, + "learning_rate": 1.7673319948033542e-07, + "logits/chosen": -2.688372850418091, + "logits/rejected": -2.842137098312378, + "logps/chosen": -299.37530517578125, + "logps/rejected": -236.3158721923828, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48713594675064087, + "rewards/margins": 2.918518543243408, + "rewards/rejected": -3.405653953552246, + "step": 3608 + }, + { + "epoch": 0.42, + "learning_rate": 1.7669776780441714e-07, + "logits/chosen": -1.997929334640503, + "logits/rejected": -2.156121253967285, + "logps/chosen": -471.1583557128906, + "logps/rejected": -377.3288879394531, + "loss": 0.1142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7391446232795715, + "rewards/margins": 5.186575889587402, + "rewards/rejected": -5.92572021484375, + "step": 3609 + }, + { + "epoch": 0.42, + "learning_rate": 1.7666233612849887e-07, + "logits/chosen": -2.4074432849884033, + "logits/rejected": -2.3415870666503906, + "logps/chosen": -397.9464416503906, + "logps/rejected": -233.77035522460938, + "loss": 0.3947, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9254862666130066, + "rewards/margins": 1.8310589790344238, + "rewards/rejected": -2.756545305252075, + "step": 3610 + }, + { + "epoch": 0.42, + "learning_rate": 1.766269044525806e-07, + "logits/chosen": -1.8646204471588135, + "logits/rejected": -2.292757272720337, + "logps/chosen": -481.5156555175781, + "logps/rejected": -306.01080322265625, + "loss": 0.4113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6206088662147522, + "rewards/margins": 1.5286095142364502, + "rewards/rejected": -2.1492183208465576, + "step": 3611 + }, + { + "epoch": 0.42, + "learning_rate": 1.765914727766623e-07, + "logits/chosen": -1.4575624465942383, + "logits/rejected": -1.3540453910827637, + "logps/chosen": -148.73287963867188, + "logps/rejected": -260.62713623046875, + "loss": 0.442, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3345039188861847, + "rewards/margins": 2.223738431930542, + "rewards/rejected": -2.5582425594329834, + "step": 3612 + }, + { + "epoch": 0.42, + "learning_rate": 1.7655604110074403e-07, + "logits/chosen": -2.8486075401306152, + "logits/rejected": -2.6044223308563232, + "logps/chosen": -255.39425659179688, + "logps/rejected": -167.30819702148438, + "loss": 0.6194, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9307503700256348, + "rewards/margins": 1.1964340209960938, + "rewards/rejected": -3.1271843910217285, + "step": 3613 + }, + { + "epoch": 0.42, + "learning_rate": 1.7652060942482578e-07, + "logits/chosen": -2.1573729515075684, + "logits/rejected": -2.346149206161499, + "logps/chosen": -295.8320007324219, + "logps/rejected": -279.00494384765625, + "loss": 0.6039, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9237630367279053, + "rewards/margins": 1.9030187129974365, + "rewards/rejected": -2.826781749725342, + "step": 3614 + }, + { + "epoch": 0.42, + "learning_rate": 1.764851777489075e-07, + "logits/chosen": -2.639853000640869, + "logits/rejected": -2.5196051597595215, + "logps/chosen": -185.4152069091797, + "logps/rejected": -212.96261596679688, + "loss": 0.2606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6488139629364014, + "rewards/margins": 1.8429820537567139, + "rewards/rejected": -2.4917960166931152, + "step": 3615 + }, + { + "epoch": 0.42, + "learning_rate": 1.7644974607298925e-07, + "logits/chosen": -2.2430191040039062, + "logits/rejected": -2.585036277770996, + "logps/chosen": -310.9378967285156, + "logps/rejected": -220.43612670898438, + "loss": 0.3892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0127099752426147, + "rewards/margins": 1.9801961183547974, + "rewards/rejected": -2.992906332015991, + "step": 3616 + }, + { + "epoch": 0.42, + "learning_rate": 1.7641431439707097e-07, + "logits/chosen": -2.369838237762451, + "logits/rejected": -2.4044392108917236, + "logps/chosen": -320.7086181640625, + "logps/rejected": -406.14361572265625, + "loss": 0.194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8556973338127136, + "rewards/margins": 2.885791063308716, + "rewards/rejected": -3.741488456726074, + "step": 3617 + }, + { + "epoch": 0.42, + "learning_rate": 1.7637888272115272e-07, + "logits/chosen": -2.0076959133148193, + "logits/rejected": -2.0877718925476074, + "logps/chosen": -154.7200927734375, + "logps/rejected": -253.339111328125, + "loss": 0.4229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2616881728172302, + "rewards/margins": 1.908022403717041, + "rewards/rejected": -2.169710636138916, + "step": 3618 + }, + { + "epoch": 0.42, + "learning_rate": 1.7634345104523445e-07, + "logits/chosen": -2.677542209625244, + "logits/rejected": -2.627998113632202, + "logps/chosen": -192.49197387695312, + "logps/rejected": -240.58126831054688, + "loss": 0.2186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06599956750869751, + "rewards/margins": 2.7855265140533447, + "rewards/rejected": -2.8515260219573975, + "step": 3619 + }, + { + "epoch": 0.42, + "learning_rate": 1.7630801936931617e-07, + "logits/chosen": -2.1320834159851074, + "logits/rejected": -2.513894557952881, + "logps/chosen": -550.2503662109375, + "logps/rejected": -219.17367553710938, + "loss": 0.3941, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6197540760040283, + "rewards/margins": 2.113739013671875, + "rewards/rejected": -2.7334930896759033, + "step": 3620 + }, + { + "epoch": 0.42, + "learning_rate": 1.762725876933979e-07, + "logits/chosen": -2.001929998397827, + "logits/rejected": -2.2676005363464355, + "logps/chosen": -362.23370361328125, + "logps/rejected": -468.70123291015625, + "loss": 0.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1526196300983429, + "rewards/margins": 4.143484592437744, + "rewards/rejected": -4.296104431152344, + "step": 3621 + }, + { + "epoch": 0.42, + "learning_rate": 1.762371560174796e-07, + "logits/chosen": -2.197519063949585, + "logits/rejected": -2.4499287605285645, + "logps/chosen": -274.94390869140625, + "logps/rejected": -260.0486145019531, + "loss": 0.6271, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.656251907348633, + "rewards/margins": 0.9504786133766174, + "rewards/rejected": -3.6067304611206055, + "step": 3622 + }, + { + "epoch": 0.42, + "learning_rate": 1.7620172434156133e-07, + "logits/chosen": -1.92734956741333, + "logits/rejected": -1.7617801427841187, + "logps/chosen": -218.84906005859375, + "logps/rejected": -155.29173278808594, + "loss": 0.6482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2556428909301758, + "rewards/margins": 0.8617305159568787, + "rewards/rejected": -1.1173733472824097, + "step": 3623 + }, + { + "epoch": 0.42, + "learning_rate": 1.7616629266564306e-07, + "logits/chosen": -2.460552930831909, + "logits/rejected": -2.45141863822937, + "logps/chosen": -275.479736328125, + "logps/rejected": -338.16973876953125, + "loss": 0.254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38652104139328003, + "rewards/margins": 1.5926625728607178, + "rewards/rejected": -1.9791836738586426, + "step": 3624 + }, + { + "epoch": 0.42, + "learning_rate": 1.761308609897248e-07, + "logits/chosen": -2.2082760334014893, + "logits/rejected": -2.326277494430542, + "logps/chosen": -494.88446044921875, + "logps/rejected": -403.92779541015625, + "loss": 0.2491, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027856767177581787, + "rewards/margins": 1.723174810409546, + "rewards/rejected": -1.6953179836273193, + "step": 3625 + }, + { + "epoch": 0.42, + "learning_rate": 1.7609542931380653e-07, + "logits/chosen": -2.5149309635162354, + "logits/rejected": -2.5833613872528076, + "logps/chosen": -487.967529296875, + "logps/rejected": -341.82525634765625, + "loss": 0.2632, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7141487002372742, + "rewards/margins": 2.247788429260254, + "rewards/rejected": -2.9619369506835938, + "step": 3626 + }, + { + "epoch": 0.42, + "learning_rate": 1.7605999763788828e-07, + "logits/chosen": -2.285888195037842, + "logits/rejected": -2.4170308113098145, + "logps/chosen": -368.3277587890625, + "logps/rejected": -204.67825317382812, + "loss": 0.8157, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4751067161560059, + "rewards/margins": 1.2618650197982788, + "rewards/rejected": -2.736971616744995, + "step": 3627 + }, + { + "epoch": 0.42, + "learning_rate": 1.7602456596197e-07, + "logits/chosen": -2.5381674766540527, + "logits/rejected": -2.5913352966308594, + "logps/chosen": -122.52070617675781, + "logps/rejected": -194.04750061035156, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49838101863861084, + "rewards/margins": 1.9317799806594849, + "rewards/rejected": -2.4301609992980957, + "step": 3628 + }, + { + "epoch": 0.42, + "learning_rate": 1.7598913428605172e-07, + "logits/chosen": -2.540062665939331, + "logits/rejected": -2.578360080718994, + "logps/chosen": -324.86517333984375, + "logps/rejected": -314.5015563964844, + "loss": 0.4111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8811057209968567, + "rewards/margins": 2.2924718856811523, + "rewards/rejected": -3.1735775470733643, + "step": 3629 + }, + { + "epoch": 0.42, + "learning_rate": 1.7595370261013347e-07, + "logits/chosen": -2.856123208999634, + "logits/rejected": -2.9191625118255615, + "logps/chosen": -173.734619140625, + "logps/rejected": -193.050537109375, + "loss": 0.9113, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9070520401000977, + "rewards/margins": 0.49310481548309326, + "rewards/rejected": -2.4001567363739014, + "step": 3630 + }, + { + "epoch": 0.42, + "learning_rate": 1.759182709342152e-07, + "logits/chosen": -2.822242259979248, + "logits/rejected": -2.734461545944214, + "logps/chosen": -270.19036865234375, + "logps/rejected": -331.5507507324219, + "loss": 0.322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7706082463264465, + "rewards/margins": 2.7023744583129883, + "rewards/rejected": -3.47298264503479, + "step": 3631 + }, + { + "epoch": 0.42, + "learning_rate": 1.758828392582969e-07, + "logits/chosen": -2.628826856613159, + "logits/rejected": -2.6068978309631348, + "logps/chosen": -303.2485046386719, + "logps/rejected": -346.0715637207031, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9181984066963196, + "rewards/margins": 2.5616564750671387, + "rewards/rejected": -3.4798545837402344, + "step": 3632 + }, + { + "epoch": 0.42, + "learning_rate": 1.7584740758237863e-07, + "logits/chosen": -2.658538341522217, + "logits/rejected": -2.7123095989227295, + "logps/chosen": -240.0086669921875, + "logps/rejected": -241.41644287109375, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4360697567462921, + "rewards/margins": 3.5168418884277344, + "rewards/rejected": -3.952911853790283, + "step": 3633 + }, + { + "epoch": 0.42, + "learning_rate": 1.7581197590646036e-07, + "logits/chosen": -2.311310291290283, + "logits/rejected": -2.070061683654785, + "logps/chosen": -455.2848205566406, + "logps/rejected": -367.4862060546875, + "loss": 0.6661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7782317996025085, + "rewards/margins": 1.1486155986785889, + "rewards/rejected": -1.9268474578857422, + "step": 3634 + }, + { + "epoch": 0.42, + "learning_rate": 1.7577654423054208e-07, + "logits/chosen": -2.2321739196777344, + "logits/rejected": -2.442387819290161, + "logps/chosen": -183.42288208007812, + "logps/rejected": -225.12030029296875, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8367000222206116, + "rewards/margins": 1.8083546161651611, + "rewards/rejected": -2.645054817199707, + "step": 3635 + }, + { + "epoch": 0.42, + "learning_rate": 1.7574111255462383e-07, + "logits/chosen": -2.8326711654663086, + "logits/rejected": -2.5293850898742676, + "logps/chosen": -166.69435119628906, + "logps/rejected": -209.31329345703125, + "loss": 0.219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25717100501060486, + "rewards/margins": 2.0742545127868652, + "rewards/rejected": -2.331425428390503, + "step": 3636 + }, + { + "epoch": 0.42, + "learning_rate": 1.7570568087870555e-07, + "logits/chosen": -2.351154088973999, + "logits/rejected": -2.0335121154785156, + "logps/chosen": -205.51963806152344, + "logps/rejected": -263.6110534667969, + "loss": 0.3445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8186399936676025, + "rewards/margins": 0.9557206630706787, + "rewards/rejected": -1.7743606567382812, + "step": 3637 + }, + { + "epoch": 0.42, + "learning_rate": 1.7567024920278727e-07, + "logits/chosen": -1.7017322778701782, + "logits/rejected": -1.820804238319397, + "logps/chosen": -571.2379150390625, + "logps/rejected": -458.09307861328125, + "loss": 0.1829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46206966042518616, + "rewards/margins": 3.1792893409729004, + "rewards/rejected": -3.6413590908050537, + "step": 3638 + }, + { + "epoch": 0.42, + "learning_rate": 1.7563481752686902e-07, + "logits/chosen": -2.2000551223754883, + "logits/rejected": -2.4209840297698975, + "logps/chosen": -322.6048583984375, + "logps/rejected": -389.43841552734375, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8882119655609131, + "rewards/margins": 2.910886287689209, + "rewards/rejected": -3.799098253250122, + "step": 3639 + }, + { + "epoch": 0.42, + "learning_rate": 1.7559938585095074e-07, + "logits/chosen": -2.1074576377868652, + "logits/rejected": -2.356843948364258, + "logps/chosen": -326.22528076171875, + "logps/rejected": -241.0065460205078, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6816136837005615, + "rewards/margins": 1.553244709968567, + "rewards/rejected": -2.234858512878418, + "step": 3640 + }, + { + "epoch": 0.42, + "learning_rate": 1.755639541750325e-07, + "logits/chosen": -2.316636562347412, + "logits/rejected": -2.364795207977295, + "logps/chosen": -220.51646423339844, + "logps/rejected": -208.14425659179688, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9230340123176575, + "rewards/margins": 1.4553242921829224, + "rewards/rejected": -2.3783583641052246, + "step": 3641 + }, + { + "epoch": 0.42, + "learning_rate": 1.7552852249911421e-07, + "logits/chosen": -2.191530466079712, + "logits/rejected": -2.504563808441162, + "logps/chosen": -304.494873046875, + "logps/rejected": -238.13687133789062, + "loss": 0.3144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.708904504776001, + "rewards/margins": 1.8391530513763428, + "rewards/rejected": -2.548057794570923, + "step": 3642 + }, + { + "epoch": 0.42, + "learning_rate": 1.7549309082319594e-07, + "logits/chosen": -2.3753397464752197, + "logits/rejected": -2.7118935585021973, + "logps/chosen": -337.87139892578125, + "logps/rejected": -161.96112060546875, + "loss": 0.3199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.417232871055603, + "rewards/margins": 1.6350390911102295, + "rewards/rejected": -2.052271842956543, + "step": 3643 + }, + { + "epoch": 0.42, + "learning_rate": 1.7545765914727766e-07, + "logits/chosen": -2.751394510269165, + "logits/rejected": -2.6526002883911133, + "logps/chosen": -264.1773376464844, + "logps/rejected": -324.6272888183594, + "loss": 0.0865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2512415945529938, + "rewards/margins": 4.298307418823242, + "rewards/rejected": -4.549549102783203, + "step": 3644 + }, + { + "epoch": 0.42, + "learning_rate": 1.7542222747135938e-07, + "logits/chosen": -1.7065523862838745, + "logits/rejected": -1.848132848739624, + "logps/chosen": -270.46124267578125, + "logps/rejected": -361.68511962890625, + "loss": 0.1943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6576786637306213, + "rewards/margins": 2.6710641384124756, + "rewards/rejected": -3.328742504119873, + "step": 3645 + }, + { + "epoch": 0.42, + "learning_rate": 1.753867957954411e-07, + "logits/chosen": -2.247823476791382, + "logits/rejected": -2.5076019763946533, + "logps/chosen": -503.8516845703125, + "logps/rejected": -327.9425354003906, + "loss": 0.8168, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.851790428161621, + "rewards/margins": 0.9290279746055603, + "rewards/rejected": -2.780818462371826, + "step": 3646 + }, + { + "epoch": 0.42, + "learning_rate": 1.7535136411952285e-07, + "logits/chosen": -2.3846371173858643, + "logits/rejected": -2.264427661895752, + "logps/chosen": -353.2913818359375, + "logps/rejected": -334.5704345703125, + "loss": 0.4607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6028943061828613, + "rewards/margins": 0.9870700240135193, + "rewards/rejected": -1.5899642705917358, + "step": 3647 + }, + { + "epoch": 0.42, + "learning_rate": 1.7531593244360457e-07, + "logits/chosen": -2.6449031829833984, + "logits/rejected": -2.7855143547058105, + "logps/chosen": -415.25384521484375, + "logps/rejected": -283.5061340332031, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.947851300239563, + "rewards/margins": 3.3880558013916016, + "rewards/rejected": -4.335906982421875, + "step": 3648 + }, + { + "epoch": 0.42, + "learning_rate": 1.752805007676863e-07, + "logits/chosen": -2.6904072761535645, + "logits/rejected": -2.6580357551574707, + "logps/chosen": -371.0420227050781, + "logps/rejected": -260.58209228515625, + "loss": 0.3959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8946399092674255, + "rewards/margins": 1.2156535387039185, + "rewards/rejected": -2.110293388366699, + "step": 3649 + }, + { + "epoch": 0.42, + "learning_rate": 1.7524506909176802e-07, + "logits/chosen": -2.022993564605713, + "logits/rejected": -2.1597447395324707, + "logps/chosen": -156.243896484375, + "logps/rejected": -219.60995483398438, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03595474362373352, + "rewards/margins": 3.1561880111694336, + "rewards/rejected": -3.192142963409424, + "step": 3650 + }, + { + "epoch": 0.42, + "learning_rate": 1.7520963741584977e-07, + "logits/chosen": -2.7479283809661865, + "logits/rejected": -2.676039695739746, + "logps/chosen": -270.45074462890625, + "logps/rejected": -174.1519012451172, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0217928886413574, + "rewards/margins": 2.4436194896698, + "rewards/rejected": -4.465412139892578, + "step": 3651 + }, + { + "epoch": 0.42, + "learning_rate": 1.7517420573993151e-07, + "logits/chosen": -1.9016468524932861, + "logits/rejected": -2.3999900817871094, + "logps/chosen": -354.0035095214844, + "logps/rejected": -319.2190246582031, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6545166969299316, + "rewards/margins": 1.8044581413269043, + "rewards/rejected": -3.458974838256836, + "step": 3652 + }, + { + "epoch": 0.42, + "learning_rate": 1.7513877406401324e-07, + "logits/chosen": -2.8612794876098633, + "logits/rejected": -2.923551082611084, + "logps/chosen": -105.82888793945312, + "logps/rejected": -172.19403076171875, + "loss": 0.2134, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21372301876544952, + "rewards/margins": 3.086059093475342, + "rewards/rejected": -3.2997817993164062, + "step": 3653 + }, + { + "epoch": 0.43, + "learning_rate": 1.7510334238809496e-07, + "logits/chosen": -2.3122005462646484, + "logits/rejected": -2.1965112686157227, + "logps/chosen": -298.6417236328125, + "logps/rejected": -319.1836242675781, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3625435531139374, + "rewards/margins": 2.2177846431732178, + "rewards/rejected": -2.5803279876708984, + "step": 3654 + }, + { + "epoch": 0.43, + "learning_rate": 1.7506791071217668e-07, + "logits/chosen": -1.774406909942627, + "logits/rejected": -1.708235740661621, + "logps/chosen": -230.31338500976562, + "logps/rejected": -284.8048095703125, + "loss": 0.3547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5063499808311462, + "rewards/margins": 1.306469202041626, + "rewards/rejected": -1.8128191232681274, + "step": 3655 + }, + { + "epoch": 0.43, + "learning_rate": 1.750324790362584e-07, + "logits/chosen": -2.140639543533325, + "logits/rejected": -2.2157797813415527, + "logps/chosen": -247.4978485107422, + "logps/rejected": -223.87081909179688, + "loss": 0.6543, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.812157392501831, + "rewards/margins": 0.8908335566520691, + "rewards/rejected": -2.702991008758545, + "step": 3656 + }, + { + "epoch": 0.43, + "learning_rate": 1.7499704736034012e-07, + "logits/chosen": -2.9164481163024902, + "logits/rejected": -2.8174870014190674, + "logps/chosen": -197.41128540039062, + "logps/rejected": -215.27635192871094, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6035594344139099, + "rewards/margins": 2.151754140853882, + "rewards/rejected": -2.7553138732910156, + "step": 3657 + }, + { + "epoch": 0.43, + "learning_rate": 1.7496161568442185e-07, + "logits/chosen": -1.7119629383087158, + "logits/rejected": -1.4527220726013184, + "logps/chosen": -294.8026428222656, + "logps/rejected": -387.5682067871094, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.853924036026001, + "rewards/margins": 2.255444288253784, + "rewards/rejected": -3.109368324279785, + "step": 3658 + }, + { + "epoch": 0.43, + "learning_rate": 1.749261840085036e-07, + "logits/chosen": -1.9631750583648682, + "logits/rejected": -2.1188595294952393, + "logps/chosen": -158.3531494140625, + "logps/rejected": -218.49269104003906, + "loss": 1.4792, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1300792694091797, + "rewards/margins": 0.18798059225082397, + "rewards/rejected": -2.3180601596832275, + "step": 3659 + }, + { + "epoch": 0.43, + "learning_rate": 1.7489075233258532e-07, + "logits/chosen": -2.6155920028686523, + "logits/rejected": -2.1889071464538574, + "logps/chosen": -132.99642944335938, + "logps/rejected": -309.3650817871094, + "loss": 0.2064, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4085540771484375, + "rewards/margins": 3.946611166000366, + "rewards/rejected": -5.355165481567383, + "step": 3660 + }, + { + "epoch": 0.43, + "learning_rate": 1.7485532065666704e-07, + "logits/chosen": -1.891045331954956, + "logits/rejected": -1.6461834907531738, + "logps/chosen": -380.045166015625, + "logps/rejected": -405.4801940917969, + "loss": 0.3355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4075565040111542, + "rewards/margins": 1.602735996246338, + "rewards/rejected": -2.0102925300598145, + "step": 3661 + }, + { + "epoch": 0.43, + "learning_rate": 1.748198889807488e-07, + "logits/chosen": -2.235243320465088, + "logits/rejected": -2.374389410018921, + "logps/chosen": -203.2404022216797, + "logps/rejected": -251.3274688720703, + "loss": 0.3808, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5026143789291382, + "rewards/margins": 1.8153672218322754, + "rewards/rejected": -3.317981719970703, + "step": 3662 + }, + { + "epoch": 0.43, + "learning_rate": 1.7478445730483054e-07, + "logits/chosen": -2.907538890838623, + "logits/rejected": -2.6081912517547607, + "logps/chosen": -296.9075927734375, + "logps/rejected": -287.37152099609375, + "loss": 0.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08826004713773727, + "rewards/margins": 2.672557830810547, + "rewards/rejected": -2.7608180046081543, + "step": 3663 + }, + { + "epoch": 0.43, + "learning_rate": 1.7474902562891226e-07, + "logits/chosen": -2.971066951751709, + "logits/rejected": -2.9624288082122803, + "logps/chosen": -211.5835418701172, + "logps/rejected": -316.7471923828125, + "loss": 0.1042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7265113592147827, + "rewards/margins": 3.3798723220825195, + "rewards/rejected": -4.106383323669434, + "step": 3664 + }, + { + "epoch": 0.43, + "learning_rate": 1.7471359395299398e-07, + "logits/chosen": -2.715458393096924, + "logits/rejected": -2.5708374977111816, + "logps/chosen": -177.61712646484375, + "logps/rejected": -219.39047241210938, + "loss": 0.3029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4614828824996948, + "rewards/margins": 2.4243717193603516, + "rewards/rejected": -3.8858542442321777, + "step": 3665 + }, + { + "epoch": 0.43, + "learning_rate": 1.746781622770757e-07, + "logits/chosen": -1.7088422775268555, + "logits/rejected": -2.1317272186279297, + "logps/chosen": -391.8970031738281, + "logps/rejected": -357.0691833496094, + "loss": 0.5218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8623418807983398, + "rewards/margins": 2.220327138900757, + "rewards/rejected": -3.082669258117676, + "step": 3666 + }, + { + "epoch": 0.43, + "learning_rate": 1.7464273060115743e-07, + "logits/chosen": -2.436995506286621, + "logits/rejected": -2.580872058868408, + "logps/chosen": -369.73388671875, + "logps/rejected": -330.9620056152344, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7926731705665588, + "rewards/margins": 3.267302989959717, + "rewards/rejected": -4.059976100921631, + "step": 3667 + }, + { + "epoch": 0.43, + "learning_rate": 1.7460729892523915e-07, + "logits/chosen": -2.164095401763916, + "logits/rejected": -2.4507744312286377, + "logps/chosen": -248.36773681640625, + "logps/rejected": -167.18174743652344, + "loss": 0.4431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8717927932739258, + "rewards/margins": 1.6182336807250977, + "rewards/rejected": -2.4900267124176025, + "step": 3668 + }, + { + "epoch": 0.43, + "learning_rate": 1.7457186724932087e-07, + "logits/chosen": -2.2899506092071533, + "logits/rejected": -2.407367706298828, + "logps/chosen": -276.9530944824219, + "logps/rejected": -217.07403564453125, + "loss": 0.811, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265005111694336, + "rewards/margins": 0.9955097436904907, + "rewards/rejected": -2.260514974594116, + "step": 3669 + }, + { + "epoch": 0.43, + "learning_rate": 1.7453643557340262e-07, + "logits/chosen": -2.0404410362243652, + "logits/rejected": -1.8280999660491943, + "logps/chosen": -165.88226318359375, + "logps/rejected": -218.3700408935547, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9352635741233826, + "rewards/margins": 1.0920538902282715, + "rewards/rejected": -2.0273172855377197, + "step": 3670 + }, + { + "epoch": 0.43, + "learning_rate": 1.7450100389748434e-07, + "logits/chosen": -2.474993944168091, + "logits/rejected": -2.780876636505127, + "logps/chosen": -311.75677490234375, + "logps/rejected": -267.763916015625, + "loss": 0.1587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04807084798812866, + "rewards/margins": 3.2806639671325684, + "rewards/rejected": -3.3287346363067627, + "step": 3671 + }, + { + "epoch": 0.43, + "learning_rate": 1.7446557222156606e-07, + "logits/chosen": -2.5547847747802734, + "logits/rejected": -2.2353501319885254, + "logps/chosen": -157.34701538085938, + "logps/rejected": -216.4887237548828, + "loss": 0.2215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7408726215362549, + "rewards/margins": 2.433030843734741, + "rewards/rejected": -3.173903465270996, + "step": 3672 + }, + { + "epoch": 0.43, + "learning_rate": 1.7443014054564778e-07, + "logits/chosen": -2.6178956031799316, + "logits/rejected": -2.79403018951416, + "logps/chosen": -140.9967803955078, + "logps/rejected": -293.86444091796875, + "loss": 0.1593, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0327720642089844, + "rewards/margins": 3.035249710083008, + "rewards/rejected": -4.06802225112915, + "step": 3673 + }, + { + "epoch": 0.43, + "learning_rate": 1.7439470886972953e-07, + "logits/chosen": -1.8702095746994019, + "logits/rejected": -1.9370931386947632, + "logps/chosen": -490.37139892578125, + "logps/rejected": -314.7297058105469, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.792751669883728, + "rewards/margins": 1.9245471954345703, + "rewards/rejected": -2.717298746109009, + "step": 3674 + }, + { + "epoch": 0.43, + "learning_rate": 1.7435927719381128e-07, + "logits/chosen": -2.0421814918518066, + "logits/rejected": -2.472233772277832, + "logps/chosen": -399.0526123046875, + "logps/rejected": -266.0621643066406, + "loss": 0.2427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.73717200756073, + "rewards/margins": 4.044706344604492, + "rewards/rejected": -4.78187894821167, + "step": 3675 + }, + { + "epoch": 0.43, + "learning_rate": 1.74323845517893e-07, + "logits/chosen": -2.6309547424316406, + "logits/rejected": -2.5544681549072266, + "logps/chosen": -371.57122802734375, + "logps/rejected": -391.730224609375, + "loss": 0.4933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9490096569061279, + "rewards/margins": 1.0312548875808716, + "rewards/rejected": -1.9802645444869995, + "step": 3676 + }, + { + "epoch": 0.43, + "learning_rate": 1.7428841384197473e-07, + "logits/chosen": -2.866204261779785, + "logits/rejected": -2.8542251586914062, + "logps/chosen": -124.91893768310547, + "logps/rejected": -223.81712341308594, + "loss": 0.3337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.537009596824646, + "rewards/margins": 1.9068275690078735, + "rewards/rejected": -2.4438371658325195, + "step": 3677 + }, + { + "epoch": 0.43, + "learning_rate": 1.7425298216605645e-07, + "logits/chosen": -2.377229690551758, + "logits/rejected": -2.197810173034668, + "logps/chosen": -272.00543212890625, + "logps/rejected": -245.77224731445312, + "loss": 0.2691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.824256956577301, + "rewards/margins": 1.9091439247131348, + "rewards/rejected": -2.733401298522949, + "step": 3678 + }, + { + "epoch": 0.43, + "learning_rate": 1.7421755049013817e-07, + "logits/chosen": -2.0848677158355713, + "logits/rejected": -1.936612606048584, + "logps/chosen": -198.53013610839844, + "logps/rejected": -290.92657470703125, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0603196620941162, + "rewards/margins": 1.729354739189148, + "rewards/rejected": -2.7896745204925537, + "step": 3679 + }, + { + "epoch": 0.43, + "learning_rate": 1.741821188142199e-07, + "logits/chosen": -2.268199920654297, + "logits/rejected": -2.526427745819092, + "logps/chosen": -257.79144287109375, + "logps/rejected": -227.1619415283203, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3463366627693176, + "rewards/margins": 1.14854097366333, + "rewards/rejected": -1.4948776960372925, + "step": 3680 + }, + { + "epoch": 0.43, + "learning_rate": 1.7414668713830164e-07, + "logits/chosen": -1.9737370014190674, + "logits/rejected": -1.9244928359985352, + "logps/chosen": -366.2268981933594, + "logps/rejected": -405.58013916015625, + "loss": 0.6481, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5612242817878723, + "rewards/margins": 1.4461132287979126, + "rewards/rejected": -2.0073373317718506, + "step": 3681 + }, + { + "epoch": 0.43, + "learning_rate": 1.7411125546238336e-07, + "logits/chosen": -2.7329518795013428, + "logits/rejected": -2.6488242149353027, + "logps/chosen": -187.4397430419922, + "logps/rejected": -331.52520751953125, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8117837905883789, + "rewards/margins": 3.612819194793701, + "rewards/rejected": -4.424602508544922, + "step": 3682 + }, + { + "epoch": 0.43, + "learning_rate": 1.7407582378646509e-07, + "logits/chosen": -2.788693904876709, + "logits/rejected": -2.7505176067352295, + "logps/chosen": -296.1656799316406, + "logps/rejected": -266.333984375, + "loss": 0.285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13962876796722412, + "rewards/margins": 2.231947422027588, + "rewards/rejected": -2.3715763092041016, + "step": 3683 + }, + { + "epoch": 0.43, + "learning_rate": 1.740403921105468e-07, + "logits/chosen": -2.2580204010009766, + "logits/rejected": -2.4298202991485596, + "logps/chosen": -325.7272033691406, + "logps/rejected": -216.823486328125, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.025530457496643, + "rewards/margins": 1.2183220386505127, + "rewards/rejected": -2.2438526153564453, + "step": 3684 + }, + { + "epoch": 0.43, + "learning_rate": 1.7400496043462853e-07, + "logits/chosen": -2.445176362991333, + "logits/rejected": -2.136286973953247, + "logps/chosen": -283.36590576171875, + "logps/rejected": -303.21734619140625, + "loss": 0.3033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47659796476364136, + "rewards/margins": 2.1696689128875732, + "rewards/rejected": -2.6462669372558594, + "step": 3685 + }, + { + "epoch": 0.43, + "learning_rate": 1.739695287587103e-07, + "logits/chosen": -2.0911810398101807, + "logits/rejected": -1.9011523723602295, + "logps/chosen": -130.83779907226562, + "logps/rejected": -190.7473907470703, + "loss": 0.415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14735761284828186, + "rewards/margins": 1.249127984046936, + "rewards/rejected": -1.3964858055114746, + "step": 3686 + }, + { + "epoch": 0.43, + "learning_rate": 1.7393409708279203e-07, + "logits/chosen": -2.894599199295044, + "logits/rejected": -3.033484935760498, + "logps/chosen": -239.06558227539062, + "logps/rejected": -229.97348022460938, + "loss": 1.3614, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.611859083175659, + "rewards/margins": 0.9862688779830933, + "rewards/rejected": -4.598127841949463, + "step": 3687 + }, + { + "epoch": 0.43, + "learning_rate": 1.7389866540687375e-07, + "logits/chosen": -2.7142202854156494, + "logits/rejected": -2.3902058601379395, + "logps/chosen": -198.78036499023438, + "logps/rejected": -221.470947265625, + "loss": 0.4643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5688228011131287, + "rewards/margins": 2.6083273887634277, + "rewards/rejected": -3.177150249481201, + "step": 3688 + }, + { + "epoch": 0.43, + "learning_rate": 1.7386323373095547e-07, + "logits/chosen": -2.2120471000671387, + "logits/rejected": -1.9061753749847412, + "logps/chosen": -238.64190673828125, + "logps/rejected": -388.8365478515625, + "loss": 0.1696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06947195529937744, + "rewards/margins": 4.040794372558594, + "rewards/rejected": -4.110266208648682, + "step": 3689 + }, + { + "epoch": 0.43, + "learning_rate": 1.738278020550372e-07, + "logits/chosen": -1.9914309978485107, + "logits/rejected": -2.0724055767059326, + "logps/chosen": -282.9148254394531, + "logps/rejected": -324.8716735839844, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.267249345779419, + "rewards/margins": 3.686800003051758, + "rewards/rejected": -4.954049587249756, + "step": 3690 + }, + { + "epoch": 0.43, + "learning_rate": 1.7379237037911892e-07, + "logits/chosen": -2.6573290824890137, + "logits/rejected": -2.6227431297302246, + "logps/chosen": -510.3601989746094, + "logps/rejected": -385.32421875, + "loss": 0.2296, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8787400722503662, + "rewards/margins": 1.974020004272461, + "rewards/rejected": -2.8527603149414062, + "step": 3691 + }, + { + "epoch": 0.43, + "learning_rate": 1.7375693870320066e-07, + "logits/chosen": -2.5903353691101074, + "logits/rejected": -2.598322629928589, + "logps/chosen": -188.16555786132812, + "logps/rejected": -201.2687225341797, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15021520853042603, + "rewards/margins": 2.7779810428619385, + "rewards/rejected": -2.928196430206299, + "step": 3692 + }, + { + "epoch": 0.43, + "learning_rate": 1.7372150702728239e-07, + "logits/chosen": -2.039206027984619, + "logits/rejected": -1.9984009265899658, + "logps/chosen": -318.2369689941406, + "logps/rejected": -484.5920715332031, + "loss": 0.3577, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2695448398590088, + "rewards/margins": 2.803101062774658, + "rewards/rejected": -4.072646141052246, + "step": 3693 + }, + { + "epoch": 0.43, + "learning_rate": 1.736860753513641e-07, + "logits/chosen": -1.696550726890564, + "logits/rejected": -2.2406504154205322, + "logps/chosen": -448.8092346191406, + "logps/rejected": -266.0439758300781, + "loss": 0.365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5180296301841736, + "rewards/margins": 2.553469657897949, + "rewards/rejected": -3.0714993476867676, + "step": 3694 + }, + { + "epoch": 0.43, + "learning_rate": 1.7365064367544583e-07, + "logits/chosen": -2.1854357719421387, + "logits/rejected": -2.035473585128784, + "logps/chosen": -309.9385986328125, + "logps/rejected": -435.86273193359375, + "loss": 0.4115, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6499674320220947, + "rewards/margins": 1.6143262386322021, + "rewards/rejected": -2.2642934322357178, + "step": 3695 + }, + { + "epoch": 0.43, + "learning_rate": 1.7361521199952755e-07, + "logits/chosen": -2.4479734897613525, + "logits/rejected": -2.3185224533081055, + "logps/chosen": -216.40597534179688, + "logps/rejected": -235.10678100585938, + "loss": 0.4099, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.094369411468506, + "rewards/margins": 2.2627477645874023, + "rewards/rejected": -4.357117176055908, + "step": 3696 + }, + { + "epoch": 0.43, + "learning_rate": 1.7357978032360933e-07, + "logits/chosen": -2.672917366027832, + "logits/rejected": -2.635641098022461, + "logps/chosen": -216.56942749023438, + "logps/rejected": -186.6765594482422, + "loss": 0.7269, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3228914737701416, + "rewards/margins": 1.340186357498169, + "rewards/rejected": -2.6630780696868896, + "step": 3697 + }, + { + "epoch": 0.43, + "learning_rate": 1.7354434864769105e-07, + "logits/chosen": -2.0059165954589844, + "logits/rejected": -2.123657703399658, + "logps/chosen": -353.39892578125, + "logps/rejected": -302.18060302734375, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.860396146774292, + "rewards/margins": 2.3353824615478516, + "rewards/rejected": -3.1957786083221436, + "step": 3698 + }, + { + "epoch": 0.43, + "learning_rate": 1.7350891697177277e-07, + "logits/chosen": -1.9109951257705688, + "logits/rejected": -2.07395601272583, + "logps/chosen": -238.9143829345703, + "logps/rejected": -249.48208618164062, + "loss": 0.8497, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8662872314453125, + "rewards/margins": 2.381871461868286, + "rewards/rejected": -4.2481584548950195, + "step": 3699 + }, + { + "epoch": 0.43, + "learning_rate": 1.734734852958545e-07, + "logits/chosen": -2.145282745361328, + "logits/rejected": -2.50301456451416, + "logps/chosen": -399.3047790527344, + "logps/rejected": -314.114501953125, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7943548560142517, + "rewards/margins": 2.3854169845581055, + "rewards/rejected": -3.179771661758423, + "step": 3700 + }, + { + "epoch": 0.43, + "learning_rate": 1.7343805361993622e-07, + "logits/chosen": -2.04487681388855, + "logits/rejected": -1.7817836999893188, + "logps/chosen": -381.40496826171875, + "logps/rejected": -392.351318359375, + "loss": 0.436, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8191475868225098, + "rewards/margins": 1.4170055389404297, + "rewards/rejected": -2.2361531257629395, + "step": 3701 + }, + { + "epoch": 0.43, + "learning_rate": 1.7340262194401794e-07, + "logits/chosen": -1.5543417930603027, + "logits/rejected": -1.7486350536346436, + "logps/chosen": -330.330322265625, + "logps/rejected": -363.55889892578125, + "loss": 0.4802, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3951191902160645, + "rewards/margins": 3.1329269409179688, + "rewards/rejected": -4.528045654296875, + "step": 3702 + }, + { + "epoch": 0.43, + "learning_rate": 1.7336719026809966e-07, + "logits/chosen": -2.1352009773254395, + "logits/rejected": -2.0588603019714355, + "logps/chosen": -368.5689697265625, + "logps/rejected": -307.3861389160156, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5209349393844604, + "rewards/margins": 1.9946428537368774, + "rewards/rejected": -2.515577793121338, + "step": 3703 + }, + { + "epoch": 0.43, + "learning_rate": 1.733317585921814e-07, + "logits/chosen": -1.746111273765564, + "logits/rejected": -2.0118613243103027, + "logps/chosen": -553.3530883789062, + "logps/rejected": -291.9944152832031, + "loss": 0.5178, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4164044857025146, + "rewards/margins": 1.3938798904418945, + "rewards/rejected": -2.81028413772583, + "step": 3704 + }, + { + "epoch": 0.43, + "learning_rate": 1.7329632691626313e-07, + "logits/chosen": -2.065673589706421, + "logits/rejected": -1.9595096111297607, + "logps/chosen": -243.19338989257812, + "logps/rejected": -327.6018371582031, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2481385469436646, + "rewards/margins": 3.85477352142334, + "rewards/rejected": -5.102911949157715, + "step": 3705 + }, + { + "epoch": 0.43, + "learning_rate": 1.7326089524034485e-07, + "logits/chosen": -2.8990285396575928, + "logits/rejected": -2.6984751224517822, + "logps/chosen": -354.3673400878906, + "logps/rejected": -206.9667510986328, + "loss": 0.2671, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5646005868911743, + "rewards/margins": 2.0903940200805664, + "rewards/rejected": -3.6549947261810303, + "step": 3706 + }, + { + "epoch": 0.43, + "learning_rate": 1.7322546356442657e-07, + "logits/chosen": -2.411559581756592, + "logits/rejected": -2.423661708831787, + "logps/chosen": -258.35968017578125, + "logps/rejected": -278.9551696777344, + "loss": 0.273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.444812536239624, + "rewards/margins": 1.675734043121338, + "rewards/rejected": -2.120546579360962, + "step": 3707 + }, + { + "epoch": 0.43, + "learning_rate": 1.731900318885083e-07, + "logits/chosen": -2.3971192836761475, + "logits/rejected": -2.1488118171691895, + "logps/chosen": -191.97755432128906, + "logps/rejected": -395.44842529296875, + "loss": 0.4075, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7783681154251099, + "rewards/margins": 1.7208970785140991, + "rewards/rejected": -2.499265193939209, + "step": 3708 + }, + { + "epoch": 0.43, + "learning_rate": 1.7315460021259007e-07, + "logits/chosen": -2.4006502628326416, + "logits/rejected": -2.5030553340911865, + "logps/chosen": -243.72093200683594, + "logps/rejected": -182.96682739257812, + "loss": 0.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0155597925186157, + "rewards/margins": 0.6701515316963196, + "rewards/rejected": -1.68571138381958, + "step": 3709 + }, + { + "epoch": 0.43, + "learning_rate": 1.731191685366718e-07, + "logits/chosen": -2.6574642658233643, + "logits/rejected": -2.7064478397369385, + "logps/chosen": -223.228515625, + "logps/rejected": -237.41146850585938, + "loss": 0.6371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9970271587371826, + "rewards/margins": 1.9122612476348877, + "rewards/rejected": -2.9092884063720703, + "step": 3710 + }, + { + "epoch": 0.43, + "learning_rate": 1.7308373686075352e-07, + "logits/chosen": -2.391976833343506, + "logits/rejected": -2.497063159942627, + "logps/chosen": -295.07232666015625, + "logps/rejected": -241.7061767578125, + "loss": 0.3301, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8970012068748474, + "rewards/margins": 2.3239829540252686, + "rewards/rejected": -3.22098445892334, + "step": 3711 + }, + { + "epoch": 0.43, + "learning_rate": 1.7304830518483524e-07, + "logits/chosen": -2.6321370601654053, + "logits/rejected": -2.775364637374878, + "logps/chosen": -217.9580535888672, + "logps/rejected": -236.94781494140625, + "loss": 0.149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3888705372810364, + "rewards/margins": 4.132527828216553, + "rewards/rejected": -4.521398544311523, + "step": 3712 + }, + { + "epoch": 0.43, + "learning_rate": 1.7301287350891696e-07, + "logits/chosen": -2.6394784450531006, + "logits/rejected": -2.5765581130981445, + "logps/chosen": -188.92958068847656, + "logps/rejected": -187.55145263671875, + "loss": 0.739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3894834518432617, + "rewards/margins": 1.9422335624694824, + "rewards/rejected": -3.331716775894165, + "step": 3713 + }, + { + "epoch": 0.43, + "learning_rate": 1.7297744183299868e-07, + "logits/chosen": -1.965806484222412, + "logits/rejected": -1.7914789915084839, + "logps/chosen": -319.6033935546875, + "logps/rejected": -335.74072265625, + "loss": 0.3654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9713230133056641, + "rewards/margins": 1.536490797996521, + "rewards/rejected": -2.5078139305114746, + "step": 3714 + }, + { + "epoch": 0.43, + "learning_rate": 1.7294201015708043e-07, + "logits/chosen": -2.0414977073669434, + "logits/rejected": -1.8441251516342163, + "logps/chosen": -302.99169921875, + "logps/rejected": -284.26605224609375, + "loss": 0.506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7070732116699219, + "rewards/margins": 0.9306473731994629, + "rewards/rejected": -1.6377205848693848, + "step": 3715 + }, + { + "epoch": 0.43, + "learning_rate": 1.7290657848116215e-07, + "logits/chosen": -2.5921173095703125, + "logits/rejected": -2.277160167694092, + "logps/chosen": -298.1441345214844, + "logps/rejected": -377.3199768066406, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6873561143875122, + "rewards/margins": 3.1565096378326416, + "rewards/rejected": -3.843865394592285, + "step": 3716 + }, + { + "epoch": 0.43, + "learning_rate": 1.7287114680524388e-07, + "logits/chosen": -2.6138858795166016, + "logits/rejected": -2.3548648357391357, + "logps/chosen": -308.98193359375, + "logps/rejected": -352.6666564941406, + "loss": 0.2856, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3538254499435425, + "rewards/margins": 2.7473530769348145, + "rewards/rejected": -4.1011786460876465, + "step": 3717 + }, + { + "epoch": 0.43, + "learning_rate": 1.728357151293256e-07, + "logits/chosen": -2.5532941818237305, + "logits/rejected": -2.467545509338379, + "logps/chosen": -221.00914001464844, + "logps/rejected": -182.41798400878906, + "loss": 0.339, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2797306776046753, + "rewards/margins": 1.8898282051086426, + "rewards/rejected": -3.1695590019226074, + "step": 3718 + }, + { + "epoch": 0.43, + "learning_rate": 1.7280028345340732e-07, + "logits/chosen": -1.7028050422668457, + "logits/rejected": -2.285731077194214, + "logps/chosen": -667.54052734375, + "logps/rejected": -236.13845825195312, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5811326503753662, + "rewards/margins": 2.1210849285125732, + "rewards/rejected": -2.7022175788879395, + "step": 3719 + }, + { + "epoch": 0.43, + "learning_rate": 1.7276485177748904e-07, + "logits/chosen": -2.338312864303589, + "logits/rejected": -2.547163963317871, + "logps/chosen": -224.9838409423828, + "logps/rejected": -209.48057556152344, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4627833068370819, + "rewards/margins": 2.9405112266540527, + "rewards/rejected": -3.403294086456299, + "step": 3720 + }, + { + "epoch": 0.43, + "learning_rate": 1.7272942010157082e-07, + "logits/chosen": -2.065648078918457, + "logits/rejected": -2.5420784950256348, + "logps/chosen": -657.2072143554688, + "logps/rejected": -327.7154541015625, + "loss": 0.0936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005409598350524902, + "rewards/margins": 3.625261068344116, + "rewards/rejected": -3.619851589202881, + "step": 3721 + }, + { + "epoch": 0.43, + "learning_rate": 1.7269398842565254e-07, + "logits/chosen": -2.4185280799865723, + "logits/rejected": -2.457672357559204, + "logps/chosen": -246.00563049316406, + "logps/rejected": -282.01727294921875, + "loss": 0.2141, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5099341869354248, + "rewards/margins": 3.4903016090393066, + "rewards/rejected": -4.000235557556152, + "step": 3722 + }, + { + "epoch": 0.43, + "learning_rate": 1.7265855674973426e-07, + "logits/chosen": -2.240135669708252, + "logits/rejected": -2.002568006515503, + "logps/chosen": -176.00177001953125, + "logps/rejected": -154.2832489013672, + "loss": 0.3663, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14821940660476685, + "rewards/margins": 1.4750512838363647, + "rewards/rejected": -1.6232706308364868, + "step": 3723 + }, + { + "epoch": 0.43, + "learning_rate": 1.7262312507381598e-07, + "logits/chosen": -2.6076619625091553, + "logits/rejected": -2.796152353286743, + "logps/chosen": -211.88922119140625, + "logps/rejected": -223.60079956054688, + "loss": 0.5081, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2487589120864868, + "rewards/margins": 1.393790364265442, + "rewards/rejected": -2.6425490379333496, + "step": 3724 + }, + { + "epoch": 0.43, + "learning_rate": 1.725876933978977e-07, + "logits/chosen": -3.027440309524536, + "logits/rejected": -2.908684730529785, + "logps/chosen": -201.9524688720703, + "logps/rejected": -231.9432373046875, + "loss": 0.1933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9422208666801453, + "rewards/margins": 3.074080228805542, + "rewards/rejected": -4.016301155090332, + "step": 3725 + }, + { + "epoch": 0.43, + "learning_rate": 1.7255226172197945e-07, + "logits/chosen": -2.1773812770843506, + "logits/rejected": -2.0532565116882324, + "logps/chosen": -306.9954833984375, + "logps/rejected": -360.8705749511719, + "loss": 0.1467, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1133252382278442, + "rewards/margins": 3.4586222171783447, + "rewards/rejected": -4.57194709777832, + "step": 3726 + }, + { + "epoch": 0.43, + "learning_rate": 1.7251683004606118e-07, + "logits/chosen": -2.944957733154297, + "logits/rejected": -2.915398359298706, + "logps/chosen": -220.62588500976562, + "logps/rejected": -157.22938537597656, + "loss": 0.7815, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2194383144378662, + "rewards/margins": 0.9688323736190796, + "rewards/rejected": -2.1882705688476562, + "step": 3727 + }, + { + "epoch": 0.43, + "learning_rate": 1.724813983701429e-07, + "logits/chosen": -2.7415571212768555, + "logits/rejected": -2.64961576461792, + "logps/chosen": -272.9709777832031, + "logps/rejected": -193.2978973388672, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.505323052406311, + "rewards/margins": 1.9852540493011475, + "rewards/rejected": -3.490577220916748, + "step": 3728 + }, + { + "epoch": 0.43, + "learning_rate": 1.7244596669422462e-07, + "logits/chosen": -2.6818881034851074, + "logits/rejected": -2.7878992557525635, + "logps/chosen": -153.30621337890625, + "logps/rejected": -172.41943359375, + "loss": 0.5326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9614912271499634, + "rewards/margins": 1.2398717403411865, + "rewards/rejected": -2.2013628482818604, + "step": 3729 + }, + { + "epoch": 0.43, + "learning_rate": 1.7241053501830634e-07, + "logits/chosen": -1.9672021865844727, + "logits/rejected": -2.069403886795044, + "logps/chosen": -390.5616455078125, + "logps/rejected": -296.745849609375, + "loss": 1.4922, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.451507568359375, + "rewards/margins": 0.4091653525829315, + "rewards/rejected": -3.860672950744629, + "step": 3730 + }, + { + "epoch": 0.43, + "learning_rate": 1.7237510334238806e-07, + "logits/chosen": -2.519448757171631, + "logits/rejected": -2.7333261966705322, + "logps/chosen": -343.583984375, + "logps/rejected": -231.3396759033203, + "loss": 0.7934, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8787013292312622, + "rewards/margins": 0.494794636964798, + "rewards/rejected": -2.3734960556030273, + "step": 3731 + }, + { + "epoch": 0.43, + "learning_rate": 1.7233967166646984e-07, + "logits/chosen": -2.2560088634490967, + "logits/rejected": -2.2314107418060303, + "logps/chosen": -377.1141357421875, + "logps/rejected": -310.6207580566406, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7289540767669678, + "rewards/margins": 2.2992656230926514, + "rewards/rejected": -3.028219699859619, + "step": 3732 + }, + { + "epoch": 0.43, + "learning_rate": 1.7230423999055156e-07, + "logits/chosen": -2.3759357929229736, + "logits/rejected": -2.716285228729248, + "logps/chosen": -277.15863037109375, + "logps/rejected": -303.8381042480469, + "loss": 0.1405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9000710248947144, + "rewards/margins": 3.6192898750305176, + "rewards/rejected": -4.5193610191345215, + "step": 3733 + }, + { + "epoch": 0.43, + "learning_rate": 1.7226880831463328e-07, + "logits/chosen": -1.8162533044815063, + "logits/rejected": -1.8569942712783813, + "logps/chosen": -388.8763427734375, + "logps/rejected": -274.67974853515625, + "loss": 0.4228, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9277607202529907, + "rewards/margins": 1.353257417678833, + "rewards/rejected": -2.281018018722534, + "step": 3734 + }, + { + "epoch": 0.43, + "learning_rate": 1.72233376638715e-07, + "logits/chosen": -2.659852981567383, + "logits/rejected": -2.4328322410583496, + "logps/chosen": -399.3940124511719, + "logps/rejected": -344.71380615234375, + "loss": 0.2261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7002333998680115, + "rewards/margins": 2.2400121688842773, + "rewards/rejected": -2.9402458667755127, + "step": 3735 + }, + { + "epoch": 0.43, + "learning_rate": 1.7219794496279673e-07, + "logits/chosen": -2.7664642333984375, + "logits/rejected": -2.6604623794555664, + "logps/chosen": -124.04094696044922, + "logps/rejected": -158.7914276123047, + "loss": 0.3977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6143667101860046, + "rewards/margins": 1.2038018703460693, + "rewards/rejected": -1.8181684017181396, + "step": 3736 + }, + { + "epoch": 0.43, + "learning_rate": 1.7216251328687845e-07, + "logits/chosen": -2.6661806106567383, + "logits/rejected": -2.5586349964141846, + "logps/chosen": -238.3379364013672, + "logps/rejected": -231.2430877685547, + "loss": 0.4086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8307764530181885, + "rewards/margins": 2.457336664199829, + "rewards/rejected": -3.2881131172180176, + "step": 3737 + }, + { + "epoch": 0.43, + "learning_rate": 1.721270816109602e-07, + "logits/chosen": -2.105785608291626, + "logits/rejected": -1.8706681728363037, + "logps/chosen": -300.86761474609375, + "logps/rejected": -347.4740295410156, + "loss": 0.5658, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0396195650100708, + "rewards/margins": 1.0815443992614746, + "rewards/rejected": -2.121164083480835, + "step": 3738 + }, + { + "epoch": 0.43, + "learning_rate": 1.7209164993504192e-07, + "logits/chosen": -1.652765154838562, + "logits/rejected": -2.1318516731262207, + "logps/chosen": -379.6932373046875, + "logps/rejected": -274.25689697265625, + "loss": 0.2882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.770958662033081, + "rewards/margins": 2.4208149909973145, + "rewards/rejected": -3.1917738914489746, + "step": 3739 + }, + { + "epoch": 0.44, + "learning_rate": 1.7205621825912364e-07, + "logits/chosen": -1.6141008138656616, + "logits/rejected": -2.035013198852539, + "logps/chosen": -353.3890686035156, + "logps/rejected": -267.54058837890625, + "loss": 0.3227, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.131401538848877, + "rewards/margins": 2.1745893955230713, + "rewards/rejected": -3.305990695953369, + "step": 3740 + }, + { + "epoch": 0.44, + "learning_rate": 1.7202078658320537e-07, + "logits/chosen": -2.8174548149108887, + "logits/rejected": -2.592378854751587, + "logps/chosen": -168.69192504882812, + "logps/rejected": -236.2540740966797, + "loss": 0.2657, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6635015606880188, + "rewards/margins": 2.921603202819824, + "rewards/rejected": -3.5851047039031982, + "step": 3741 + }, + { + "epoch": 0.44, + "learning_rate": 1.719853549072871e-07, + "logits/chosen": -2.064040422439575, + "logits/rejected": -2.240743637084961, + "logps/chosen": -369.3329162597656, + "logps/rejected": -341.30145263671875, + "loss": 0.5814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5661029815673828, + "rewards/margins": 1.2274757623672485, + "rewards/rejected": -1.7935787439346313, + "step": 3742 + }, + { + "epoch": 0.44, + "learning_rate": 1.719499232313688e-07, + "logits/chosen": -1.7912862300872803, + "logits/rejected": -1.890390157699585, + "logps/chosen": -239.19192504882812, + "logps/rejected": -240.91177368164062, + "loss": 0.2584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9479097723960876, + "rewards/margins": 1.497235655784607, + "rewards/rejected": -2.445145606994629, + "step": 3743 + }, + { + "epoch": 0.44, + "learning_rate": 1.7191449155545058e-07, + "logits/chosen": -2.304654598236084, + "logits/rejected": -2.2825629711151123, + "logps/chosen": -233.4609375, + "logps/rejected": -323.0733642578125, + "loss": 0.0813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6426647901535034, + "rewards/margins": 3.874387264251709, + "rewards/rejected": -4.517051696777344, + "step": 3744 + }, + { + "epoch": 0.44, + "learning_rate": 1.718790598795323e-07, + "logits/chosen": -1.6577391624450684, + "logits/rejected": -1.8928120136260986, + "logps/chosen": -375.6813659667969, + "logps/rejected": -297.20831298828125, + "loss": 0.2241, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1766550540924072, + "rewards/margins": 2.2279229164123535, + "rewards/rejected": -3.4045779705047607, + "step": 3745 + }, + { + "epoch": 0.44, + "learning_rate": 1.7184362820361403e-07, + "logits/chosen": -2.5379116535186768, + "logits/rejected": -2.7222719192504883, + "logps/chosen": -316.79718017578125, + "logps/rejected": -189.24615478515625, + "loss": 0.8262, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4928467273712158, + "rewards/margins": 0.5650280117988586, + "rewards/rejected": -2.0578746795654297, + "step": 3746 + }, + { + "epoch": 0.44, + "learning_rate": 1.7180819652769575e-07, + "logits/chosen": -2.3402161598205566, + "logits/rejected": -2.507727861404419, + "logps/chosen": -370.5191650390625, + "logps/rejected": -365.3287353515625, + "loss": 0.7351, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5296844244003296, + "rewards/margins": 1.0636063814163208, + "rewards/rejected": -2.5932905673980713, + "step": 3747 + }, + { + "epoch": 0.44, + "learning_rate": 1.7177276485177747e-07, + "logits/chosen": -2.437608003616333, + "logits/rejected": -2.531290054321289, + "logps/chosen": -374.33868408203125, + "logps/rejected": -344.0832214355469, + "loss": 0.2847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.463425636291504, + "rewards/margins": 2.3477416038513184, + "rewards/rejected": -3.811166763305664, + "step": 3748 + }, + { + "epoch": 0.44, + "learning_rate": 1.7173733317585922e-07, + "logits/chosen": -2.2733047008514404, + "logits/rejected": -2.3666274547576904, + "logps/chosen": -168.85377502441406, + "logps/rejected": -153.43067932128906, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.449173927307129, + "rewards/margins": 1.6674580574035645, + "rewards/rejected": -3.1166317462921143, + "step": 3749 + }, + { + "epoch": 0.44, + "learning_rate": 1.7170190149994094e-07, + "logits/chosen": -1.8260974884033203, + "logits/rejected": -2.141425371170044, + "logps/chosen": -307.3590087890625, + "logps/rejected": -267.62139892578125, + "loss": 0.3974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7419998049736023, + "rewards/margins": 1.3440253734588623, + "rewards/rejected": -2.0860252380371094, + "step": 3750 + }, + { + "epoch": 0.44, + "learning_rate": 1.7166646982402267e-07, + "logits/chosen": -2.31095290184021, + "logits/rejected": -2.496816396713257, + "logps/chosen": -266.34405517578125, + "logps/rejected": -218.66644287109375, + "loss": 0.7084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7933177947998047, + "rewards/margins": 1.474044680595398, + "rewards/rejected": -2.267362594604492, + "step": 3751 + }, + { + "epoch": 0.44, + "learning_rate": 1.716310381481044e-07, + "logits/chosen": -1.9681038856506348, + "logits/rejected": -1.9804086685180664, + "logps/chosen": -153.40902709960938, + "logps/rejected": -185.9090576171875, + "loss": 0.5293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7002599835395813, + "rewards/margins": 1.4783798456192017, + "rewards/rejected": -2.1786398887634277, + "step": 3752 + }, + { + "epoch": 0.44, + "learning_rate": 1.715956064721861e-07, + "logits/chosen": -2.234421730041504, + "logits/rejected": -2.400183916091919, + "logps/chosen": -280.2693786621094, + "logps/rejected": -307.4417419433594, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5054105520248413, + "rewards/margins": 3.0307154655456543, + "rewards/rejected": -3.536125898361206, + "step": 3753 + }, + { + "epoch": 0.44, + "learning_rate": 1.7156017479626783e-07, + "logits/chosen": -2.7145230770111084, + "logits/rejected": -2.8044052124023438, + "logps/chosen": -188.31272888183594, + "logps/rejected": -157.29623413085938, + "loss": 0.3696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8319178223609924, + "rewards/margins": 2.4807350635528564, + "rewards/rejected": -3.312653064727783, + "step": 3754 + }, + { + "epoch": 0.44, + "learning_rate": 1.7152474312034958e-07, + "logits/chosen": -2.4802496433258057, + "logits/rejected": -2.7808759212493896, + "logps/chosen": -285.4593200683594, + "logps/rejected": -299.30792236328125, + "loss": 0.3339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5036831498146057, + "rewards/margins": 1.7123324871063232, + "rewards/rejected": -2.2160158157348633, + "step": 3755 + }, + { + "epoch": 0.44, + "learning_rate": 1.7148931144443133e-07, + "logits/chosen": -2.574021100997925, + "logits/rejected": -2.441335916519165, + "logps/chosen": -346.53619384765625, + "logps/rejected": -338.0230712890625, + "loss": 0.6001, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.119659423828125, + "rewards/margins": 1.4179264307022095, + "rewards/rejected": -2.537585735321045, + "step": 3756 + }, + { + "epoch": 0.44, + "learning_rate": 1.7145387976851305e-07, + "logits/chosen": -2.50264835357666, + "logits/rejected": -2.5166878700256348, + "logps/chosen": -142.37432861328125, + "logps/rejected": -176.509033203125, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7571601867675781, + "rewards/margins": 2.1136577129364014, + "rewards/rejected": -2.8708176612854004, + "step": 3757 + }, + { + "epoch": 0.44, + "learning_rate": 1.7141844809259477e-07, + "logits/chosen": -2.1976494789123535, + "logits/rejected": -2.239157199859619, + "logps/chosen": -168.4442138671875, + "logps/rejected": -196.59994506835938, + "loss": 0.4041, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17015379667282104, + "rewards/margins": 0.9558542966842651, + "rewards/rejected": -1.1260080337524414, + "step": 3758 + }, + { + "epoch": 0.44, + "learning_rate": 1.713830164166765e-07, + "logits/chosen": -3.0385544300079346, + "logits/rejected": -2.985635280609131, + "logps/chosen": -325.5106201171875, + "logps/rejected": -247.2646026611328, + "loss": 0.2774, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7209484577178955, + "rewards/margins": 2.325742721557617, + "rewards/rejected": -3.0466909408569336, + "step": 3759 + }, + { + "epoch": 0.44, + "learning_rate": 1.7134758474075824e-07, + "logits/chosen": -2.216569185256958, + "logits/rejected": -2.5557949542999268, + "logps/chosen": -473.6844177246094, + "logps/rejected": -337.4504089355469, + "loss": 0.3678, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.43375563621521, + "rewards/margins": 2.153794050216675, + "rewards/rejected": -3.5875496864318848, + "step": 3760 + }, + { + "epoch": 0.44, + "learning_rate": 1.7131215306483997e-07, + "logits/chosen": -2.2817442417144775, + "logits/rejected": -2.2967941761016846, + "logps/chosen": -178.56314086914062, + "logps/rejected": -238.62457275390625, + "loss": 0.2831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2943980097770691, + "rewards/margins": 2.636892557144165, + "rewards/rejected": -2.9312903881073, + "step": 3761 + }, + { + "epoch": 0.44, + "learning_rate": 1.712767213889217e-07, + "logits/chosen": -1.9332555532455444, + "logits/rejected": -2.3321938514709473, + "logps/chosen": -599.8321533203125, + "logps/rejected": -328.58221435546875, + "loss": 0.2275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8049410581588745, + "rewards/margins": 2.013509750366211, + "rewards/rejected": -2.818450927734375, + "step": 3762 + }, + { + "epoch": 0.44, + "learning_rate": 1.712412897130034e-07, + "logits/chosen": -2.0838022232055664, + "logits/rejected": -2.046811580657959, + "logps/chosen": -236.88043212890625, + "logps/rejected": -331.28875732421875, + "loss": 0.3506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6956567168235779, + "rewards/margins": 1.7534615993499756, + "rewards/rejected": -2.4491183757781982, + "step": 3763 + }, + { + "epoch": 0.44, + "learning_rate": 1.7120585803708513e-07, + "logits/chosen": -2.398810863494873, + "logits/rejected": -2.5273096561431885, + "logps/chosen": -380.80364990234375, + "logps/rejected": -240.86607360839844, + "loss": 0.4475, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8784935474395752, + "rewards/margins": 1.4033005237579346, + "rewards/rejected": -2.2817940711975098, + "step": 3764 + }, + { + "epoch": 0.44, + "learning_rate": 1.7117042636116686e-07, + "logits/chosen": -2.118960380554199, + "logits/rejected": -2.149174213409424, + "logps/chosen": -259.4793701171875, + "logps/rejected": -194.9567108154297, + "loss": 0.3017, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5692083835601807, + "rewards/margins": 2.2336740493774414, + "rewards/rejected": -2.802882432937622, + "step": 3765 + }, + { + "epoch": 0.44, + "learning_rate": 1.7113499468524858e-07, + "logits/chosen": -2.107173204421997, + "logits/rejected": -1.9755395650863647, + "logps/chosen": -374.44677734375, + "logps/rejected": -382.8333435058594, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.695237398147583, + "rewards/margins": 2.859785795211792, + "rewards/rejected": -3.555023193359375, + "step": 3766 + }, + { + "epoch": 0.44, + "learning_rate": 1.7109956300933035e-07, + "logits/chosen": -2.025876045227051, + "logits/rejected": -1.855359435081482, + "logps/chosen": -384.7864990234375, + "logps/rejected": -324.9702453613281, + "loss": 0.4329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4192999005317688, + "rewards/margins": 1.4119718074798584, + "rewards/rejected": -1.8312718868255615, + "step": 3767 + }, + { + "epoch": 0.44, + "learning_rate": 1.7106413133341207e-07, + "logits/chosen": -2.419177770614624, + "logits/rejected": -2.533783435821533, + "logps/chosen": -229.5596923828125, + "logps/rejected": -118.90814971923828, + "loss": 1.7951, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.095093011856079, + "rewards/margins": -0.29753679037094116, + "rewards/rejected": -1.7975562810897827, + "step": 3768 + }, + { + "epoch": 0.44, + "learning_rate": 1.710286996574938e-07, + "logits/chosen": -1.7026597261428833, + "logits/rejected": -1.985854983329773, + "logps/chosen": -381.9714050292969, + "logps/rejected": -221.5395965576172, + "loss": 0.6559, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4126040935516357, + "rewards/margins": 1.198707938194275, + "rewards/rejected": -2.611311912536621, + "step": 3769 + }, + { + "epoch": 0.44, + "learning_rate": 1.7099326798157552e-07, + "logits/chosen": -2.6705987453460693, + "logits/rejected": -2.822188138961792, + "logps/chosen": -308.56658935546875, + "logps/rejected": -303.24285888671875, + "loss": 0.1802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9306091666221619, + "rewards/margins": 3.848820447921753, + "rewards/rejected": -4.7794294357299805, + "step": 3770 + }, + { + "epoch": 0.44, + "learning_rate": 1.7095783630565727e-07, + "logits/chosen": -2.691370964050293, + "logits/rejected": -2.7330918312072754, + "logps/chosen": -393.5550537109375, + "logps/rejected": -242.232666015625, + "loss": 0.4034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3191079795360565, + "rewards/margins": 1.3799216747283936, + "rewards/rejected": -1.6990296840667725, + "step": 3771 + }, + { + "epoch": 0.44, + "learning_rate": 1.70922404629739e-07, + "logits/chosen": -2.1604669094085693, + "logits/rejected": -2.2781217098236084, + "logps/chosen": -243.04519653320312, + "logps/rejected": -311.46893310546875, + "loss": 0.5547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6873865127563477, + "rewards/margins": 1.1615521907806396, + "rewards/rejected": -2.8489387035369873, + "step": 3772 + }, + { + "epoch": 0.44, + "learning_rate": 1.708869729538207e-07, + "logits/chosen": -2.8215832710266113, + "logits/rejected": -2.7337865829467773, + "logps/chosen": -291.673583984375, + "logps/rejected": -281.651123046875, + "loss": 0.4424, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9925156831741333, + "rewards/margins": 3.1228511333465576, + "rewards/rejected": -4.1153669357299805, + "step": 3773 + }, + { + "epoch": 0.44, + "learning_rate": 1.7085154127790243e-07, + "logits/chosen": -2.391026020050049, + "logits/rejected": -2.370880603790283, + "logps/chosen": -417.5521240234375, + "logps/rejected": -322.7779235839844, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3103489875793457, + "rewards/margins": 0.9004037976264954, + "rewards/rejected": -1.2107528448104858, + "step": 3774 + }, + { + "epoch": 0.44, + "learning_rate": 1.7081610960198416e-07, + "logits/chosen": -2.6042723655700684, + "logits/rejected": -2.582612991333008, + "logps/chosen": -101.27119445800781, + "logps/rejected": -201.7843017578125, + "loss": 0.2097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5204496383666992, + "rewards/margins": 3.3009033203125, + "rewards/rejected": -3.821352958679199, + "step": 3775 + }, + { + "epoch": 0.44, + "learning_rate": 1.7078067792606588e-07, + "logits/chosen": -1.8413634300231934, + "logits/rejected": -2.11630916595459, + "logps/chosen": -274.393798828125, + "logps/rejected": -170.50503540039062, + "loss": 0.6766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4597012996673584, + "rewards/margins": 0.5728222131729126, + "rewards/rejected": -1.032523512840271, + "step": 3776 + }, + { + "epoch": 0.44, + "learning_rate": 1.707452462501476e-07, + "logits/chosen": -2.4838674068450928, + "logits/rejected": -2.206958532333374, + "logps/chosen": -171.67654418945312, + "logps/rejected": -316.51708984375, + "loss": 0.3382, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1162574291229248, + "rewards/margins": 2.1751821041107178, + "rewards/rejected": -3.2914395332336426, + "step": 3777 + }, + { + "epoch": 0.44, + "learning_rate": 1.7070981457422935e-07, + "logits/chosen": -2.316068649291992, + "logits/rejected": -2.5531888008117676, + "logps/chosen": -239.25057983398438, + "logps/rejected": -154.2504119873047, + "loss": 0.305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7143497467041016, + "rewards/margins": 1.6597325801849365, + "rewards/rejected": -2.374082088470459, + "step": 3778 + }, + { + "epoch": 0.44, + "learning_rate": 1.706743828983111e-07, + "logits/chosen": -1.8977967500686646, + "logits/rejected": -2.0684292316436768, + "logps/chosen": -289.68499755859375, + "logps/rejected": -273.0801696777344, + "loss": 0.4918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7247843742370605, + "rewards/margins": 0.9484382271766663, + "rewards/rejected": -1.673222541809082, + "step": 3779 + }, + { + "epoch": 0.44, + "learning_rate": 1.7063895122239282e-07, + "logits/chosen": -2.3799855709075928, + "logits/rejected": -2.4277968406677246, + "logps/chosen": -211.29977416992188, + "logps/rejected": -344.5084533691406, + "loss": 0.5896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5603629946708679, + "rewards/margins": 2.235182285308838, + "rewards/rejected": -2.7955453395843506, + "step": 3780 + }, + { + "epoch": 0.44, + "learning_rate": 1.7060351954647454e-07, + "logits/chosen": -2.541721820831299, + "logits/rejected": -2.463068723678589, + "logps/chosen": -389.0673828125, + "logps/rejected": -282.7577819824219, + "loss": 0.2377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8875350952148438, + "rewards/margins": 1.9013620615005493, + "rewards/rejected": -2.7888970375061035, + "step": 3781 + }, + { + "epoch": 0.44, + "learning_rate": 1.7056808787055626e-07, + "logits/chosen": -2.552694320678711, + "logits/rejected": -2.7252066135406494, + "logps/chosen": -256.26812744140625, + "logps/rejected": -221.91213989257812, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2729310691356659, + "rewards/margins": 3.9842653274536133, + "rewards/rejected": -4.257196426391602, + "step": 3782 + }, + { + "epoch": 0.44, + "learning_rate": 1.70532656194638e-07, + "logits/chosen": -2.44295597076416, + "logits/rejected": -2.4547760486602783, + "logps/chosen": -196.97120666503906, + "logps/rejected": -257.48553466796875, + "loss": 0.4387, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2553638219833374, + "rewards/margins": 2.1991448402404785, + "rewards/rejected": -3.4545087814331055, + "step": 3783 + }, + { + "epoch": 0.44, + "learning_rate": 1.7049722451871973e-07, + "logits/chosen": -1.771080493927002, + "logits/rejected": -2.0155606269836426, + "logps/chosen": -573.7744140625, + "logps/rejected": -309.09027099609375, + "loss": 0.3691, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2098865509033203, + "rewards/margins": 1.635573148727417, + "rewards/rejected": -2.8454599380493164, + "step": 3784 + }, + { + "epoch": 0.44, + "learning_rate": 1.7046179284280146e-07, + "logits/chosen": -2.750241279602051, + "logits/rejected": -2.6381092071533203, + "logps/chosen": -181.6207275390625, + "logps/rejected": -310.04962158203125, + "loss": 0.4858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5318410396575928, + "rewards/margins": 1.692322015762329, + "rewards/rejected": -2.224163055419922, + "step": 3785 + }, + { + "epoch": 0.44, + "learning_rate": 1.7042636116688318e-07, + "logits/chosen": -2.147456645965576, + "logits/rejected": -2.265768527984619, + "logps/chosen": -326.4219970703125, + "logps/rejected": -337.4965515136719, + "loss": 0.8649, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0378087759017944, + "rewards/margins": 0.4177129864692688, + "rewards/rejected": -1.455521821975708, + "step": 3786 + }, + { + "epoch": 0.44, + "learning_rate": 1.703909294909649e-07, + "logits/chosen": -2.4669437408447266, + "logits/rejected": -2.426112651824951, + "logps/chosen": -123.30867767333984, + "logps/rejected": -209.21609497070312, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6679010987281799, + "rewards/margins": 2.6773533821105957, + "rewards/rejected": -3.345254421234131, + "step": 3787 + }, + { + "epoch": 0.44, + "learning_rate": 1.7035549781504662e-07, + "logits/chosen": -1.6903389692306519, + "logits/rejected": -1.818345069885254, + "logps/chosen": -372.83050537109375, + "logps/rejected": -348.4471130371094, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5003893375396729, + "rewards/margins": 1.194337248802185, + "rewards/rejected": -2.6947264671325684, + "step": 3788 + }, + { + "epoch": 0.44, + "learning_rate": 1.7032006613912837e-07, + "logits/chosen": -2.6125450134277344, + "logits/rejected": -2.5782251358032227, + "logps/chosen": -352.9016418457031, + "logps/rejected": -274.00128173828125, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.86961430311203, + "rewards/margins": 1.9341182708740234, + "rewards/rejected": -2.8037328720092773, + "step": 3789 + }, + { + "epoch": 0.44, + "learning_rate": 1.702846344632101e-07, + "logits/chosen": -1.9954301118850708, + "logits/rejected": -1.6997146606445312, + "logps/chosen": -207.79861450195312, + "logps/rejected": -411.94195556640625, + "loss": 0.4243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.90308678150177, + "rewards/margins": 2.4553451538085938, + "rewards/rejected": -3.358431816101074, + "step": 3790 + }, + { + "epoch": 0.44, + "learning_rate": 1.7024920278729184e-07, + "logits/chosen": -2.3168773651123047, + "logits/rejected": -2.3801403045654297, + "logps/chosen": -286.3769226074219, + "logps/rejected": -259.5494384765625, + "loss": 0.2446, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1294069290161133, + "rewards/margins": 2.508059501647949, + "rewards/rejected": -4.6374664306640625, + "step": 3791 + }, + { + "epoch": 0.44, + "learning_rate": 1.7021377111137356e-07, + "logits/chosen": -2.6780409812927246, + "logits/rejected": -2.3034110069274902, + "logps/chosen": -192.69677734375, + "logps/rejected": -338.19683837890625, + "loss": 0.3384, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4733058214187622, + "rewards/margins": 1.9423298835754395, + "rewards/rejected": -2.415635585784912, + "step": 3792 + }, + { + "epoch": 0.44, + "learning_rate": 1.7017833943545529e-07, + "logits/chosen": -1.7744717597961426, + "logits/rejected": -1.7573881149291992, + "logps/chosen": -442.5962829589844, + "logps/rejected": -363.1497497558594, + "loss": 0.7051, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.302548408508301, + "rewards/margins": 1.5919055938720703, + "rewards/rejected": -3.894454002380371, + "step": 3793 + }, + { + "epoch": 0.44, + "learning_rate": 1.7014290775953703e-07, + "logits/chosen": -2.6654059886932373, + "logits/rejected": -2.80353045463562, + "logps/chosen": -242.20758056640625, + "logps/rejected": -288.70123291015625, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49731239676475525, + "rewards/margins": 2.529067277908325, + "rewards/rejected": -3.0263798236846924, + "step": 3794 + }, + { + "epoch": 0.44, + "learning_rate": 1.7010747608361876e-07, + "logits/chosen": -1.3455537557601929, + "logits/rejected": -1.25901460647583, + "logps/chosen": -433.4952697753906, + "logps/rejected": -465.3505859375, + "loss": 0.2013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47454094886779785, + "rewards/margins": 2.747326612472534, + "rewards/rejected": -3.221867561340332, + "step": 3795 + }, + { + "epoch": 0.44, + "learning_rate": 1.7007204440770048e-07, + "logits/chosen": -2.9750380516052246, + "logits/rejected": -3.0171055793762207, + "logps/chosen": -194.07867431640625, + "logps/rejected": -265.92193603515625, + "loss": 0.2705, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0400372743606567, + "rewards/margins": 2.950512170791626, + "rewards/rejected": -3.990549325942993, + "step": 3796 + }, + { + "epoch": 0.44, + "learning_rate": 1.700366127317822e-07, + "logits/chosen": -2.4030394554138184, + "logits/rejected": -2.34592866897583, + "logps/chosen": -280.1178283691406, + "logps/rejected": -336.9311218261719, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8290759325027466, + "rewards/margins": 2.9297900199890137, + "rewards/rejected": -3.7588655948638916, + "step": 3797 + }, + { + "epoch": 0.44, + "learning_rate": 1.7000118105586392e-07, + "logits/chosen": -2.0979678630828857, + "logits/rejected": -2.384960651397705, + "logps/chosen": -264.5905456542969, + "logps/rejected": -187.75953674316406, + "loss": 0.3361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.739276647567749, + "rewards/margins": 1.5950044393539429, + "rewards/rejected": -2.3342809677124023, + "step": 3798 + }, + { + "epoch": 0.44, + "learning_rate": 1.6996574937994565e-07, + "logits/chosen": -2.6050305366516113, + "logits/rejected": -2.4710755348205566, + "logps/chosen": -251.95994567871094, + "logps/rejected": -289.75982666015625, + "loss": 0.1, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5248978137969971, + "rewards/margins": 4.193580150604248, + "rewards/rejected": -4.718477725982666, + "step": 3799 + }, + { + "epoch": 0.44, + "learning_rate": 1.699303177040274e-07, + "logits/chosen": -1.9977912902832031, + "logits/rejected": -2.385557174682617, + "logps/chosen": -376.3791198730469, + "logps/rejected": -360.1363525390625, + "loss": 0.1617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7329797148704529, + "rewards/margins": 3.9863181114196777, + "rewards/rejected": -4.719297885894775, + "step": 3800 + }, + { + "epoch": 0.44, + "learning_rate": 1.6989488602810912e-07, + "logits/chosen": -1.998733639717102, + "logits/rejected": -2.0676770210266113, + "logps/chosen": -229.2128448486328, + "logps/rejected": -248.56346130371094, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8291890621185303, + "rewards/margins": 1.2897917032241821, + "rewards/rejected": -2.118980884552002, + "step": 3801 + }, + { + "epoch": 0.44, + "learning_rate": 1.6985945435219086e-07, + "logits/chosen": -2.1137795448303223, + "logits/rejected": -2.646155834197998, + "logps/chosen": -452.292724609375, + "logps/rejected": -245.48651123046875, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49639061093330383, + "rewards/margins": 2.143827199935913, + "rewards/rejected": -2.6402180194854736, + "step": 3802 + }, + { + "epoch": 0.44, + "learning_rate": 1.698240226762726e-07, + "logits/chosen": -2.749824047088623, + "logits/rejected": -2.762599468231201, + "logps/chosen": -105.44390106201172, + "logps/rejected": -176.61407470703125, + "loss": 0.8021, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0743046998977661, + "rewards/margins": 1.1487983465194702, + "rewards/rejected": -2.2231030464172363, + "step": 3803 + }, + { + "epoch": 0.44, + "learning_rate": 1.697885910003543e-07, + "logits/chosen": -2.345707416534424, + "logits/rejected": -2.4009432792663574, + "logps/chosen": -182.53590393066406, + "logps/rejected": -226.47857666015625, + "loss": 0.3841, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.119269847869873, + "rewards/margins": 2.193181037902832, + "rewards/rejected": -3.312450885772705, + "step": 3804 + }, + { + "epoch": 0.44, + "learning_rate": 1.6975315932443606e-07, + "logits/chosen": -1.9573720693588257, + "logits/rejected": -1.7582001686096191, + "logps/chosen": -105.86338806152344, + "logps/rejected": -271.5712585449219, + "loss": 0.3033, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16308534145355225, + "rewards/margins": 2.8431951999664307, + "rewards/rejected": -3.0062804222106934, + "step": 3805 + }, + { + "epoch": 0.44, + "learning_rate": 1.6971772764851778e-07, + "logits/chosen": -1.6990699768066406, + "logits/rejected": -1.642216682434082, + "logps/chosen": -270.33465576171875, + "logps/rejected": -355.02642822265625, + "loss": 0.5785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5129654407501221, + "rewards/margins": 1.2147226333618164, + "rewards/rejected": -1.7276880741119385, + "step": 3806 + }, + { + "epoch": 0.44, + "learning_rate": 1.696822959725995e-07, + "logits/chosen": -1.8806803226470947, + "logits/rejected": -1.9546822309494019, + "logps/chosen": -272.9195251464844, + "logps/rejected": -348.9990234375, + "loss": 0.3218, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.192997932434082, + "rewards/margins": 2.1266751289367676, + "rewards/rejected": -3.3196730613708496, + "step": 3807 + }, + { + "epoch": 0.44, + "learning_rate": 1.6964686429668122e-07, + "logits/chosen": -2.0564887523651123, + "logits/rejected": -2.413443088531494, + "logps/chosen": -301.5754089355469, + "logps/rejected": -246.79257202148438, + "loss": 0.5525, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9160251617431641, + "rewards/margins": 2.058910369873047, + "rewards/rejected": -2.974936008453369, + "step": 3808 + }, + { + "epoch": 0.44, + "learning_rate": 1.6961143262076295e-07, + "logits/chosen": -2.3290672302246094, + "logits/rejected": -2.4375576972961426, + "logps/chosen": -253.19122314453125, + "logps/rejected": -282.9997253417969, + "loss": 0.2847, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5727555751800537, + "rewards/margins": 3.100311040878296, + "rewards/rejected": -3.6730666160583496, + "step": 3809 + }, + { + "epoch": 0.44, + "learning_rate": 1.6957600094484467e-07, + "logits/chosen": -2.46608567237854, + "logits/rejected": -2.53562331199646, + "logps/chosen": -306.95404052734375, + "logps/rejected": -348.24993896484375, + "loss": 0.5591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8656816482543945, + "rewards/margins": 0.485037624835968, + "rewards/rejected": -1.3507193326950073, + "step": 3810 + }, + { + "epoch": 0.44, + "learning_rate": 1.695405692689264e-07, + "logits/chosen": -2.3709793090820312, + "logits/rejected": -2.5072054862976074, + "logps/chosen": -229.4619140625, + "logps/rejected": -313.1302795410156, + "loss": 0.3482, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.699439287185669, + "rewards/margins": 2.894807815551758, + "rewards/rejected": -3.5942466259002686, + "step": 3811 + }, + { + "epoch": 0.44, + "learning_rate": 1.6950513759300814e-07, + "logits/chosen": -2.248223304748535, + "logits/rejected": -2.088867425918579, + "logps/chosen": -184.47830200195312, + "logps/rejected": -277.65435791015625, + "loss": 0.6972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6227772831916809, + "rewards/margins": 0.8355299830436707, + "rewards/rejected": -1.4583073854446411, + "step": 3812 + }, + { + "epoch": 0.44, + "learning_rate": 1.6946970591708986e-07, + "logits/chosen": -2.9243359565734863, + "logits/rejected": -2.6137940883636475, + "logps/chosen": -648.00732421875, + "logps/rejected": -369.20367431640625, + "loss": 0.2215, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8166504502296448, + "rewards/margins": 3.634876012802124, + "rewards/rejected": -4.451526641845703, + "step": 3813 + }, + { + "epoch": 0.44, + "learning_rate": 1.694342742411716e-07, + "logits/chosen": -1.915997862815857, + "logits/rejected": -1.8202357292175293, + "logps/chosen": -331.0302734375, + "logps/rejected": -322.8487548828125, + "loss": 0.4651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5964080095291138, + "rewards/margins": 1.3518257141113281, + "rewards/rejected": -2.9482338428497314, + "step": 3814 + }, + { + "epoch": 0.44, + "learning_rate": 1.6939884256525333e-07, + "logits/chosen": -2.5051538944244385, + "logits/rejected": -2.427781343460083, + "logps/chosen": -407.1469421386719, + "logps/rejected": -401.8533630371094, + "loss": 0.5027, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0337773561477661, + "rewards/margins": 2.84604549407959, + "rewards/rejected": -3.8798229694366455, + "step": 3815 + }, + { + "epoch": 0.44, + "learning_rate": 1.6936341088933508e-07, + "logits/chosen": -2.479362726211548, + "logits/rejected": -2.6405086517333984, + "logps/chosen": -266.91314697265625, + "logps/rejected": -330.23004150390625, + "loss": 0.2021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8573217391967773, + "rewards/margins": 2.789606809616089, + "rewards/rejected": -3.646928310394287, + "step": 3816 + }, + { + "epoch": 0.44, + "learning_rate": 1.693279792134168e-07, + "logits/chosen": -2.3473703861236572, + "logits/rejected": -2.3644118309020996, + "logps/chosen": -192.52603149414062, + "logps/rejected": -246.68521118164062, + "loss": 0.5147, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7844061851501465, + "rewards/margins": 3.8912973403930664, + "rewards/rejected": -4.675703525543213, + "step": 3817 + }, + { + "epoch": 0.44, + "learning_rate": 1.6929254753749852e-07, + "logits/chosen": -2.5933048725128174, + "logits/rejected": -2.5572397708892822, + "logps/chosen": -149.1268310546875, + "logps/rejected": -188.1115264892578, + "loss": 1.0319, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.248836040496826, + "rewards/margins": 1.2450098991394043, + "rewards/rejected": -3.4938457012176514, + "step": 3818 + }, + { + "epoch": 0.44, + "learning_rate": 1.6925711586158025e-07, + "logits/chosen": -2.6353275775909424, + "logits/rejected": -2.4235434532165527, + "logps/chosen": -141.26608276367188, + "logps/rejected": -223.60427856445312, + "loss": 0.5521, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7224225997924805, + "rewards/margins": 1.6562871932983398, + "rewards/rejected": -3.3787097930908203, + "step": 3819 + }, + { + "epoch": 0.44, + "learning_rate": 1.6922168418566197e-07, + "logits/chosen": -1.861281156539917, + "logits/rejected": -2.05714750289917, + "logps/chosen": -523.281005859375, + "logps/rejected": -352.99615478515625, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.658854603767395, + "rewards/margins": 2.544095754623413, + "rewards/rejected": -3.2029502391815186, + "step": 3820 + }, + { + "epoch": 0.44, + "learning_rate": 1.691862525097437e-07, + "logits/chosen": -1.50089430809021, + "logits/rejected": -1.8316129446029663, + "logps/chosen": -678.7535400390625, + "logps/rejected": -551.0390625, + "loss": 0.3514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5951563119888306, + "rewards/margins": 2.3509392738342285, + "rewards/rejected": -2.9460954666137695, + "step": 3821 + }, + { + "epoch": 0.44, + "learning_rate": 1.691508208338254e-07, + "logits/chosen": -2.340848445892334, + "logits/rejected": -2.574352502822876, + "logps/chosen": -292.50750732421875, + "logps/rejected": -245.32638549804688, + "loss": 0.9604, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.042235851287842, + "rewards/margins": 0.6551424264907837, + "rewards/rejected": -2.697378158569336, + "step": 3822 + }, + { + "epoch": 0.44, + "learning_rate": 1.6911538915790716e-07, + "logits/chosen": -2.955321788787842, + "logits/rejected": -2.7621734142303467, + "logps/chosen": -266.4412841796875, + "logps/rejected": -286.2760009765625, + "loss": 0.3938, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0673372745513916, + "rewards/margins": 2.9986519813537598, + "rewards/rejected": -4.065989017486572, + "step": 3823 + }, + { + "epoch": 0.44, + "learning_rate": 1.6907995748198888e-07, + "logits/chosen": -2.2558627128601074, + "logits/rejected": -2.4350106716156006, + "logps/chosen": -321.59381103515625, + "logps/rejected": -243.19546508789062, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6221703886985779, + "rewards/margins": 1.205824375152588, + "rewards/rejected": -1.8279948234558105, + "step": 3824 + }, + { + "epoch": 0.44, + "learning_rate": 1.690445258060706e-07, + "logits/chosen": -2.3505730628967285, + "logits/rejected": -2.0519866943359375, + "logps/chosen": -215.49693298339844, + "logps/rejected": -320.3085021972656, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8347928524017334, + "rewards/margins": 2.889139413833618, + "rewards/rejected": -4.723931789398193, + "step": 3825 + }, + { + "epoch": 0.45, + "learning_rate": 1.6900909413015235e-07, + "logits/chosen": -2.0997064113616943, + "logits/rejected": -1.8105250597000122, + "logps/chosen": -330.0223388671875, + "logps/rejected": -371.4913024902344, + "loss": 0.381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36151543259620667, + "rewards/margins": 1.5176966190338135, + "rewards/rejected": -1.8792121410369873, + "step": 3826 + }, + { + "epoch": 0.45, + "learning_rate": 1.6897366245423408e-07, + "logits/chosen": -2.4775309562683105, + "logits/rejected": -2.5103628635406494, + "logps/chosen": -246.88796997070312, + "logps/rejected": -341.9145812988281, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9078621864318848, + "rewards/margins": 0.7733854055404663, + "rewards/rejected": -2.6812474727630615, + "step": 3827 + }, + { + "epoch": 0.45, + "learning_rate": 1.6893823077831583e-07, + "logits/chosen": -2.201251268386841, + "logits/rejected": -2.367485523223877, + "logps/chosen": -369.099365234375, + "logps/rejected": -232.81051635742188, + "loss": 0.3126, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2392033338546753, + "rewards/margins": 1.4680966138839722, + "rewards/rejected": -2.7072999477386475, + "step": 3828 + }, + { + "epoch": 0.45, + "learning_rate": 1.6890279910239755e-07, + "logits/chosen": -1.9296603202819824, + "logits/rejected": -2.0190649032592773, + "logps/chosen": -336.2305908203125, + "logps/rejected": -327.4036865234375, + "loss": 0.6317, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0076663494110107, + "rewards/margins": 2.1324057579040527, + "rewards/rejected": -3.1400718688964844, + "step": 3829 + }, + { + "epoch": 0.45, + "learning_rate": 1.6886736742647927e-07, + "logits/chosen": -2.5764341354370117, + "logits/rejected": -2.672109842300415, + "logps/chosen": -294.19342041015625, + "logps/rejected": -276.70465087890625, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.339808702468872, + "rewards/margins": 3.521073818206787, + "rewards/rejected": -4.860882759094238, + "step": 3830 + }, + { + "epoch": 0.45, + "learning_rate": 1.68831935750561e-07, + "logits/chosen": -2.8378944396972656, + "logits/rejected": -2.6434946060180664, + "logps/chosen": -240.7099151611328, + "logps/rejected": -347.62567138671875, + "loss": 0.5889, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6218593120574951, + "rewards/margins": 2.683563709259033, + "rewards/rejected": -4.305422782897949, + "step": 3831 + }, + { + "epoch": 0.45, + "learning_rate": 1.6879650407464271e-07, + "logits/chosen": -1.9786877632141113, + "logits/rejected": -2.3769423961639404, + "logps/chosen": -503.4774169921875, + "logps/rejected": -282.1924133300781, + "loss": 0.3259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2291380167007446, + "rewards/margins": 1.903969407081604, + "rewards/rejected": -3.1331076622009277, + "step": 3832 + }, + { + "epoch": 0.45, + "learning_rate": 1.6876107239872444e-07, + "logits/chosen": -2.591981887817383, + "logits/rejected": -2.639894485473633, + "logps/chosen": -208.23513793945312, + "logps/rejected": -365.4489440917969, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8175657391548157, + "rewards/margins": 3.630483627319336, + "rewards/rejected": -4.448049545288086, + "step": 3833 + }, + { + "epoch": 0.45, + "learning_rate": 1.6872564072280618e-07, + "logits/chosen": -2.569997787475586, + "logits/rejected": -2.4607348442077637, + "logps/chosen": -136.52171325683594, + "logps/rejected": -193.04742431640625, + "loss": 0.519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4237495958805084, + "rewards/margins": 1.0032405853271484, + "rewards/rejected": -1.4269901514053345, + "step": 3834 + }, + { + "epoch": 0.45, + "learning_rate": 1.686902090468879e-07, + "logits/chosen": -2.391303539276123, + "logits/rejected": -2.215677499771118, + "logps/chosen": -136.81101989746094, + "logps/rejected": -212.63095092773438, + "loss": 0.5272, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5388848781585693, + "rewards/margins": 1.495161533355713, + "rewards/rejected": -3.034046173095703, + "step": 3835 + }, + { + "epoch": 0.45, + "learning_rate": 1.6865477737096963e-07, + "logits/chosen": -1.9732255935668945, + "logits/rejected": -2.1740615367889404, + "logps/chosen": -223.4727783203125, + "logps/rejected": -286.63189697265625, + "loss": 0.2425, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8594789505004883, + "rewards/margins": 3.4460387229919434, + "rewards/rejected": -4.305517196655273, + "step": 3836 + }, + { + "epoch": 0.45, + "learning_rate": 1.6861934569505138e-07, + "logits/chosen": -2.314375162124634, + "logits/rejected": -2.3977975845336914, + "logps/chosen": -323.109619140625, + "logps/rejected": -206.76963806152344, + "loss": 0.4224, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.316775918006897, + "rewards/margins": 2.080357551574707, + "rewards/rejected": -3.3971333503723145, + "step": 3837 + }, + { + "epoch": 0.45, + "learning_rate": 1.685839140191331e-07, + "logits/chosen": -2.1541738510131836, + "logits/rejected": -1.967624306678772, + "logps/chosen": -203.4284210205078, + "logps/rejected": -197.76893615722656, + "loss": 0.2615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9353717565536499, + "rewards/margins": 1.8561707735061646, + "rewards/rejected": -2.7915425300598145, + "step": 3838 + }, + { + "epoch": 0.45, + "learning_rate": 1.6854848234321485e-07, + "logits/chosen": -2.922396183013916, + "logits/rejected": -2.760819911956787, + "logps/chosen": -206.8907470703125, + "logps/rejected": -392.7366943359375, + "loss": 0.2492, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.02013099193573, + "rewards/margins": 3.468158006668091, + "rewards/rejected": -4.4882893562316895, + "step": 3839 + }, + { + "epoch": 0.45, + "learning_rate": 1.6851305066729657e-07, + "logits/chosen": -2.0662384033203125, + "logits/rejected": -2.0066256523132324, + "logps/chosen": -332.0126647949219, + "logps/rejected": -433.36529541015625, + "loss": 0.3628, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7707081437110901, + "rewards/margins": 3.1651487350463867, + "rewards/rejected": -3.935857057571411, + "step": 3840 + }, + { + "epoch": 0.45, + "learning_rate": 1.684776189913783e-07, + "logits/chosen": -2.822439193725586, + "logits/rejected": -2.8959145545959473, + "logps/chosen": -159.08189392089844, + "logps/rejected": -275.6336364746094, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8464915752410889, + "rewards/margins": 4.825191497802734, + "rewards/rejected": -6.671683311462402, + "step": 3841 + }, + { + "epoch": 0.45, + "learning_rate": 1.6844218731546001e-07, + "logits/chosen": -2.0508203506469727, + "logits/rejected": -2.3440301418304443, + "logps/chosen": -314.48138427734375, + "logps/rejected": -242.37493896484375, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8466165661811829, + "rewards/margins": 3.564997911453247, + "rewards/rejected": -4.411614418029785, + "step": 3842 + }, + { + "epoch": 0.45, + "learning_rate": 1.6840675563954174e-07, + "logits/chosen": -2.015721559524536, + "logits/rejected": -2.233762264251709, + "logps/chosen": -320.73297119140625, + "logps/rejected": -308.6557922363281, + "loss": 0.2471, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1274835616350174, + "rewards/margins": 1.909595251083374, + "rewards/rejected": -1.782111644744873, + "step": 3843 + }, + { + "epoch": 0.45, + "learning_rate": 1.6837132396362346e-07, + "logits/chosen": -2.3871235847473145, + "logits/rejected": -2.041787624359131, + "logps/chosen": -315.1697998046875, + "logps/rejected": -264.20599365234375, + "loss": 0.2156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6867462992668152, + "rewards/margins": 2.8909568786621094, + "rewards/rejected": -3.577702760696411, + "step": 3844 + }, + { + "epoch": 0.45, + "learning_rate": 1.683358922877052e-07, + "logits/chosen": -1.468582272529602, + "logits/rejected": -1.8102762699127197, + "logps/chosen": -378.8179931640625, + "logps/rejected": -308.75286865234375, + "loss": 0.8428, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6144113540649414, + "rewards/margins": 0.9861509799957275, + "rewards/rejected": -1.600562334060669, + "step": 3845 + }, + { + "epoch": 0.45, + "learning_rate": 1.6830046061178693e-07, + "logits/chosen": -2.0488529205322266, + "logits/rejected": -2.255938768386841, + "logps/chosen": -267.12115478515625, + "logps/rejected": -361.5212707519531, + "loss": 0.6584, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3445175886154175, + "rewards/margins": 2.225180149078369, + "rewards/rejected": -3.569697856903076, + "step": 3846 + }, + { + "epoch": 0.45, + "learning_rate": 1.6826502893586865e-07, + "logits/chosen": -2.8471899032592773, + "logits/rejected": -2.8838720321655273, + "logps/chosen": -159.9838409423828, + "logps/rejected": -159.1259002685547, + "loss": 0.6447, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4881021976470947, + "rewards/margins": 0.7805740237236023, + "rewards/rejected": -2.268676280975342, + "step": 3847 + }, + { + "epoch": 0.45, + "learning_rate": 1.6822959725995037e-07, + "logits/chosen": -2.1884493827819824, + "logits/rejected": -2.104937791824341, + "logps/chosen": -262.7991943359375, + "logps/rejected": -340.73785400390625, + "loss": 0.3151, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5290105938911438, + "rewards/margins": 1.8442312479019165, + "rewards/rejected": -2.373241901397705, + "step": 3848 + }, + { + "epoch": 0.45, + "learning_rate": 1.6819416558403212e-07, + "logits/chosen": -2.0645217895507812, + "logits/rejected": -2.066375732421875, + "logps/chosen": -303.4767761230469, + "logps/rejected": -285.02069091796875, + "loss": 0.5682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48978739976882935, + "rewards/margins": 0.8603252172470093, + "rewards/rejected": -1.3501126766204834, + "step": 3849 + }, + { + "epoch": 0.45, + "learning_rate": 1.6815873390811387e-07, + "logits/chosen": -2.140878200531006, + "logits/rejected": -2.4445528984069824, + "logps/chosen": -255.47531127929688, + "logps/rejected": -204.00942993164062, + "loss": 0.2296, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.862006664276123, + "rewards/margins": 2.726315975189209, + "rewards/rejected": -3.588322639465332, + "step": 3850 + }, + { + "epoch": 0.45, + "learning_rate": 1.681233022321956e-07, + "logits/chosen": -1.8923450708389282, + "logits/rejected": -2.1914265155792236, + "logps/chosen": -481.4262390136719, + "logps/rejected": -272.91558837890625, + "loss": 0.4112, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2347122430801392, + "rewards/margins": 2.110973834991455, + "rewards/rejected": -3.3456859588623047, + "step": 3851 + }, + { + "epoch": 0.45, + "learning_rate": 1.6808787055627732e-07, + "logits/chosen": -1.7963777780532837, + "logits/rejected": -2.1551105976104736, + "logps/chosen": -422.10430908203125, + "logps/rejected": -324.40447998046875, + "loss": 1.0366, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9448168277740479, + "rewards/margins": 1.8680858612060547, + "rewards/rejected": -3.8129026889801025, + "step": 3852 + }, + { + "epoch": 0.45, + "learning_rate": 1.6805243888035904e-07, + "logits/chosen": -2.0492641925811768, + "logits/rejected": -1.9815880060195923, + "logps/chosen": -250.93099975585938, + "logps/rejected": -252.45323181152344, + "loss": 0.4375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7611191868782043, + "rewards/margins": 1.6259925365447998, + "rewards/rejected": -2.3871116638183594, + "step": 3853 + }, + { + "epoch": 0.45, + "learning_rate": 1.6801700720444076e-07, + "logits/chosen": -2.856076240539551, + "logits/rejected": -2.8754382133483887, + "logps/chosen": -370.9661560058594, + "logps/rejected": -450.0996398925781, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6117712259292603, + "rewards/margins": 2.463886260986328, + "rewards/rejected": -3.075657606124878, + "step": 3854 + }, + { + "epoch": 0.45, + "learning_rate": 1.6798157552852248e-07, + "logits/chosen": -2.7387659549713135, + "logits/rejected": -2.8454809188842773, + "logps/chosen": -362.1965637207031, + "logps/rejected": -215.19100952148438, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7841247320175171, + "rewards/margins": 2.7710533142089844, + "rewards/rejected": -3.555178165435791, + "step": 3855 + }, + { + "epoch": 0.45, + "learning_rate": 1.679461438526042e-07, + "logits/chosen": -2.61087703704834, + "logits/rejected": -2.5356714725494385, + "logps/chosen": -201.07167053222656, + "logps/rejected": -259.5919494628906, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2156479358673096, + "rewards/margins": 1.6319248676300049, + "rewards/rejected": -2.8475728034973145, + "step": 3856 + }, + { + "epoch": 0.45, + "learning_rate": 1.6791071217668595e-07, + "logits/chosen": -2.593956708908081, + "logits/rejected": -2.64593243598938, + "logps/chosen": -324.75665283203125, + "logps/rejected": -215.24440002441406, + "loss": 0.5698, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1936200857162476, + "rewards/margins": 1.683941125869751, + "rewards/rejected": -2.877561330795288, + "step": 3857 + }, + { + "epoch": 0.45, + "learning_rate": 1.6787528050076767e-07, + "logits/chosen": -2.3305928707122803, + "logits/rejected": -2.65920352935791, + "logps/chosen": -256.4825134277344, + "logps/rejected": -154.06976318359375, + "loss": 1.012, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4522407054901123, + "rewards/margins": 0.4977017343044281, + "rewards/rejected": -2.9499423503875732, + "step": 3858 + }, + { + "epoch": 0.45, + "learning_rate": 1.678398488248494e-07, + "logits/chosen": -2.0241971015930176, + "logits/rejected": -1.8860857486724854, + "logps/chosen": -265.261474609375, + "logps/rejected": -366.9164733886719, + "loss": 0.3578, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0568616390228271, + "rewards/margins": 1.6688852310180664, + "rewards/rejected": -2.7257471084594727, + "step": 3859 + }, + { + "epoch": 0.45, + "learning_rate": 1.6780441714893112e-07, + "logits/chosen": -2.1653449535369873, + "logits/rejected": -2.280505895614624, + "logps/chosen": -249.0057373046875, + "logps/rejected": -308.88543701171875, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5435121059417725, + "rewards/margins": 2.4796223640441895, + "rewards/rejected": -4.023134708404541, + "step": 3860 + }, + { + "epoch": 0.45, + "learning_rate": 1.677689854730129e-07, + "logits/chosen": -2.31093430519104, + "logits/rejected": -2.286336660385132, + "logps/chosen": -326.8172302246094, + "logps/rejected": -298.0384826660156, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.251858115196228, + "rewards/margins": 1.6906893253326416, + "rewards/rejected": -1.94254732131958, + "step": 3861 + }, + { + "epoch": 0.45, + "learning_rate": 1.6773355379709462e-07, + "logits/chosen": -2.35324764251709, + "logits/rejected": -2.5623562335968018, + "logps/chosen": -389.74468994140625, + "logps/rejected": -304.6593933105469, + "loss": 0.1591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5386452674865723, + "rewards/margins": 3.6157727241516113, + "rewards/rejected": -4.154417514801025, + "step": 3862 + }, + { + "epoch": 0.45, + "learning_rate": 1.6769812212117634e-07, + "logits/chosen": -2.6122653484344482, + "logits/rejected": -2.692624807357788, + "logps/chosen": -212.1173095703125, + "logps/rejected": -232.46310424804688, + "loss": 0.3578, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8686950206756592, + "rewards/margins": 1.85826575756073, + "rewards/rejected": -2.7269606590270996, + "step": 3863 + }, + { + "epoch": 0.45, + "learning_rate": 1.6766269044525806e-07, + "logits/chosen": -2.4600796699523926, + "logits/rejected": -2.5046467781066895, + "logps/chosen": -156.1494140625, + "logps/rejected": -199.78964233398438, + "loss": 0.3185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43602097034454346, + "rewards/margins": 2.016326427459717, + "rewards/rejected": -2.45234751701355, + "step": 3864 + }, + { + "epoch": 0.45, + "learning_rate": 1.6762725876933978e-07, + "logits/chosen": -2.858031988143921, + "logits/rejected": -2.513591766357422, + "logps/chosen": -272.6485290527344, + "logps/rejected": -313.1524353027344, + "loss": 0.4944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9666910767555237, + "rewards/margins": 0.8768966197967529, + "rewards/rejected": -1.8435877561569214, + "step": 3865 + }, + { + "epoch": 0.45, + "learning_rate": 1.675918270934215e-07, + "logits/chosen": -2.5844719409942627, + "logits/rejected": -2.799694538116455, + "logps/chosen": -172.38026428222656, + "logps/rejected": -208.25148010253906, + "loss": 0.1677, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05365535244345665, + "rewards/margins": 3.254483461380005, + "rewards/rejected": -3.2008278369903564, + "step": 3866 + }, + { + "epoch": 0.45, + "learning_rate": 1.6755639541750323e-07, + "logits/chosen": -2.787827968597412, + "logits/rejected": -2.8657631874084473, + "logps/chosen": -155.89907836914062, + "logps/rejected": -306.1426086425781, + "loss": 0.2308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4150611460208893, + "rewards/margins": 3.615582227706909, + "rewards/rejected": -4.030643463134766, + "step": 3867 + }, + { + "epoch": 0.45, + "learning_rate": 1.6752096374158498e-07, + "logits/chosen": -1.7727798223495483, + "logits/rejected": -1.77206289768219, + "logps/chosen": -345.71612548828125, + "logps/rejected": -337.3167724609375, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8792455196380615, + "rewards/margins": 2.4160561561584473, + "rewards/rejected": -3.295301675796509, + "step": 3868 + }, + { + "epoch": 0.45, + "learning_rate": 1.674855320656667e-07, + "logits/chosen": -2.1119894981384277, + "logits/rejected": -1.9880409240722656, + "logps/chosen": -338.15277099609375, + "logps/rejected": -363.08428955078125, + "loss": 0.2095, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29326075315475464, + "rewards/margins": 2.445756435394287, + "rewards/rejected": -2.7390172481536865, + "step": 3869 + }, + { + "epoch": 0.45, + "learning_rate": 1.6745010038974842e-07, + "logits/chosen": -2.531792163848877, + "logits/rejected": -2.36822509765625, + "logps/chosen": -256.2059631347656, + "logps/rejected": -242.43502807617188, + "loss": 0.2168, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29968786239624023, + "rewards/margins": 2.91050124168396, + "rewards/rejected": -3.210188865661621, + "step": 3870 + }, + { + "epoch": 0.45, + "learning_rate": 1.6741466871383014e-07, + "logits/chosen": -2.436530351638794, + "logits/rejected": -2.273883819580078, + "logps/chosen": -171.18447875976562, + "logps/rejected": -256.0296325683594, + "loss": 0.2564, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0797022581100464, + "rewards/margins": 1.7788752317428589, + "rewards/rejected": -2.8585774898529053, + "step": 3871 + }, + { + "epoch": 0.45, + "learning_rate": 1.673792370379119e-07, + "logits/chosen": -2.208768367767334, + "logits/rejected": -2.3265886306762695, + "logps/chosen": -287.88824462890625, + "logps/rejected": -327.5821533203125, + "loss": 0.382, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4417760968208313, + "rewards/margins": 1.3021957874298096, + "rewards/rejected": -1.7439717054367065, + "step": 3872 + }, + { + "epoch": 0.45, + "learning_rate": 1.6734380536199364e-07, + "logits/chosen": -2.046501636505127, + "logits/rejected": -1.876584768295288, + "logps/chosen": -315.60260009765625, + "logps/rejected": -404.8988952636719, + "loss": 0.3111, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3199584484100342, + "rewards/margins": 2.9584078788757324, + "rewards/rejected": -4.2783660888671875, + "step": 3873 + }, + { + "epoch": 0.45, + "learning_rate": 1.6730837368607536e-07, + "logits/chosen": -2.037048101425171, + "logits/rejected": -1.9812512397766113, + "logps/chosen": -246.26678466796875, + "logps/rejected": -346.2870788574219, + "loss": 0.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4406641721725464, + "rewards/margins": 3.086928606033325, + "rewards/rejected": -3.527592658996582, + "step": 3874 + }, + { + "epoch": 0.45, + "learning_rate": 1.6727294201015708e-07, + "logits/chosen": -2.1197123527526855, + "logits/rejected": -2.023097276687622, + "logps/chosen": -180.98997497558594, + "logps/rejected": -271.0020446777344, + "loss": 0.4355, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0543116331100464, + "rewards/margins": 0.8606459498405457, + "rewards/rejected": -1.9149576425552368, + "step": 3875 + }, + { + "epoch": 0.45, + "learning_rate": 1.672375103342388e-07, + "logits/chosen": -2.550811529159546, + "logits/rejected": -2.604170322418213, + "logps/chosen": -180.4134521484375, + "logps/rejected": -244.74346923828125, + "loss": 0.6118, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9743334650993347, + "rewards/margins": 2.4420697689056396, + "rewards/rejected": -3.416403293609619, + "step": 3876 + }, + { + "epoch": 0.45, + "learning_rate": 1.6720207865832053e-07, + "logits/chosen": -2.106332778930664, + "logits/rejected": -2.263204336166382, + "logps/chosen": -258.62677001953125, + "logps/rejected": -159.3791961669922, + "loss": 0.6645, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5052158832550049, + "rewards/margins": 0.6435899138450623, + "rewards/rejected": -2.148805618286133, + "step": 3877 + }, + { + "epoch": 0.45, + "learning_rate": 1.6716664698240225e-07, + "logits/chosen": -2.3312036991119385, + "logits/rejected": -2.4568278789520264, + "logps/chosen": -300.5892639160156, + "logps/rejected": -273.6029052734375, + "loss": 0.3891, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0013070106506348, + "rewards/margins": 2.0554420948028564, + "rewards/rejected": -3.056748867034912, + "step": 3878 + }, + { + "epoch": 0.45, + "learning_rate": 1.67131215306484e-07, + "logits/chosen": -2.5587446689605713, + "logits/rejected": -2.565516710281372, + "logps/chosen": -285.08624267578125, + "logps/rejected": -277.631103515625, + "loss": 0.3141, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1133155822753906, + "rewards/margins": 3.5506796836853027, + "rewards/rejected": -4.663995265960693, + "step": 3879 + }, + { + "epoch": 0.45, + "learning_rate": 1.6709578363056572e-07, + "logits/chosen": -2.55995512008667, + "logits/rejected": -2.4365720748901367, + "logps/chosen": -256.30352783203125, + "logps/rejected": -301.9946594238281, + "loss": 0.2077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0809688568115234, + "rewards/margins": 4.236901760101318, + "rewards/rejected": -5.317870140075684, + "step": 3880 + }, + { + "epoch": 0.45, + "learning_rate": 1.6706035195464744e-07, + "logits/chosen": -1.593240737915039, + "logits/rejected": -1.9584397077560425, + "logps/chosen": -447.4654235839844, + "logps/rejected": -366.2733459472656, + "loss": 0.6007, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4272202253341675, + "rewards/margins": 2.0359816551208496, + "rewards/rejected": -3.4632019996643066, + "step": 3881 + }, + { + "epoch": 0.45, + "learning_rate": 1.6702492027872916e-07, + "logits/chosen": -2.248110294342041, + "logits/rejected": -1.958954095840454, + "logps/chosen": -290.76617431640625, + "logps/rejected": -368.9569091796875, + "loss": 0.3876, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7283235788345337, + "rewards/margins": 1.5859572887420654, + "rewards/rejected": -3.3142809867858887, + "step": 3882 + }, + { + "epoch": 0.45, + "learning_rate": 1.6698948860281089e-07, + "logits/chosen": -2.4536895751953125, + "logits/rejected": -2.537228584289551, + "logps/chosen": -285.68365478515625, + "logps/rejected": -248.40589904785156, + "loss": 0.5856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3895840644836426, + "rewards/margins": 0.8382235765457153, + "rewards/rejected": -1.2278075218200684, + "step": 3883 + }, + { + "epoch": 0.45, + "learning_rate": 1.6695405692689266e-07, + "logits/chosen": -2.2874979972839355, + "logits/rejected": -2.6131410598754883, + "logps/chosen": -196.2591552734375, + "logps/rejected": -177.0309600830078, + "loss": 0.3898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3315287828445435, + "rewards/margins": 1.544589877128601, + "rewards/rejected": -2.8761186599731445, + "step": 3884 + }, + { + "epoch": 0.45, + "learning_rate": 1.6691862525097438e-07, + "logits/chosen": -2.1648576259613037, + "logits/rejected": -2.1737892627716064, + "logps/chosen": -226.75765991210938, + "logps/rejected": -304.38934326171875, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38477036356925964, + "rewards/margins": 2.0864386558532715, + "rewards/rejected": -2.4712088108062744, + "step": 3885 + }, + { + "epoch": 0.45, + "learning_rate": 1.668831935750561e-07, + "logits/chosen": -2.267943859100342, + "logits/rejected": -2.165076971054077, + "logps/chosen": -237.361083984375, + "logps/rejected": -319.022705078125, + "loss": 0.4182, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7557692527770996, + "rewards/margins": 1.766971468925476, + "rewards/rejected": -2.522740602493286, + "step": 3886 + }, + { + "epoch": 0.45, + "learning_rate": 1.6684776189913783e-07, + "logits/chosen": -1.9549814462661743, + "logits/rejected": -1.7990045547485352, + "logps/chosen": -273.0657958984375, + "logps/rejected": -321.4599609375, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5449931025505066, + "rewards/margins": 2.982187271118164, + "rewards/rejected": -3.5271804332733154, + "step": 3887 + }, + { + "epoch": 0.45, + "learning_rate": 1.6681233022321955e-07, + "logits/chosen": -2.3014609813690186, + "logits/rejected": -2.1888084411621094, + "logps/chosen": -239.59417724609375, + "logps/rejected": -294.0715026855469, + "loss": 0.646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36529475450515747, + "rewards/margins": 1.1496926546096802, + "rewards/rejected": -1.5149874687194824, + "step": 3888 + }, + { + "epoch": 0.45, + "learning_rate": 1.6677689854730127e-07, + "logits/chosen": -2.2841453552246094, + "logits/rejected": -2.2381153106689453, + "logps/chosen": -220.14822387695312, + "logps/rejected": -186.7149658203125, + "loss": 0.6736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0027287006378174, + "rewards/margins": 0.6269006729125977, + "rewards/rejected": -1.6296294927597046, + "step": 3889 + }, + { + "epoch": 0.45, + "learning_rate": 1.6674146687138302e-07, + "logits/chosen": -2.5120835304260254, + "logits/rejected": -2.591970682144165, + "logps/chosen": -210.82000732421875, + "logps/rejected": -192.7767791748047, + "loss": 0.3871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7750200033187866, + "rewards/margins": 1.6773831844329834, + "rewards/rejected": -2.4524030685424805, + "step": 3890 + }, + { + "epoch": 0.45, + "learning_rate": 1.6670603519546474e-07, + "logits/chosen": -2.1459460258483887, + "logits/rejected": -2.1027731895446777, + "logps/chosen": -131.5851593017578, + "logps/rejected": -144.72669982910156, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.914670467376709, + "rewards/margins": 1.7257053852081299, + "rewards/rejected": -2.640375852584839, + "step": 3891 + }, + { + "epoch": 0.45, + "learning_rate": 1.6667060351954646e-07, + "logits/chosen": -1.952289342880249, + "logits/rejected": -2.000966787338257, + "logps/chosen": -344.2003173828125, + "logps/rejected": -394.0994873046875, + "loss": 0.2317, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0872116088867188, + "rewards/margins": 1.9442073106765747, + "rewards/rejected": -3.031418800354004, + "step": 3892 + }, + { + "epoch": 0.45, + "learning_rate": 1.666351718436282e-07, + "logits/chosen": -2.2933108806610107, + "logits/rejected": -2.5552964210510254, + "logps/chosen": -449.40155029296875, + "logps/rejected": -297.04925537109375, + "loss": 0.9894, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.088310718536377, + "rewards/margins": 0.07103085517883301, + "rewards/rejected": -2.15934157371521, + "step": 3893 + }, + { + "epoch": 0.45, + "learning_rate": 1.665997401677099e-07, + "logits/chosen": -2.512629508972168, + "logits/rejected": -2.503746509552002, + "logps/chosen": -308.7872314453125, + "logps/rejected": -310.9063415527344, + "loss": 0.1923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.296944260597229, + "rewards/margins": 3.005025625228882, + "rewards/rejected": -3.301969528198242, + "step": 3894 + }, + { + "epoch": 0.45, + "learning_rate": 1.6656430849179163e-07, + "logits/chosen": -2.131422996520996, + "logits/rejected": -2.4820127487182617, + "logps/chosen": -563.6387329101562, + "logps/rejected": -351.0616760253906, + "loss": 0.4032, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9796396493911743, + "rewards/margins": 2.0491464138031006, + "rewards/rejected": -3.0287861824035645, + "step": 3895 + }, + { + "epoch": 0.45, + "learning_rate": 1.665288768158734e-07, + "logits/chosen": -1.9110342264175415, + "logits/rejected": -1.6736944913864136, + "logps/chosen": -324.4891357421875, + "logps/rejected": -363.16326904296875, + "loss": 0.6152, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0614359378814697, + "rewards/margins": 0.576869547367096, + "rewards/rejected": -1.638305425643921, + "step": 3896 + }, + { + "epoch": 0.45, + "learning_rate": 1.6649344513995513e-07, + "logits/chosen": -1.7349880933761597, + "logits/rejected": -1.6993216276168823, + "logps/chosen": -296.19573974609375, + "logps/rejected": -304.4298095703125, + "loss": 0.4281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9680901765823364, + "rewards/margins": 2.222815990447998, + "rewards/rejected": -3.190906047821045, + "step": 3897 + }, + { + "epoch": 0.45, + "learning_rate": 1.6645801346403685e-07, + "logits/chosen": -2.2893307209014893, + "logits/rejected": -2.2873010635375977, + "logps/chosen": -481.09088134765625, + "logps/rejected": -465.574462890625, + "loss": 0.3964, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6989883184432983, + "rewards/margins": 2.219311475753784, + "rewards/rejected": -2.918299674987793, + "step": 3898 + }, + { + "epoch": 0.45, + "learning_rate": 1.6642258178811857e-07, + "logits/chosen": -1.9570164680480957, + "logits/rejected": -2.068056106567383, + "logps/chosen": -265.257568359375, + "logps/rejected": -257.0851135253906, + "loss": 0.2853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7775898575782776, + "rewards/margins": 1.5611759424209595, + "rewards/rejected": -2.338765859603882, + "step": 3899 + }, + { + "epoch": 0.45, + "learning_rate": 1.663871501122003e-07, + "logits/chosen": -2.5773377418518066, + "logits/rejected": -2.7421507835388184, + "logps/chosen": -348.7246398925781, + "logps/rejected": -271.55548095703125, + "loss": 0.4629, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9303429126739502, + "rewards/margins": 1.960776448249817, + "rewards/rejected": -3.8911194801330566, + "step": 3900 + }, + { + "epoch": 0.45, + "learning_rate": 1.6635171843628202e-07, + "logits/chosen": -2.056032657623291, + "logits/rejected": -2.264566421508789, + "logps/chosen": -359.62225341796875, + "logps/rejected": -383.87994384765625, + "loss": 0.3998, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.47682785987854, + "rewards/margins": 3.0207619667053223, + "rewards/rejected": -4.497590065002441, + "step": 3901 + }, + { + "epoch": 0.45, + "learning_rate": 1.6631628676036377e-07, + "logits/chosen": -2.2757906913757324, + "logits/rejected": -2.371882915496826, + "logps/chosen": -210.89828491210938, + "logps/rejected": -195.07049560546875, + "loss": 0.5804, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.295375108718872, + "rewards/margins": 1.797385334968567, + "rewards/rejected": -3.0927603244781494, + "step": 3902 + }, + { + "epoch": 0.45, + "learning_rate": 1.662808550844455e-07, + "logits/chosen": -1.6172164678573608, + "logits/rejected": -1.426568865776062, + "logps/chosen": -528.005126953125, + "logps/rejected": -598.367431640625, + "loss": 0.5379, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8151198029518127, + "rewards/margins": 1.8337925672531128, + "rewards/rejected": -2.6489124298095703, + "step": 3903 + }, + { + "epoch": 0.45, + "learning_rate": 1.662454234085272e-07, + "logits/chosen": -2.273738384246826, + "logits/rejected": -2.321366548538208, + "logps/chosen": -123.86614990234375, + "logps/rejected": -181.62725830078125, + "loss": 0.2689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6715587377548218, + "rewards/margins": 1.542258381843567, + "rewards/rejected": -2.2138171195983887, + "step": 3904 + }, + { + "epoch": 0.45, + "learning_rate": 1.6620999173260893e-07, + "logits/chosen": -2.00551176071167, + "logits/rejected": -2.0700478553771973, + "logps/chosen": -298.8359069824219, + "logps/rejected": -277.1605224609375, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8068256974220276, + "rewards/margins": 1.3865652084350586, + "rewards/rejected": -2.1933910846710205, + "step": 3905 + }, + { + "epoch": 0.45, + "learning_rate": 1.6617456005669065e-07, + "logits/chosen": -1.9215893745422363, + "logits/rejected": -2.0448343753814697, + "logps/chosen": -447.53546142578125, + "logps/rejected": -300.39447021484375, + "loss": 0.4892, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9539096355438232, + "rewards/margins": 1.0335050821304321, + "rewards/rejected": -1.987414836883545, + "step": 3906 + }, + { + "epoch": 0.45, + "learning_rate": 1.6613912838077238e-07, + "logits/chosen": -2.6586968898773193, + "logits/rejected": -2.383171558380127, + "logps/chosen": -198.85208129882812, + "logps/rejected": -280.6541748046875, + "loss": 0.5391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0294334888458252, + "rewards/margins": 1.330021858215332, + "rewards/rejected": -2.3594553470611572, + "step": 3907 + }, + { + "epoch": 0.45, + "learning_rate": 1.6610369670485415e-07, + "logits/chosen": -2.564728021621704, + "logits/rejected": -2.7271430492401123, + "logps/chosen": -207.4271240234375, + "logps/rejected": -125.0422592163086, + "loss": 0.4145, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6323509216308594, + "rewards/margins": 1.0975420475006104, + "rewards/rejected": -1.7298928499221802, + "step": 3908 + }, + { + "epoch": 0.45, + "learning_rate": 1.6606826502893587e-07, + "logits/chosen": -2.1196799278259277, + "logits/rejected": -1.9212218523025513, + "logps/chosen": -221.9855194091797, + "logps/rejected": -424.0548095703125, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4790626764297485, + "rewards/margins": 2.4551355838775635, + "rewards/rejected": -3.9341983795166016, + "step": 3909 + }, + { + "epoch": 0.45, + "learning_rate": 1.660328333530176e-07, + "logits/chosen": -2.0002923011779785, + "logits/rejected": -2.4046037197113037, + "logps/chosen": -406.3184814453125, + "logps/rejected": -297.5899963378906, + "loss": 0.4728, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.39532470703125, + "rewards/margins": 1.999868631362915, + "rewards/rejected": -3.395193338394165, + "step": 3910 + }, + { + "epoch": 0.45, + "learning_rate": 1.6599740167709932e-07, + "logits/chosen": -2.2699995040893555, + "logits/rejected": -2.2734501361846924, + "logps/chosen": -187.1719512939453, + "logps/rejected": -163.6668243408203, + "loss": 0.279, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8037168979644775, + "rewards/margins": 2.3717522621154785, + "rewards/rejected": -3.175469398498535, + "step": 3911 + }, + { + "epoch": 0.46, + "learning_rate": 1.6596197000118104e-07, + "logits/chosen": -2.3722338676452637, + "logits/rejected": -2.7493209838867188, + "logps/chosen": -456.46722412109375, + "logps/rejected": -342.0536804199219, + "loss": 0.6746, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0042991638183594, + "rewards/margins": 1.1311392784118652, + "rewards/rejected": -2.1354384422302246, + "step": 3912 + }, + { + "epoch": 0.46, + "learning_rate": 1.659265383252628e-07, + "logits/chosen": -2.9287166595458984, + "logits/rejected": -2.9435372352600098, + "logps/chosen": -218.3606719970703, + "logps/rejected": -139.0884246826172, + "loss": 0.2999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5440170168876648, + "rewards/margins": 2.1228206157684326, + "rewards/rejected": -2.666837692260742, + "step": 3913 + }, + { + "epoch": 0.46, + "learning_rate": 1.658911066493445e-07, + "logits/chosen": -2.3693840503692627, + "logits/rejected": -2.4994516372680664, + "logps/chosen": -192.30520629882812, + "logps/rejected": -200.2140655517578, + "loss": 0.3718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04872839152812958, + "rewards/margins": 1.548474907875061, + "rewards/rejected": -1.597203254699707, + "step": 3914 + }, + { + "epoch": 0.46, + "learning_rate": 1.6585567497342623e-07, + "logits/chosen": -2.785851001739502, + "logits/rejected": -2.702601909637451, + "logps/chosen": -204.90921020507812, + "logps/rejected": -217.53948974609375, + "loss": 0.4224, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6771920919418335, + "rewards/margins": 1.876772165298462, + "rewards/rejected": -3.553964376449585, + "step": 3915 + }, + { + "epoch": 0.46, + "learning_rate": 1.6582024329750795e-07, + "logits/chosen": -2.8932981491088867, + "logits/rejected": -2.6543326377868652, + "logps/chosen": -363.5979309082031, + "logps/rejected": -272.1506652832031, + "loss": 0.6161, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7143642902374268, + "rewards/margins": 1.3277819156646729, + "rewards/rejected": -3.0421462059020996, + "step": 3916 + }, + { + "epoch": 0.46, + "learning_rate": 1.6578481162158968e-07, + "logits/chosen": -1.404897689819336, + "logits/rejected": -2.0405707359313965, + "logps/chosen": -430.088134765625, + "logps/rejected": -245.97152709960938, + "loss": 0.52, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24080932140350342, + "rewards/margins": 1.2592658996582031, + "rewards/rejected": -1.500075101852417, + "step": 3917 + }, + { + "epoch": 0.46, + "learning_rate": 1.657493799456714e-07, + "logits/chosen": -2.1882576942443848, + "logits/rejected": -2.244640827178955, + "logps/chosen": -230.83399963378906, + "logps/rejected": -331.74176025390625, + "loss": 0.9269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.364326000213623, + "rewards/margins": 1.4826722145080566, + "rewards/rejected": -2.8469979763031006, + "step": 3918 + }, + { + "epoch": 0.46, + "learning_rate": 1.6571394826975317e-07, + "logits/chosen": -2.0727832317352295, + "logits/rejected": -2.070251226425171, + "logps/chosen": -284.667724609375, + "logps/rejected": -286.181396484375, + "loss": 0.3498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3417971134185791, + "rewards/margins": 1.585906982421875, + "rewards/rejected": -1.927704095840454, + "step": 3919 + }, + { + "epoch": 0.46, + "learning_rate": 1.656785165938349e-07, + "logits/chosen": -1.7428734302520752, + "logits/rejected": -2.0255985260009766, + "logps/chosen": -461.7159729003906, + "logps/rejected": -316.0364685058594, + "loss": 0.9836, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2697467803955078, + "rewards/margins": 1.329374074935913, + "rewards/rejected": -2.599120855331421, + "step": 3920 + }, + { + "epoch": 0.46, + "learning_rate": 1.6564308491791662e-07, + "logits/chosen": -1.9331209659576416, + "logits/rejected": -2.0482234954833984, + "logps/chosen": -369.12921142578125, + "logps/rejected": -340.3963317871094, + "loss": 0.5502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8258023858070374, + "rewards/margins": 0.858640730381012, + "rewards/rejected": -1.6844431161880493, + "step": 3921 + }, + { + "epoch": 0.46, + "learning_rate": 1.6560765324199834e-07, + "logits/chosen": -2.6940712928771973, + "logits/rejected": -2.6113667488098145, + "logps/chosen": -52.231117248535156, + "logps/rejected": -173.03965759277344, + "loss": 0.199, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1006915420293808, + "rewards/margins": 2.283607006072998, + "rewards/rejected": -2.182915687561035, + "step": 3922 + }, + { + "epoch": 0.46, + "learning_rate": 1.6557222156608006e-07, + "logits/chosen": -2.481154441833496, + "logits/rejected": -2.4103400707244873, + "logps/chosen": -280.0674133300781, + "logps/rejected": -273.2349548339844, + "loss": 0.187, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.648833155632019, + "rewards/margins": 2.793947219848633, + "rewards/rejected": -3.4427809715270996, + "step": 3923 + }, + { + "epoch": 0.46, + "learning_rate": 1.655367898901618e-07, + "logits/chosen": -2.5242955684661865, + "logits/rejected": -2.7565605640411377, + "logps/chosen": -165.9093017578125, + "logps/rejected": -230.6689453125, + "loss": 0.5166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2453938722610474, + "rewards/margins": 2.4396090507507324, + "rewards/rejected": -3.6850030422210693, + "step": 3924 + }, + { + "epoch": 0.46, + "learning_rate": 1.6550135821424353e-07, + "logits/chosen": -2.7933897972106934, + "logits/rejected": -2.8297200202941895, + "logps/chosen": -333.6321105957031, + "logps/rejected": -299.3306884765625, + "loss": 0.1996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5977796912193298, + "rewards/margins": 2.359827756881714, + "rewards/rejected": -2.9576077461242676, + "step": 3925 + }, + { + "epoch": 0.46, + "learning_rate": 1.6546592653832526e-07, + "logits/chosen": -2.687225818634033, + "logits/rejected": -2.8150887489318848, + "logps/chosen": -293.63409423828125, + "logps/rejected": -228.01708984375, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5138037204742432, + "rewards/margins": 2.7215728759765625, + "rewards/rejected": -3.2353765964508057, + "step": 3926 + }, + { + "epoch": 0.46, + "learning_rate": 1.6543049486240698e-07, + "logits/chosen": -2.2444589138031006, + "logits/rejected": -2.273538827896118, + "logps/chosen": -230.5950927734375, + "logps/rejected": -262.35333251953125, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6988614201545715, + "rewards/margins": 2.431675910949707, + "rewards/rejected": -3.130537509918213, + "step": 3927 + }, + { + "epoch": 0.46, + "learning_rate": 1.653950631864887e-07, + "logits/chosen": -2.3953919410705566, + "logits/rejected": -2.6913251876831055, + "logps/chosen": -241.2333984375, + "logps/rejected": -264.0606384277344, + "loss": 0.7335, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.76862633228302, + "rewards/margins": 2.348984718322754, + "rewards/rejected": -4.117610931396484, + "step": 3928 + }, + { + "epoch": 0.46, + "learning_rate": 1.6535963151057042e-07, + "logits/chosen": -2.156292676925659, + "logits/rejected": -2.2925949096679688, + "logps/chosen": -550.068359375, + "logps/rejected": -436.76025390625, + "loss": 0.3793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7540846467018127, + "rewards/margins": 1.5945425033569336, + "rewards/rejected": -2.3486270904541016, + "step": 3929 + }, + { + "epoch": 0.46, + "learning_rate": 1.6532419983465214e-07, + "logits/chosen": -2.116757392883301, + "logits/rejected": -2.0873019695281982, + "logps/chosen": -205.5311279296875, + "logps/rejected": -306.070068359375, + "loss": 0.4203, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.20888090133667, + "rewards/margins": 2.12562894821167, + "rewards/rejected": -3.3345096111297607, + "step": 3930 + }, + { + "epoch": 0.46, + "learning_rate": 1.6528876815873392e-07, + "logits/chosen": -2.4310860633850098, + "logits/rejected": -2.392638683319092, + "logps/chosen": -337.08514404296875, + "logps/rejected": -342.1105651855469, + "loss": 0.4623, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2214616537094116, + "rewards/margins": 2.3843374252319336, + "rewards/rejected": -3.6057987213134766, + "step": 3931 + }, + { + "epoch": 0.46, + "learning_rate": 1.6525333648281564e-07, + "logits/chosen": -2.5516531467437744, + "logits/rejected": -2.4415719509124756, + "logps/chosen": -325.0041809082031, + "logps/rejected": -347.8976135253906, + "loss": 0.2106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6062057614326477, + "rewards/margins": 2.931074857711792, + "rewards/rejected": -3.537280559539795, + "step": 3932 + }, + { + "epoch": 0.46, + "learning_rate": 1.6521790480689736e-07, + "logits/chosen": -2.5044026374816895, + "logits/rejected": -2.5501670837402344, + "logps/chosen": -230.84014892578125, + "logps/rejected": -294.1318359375, + "loss": 0.7257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.749709963798523, + "rewards/margins": 0.653721809387207, + "rewards/rejected": -2.4034318923950195, + "step": 3933 + }, + { + "epoch": 0.46, + "learning_rate": 1.6518247313097909e-07, + "logits/chosen": -1.5549858808517456, + "logits/rejected": -1.9962610006332397, + "logps/chosen": -305.12139892578125, + "logps/rejected": -221.55844116210938, + "loss": 0.2369, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8440015316009521, + "rewards/margins": 2.5838680267333984, + "rewards/rejected": -4.42786979675293, + "step": 3934 + }, + { + "epoch": 0.46, + "learning_rate": 1.651470414550608e-07, + "logits/chosen": -2.3633902072906494, + "logits/rejected": -2.5667688846588135, + "logps/chosen": -168.75755310058594, + "logps/rejected": -290.710205078125, + "loss": 0.3304, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0187458992004395, + "rewards/margins": 2.9705708026885986, + "rewards/rejected": -3.989316463470459, + "step": 3935 + }, + { + "epoch": 0.46, + "learning_rate": 1.6511160977914256e-07, + "logits/chosen": -2.6231470108032227, + "logits/rejected": -2.577360153198242, + "logps/chosen": -256.7175598144531, + "logps/rejected": -316.27459716796875, + "loss": 0.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5895966291427612, + "rewards/margins": 1.558298110961914, + "rewards/rejected": -2.147894859313965, + "step": 3936 + }, + { + "epoch": 0.46, + "learning_rate": 1.6507617810322428e-07, + "logits/chosen": -2.5242719650268555, + "logits/rejected": -2.485776662826538, + "logps/chosen": -287.8277282714844, + "logps/rejected": -299.5181579589844, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5263023376464844, + "rewards/margins": 3.133920431137085, + "rewards/rejected": -3.6602225303649902, + "step": 3937 + }, + { + "epoch": 0.46, + "learning_rate": 1.65040746427306e-07, + "logits/chosen": -1.920130729675293, + "logits/rejected": -2.0677363872528076, + "logps/chosen": -396.0291442871094, + "logps/rejected": -397.230712890625, + "loss": 0.4815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8796119689941406, + "rewards/margins": 1.2572662830352783, + "rewards/rejected": -2.136878252029419, + "step": 3938 + }, + { + "epoch": 0.46, + "learning_rate": 1.6500531475138772e-07, + "logits/chosen": -2.0578975677490234, + "logits/rejected": -2.1588358879089355, + "logps/chosen": -353.5740661621094, + "logps/rejected": -243.0382080078125, + "loss": 0.5404, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.434918761253357, + "rewards/margins": 1.977284550666809, + "rewards/rejected": -3.412203311920166, + "step": 3939 + }, + { + "epoch": 0.46, + "learning_rate": 1.6496988307546944e-07, + "logits/chosen": -1.8925567865371704, + "logits/rejected": -1.7885117530822754, + "logps/chosen": -215.15457153320312, + "logps/rejected": -206.30328369140625, + "loss": 0.6131, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.354048490524292, + "rewards/margins": 1.4506523609161377, + "rewards/rejected": -2.8047006130218506, + "step": 3940 + }, + { + "epoch": 0.46, + "learning_rate": 1.6493445139955117e-07, + "logits/chosen": -2.027043342590332, + "logits/rejected": -2.024496555328369, + "logps/chosen": -307.4137878417969, + "logps/rejected": -424.326416015625, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6002962589263916, + "rewards/margins": 3.2508492469787598, + "rewards/rejected": -3.8511455059051514, + "step": 3941 + }, + { + "epoch": 0.46, + "learning_rate": 1.6489901972363292e-07, + "logits/chosen": -2.6939916610717773, + "logits/rejected": -2.5931665897369385, + "logps/chosen": -173.3814239501953, + "logps/rejected": -162.88107299804688, + "loss": 0.3034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44174665212631226, + "rewards/margins": 1.7786290645599365, + "rewards/rejected": -2.2203755378723145, + "step": 3942 + }, + { + "epoch": 0.46, + "learning_rate": 1.6486358804771466e-07, + "logits/chosen": -2.2949297428131104, + "logits/rejected": -2.504296064376831, + "logps/chosen": -298.02667236328125, + "logps/rejected": -163.6577911376953, + "loss": 0.6218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044179074466228485, + "rewards/margins": 1.8085417747497559, + "rewards/rejected": -1.852720856666565, + "step": 3943 + }, + { + "epoch": 0.46, + "learning_rate": 1.6482815637179639e-07, + "logits/chosen": -2.456106662750244, + "logits/rejected": -2.376206398010254, + "logps/chosen": -178.7095947265625, + "logps/rejected": -251.34060668945312, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9643751978874207, + "rewards/margins": 1.6535300016403198, + "rewards/rejected": -2.6179051399230957, + "step": 3944 + }, + { + "epoch": 0.46, + "learning_rate": 1.647927246958781e-07, + "logits/chosen": -2.0813639163970947, + "logits/rejected": -2.301832675933838, + "logps/chosen": -188.51470947265625, + "logps/rejected": -208.46986389160156, + "loss": 0.3864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7985163331031799, + "rewards/margins": 2.8208892345428467, + "rewards/rejected": -3.6194052696228027, + "step": 3945 + }, + { + "epoch": 0.46, + "learning_rate": 1.6475729301995983e-07, + "logits/chosen": -2.1538877487182617, + "logits/rejected": -2.3655076026916504, + "logps/chosen": -393.18975830078125, + "logps/rejected": -236.78524780273438, + "loss": 0.5358, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0000369548797607, + "rewards/margins": 0.9976277351379395, + "rewards/rejected": -1.9976646900177002, + "step": 3946 + }, + { + "epoch": 0.46, + "learning_rate": 1.6472186134404158e-07, + "logits/chosen": -2.4258294105529785, + "logits/rejected": -2.6544787883758545, + "logps/chosen": -347.4997253417969, + "logps/rejected": -257.243896484375, + "loss": 0.402, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6963886618614197, + "rewards/margins": 2.235750913619995, + "rewards/rejected": -2.9321393966674805, + "step": 3947 + }, + { + "epoch": 0.46, + "learning_rate": 1.646864296681233e-07, + "logits/chosen": -2.3404111862182617, + "logits/rejected": -2.274977684020996, + "logps/chosen": -228.67739868164062, + "logps/rejected": -218.5743865966797, + "loss": 0.3371, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22307144105434418, + "rewards/margins": 1.3316730260849, + "rewards/rejected": -1.5547446012496948, + "step": 3948 + }, + { + "epoch": 0.46, + "learning_rate": 1.6465099799220502e-07, + "logits/chosen": -2.677840232849121, + "logits/rejected": -2.5003280639648438, + "logps/chosen": -248.92935180664062, + "logps/rejected": -246.9120330810547, + "loss": 0.3518, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.265497088432312, + "rewards/margins": 1.9520766735076904, + "rewards/rejected": -3.217573642730713, + "step": 3949 + }, + { + "epoch": 0.46, + "learning_rate": 1.6461556631628675e-07, + "logits/chosen": -2.2416977882385254, + "logits/rejected": -2.5748939514160156, + "logps/chosen": -490.63818359375, + "logps/rejected": -232.35504150390625, + "loss": 0.329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8970564603805542, + "rewards/margins": 2.0748486518859863, + "rewards/rejected": -2.971904993057251, + "step": 3950 + }, + { + "epoch": 0.46, + "learning_rate": 1.6458013464036847e-07, + "logits/chosen": -1.8287098407745361, + "logits/rejected": -1.8242080211639404, + "logps/chosen": -376.67529296875, + "logps/rejected": -436.81256103515625, + "loss": 0.3504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.960844874382019, + "rewards/margins": 3.16070556640625, + "rewards/rejected": -4.1215500831604, + "step": 3951 + }, + { + "epoch": 0.46, + "learning_rate": 1.645447029644502e-07, + "logits/chosen": -2.2309231758117676, + "logits/rejected": -2.4436192512512207, + "logps/chosen": -208.72853088378906, + "logps/rejected": -285.3692626953125, + "loss": 0.5293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6695567965507507, + "rewards/margins": 3.393733263015747, + "rewards/rejected": -4.063289642333984, + "step": 3952 + }, + { + "epoch": 0.46, + "learning_rate": 1.6450927128853194e-07, + "logits/chosen": -2.13130521774292, + "logits/rejected": -2.097860813140869, + "logps/chosen": -367.9957275390625, + "logps/rejected": -439.1777648925781, + "loss": 0.2111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41047775745391846, + "rewards/margins": 2.7937567234039307, + "rewards/rejected": -3.2042346000671387, + "step": 3953 + }, + { + "epoch": 0.46, + "learning_rate": 1.6447383961261369e-07, + "logits/chosen": -2.4099628925323486, + "logits/rejected": -2.640214681625366, + "logps/chosen": -150.7002716064453, + "logps/rejected": -159.5825653076172, + "loss": 0.3707, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8288857340812683, + "rewards/margins": 1.8333728313446045, + "rewards/rejected": -2.6622586250305176, + "step": 3954 + }, + { + "epoch": 0.46, + "learning_rate": 1.644384079366954e-07, + "logits/chosen": -2.637030601501465, + "logits/rejected": -2.2473363876342773, + "logps/chosen": -88.22962188720703, + "logps/rejected": -226.756591796875, + "loss": 0.866, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9672958850860596, + "rewards/margins": 1.7017645835876465, + "rewards/rejected": -3.669060468673706, + "step": 3955 + }, + { + "epoch": 0.46, + "learning_rate": 1.6440297626077713e-07, + "logits/chosen": -2.813427686691284, + "logits/rejected": -2.8487465381622314, + "logps/chosen": -121.28565979003906, + "logps/rejected": -201.32872009277344, + "loss": 1.0488, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.074873447418213, + "rewards/margins": 1.1152749061584473, + "rewards/rejected": -3.1901485919952393, + "step": 3956 + }, + { + "epoch": 0.46, + "learning_rate": 1.6436754458485885e-07, + "logits/chosen": -2.063302516937256, + "logits/rejected": -2.305544853210449, + "logps/chosen": -258.9646911621094, + "logps/rejected": -160.5577850341797, + "loss": 0.2897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2232809215784073, + "rewards/margins": 1.6698435544967651, + "rewards/rejected": -1.8931243419647217, + "step": 3957 + }, + { + "epoch": 0.46, + "learning_rate": 1.643321129089406e-07, + "logits/chosen": -2.4128925800323486, + "logits/rejected": -2.6066646575927734, + "logps/chosen": -262.2380676269531, + "logps/rejected": -278.1166687011719, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0172700881958008, + "rewards/margins": 1.6816158294677734, + "rewards/rejected": -2.698885917663574, + "step": 3958 + }, + { + "epoch": 0.46, + "learning_rate": 1.6429668123302232e-07, + "logits/chosen": -2.2732763290405273, + "logits/rejected": -2.3244009017944336, + "logps/chosen": -201.82440185546875, + "logps/rejected": -236.677490234375, + "loss": 0.3603, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6881381869316101, + "rewards/margins": 1.4232892990112305, + "rewards/rejected": -2.1114275455474854, + "step": 3959 + }, + { + "epoch": 0.46, + "learning_rate": 1.6426124955710405e-07, + "logits/chosen": -1.8601810932159424, + "logits/rejected": -2.181622266769409, + "logps/chosen": -626.0748901367188, + "logps/rejected": -449.293212890625, + "loss": 0.467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7602765560150146, + "rewards/margins": 1.8340922594070435, + "rewards/rejected": -2.5943689346313477, + "step": 3960 + }, + { + "epoch": 0.46, + "learning_rate": 1.6422581788118577e-07, + "logits/chosen": -2.671356201171875, + "logits/rejected": -2.751289129257202, + "logps/chosen": -184.88131713867188, + "logps/rejected": -280.28546142578125, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0858287811279297, + "rewards/margins": 2.511718511581421, + "rewards/rejected": -3.5975472927093506, + "step": 3961 + }, + { + "epoch": 0.46, + "learning_rate": 1.641903862052675e-07, + "logits/chosen": -2.3870863914489746, + "logits/rejected": -2.158416748046875, + "logps/chosen": -412.8333740234375, + "logps/rejected": -390.12017822265625, + "loss": 0.7054, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2829443216323853, + "rewards/margins": 1.6606245040893555, + "rewards/rejected": -2.943568706512451, + "step": 3962 + }, + { + "epoch": 0.46, + "learning_rate": 1.641549545293492e-07, + "logits/chosen": -2.340913772583008, + "logits/rejected": -2.1604578495025635, + "logps/chosen": -206.87167358398438, + "logps/rejected": -278.2321472167969, + "loss": 0.2374, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.433334469795227, + "rewards/margins": 3.97241473197937, + "rewards/rejected": -5.405749320983887, + "step": 3963 + }, + { + "epoch": 0.46, + "learning_rate": 1.6411952285343093e-07, + "logits/chosen": -2.6634902954101562, + "logits/rejected": -2.224545955657959, + "logps/chosen": -224.65077209472656, + "logps/rejected": -189.88925170898438, + "loss": 0.3033, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.794945240020752, + "rewards/margins": 3.1383814811706543, + "rewards/rejected": -3.9333269596099854, + "step": 3964 + }, + { + "epoch": 0.46, + "learning_rate": 1.6408409117751268e-07, + "logits/chosen": -1.9960612058639526, + "logits/rejected": -2.2806451320648193, + "logps/chosen": -326.1009521484375, + "logps/rejected": -243.41036987304688, + "loss": 0.2488, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0011952295899391174, + "rewards/margins": 2.0707128047943115, + "rewards/rejected": -2.0695176124572754, + "step": 3965 + }, + { + "epoch": 0.46, + "learning_rate": 1.6404865950159443e-07, + "logits/chosen": -2.4857444763183594, + "logits/rejected": -2.0633797645568848, + "logps/chosen": -213.16876220703125, + "logps/rejected": -275.69049072265625, + "loss": 0.186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09581833332777023, + "rewards/margins": 2.343961715698242, + "rewards/rejected": -2.248143196105957, + "step": 3966 + }, + { + "epoch": 0.46, + "learning_rate": 1.6401322782567615e-07, + "logits/chosen": -1.9964426755905151, + "logits/rejected": -2.133298397064209, + "logps/chosen": -258.18109130859375, + "logps/rejected": -334.46783447265625, + "loss": 0.3318, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8978146314620972, + "rewards/margins": 3.4809908866882324, + "rewards/rejected": -4.378805160522461, + "step": 3967 + }, + { + "epoch": 0.46, + "learning_rate": 1.6397779614975788e-07, + "logits/chosen": -2.0834102630615234, + "logits/rejected": -2.2590813636779785, + "logps/chosen": -514.2142333984375, + "logps/rejected": -407.45263671875, + "loss": 0.1985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47627344727516174, + "rewards/margins": 2.7707161903381348, + "rewards/rejected": -3.2469894886016846, + "step": 3968 + }, + { + "epoch": 0.46, + "learning_rate": 1.6394236447383962e-07, + "logits/chosen": -2.384265899658203, + "logits/rejected": -2.5301554203033447, + "logps/chosen": -262.34918212890625, + "logps/rejected": -281.2230529785156, + "loss": 0.2304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5596275925636292, + "rewards/margins": 2.131875514984131, + "rewards/rejected": -2.691502809524536, + "step": 3969 + }, + { + "epoch": 0.46, + "learning_rate": 1.6390693279792135e-07, + "logits/chosen": -1.831903100013733, + "logits/rejected": -2.494029998779297, + "logps/chosen": -374.9416198730469, + "logps/rejected": -189.73104858398438, + "loss": 0.3027, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9706882238388062, + "rewards/margins": 2.267364263534546, + "rewards/rejected": -3.2380526065826416, + "step": 3970 + }, + { + "epoch": 0.46, + "learning_rate": 1.6387150112200307e-07, + "logits/chosen": -2.2646617889404297, + "logits/rejected": -2.242034673690796, + "logps/chosen": -423.46746826171875, + "logps/rejected": -329.1251525878906, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8615195751190186, + "rewards/margins": 2.820004463195801, + "rewards/rejected": -3.6815240383148193, + "step": 3971 + }, + { + "epoch": 0.46, + "learning_rate": 1.638360694460848e-07, + "logits/chosen": -2.369971513748169, + "logits/rejected": -2.3231699466705322, + "logps/chosen": -140.62388610839844, + "logps/rejected": -196.24606323242188, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23546014726161957, + "rewards/margins": 3.0574488639831543, + "rewards/rejected": -2.821988821029663, + "step": 3972 + }, + { + "epoch": 0.46, + "learning_rate": 1.638006377701665e-07, + "logits/chosen": -1.8697021007537842, + "logits/rejected": -1.9884072542190552, + "logps/chosen": -368.5655822753906, + "logps/rejected": -383.21734619140625, + "loss": 1.0667, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8240963220596313, + "rewards/margins": -0.5172037482261658, + "rewards/rejected": -1.3068926334381104, + "step": 3973 + }, + { + "epoch": 0.46, + "learning_rate": 1.6376520609424823e-07, + "logits/chosen": -2.50384521484375, + "logits/rejected": -2.335244655609131, + "logps/chosen": -205.60092163085938, + "logps/rejected": -285.5077209472656, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3143959045410156, + "rewards/margins": 3.9873692989349365, + "rewards/rejected": -5.301765441894531, + "step": 3974 + }, + { + "epoch": 0.46, + "learning_rate": 1.6372977441832996e-07, + "logits/chosen": -1.9997432231903076, + "logits/rejected": -2.0570549964904785, + "logps/chosen": -313.7476501464844, + "logps/rejected": -355.53228759765625, + "loss": 0.2643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7654287815093994, + "rewards/margins": 2.790038585662842, + "rewards/rejected": -3.555467128753662, + "step": 3975 + }, + { + "epoch": 0.46, + "learning_rate": 1.636943427424117e-07, + "logits/chosen": -2.423778533935547, + "logits/rejected": -2.629275321960449, + "logps/chosen": -353.6055603027344, + "logps/rejected": -295.0767517089844, + "loss": 0.1995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2444050908088684, + "rewards/margins": 2.5270917415618896, + "rewards/rejected": -2.7714967727661133, + "step": 3976 + }, + { + "epoch": 0.46, + "learning_rate": 1.6365891106649343e-07, + "logits/chosen": -2.3694143295288086, + "logits/rejected": -2.3338515758514404, + "logps/chosen": -216.8291778564453, + "logps/rejected": -205.60296630859375, + "loss": 0.6496, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2938556671142578, + "rewards/margins": 0.5532680749893188, + "rewards/rejected": -1.8471238613128662, + "step": 3977 + }, + { + "epoch": 0.46, + "learning_rate": 1.6362347939057518e-07, + "logits/chosen": -1.847383975982666, + "logits/rejected": -1.8584564924240112, + "logps/chosen": -289.0009765625, + "logps/rejected": -408.8981628417969, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.825747549533844, + "rewards/margins": 3.1812713146209717, + "rewards/rejected": -4.00701904296875, + "step": 3978 + }, + { + "epoch": 0.46, + "learning_rate": 1.635880477146569e-07, + "logits/chosen": -2.7667036056518555, + "logits/rejected": -2.4772160053253174, + "logps/chosen": -203.95765686035156, + "logps/rejected": -288.8656921386719, + "loss": 0.4296, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0858700275421143, + "rewards/margins": 1.9554939270019531, + "rewards/rejected": -3.0413639545440674, + "step": 3979 + }, + { + "epoch": 0.46, + "learning_rate": 1.6355261603873862e-07, + "logits/chosen": -2.5727648735046387, + "logits/rejected": -2.4634881019592285, + "logps/chosen": -170.88404846191406, + "logps/rejected": -319.9769287109375, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2404130697250366, + "rewards/margins": 1.5059418678283691, + "rewards/rejected": -2.7463550567626953, + "step": 3980 + }, + { + "epoch": 0.46, + "learning_rate": 1.6351718436282037e-07, + "logits/chosen": -2.6819450855255127, + "logits/rejected": -2.6617238521575928, + "logps/chosen": -184.51055908203125, + "logps/rejected": -146.26290893554688, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4780151844024658, + "rewards/margins": 1.1833654642105103, + "rewards/rejected": -2.6613807678222656, + "step": 3981 + }, + { + "epoch": 0.46, + "learning_rate": 1.634817526869021e-07, + "logits/chosen": -2.5229270458221436, + "logits/rejected": -2.326030969619751, + "logps/chosen": -141.95672607421875, + "logps/rejected": -308.0113220214844, + "loss": 0.2517, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2896267175674438, + "rewards/margins": 1.9409444332122803, + "rewards/rejected": -3.2305712699890137, + "step": 3982 + }, + { + "epoch": 0.46, + "learning_rate": 1.6344632101098381e-07, + "logits/chosen": -2.587704658508301, + "logits/rejected": -2.8659188747406006, + "logps/chosen": -302.96575927734375, + "logps/rejected": -321.80291748046875, + "loss": 0.406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1690610647201538, + "rewards/margins": 2.596224546432495, + "rewards/rejected": -3.7652854919433594, + "step": 3983 + }, + { + "epoch": 0.46, + "learning_rate": 1.6341088933506554e-07, + "logits/chosen": -2.233531951904297, + "logits/rejected": -1.998093605041504, + "logps/chosen": -349.4864501953125, + "logps/rejected": -276.64874267578125, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20434747636318207, + "rewards/margins": 2.5251004695892334, + "rewards/rejected": -2.3207528591156006, + "step": 3984 + }, + { + "epoch": 0.46, + "learning_rate": 1.6337545765914726e-07, + "logits/chosen": -1.958150029182434, + "logits/rejected": -1.9585254192352295, + "logps/chosen": -341.4512023925781, + "logps/rejected": -237.99676513671875, + "loss": 0.3805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6409279704093933, + "rewards/margins": 1.2214231491088867, + "rewards/rejected": -1.8623511791229248, + "step": 3985 + }, + { + "epoch": 0.46, + "learning_rate": 1.6334002598322898e-07, + "logits/chosen": -2.1044516563415527, + "logits/rejected": -2.0845272541046143, + "logps/chosen": -341.22723388671875, + "logps/rejected": -256.0553894042969, + "loss": 0.3522, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6865931749343872, + "rewards/margins": 2.3085055351257324, + "rewards/rejected": -2.9950990676879883, + "step": 3986 + }, + { + "epoch": 0.46, + "learning_rate": 1.6330459430731073e-07, + "logits/chosen": -1.354891061782837, + "logits/rejected": -1.3291728496551514, + "logps/chosen": -388.2460632324219, + "logps/rejected": -345.3612060546875, + "loss": 0.8292, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0791282653808594, + "rewards/margins": 0.8187131881713867, + "rewards/rejected": -1.897841453552246, + "step": 3987 + }, + { + "epoch": 0.46, + "learning_rate": 1.6326916263139245e-07, + "logits/chosen": -2.8874642848968506, + "logits/rejected": -2.9945285320281982, + "logps/chosen": -246.10214233398438, + "logps/rejected": -215.39735412597656, + "loss": 0.3906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8647282123565674, + "rewards/margins": 1.9278221130371094, + "rewards/rejected": -2.7925500869750977, + "step": 3988 + }, + { + "epoch": 0.46, + "learning_rate": 1.632337309554742e-07, + "logits/chosen": -1.7505451440811157, + "logits/rejected": -2.356766939163208, + "logps/chosen": -469.513671875, + "logps/rejected": -173.0124969482422, + "loss": 0.3861, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017610162496566772, + "rewards/margins": 1.6261199712753296, + "rewards/rejected": -1.6085097789764404, + "step": 3989 + }, + { + "epoch": 0.46, + "learning_rate": 1.6319829927955592e-07, + "logits/chosen": -2.2524514198303223, + "logits/rejected": -2.233635663986206, + "logps/chosen": -185.6726531982422, + "logps/rejected": -206.17462158203125, + "loss": 0.4314, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4770694077014923, + "rewards/margins": 1.2357823848724365, + "rewards/rejected": -1.712851881980896, + "step": 3990 + }, + { + "epoch": 0.46, + "learning_rate": 1.6316286760363764e-07, + "logits/chosen": -2.6384668350219727, + "logits/rejected": -2.7295751571655273, + "logps/chosen": -211.89797973632812, + "logps/rejected": -157.75331115722656, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7456852793693542, + "rewards/margins": 0.7141335010528564, + "rewards/rejected": -1.459818720817566, + "step": 3991 + }, + { + "epoch": 0.46, + "learning_rate": 1.631274359277194e-07, + "logits/chosen": -2.5034780502319336, + "logits/rejected": -2.6355082988739014, + "logps/chosen": -458.5139465332031, + "logps/rejected": -365.8216247558594, + "loss": 0.413, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2951167821884155, + "rewards/margins": 2.463738203048706, + "rewards/rejected": -3.758854866027832, + "step": 3992 + }, + { + "epoch": 0.46, + "learning_rate": 1.6309200425180111e-07, + "logits/chosen": -2.2143714427948, + "logits/rejected": -1.9750316143035889, + "logps/chosen": -236.6873016357422, + "logps/rejected": -306.2986755371094, + "loss": 0.4948, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8285441398620605, + "rewards/margins": 0.9282800555229187, + "rewards/rejected": -1.756824254989624, + "step": 3993 + }, + { + "epoch": 0.46, + "learning_rate": 1.6305657257588284e-07, + "logits/chosen": -2.1235861778259277, + "logits/rejected": -2.258441686630249, + "logps/chosen": -251.4058837890625, + "logps/rejected": -255.95050048828125, + "loss": 0.2592, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5137506723403931, + "rewards/margins": 2.896247625350952, + "rewards/rejected": -3.4099984169006348, + "step": 3994 + }, + { + "epoch": 0.46, + "learning_rate": 1.6302114089996456e-07, + "logits/chosen": -2.4660308361053467, + "logits/rejected": -2.448822498321533, + "logps/chosen": -462.6173095703125, + "logps/rejected": -464.00909423828125, + "loss": 0.2487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06790051609277725, + "rewards/margins": 3.669581413269043, + "rewards/rejected": -3.7374818325042725, + "step": 3995 + }, + { + "epoch": 0.46, + "learning_rate": 1.6298570922404628e-07, + "logits/chosen": -2.359591007232666, + "logits/rejected": -2.1683249473571777, + "logps/chosen": -205.99949645996094, + "logps/rejected": -257.1788330078125, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3679707050323486, + "rewards/margins": 1.4290827512741089, + "rewards/rejected": -3.797053098678589, + "step": 3996 + }, + { + "epoch": 0.46, + "learning_rate": 1.62950277548128e-07, + "logits/chosen": -2.207282543182373, + "logits/rejected": -2.295748710632324, + "logps/chosen": -243.1979522705078, + "logps/rejected": -279.11920166015625, + "loss": 1.4723, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.282073974609375, + "rewards/margins": -0.4423837661743164, + "rewards/rejected": -1.8396903276443481, + "step": 3997 + }, + { + "epoch": 0.47, + "learning_rate": 1.6291484587220975e-07, + "logits/chosen": -1.6296473741531372, + "logits/rejected": -1.9392175674438477, + "logps/chosen": -398.195556640625, + "logps/rejected": -332.976806640625, + "loss": 0.2, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1722580194473267, + "rewards/margins": 2.830291986465454, + "rewards/rejected": -4.00255012512207, + "step": 3998 + }, + { + "epoch": 0.47, + "learning_rate": 1.6287941419629147e-07, + "logits/chosen": -2.768129825592041, + "logits/rejected": -2.815741539001465, + "logps/chosen": -319.0478820800781, + "logps/rejected": -249.6585693359375, + "loss": 0.5352, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5207188129425049, + "rewards/margins": 0.9085509181022644, + "rewards/rejected": -2.429269790649414, + "step": 3999 + }, + { + "epoch": 0.47, + "learning_rate": 1.628439825203732e-07, + "logits/chosen": -2.1106624603271484, + "logits/rejected": -2.1956934928894043, + "logps/chosen": -253.6958770751953, + "logps/rejected": -309.3354797363281, + "loss": 0.4074, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4726953506469727, + "rewards/margins": 2.2677605152130127, + "rewards/rejected": -3.7404556274414062, + "step": 4000 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -1.7543429136276245, + "eval_logits/rejected": -1.756016492843628, + "eval_logps/chosen": -279.1766662597656, + "eval_logps/rejected": -278.8044738769531, + "eval_loss": 0.3739631474018097, + "eval_rewards/accuracies": 0.8390804529190063, + "eval_rewards/chosen": -0.7011234164237976, + "eval_rewards/margins": 2.0786702632904053, + "eval_rewards/rejected": -2.7797935009002686, + "eval_runtime": 265.312, + "eval_samples_per_second": 2.62, + "eval_steps_per_second": 1.312, + "step": 4000 + }, + { + "epoch": 0.47, + "learning_rate": 1.6280855084445494e-07, + "logits/chosen": -2.379930257797241, + "logits/rejected": -2.430798292160034, + "logps/chosen": -296.9425354003906, + "logps/rejected": -471.7687072753906, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2584030628204346, + "rewards/margins": 2.2124128341674805, + "rewards/rejected": -3.470816135406494, + "step": 4001 + }, + { + "epoch": 0.47, + "learning_rate": 1.6277311916853667e-07, + "logits/chosen": -2.2328290939331055, + "logits/rejected": -2.102674961090088, + "logps/chosen": -243.14926147460938, + "logps/rejected": -316.7289123535156, + "loss": 0.6844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2766386270523071, + "rewards/margins": 1.322392225265503, + "rewards/rejected": -2.5990309715270996, + "step": 4002 + }, + { + "epoch": 0.47, + "learning_rate": 1.6273768749261841e-07, + "logits/chosen": -2.6471667289733887, + "logits/rejected": -2.703214645385742, + "logps/chosen": -133.6107635498047, + "logps/rejected": -128.947509765625, + "loss": 0.267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.002619832754135132, + "rewards/margins": 1.599892258644104, + "rewards/rejected": -1.5972723960876465, + "step": 4003 + }, + { + "epoch": 0.47, + "learning_rate": 1.6270225581670014e-07, + "logits/chosen": -2.3063712120056152, + "logits/rejected": -2.1044745445251465, + "logps/chosen": -530.57666015625, + "logps/rejected": -378.1014404296875, + "loss": 0.1876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38555681705474854, + "rewards/margins": 2.2903881072998047, + "rewards/rejected": -2.6759450435638428, + "step": 4004 + }, + { + "epoch": 0.47, + "learning_rate": 1.6266682414078186e-07, + "logits/chosen": -1.9888043403625488, + "logits/rejected": -2.1537792682647705, + "logps/chosen": -323.05401611328125, + "logps/rejected": -311.66796875, + "loss": 0.7458, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8847439885139465, + "rewards/margins": 0.5592479109764099, + "rewards/rejected": -1.4439918994903564, + "step": 4005 + }, + { + "epoch": 0.47, + "learning_rate": 1.6263139246486358e-07, + "logits/chosen": -2.0223121643066406, + "logits/rejected": -1.7573957443237305, + "logps/chosen": -235.67868041992188, + "logps/rejected": -324.9088134765625, + "loss": 0.7484, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4342806339263916, + "rewards/margins": 0.368630588054657, + "rewards/rejected": -1.8029110431671143, + "step": 4006 + }, + { + "epoch": 0.47, + "learning_rate": 1.625959607889453e-07, + "logits/chosen": -2.4409708976745605, + "logits/rejected": -2.517169952392578, + "logps/chosen": -314.3721923828125, + "logps/rejected": -290.0794677734375, + "loss": 0.1322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5039801001548767, + "rewards/margins": 3.0455732345581055, + "rewards/rejected": -3.549553394317627, + "step": 4007 + }, + { + "epoch": 0.47, + "learning_rate": 1.6256052911302703e-07, + "logits/chosen": -1.6862294673919678, + "logits/rejected": -1.5986900329589844, + "logps/chosen": -420.53533935546875, + "logps/rejected": -433.80242919921875, + "loss": 0.2507, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2712683379650116, + "rewards/margins": 3.392768383026123, + "rewards/rejected": -3.664036750793457, + "step": 4008 + }, + { + "epoch": 0.47, + "learning_rate": 1.6252509743710875e-07, + "logits/chosen": -2.7687180042266846, + "logits/rejected": -2.7921576499938965, + "logps/chosen": -264.06744384765625, + "logps/rejected": -234.9364471435547, + "loss": 0.2674, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.845574140548706, + "rewards/margins": 1.548851490020752, + "rewards/rejected": -2.394425630569458, + "step": 4009 + }, + { + "epoch": 0.47, + "learning_rate": 1.624896657611905e-07, + "logits/chosen": -2.2160019874572754, + "logits/rejected": -1.870922565460205, + "logps/chosen": -162.74537658691406, + "logps/rejected": -244.1003875732422, + "loss": 0.2429, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.959694504737854, + "rewards/margins": 1.8619040250778198, + "rewards/rejected": -3.821598529815674, + "step": 4010 + }, + { + "epoch": 0.47, + "learning_rate": 1.6245423408527222e-07, + "logits/chosen": -2.296600341796875, + "logits/rejected": -2.544523239135742, + "logps/chosen": -322.9901123046875, + "logps/rejected": -284.6758117675781, + "loss": 0.3879, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1900469809770584, + "rewards/margins": 2.621302366256714, + "rewards/rejected": -2.811349391937256, + "step": 4011 + }, + { + "epoch": 0.47, + "learning_rate": 1.6241880240935394e-07, + "logits/chosen": -2.365767002105713, + "logits/rejected": -2.323242425918579, + "logps/chosen": -409.7174072265625, + "logps/rejected": -527.154296875, + "loss": 0.3994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7204976081848145, + "rewards/margins": 2.4530062675476074, + "rewards/rejected": -3.173503875732422, + "step": 4012 + }, + { + "epoch": 0.47, + "learning_rate": 1.623833707334357e-07, + "logits/chosen": -2.03511118888855, + "logits/rejected": -2.220942735671997, + "logps/chosen": -379.9289855957031, + "logps/rejected": -317.8266296386719, + "loss": 0.2949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5238538980484009, + "rewards/margins": 2.7420990467071533, + "rewards/rejected": -3.2659528255462646, + "step": 4013 + }, + { + "epoch": 0.47, + "learning_rate": 1.6234793905751744e-07, + "logits/chosen": -2.1484086513519287, + "logits/rejected": -2.218473196029663, + "logps/chosen": -293.8561706542969, + "logps/rejected": -332.3392028808594, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9928566217422485, + "rewards/margins": 2.919572114944458, + "rewards/rejected": -4.912428855895996, + "step": 4014 + }, + { + "epoch": 0.47, + "learning_rate": 1.6231250738159916e-07, + "logits/chosen": -2.280313014984131, + "logits/rejected": -2.080308437347412, + "logps/chosen": -256.4566650390625, + "logps/rejected": -292.1198425292969, + "loss": 0.2401, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.376746654510498, + "rewards/margins": 2.990668535232544, + "rewards/rejected": -4.367414951324463, + "step": 4015 + }, + { + "epoch": 0.47, + "learning_rate": 1.6227707570568088e-07, + "logits/chosen": -2.017169237136841, + "logits/rejected": -2.0995397567749023, + "logps/chosen": -162.23745727539062, + "logps/rejected": -257.9078674316406, + "loss": 0.3163, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6523802280426025, + "rewards/margins": 2.626143217086792, + "rewards/rejected": -3.2785236835479736, + "step": 4016 + }, + { + "epoch": 0.47, + "learning_rate": 1.622416440297626e-07, + "logits/chosen": -2.786778688430786, + "logits/rejected": -2.7719759941101074, + "logps/chosen": -323.81829833984375, + "logps/rejected": -327.05670166015625, + "loss": 0.3937, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.182190179824829, + "rewards/margins": 3.739973545074463, + "rewards/rejected": -4.922163963317871, + "step": 4017 + }, + { + "epoch": 0.47, + "learning_rate": 1.6220621235384433e-07, + "logits/chosen": -2.2916903495788574, + "logits/rejected": -2.186828136444092, + "logps/chosen": -242.97518920898438, + "logps/rejected": -259.56756591796875, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6445357799530029, + "rewards/margins": 1.6932637691497803, + "rewards/rejected": -2.3377997875213623, + "step": 4018 + }, + { + "epoch": 0.47, + "learning_rate": 1.6217078067792605e-07, + "logits/chosen": -2.265206813812256, + "logits/rejected": -2.304734230041504, + "logps/chosen": -319.0094299316406, + "logps/rejected": -331.67279052734375, + "loss": 0.3195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7656305432319641, + "rewards/margins": 1.9768619537353516, + "rewards/rejected": -2.742492198944092, + "step": 4019 + }, + { + "epoch": 0.47, + "learning_rate": 1.6213534900200777e-07, + "logits/chosen": -2.470607042312622, + "logits/rejected": -2.414360523223877, + "logps/chosen": -269.27008056640625, + "logps/rejected": -349.06365966796875, + "loss": 0.3613, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.120682954788208, + "rewards/margins": 2.846076011657715, + "rewards/rejected": -3.966758966445923, + "step": 4020 + }, + { + "epoch": 0.47, + "learning_rate": 1.6209991732608952e-07, + "logits/chosen": -2.4460277557373047, + "logits/rejected": -2.6244146823883057, + "logps/chosen": -230.62615966796875, + "logps/rejected": -254.40890502929688, + "loss": 0.1826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.789209246635437, + "rewards/margins": 2.248028516769409, + "rewards/rejected": -3.0372378826141357, + "step": 4021 + }, + { + "epoch": 0.47, + "learning_rate": 1.6206448565017124e-07, + "logits/chosen": -1.6611381769180298, + "logits/rejected": -2.2368080615997314, + "logps/chosen": -403.5237121582031, + "logps/rejected": -284.08453369140625, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1248193979263306, + "rewards/margins": 3.752411365509033, + "rewards/rejected": -4.877230644226074, + "step": 4022 + }, + { + "epoch": 0.47, + "learning_rate": 1.6202905397425296e-07, + "logits/chosen": -2.951423406600952, + "logits/rejected": -2.8697543144226074, + "logps/chosen": -335.9925842285156, + "logps/rejected": -313.7688293457031, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5580132007598877, + "rewards/margins": 2.8239974975585938, + "rewards/rejected": -4.382010459899902, + "step": 4023 + }, + { + "epoch": 0.47, + "learning_rate": 1.619936222983347e-07, + "logits/chosen": -2.9665780067443848, + "logits/rejected": -2.9253180027008057, + "logps/chosen": -212.20179748535156, + "logps/rejected": -164.77145385742188, + "loss": 0.1098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.747307538986206, + "rewards/margins": 2.8341691493988037, + "rewards/rejected": -3.5814766883850098, + "step": 4024 + }, + { + "epoch": 0.47, + "learning_rate": 1.6195819062241643e-07, + "logits/chosen": -2.271594524383545, + "logits/rejected": -2.4642539024353027, + "logps/chosen": -204.7490692138672, + "logps/rejected": -216.47128295898438, + "loss": 0.2789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9359766244888306, + "rewards/margins": 1.6395865678787231, + "rewards/rejected": -2.5755631923675537, + "step": 4025 + }, + { + "epoch": 0.47, + "learning_rate": 1.6192275894649818e-07, + "logits/chosen": -2.7245378494262695, + "logits/rejected": -2.629551887512207, + "logps/chosen": -186.0338134765625, + "logps/rejected": -227.85263061523438, + "loss": 0.2886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5112806558609009, + "rewards/margins": 2.917816162109375, + "rewards/rejected": -3.4290971755981445, + "step": 4026 + }, + { + "epoch": 0.47, + "learning_rate": 1.618873272705799e-07, + "logits/chosen": -2.020596981048584, + "logits/rejected": -2.4954113960266113, + "logps/chosen": -400.50286865234375, + "logps/rejected": -185.51773071289062, + "loss": 0.4373, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0377007722854614, + "rewards/margins": 1.5791009664535522, + "rewards/rejected": -2.6168017387390137, + "step": 4027 + }, + { + "epoch": 0.47, + "learning_rate": 1.6185189559466163e-07, + "logits/chosen": -2.401359796524048, + "logits/rejected": -2.3842291831970215, + "logps/chosen": -283.03997802734375, + "logps/rejected": -235.09490966796875, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3585634231567383, + "rewards/margins": 1.1304881572723389, + "rewards/rejected": -2.4890518188476562, + "step": 4028 + }, + { + "epoch": 0.47, + "learning_rate": 1.6181646391874335e-07, + "logits/chosen": -2.611067771911621, + "logits/rejected": -2.8740954399108887, + "logps/chosen": -274.0760803222656, + "logps/rejected": -225.32354736328125, + "loss": 0.4676, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0485639572143555, + "rewards/margins": 2.136234760284424, + "rewards/rejected": -3.1847987174987793, + "step": 4029 + }, + { + "epoch": 0.47, + "learning_rate": 1.6178103224282507e-07, + "logits/chosen": -2.02046275138855, + "logits/rejected": -2.1551530361175537, + "logps/chosen": -363.42169189453125, + "logps/rejected": -458.1724853515625, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.396560400724411, + "rewards/margins": 5.026959419250488, + "rewards/rejected": -5.423519611358643, + "step": 4030 + }, + { + "epoch": 0.47, + "learning_rate": 1.617456005669068e-07, + "logits/chosen": -2.393760919570923, + "logits/rejected": -2.4512062072753906, + "logps/chosen": -159.62484741210938, + "logps/rejected": -175.38662719726562, + "loss": 0.2828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8472874760627747, + "rewards/margins": 2.7341525554656982, + "rewards/rejected": -3.581439733505249, + "step": 4031 + }, + { + "epoch": 0.47, + "learning_rate": 1.6171016889098854e-07, + "logits/chosen": -2.2262816429138184, + "logits/rejected": -2.1051530838012695, + "logps/chosen": -315.9046630859375, + "logps/rejected": -330.21575927734375, + "loss": 0.2684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9145344495773315, + "rewards/margins": 1.866511583328247, + "rewards/rejected": -2.781046152114868, + "step": 4032 + }, + { + "epoch": 0.47, + "learning_rate": 1.6167473721507026e-07, + "logits/chosen": -1.9460225105285645, + "logits/rejected": -2.028550624847412, + "logps/chosen": -381.16546630859375, + "logps/rejected": -288.1831970214844, + "loss": 0.6179, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0359983444213867, + "rewards/margins": 0.8157005310058594, + "rewards/rejected": -1.8516989946365356, + "step": 4033 + }, + { + "epoch": 0.47, + "learning_rate": 1.6163930553915199e-07, + "logits/chosen": -2.157963752746582, + "logits/rejected": -2.2237064838409424, + "logps/chosen": -233.5794677734375, + "logps/rejected": -245.76197814941406, + "loss": 0.625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5960309505462646, + "rewards/margins": 1.6215870380401611, + "rewards/rejected": -2.217617988586426, + "step": 4034 + }, + { + "epoch": 0.47, + "learning_rate": 1.616038738632337e-07, + "logits/chosen": -2.289233684539795, + "logits/rejected": -2.1838035583496094, + "logps/chosen": -342.93853759765625, + "logps/rejected": -457.87554931640625, + "loss": 0.4264, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35494521260261536, + "rewards/margins": 1.5172770023345947, + "rewards/rejected": -1.8722221851348877, + "step": 4035 + }, + { + "epoch": 0.47, + "learning_rate": 1.6156844218731546e-07, + "logits/chosen": -2.4482433795928955, + "logits/rejected": -2.6665053367614746, + "logps/chosen": -303.666259765625, + "logps/rejected": -302.1606750488281, + "loss": 0.3323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5726577043533325, + "rewards/margins": 2.3571507930755615, + "rewards/rejected": -2.9298083782196045, + "step": 4036 + }, + { + "epoch": 0.47, + "learning_rate": 1.615330105113972e-07, + "logits/chosen": -2.490072250366211, + "logits/rejected": -2.8387701511383057, + "logps/chosen": -169.9790802001953, + "logps/rejected": -199.3741912841797, + "loss": 0.1145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4634712338447571, + "rewards/margins": 3.913933277130127, + "rewards/rejected": -4.37740421295166, + "step": 4037 + }, + { + "epoch": 0.47, + "learning_rate": 1.6149757883547893e-07, + "logits/chosen": -1.8796353340148926, + "logits/rejected": -2.210146903991699, + "logps/chosen": -304.7650451660156, + "logps/rejected": -210.4641876220703, + "loss": 0.491, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0575072765350342, + "rewards/margins": 0.9071345329284668, + "rewards/rejected": -1.964641809463501, + "step": 4038 + }, + { + "epoch": 0.47, + "learning_rate": 1.6146214715956065e-07, + "logits/chosen": -2.542036771774292, + "logits/rejected": -2.674370765686035, + "logps/chosen": -286.69110107421875, + "logps/rejected": -243.807861328125, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1560657024383545, + "rewards/margins": 3.1867973804473877, + "rewards/rejected": -4.342863082885742, + "step": 4039 + }, + { + "epoch": 0.47, + "learning_rate": 1.6142671548364237e-07, + "logits/chosen": -2.040658712387085, + "logits/rejected": -2.1781704425811768, + "logps/chosen": -408.151611328125, + "logps/rejected": -346.5936279296875, + "loss": 0.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6145296096801758, + "rewards/margins": 2.5092685222625732, + "rewards/rejected": -4.123798370361328, + "step": 4040 + }, + { + "epoch": 0.47, + "learning_rate": 1.613912838077241e-07, + "logits/chosen": -2.2706711292266846, + "logits/rejected": -2.2583773136138916, + "logps/chosen": -281.1419982910156, + "logps/rejected": -217.33255004882812, + "loss": 0.5457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9789040684700012, + "rewards/margins": 1.506022572517395, + "rewards/rejected": -2.484926700592041, + "step": 4041 + }, + { + "epoch": 0.47, + "learning_rate": 1.6135585213180582e-07, + "logits/chosen": -2.363219738006592, + "logits/rejected": -2.7590720653533936, + "logps/chosen": -337.92718505859375, + "logps/rejected": -150.04483032226562, + "loss": 0.3396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47705310583114624, + "rewards/margins": 1.2572790384292603, + "rewards/rejected": -1.7343320846557617, + "step": 4042 + }, + { + "epoch": 0.47, + "learning_rate": 1.6132042045588756e-07, + "logits/chosen": -2.3465445041656494, + "logits/rejected": -2.5133755207061768, + "logps/chosen": -339.67559814453125, + "logps/rejected": -287.059326171875, + "loss": 0.4527, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5442863702774048, + "rewards/margins": 1.576307773590088, + "rewards/rejected": -3.120594024658203, + "step": 4043 + }, + { + "epoch": 0.47, + "learning_rate": 1.6128498877996929e-07, + "logits/chosen": -2.4933533668518066, + "logits/rejected": -2.4782967567443848, + "logps/chosen": -320.68670654296875, + "logps/rejected": -287.8603210449219, + "loss": 0.193, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1807019710540771, + "rewards/margins": 2.167166233062744, + "rewards/rejected": -3.347867965698242, + "step": 4044 + }, + { + "epoch": 0.47, + "learning_rate": 1.61249557104051e-07, + "logits/chosen": -2.078871488571167, + "logits/rejected": -2.25752329826355, + "logps/chosen": -307.66552734375, + "logps/rejected": -325.8763427734375, + "loss": 0.6708, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.390794038772583, + "rewards/margins": 1.597348928451538, + "rewards/rejected": -2.988142967224121, + "step": 4045 + }, + { + "epoch": 0.47, + "learning_rate": 1.6121412542813273e-07, + "logits/chosen": -2.2558836936950684, + "logits/rejected": -2.182671546936035, + "logps/chosen": -374.37921142578125, + "logps/rejected": -371.9806823730469, + "loss": 0.3568, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3891608715057373, + "rewards/margins": 1.6558938026428223, + "rewards/rejected": -3.0450546741485596, + "step": 4046 + }, + { + "epoch": 0.47, + "learning_rate": 1.6117869375221445e-07, + "logits/chosen": -2.5904879570007324, + "logits/rejected": -2.591486930847168, + "logps/chosen": -168.71810913085938, + "logps/rejected": -211.0722198486328, + "loss": 0.3098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5889180302619934, + "rewards/margins": 2.0322959423065186, + "rewards/rejected": -2.6212141513824463, + "step": 4047 + }, + { + "epoch": 0.47, + "learning_rate": 1.6114326207629623e-07, + "logits/chosen": -2.299152374267578, + "logits/rejected": -2.5416386127471924, + "logps/chosen": -449.2962646484375, + "logps/rejected": -310.5745849609375, + "loss": 0.3183, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6246856451034546, + "rewards/margins": 2.300295829772949, + "rewards/rejected": -3.9249815940856934, + "step": 4048 + }, + { + "epoch": 0.47, + "learning_rate": 1.6110783040037795e-07, + "logits/chosen": -2.633873462677002, + "logits/rejected": -2.4719269275665283, + "logps/chosen": -178.69044494628906, + "logps/rejected": -172.24151611328125, + "loss": 0.3892, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.914673089981079, + "rewards/margins": 2.1106061935424805, + "rewards/rejected": -4.0252790451049805, + "step": 4049 + }, + { + "epoch": 0.47, + "learning_rate": 1.6107239872445967e-07, + "logits/chosen": -2.881516933441162, + "logits/rejected": -2.8404793739318848, + "logps/chosen": -152.82778930664062, + "logps/rejected": -167.233154296875, + "loss": 0.5245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4401497840881348, + "rewards/margins": 1.0958932638168335, + "rewards/rejected": -2.5360429286956787, + "step": 4050 + }, + { + "epoch": 0.47, + "learning_rate": 1.610369670485414e-07, + "logits/chosen": -2.2415738105773926, + "logits/rejected": -2.407860279083252, + "logps/chosen": -380.449951171875, + "logps/rejected": -198.9984588623047, + "loss": 0.5611, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6974926590919495, + "rewards/margins": 0.98536616563797, + "rewards/rejected": -1.6828587055206299, + "step": 4051 + }, + { + "epoch": 0.47, + "learning_rate": 1.6100153537262312e-07, + "logits/chosen": -2.1791763305664062, + "logits/rejected": -1.7162070274353027, + "logps/chosen": -139.60792541503906, + "logps/rejected": -235.7552947998047, + "loss": 0.3149, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2683665752410889, + "rewards/margins": 1.5227947235107422, + "rewards/rejected": -2.791161298751831, + "step": 4052 + }, + { + "epoch": 0.47, + "learning_rate": 1.6096610369670484e-07, + "logits/chosen": -2.2600324153900146, + "logits/rejected": -2.2509241104125977, + "logps/chosen": -395.8753967285156, + "logps/rejected": -469.9883117675781, + "loss": 0.2143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1634414196014404, + "rewards/margins": 3.703289031982422, + "rewards/rejected": -4.866730213165283, + "step": 4053 + }, + { + "epoch": 0.47, + "learning_rate": 1.6093067202078656e-07, + "logits/chosen": -2.444830894470215, + "logits/rejected": -2.54936146736145, + "logps/chosen": -269.7178955078125, + "logps/rejected": -260.8194580078125, + "loss": 1.0573, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.498363494873047, + "rewards/margins": 0.09302544593811035, + "rewards/rejected": -2.5913891792297363, + "step": 4054 + }, + { + "epoch": 0.47, + "learning_rate": 1.608952403448683e-07, + "logits/chosen": -2.7121853828430176, + "logits/rejected": -2.7707598209381104, + "logps/chosen": -262.52447509765625, + "logps/rejected": -308.056640625, + "loss": 0.396, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2341034412384033, + "rewards/margins": 3.3376901149749756, + "rewards/rejected": -4.571793556213379, + "step": 4055 + }, + { + "epoch": 0.47, + "learning_rate": 1.6085980866895003e-07, + "logits/chosen": -2.401360273361206, + "logits/rejected": -2.4464221000671387, + "logps/chosen": -441.3764953613281, + "logps/rejected": -197.8928985595703, + "loss": 0.2244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35251086950302124, + "rewards/margins": 2.2192182540893555, + "rewards/rejected": -2.5717291831970215, + "step": 4056 + }, + { + "epoch": 0.47, + "learning_rate": 1.6082437699303175e-07, + "logits/chosen": -2.2656633853912354, + "logits/rejected": -1.8890137672424316, + "logps/chosen": -175.5037078857422, + "logps/rejected": -265.0185546875, + "loss": 0.1933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.972220778465271, + "rewards/margins": 2.718860626220703, + "rewards/rejected": -3.6910812854766846, + "step": 4057 + }, + { + "epoch": 0.47, + "learning_rate": 1.6078894531711348e-07, + "logits/chosen": -2.0339388847351074, + "logits/rejected": -1.7547874450683594, + "logps/chosen": -382.5373229980469, + "logps/rejected": -415.5988464355469, + "loss": 0.5108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6526508927345276, + "rewards/margins": 0.6679400205612183, + "rewards/rejected": -1.3205909729003906, + "step": 4058 + }, + { + "epoch": 0.47, + "learning_rate": 1.6075351364119525e-07, + "logits/chosen": -2.1825973987579346, + "logits/rejected": -2.397676467895508, + "logps/chosen": -401.91998291015625, + "logps/rejected": -306.63262939453125, + "loss": 0.2271, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04859080910682678, + "rewards/margins": 2.8722405433654785, + "rewards/rejected": -2.8236498832702637, + "step": 4059 + }, + { + "epoch": 0.47, + "learning_rate": 1.6071808196527697e-07, + "logits/chosen": -2.354980230331421, + "logits/rejected": -2.0915491580963135, + "logps/chosen": -425.04241943359375, + "logps/rejected": -349.2367858886719, + "loss": 0.6154, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7031338214874268, + "rewards/margins": 0.9441513419151306, + "rewards/rejected": -1.6472852230072021, + "step": 4060 + }, + { + "epoch": 0.47, + "learning_rate": 1.606826502893587e-07, + "logits/chosen": -2.585381031036377, + "logits/rejected": -2.640963554382324, + "logps/chosen": -232.6954345703125, + "logps/rejected": -179.76553344726562, + "loss": 0.7165, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6204997897148132, + "rewards/margins": 0.6255846619606018, + "rewards/rejected": -1.2460843324661255, + "step": 4061 + }, + { + "epoch": 0.47, + "learning_rate": 1.6064721861344042e-07, + "logits/chosen": -1.9641785621643066, + "logits/rejected": -2.026552677154541, + "logps/chosen": -245.78564453125, + "logps/rejected": -314.75531005859375, + "loss": 0.1802, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6634852886199951, + "rewards/margins": 2.6505494117736816, + "rewards/rejected": -4.314034461975098, + "step": 4062 + }, + { + "epoch": 0.47, + "learning_rate": 1.6061178693752214e-07, + "logits/chosen": -3.1083731651306152, + "logits/rejected": -2.98187255859375, + "logps/chosen": -331.98712158203125, + "logps/rejected": -261.7937316894531, + "loss": 0.5732, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6876945495605469, + "rewards/margins": 1.7389991283416748, + "rewards/rejected": -3.4266934394836426, + "step": 4063 + }, + { + "epoch": 0.47, + "learning_rate": 1.6057635526160386e-07, + "logits/chosen": -2.554898500442505, + "logits/rejected": -2.4133505821228027, + "logps/chosen": -208.2342071533203, + "logps/rejected": -204.43707275390625, + "loss": 0.5257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8244565725326538, + "rewards/margins": 1.4461679458618164, + "rewards/rejected": -2.2706246376037598, + "step": 4064 + }, + { + "epoch": 0.47, + "learning_rate": 1.6054092358568558e-07, + "logits/chosen": -1.821702480316162, + "logits/rejected": -2.0414624214172363, + "logps/chosen": -290.4552001953125, + "logps/rejected": -256.58544921875, + "loss": 0.6059, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5079951286315918, + "rewards/margins": 1.7242575883865356, + "rewards/rejected": -3.232252836227417, + "step": 4065 + }, + { + "epoch": 0.47, + "learning_rate": 1.6050549190976733e-07, + "logits/chosen": -2.4960572719573975, + "logits/rejected": -2.640667676925659, + "logps/chosen": -139.92697143554688, + "logps/rejected": -181.45993041992188, + "loss": 0.1263, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49396538734436035, + "rewards/margins": 3.050597906112671, + "rewards/rejected": -3.5445632934570312, + "step": 4066 + }, + { + "epoch": 0.47, + "learning_rate": 1.6047006023384905e-07, + "logits/chosen": -2.3991169929504395, + "logits/rejected": -2.4583399295806885, + "logps/chosen": -395.4876708984375, + "logps/rejected": -282.66326904296875, + "loss": 0.0929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.769955039024353, + "rewards/margins": 3.436938762664795, + "rewards/rejected": -4.2068939208984375, + "step": 4067 + }, + { + "epoch": 0.47, + "learning_rate": 1.6043462855793078e-07, + "logits/chosen": -2.444474220275879, + "logits/rejected": -2.571960210800171, + "logps/chosen": -295.9311218261719, + "logps/rejected": -317.5608825683594, + "loss": 0.1707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20013374090194702, + "rewards/margins": 3.0900442600250244, + "rewards/rejected": -3.290178060531616, + "step": 4068 + }, + { + "epoch": 0.47, + "learning_rate": 1.603991968820125e-07, + "logits/chosen": -2.5156807899475098, + "logits/rejected": -2.5111725330352783, + "logps/chosen": -198.70034790039062, + "logps/rejected": -290.62298583984375, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8097915053367615, + "rewards/margins": 1.1327472925186157, + "rewards/rejected": -1.9425387382507324, + "step": 4069 + }, + { + "epoch": 0.47, + "learning_rate": 1.6036376520609422e-07, + "logits/chosen": -2.0453906059265137, + "logits/rejected": -2.0513157844543457, + "logps/chosen": -274.03863525390625, + "logps/rejected": -367.76739501953125, + "loss": 0.7074, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3181802034378052, + "rewards/margins": 2.529111623764038, + "rewards/rejected": -3.8472914695739746, + "step": 4070 + }, + { + "epoch": 0.47, + "learning_rate": 1.60328333530176e-07, + "logits/chosen": -1.9644604921340942, + "logits/rejected": -1.909451961517334, + "logps/chosen": -193.8740234375, + "logps/rejected": -261.7751159667969, + "loss": 0.2701, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5871630907058716, + "rewards/margins": 2.0218992233276367, + "rewards/rejected": -2.6090621948242188, + "step": 4071 + }, + { + "epoch": 0.47, + "learning_rate": 1.6029290185425772e-07, + "logits/chosen": -2.579878807067871, + "logits/rejected": -2.503446340560913, + "logps/chosen": -220.50244140625, + "logps/rejected": -192.82559204101562, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6051865816116333, + "rewards/margins": 0.7177762985229492, + "rewards/rejected": -2.322962760925293, + "step": 4072 + }, + { + "epoch": 0.47, + "learning_rate": 1.6025747017833944e-07, + "logits/chosen": -1.7583746910095215, + "logits/rejected": -1.6010172367095947, + "logps/chosen": -183.11566162109375, + "logps/rejected": -325.06402587890625, + "loss": 0.6041, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6765381693840027, + "rewards/margins": 1.2048523426055908, + "rewards/rejected": -1.8813905715942383, + "step": 4073 + }, + { + "epoch": 0.47, + "learning_rate": 1.6022203850242116e-07, + "logits/chosen": -2.0507805347442627, + "logits/rejected": -1.8608496189117432, + "logps/chosen": -138.75413513183594, + "logps/rejected": -261.38214111328125, + "loss": 0.7084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5281267166137695, + "rewards/margins": 1.9847915172576904, + "rewards/rejected": -2.512917995452881, + "step": 4074 + }, + { + "epoch": 0.47, + "learning_rate": 1.6018660682650288e-07, + "logits/chosen": -2.4153058528900146, + "logits/rejected": -2.544142246246338, + "logps/chosen": -447.7703857421875, + "logps/rejected": -368.1720886230469, + "loss": 0.3818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8595754504203796, + "rewards/margins": 1.5147007703781128, + "rewards/rejected": -2.3742763996124268, + "step": 4075 + }, + { + "epoch": 0.47, + "learning_rate": 1.601511751505846e-07, + "logits/chosen": -1.6977249383926392, + "logits/rejected": -1.8011817932128906, + "logps/chosen": -472.72723388671875, + "logps/rejected": -351.3356018066406, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4079134166240692, + "rewards/margins": 2.84122896194458, + "rewards/rejected": -3.2491424083709717, + "step": 4076 + }, + { + "epoch": 0.47, + "learning_rate": 1.6011574347466635e-07, + "logits/chosen": -2.35325288772583, + "logits/rejected": -2.475297451019287, + "logps/chosen": -141.10743713378906, + "logps/rejected": -267.9314270019531, + "loss": 0.5449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9425958395004272, + "rewards/margins": 2.1664507389068604, + "rewards/rejected": -3.109046697616577, + "step": 4077 + }, + { + "epoch": 0.47, + "learning_rate": 1.6008031179874808e-07, + "logits/chosen": -1.7343977689743042, + "logits/rejected": -1.6708441972732544, + "logps/chosen": -221.2890625, + "logps/rejected": -274.0637512207031, + "loss": 0.9271, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9744048118591309, + "rewards/margins": 1.1185492277145386, + "rewards/rejected": -2.09295392036438, + "step": 4078 + }, + { + "epoch": 0.47, + "learning_rate": 1.600448801228298e-07, + "logits/chosen": -1.8908538818359375, + "logits/rejected": -2.3978636264801025, + "logps/chosen": -457.4634704589844, + "logps/rejected": -279.3254699707031, + "loss": 0.6214, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2218141555786133, + "rewards/margins": 1.3700751066207886, + "rewards/rejected": -2.5918893814086914, + "step": 4079 + }, + { + "epoch": 0.47, + "learning_rate": 1.6000944844691152e-07, + "logits/chosen": -2.451700448989868, + "logits/rejected": -2.6494975090026855, + "logps/chosen": -253.4405517578125, + "logps/rejected": -237.72079467773438, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.920867919921875, + "rewards/margins": 2.4679102897644043, + "rewards/rejected": -3.3887784481048584, + "step": 4080 + }, + { + "epoch": 0.47, + "learning_rate": 1.5997401677099324e-07, + "logits/chosen": -2.4073398113250732, + "logits/rejected": -2.6285667419433594, + "logps/chosen": -224.21771240234375, + "logps/rejected": -178.3167724609375, + "loss": 1.4038, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1904518604278564, + "rewards/margins": 1.5713820457458496, + "rewards/rejected": -3.761833906173706, + "step": 4081 + }, + { + "epoch": 0.47, + "learning_rate": 1.5993858509507497e-07, + "logits/chosen": -2.6058974266052246, + "logits/rejected": -2.5536632537841797, + "logps/chosen": -338.1208190917969, + "logps/rejected": -302.50103759765625, + "loss": 1.5742, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.7505910396575928, + "rewards/margins": 0.7087265253067017, + "rewards/rejected": -3.459317684173584, + "step": 4082 + }, + { + "epoch": 0.47, + "learning_rate": 1.5990315341915674e-07, + "logits/chosen": -2.1105403900146484, + "logits/rejected": -2.048630714416504, + "logps/chosen": -329.95867919921875, + "logps/rejected": -260.979736328125, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.794643759727478, + "rewards/margins": 1.9263921976089478, + "rewards/rejected": -2.721035957336426, + "step": 4083 + }, + { + "epoch": 0.48, + "learning_rate": 1.5986772174323846e-07, + "logits/chosen": -2.377633810043335, + "logits/rejected": -2.227105140686035, + "logps/chosen": -237.14263916015625, + "logps/rejected": -223.47979736328125, + "loss": 0.3091, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7461654543876648, + "rewards/margins": 2.60564923286438, + "rewards/rejected": -3.3518147468566895, + "step": 4084 + }, + { + "epoch": 0.48, + "learning_rate": 1.5983229006732018e-07, + "logits/chosen": -2.4711110591888428, + "logits/rejected": -2.490372657775879, + "logps/chosen": -279.9700927734375, + "logps/rejected": -370.2623291015625, + "loss": 0.4187, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9257657527923584, + "rewards/margins": 2.35088849067688, + "rewards/rejected": -3.2766542434692383, + "step": 4085 + }, + { + "epoch": 0.48, + "learning_rate": 1.597968583914019e-07, + "logits/chosen": -2.845172166824341, + "logits/rejected": -2.797882556915283, + "logps/chosen": -378.9098205566406, + "logps/rejected": -258.28448486328125, + "loss": 0.2672, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5431761741638184, + "rewards/margins": 2.036651849746704, + "rewards/rejected": -3.5798280239105225, + "step": 4086 + }, + { + "epoch": 0.48, + "learning_rate": 1.5976142671548363e-07, + "logits/chosen": -1.8797632455825806, + "logits/rejected": -2.002561330795288, + "logps/chosen": -183.81027221679688, + "logps/rejected": -206.70887756347656, + "loss": 0.261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6311947107315063, + "rewards/margins": 2.5990540981292725, + "rewards/rejected": -3.2302486896514893, + "step": 4087 + }, + { + "epoch": 0.48, + "learning_rate": 1.5972599503956538e-07, + "logits/chosen": -2.213416576385498, + "logits/rejected": -2.044665575027466, + "logps/chosen": -259.0989685058594, + "logps/rejected": -218.49151611328125, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5532823801040649, + "rewards/margins": 1.7879688739776611, + "rewards/rejected": -2.3412511348724365, + "step": 4088 + }, + { + "epoch": 0.48, + "learning_rate": 1.596905633636471e-07, + "logits/chosen": -2.1516809463500977, + "logits/rejected": -2.295790672302246, + "logps/chosen": -256.18621826171875, + "logps/rejected": -427.3019104003906, + "loss": 0.2658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7278546094894409, + "rewards/margins": 2.1314985752105713, + "rewards/rejected": -2.8593530654907227, + "step": 4089 + }, + { + "epoch": 0.48, + "learning_rate": 1.5965513168772882e-07, + "logits/chosen": -2.514441967010498, + "logits/rejected": -2.64673113822937, + "logps/chosen": -296.86993408203125, + "logps/rejected": -305.88250732421875, + "loss": 0.2891, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6698538064956665, + "rewards/margins": 2.2777507305145264, + "rewards/rejected": -2.9476046562194824, + "step": 4090 + }, + { + "epoch": 0.48, + "learning_rate": 1.5961970001181054e-07, + "logits/chosen": -2.4124927520751953, + "logits/rejected": -2.173229455947876, + "logps/chosen": -278.9635314941406, + "logps/rejected": -202.28460693359375, + "loss": 0.8594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8075892329216003, + "rewards/margins": 0.6399751901626587, + "rewards/rejected": -1.4475644826889038, + "step": 4091 + }, + { + "epoch": 0.48, + "learning_rate": 1.5958426833589227e-07, + "logits/chosen": -2.3530056476593018, + "logits/rejected": -2.238673686981201, + "logps/chosen": -447.05438232421875, + "logps/rejected": -315.23736572265625, + "loss": 0.6116, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0672606229782104, + "rewards/margins": 0.6063184142112732, + "rewards/rejected": -1.6735789775848389, + "step": 4092 + }, + { + "epoch": 0.48, + "learning_rate": 1.59548836659974e-07, + "logits/chosen": -1.638364553451538, + "logits/rejected": -1.8158390522003174, + "logps/chosen": -623.6826782226562, + "logps/rejected": -539.3189697265625, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9681775569915771, + "rewards/margins": 1.3104493618011475, + "rewards/rejected": -2.2786269187927246, + "step": 4093 + }, + { + "epoch": 0.48, + "learning_rate": 1.5951340498405576e-07, + "logits/chosen": -3.0527052879333496, + "logits/rejected": -3.051730155944824, + "logps/chosen": -354.683837890625, + "logps/rejected": -333.9454650878906, + "loss": 0.5658, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9278634786605835, + "rewards/margins": 1.8784294128417969, + "rewards/rejected": -3.80629301071167, + "step": 4094 + }, + { + "epoch": 0.48, + "learning_rate": 1.5947797330813749e-07, + "logits/chosen": -2.32759690284729, + "logits/rejected": -2.39555025100708, + "logps/chosen": -192.4286651611328, + "logps/rejected": -249.15121459960938, + "loss": 0.387, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2137242555618286, + "rewards/margins": 1.6946457624435425, + "rewards/rejected": -2.908370018005371, + "step": 4095 + }, + { + "epoch": 0.48, + "learning_rate": 1.594425416322192e-07, + "logits/chosen": -2.1371660232543945, + "logits/rejected": -2.4133615493774414, + "logps/chosen": -466.2069091796875, + "logps/rejected": -310.89532470703125, + "loss": 0.4371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.050594180822372437, + "rewards/margins": 1.6599353551864624, + "rewards/rejected": -1.6093411445617676, + "step": 4096 + }, + { + "epoch": 0.48, + "learning_rate": 1.5940710995630093e-07, + "logits/chosen": -2.536724090576172, + "logits/rejected": -2.2177071571350098, + "logps/chosen": -216.52041625976562, + "logps/rejected": -206.5153350830078, + "loss": 0.3491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17656826972961426, + "rewards/margins": 1.8213014602661133, + "rewards/rejected": -1.997869610786438, + "step": 4097 + }, + { + "epoch": 0.48, + "learning_rate": 1.5937167828038265e-07, + "logits/chosen": -2.7953038215637207, + "logits/rejected": -2.7435901165008545, + "logps/chosen": -159.49229431152344, + "logps/rejected": -181.7948455810547, + "loss": 0.9458, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6305214166641235, + "rewards/margins": -0.2839755713939667, + "rewards/rejected": -1.346545696258545, + "step": 4098 + }, + { + "epoch": 0.48, + "learning_rate": 1.5933624660446437e-07, + "logits/chosen": -1.999406337738037, + "logits/rejected": -1.912480115890503, + "logps/chosen": -356.0205078125, + "logps/rejected": -362.4521789550781, + "loss": 0.2611, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.212924599647522, + "rewards/margins": 3.8814027309417725, + "rewards/rejected": -5.094327449798584, + "step": 4099 + }, + { + "epoch": 0.48, + "learning_rate": 1.5930081492854612e-07, + "logits/chosen": -2.679166316986084, + "logits/rejected": -2.674785852432251, + "logps/chosen": -166.195556640625, + "logps/rejected": -218.36532592773438, + "loss": 0.4153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6651751399040222, + "rewards/margins": 0.944736659526825, + "rewards/rejected": -1.6099117994308472, + "step": 4100 + }, + { + "epoch": 0.48, + "learning_rate": 1.5926538325262784e-07, + "logits/chosen": -2.332929849624634, + "logits/rejected": -2.505073070526123, + "logps/chosen": -280.653564453125, + "logps/rejected": -260.27862548828125, + "loss": 1.727, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.04426646232605, + "rewards/margins": -0.043112486600875854, + "rewards/rejected": -2.0011541843414307, + "step": 4101 + }, + { + "epoch": 0.48, + "learning_rate": 1.5922995157670957e-07, + "logits/chosen": -2.8159780502319336, + "logits/rejected": -2.74099063873291, + "logps/chosen": -162.16282653808594, + "logps/rejected": -133.0289306640625, + "loss": 1.6611, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.957200288772583, + "rewards/margins": -0.46663814783096313, + "rewards/rejected": -1.4905623197555542, + "step": 4102 + }, + { + "epoch": 0.48, + "learning_rate": 1.591945199007913e-07, + "logits/chosen": -2.060239315032959, + "logits/rejected": -2.409789562225342, + "logps/chosen": -339.5497741699219, + "logps/rejected": -237.48089599609375, + "loss": 0.262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6689355969429016, + "rewards/margins": 3.102839231491089, + "rewards/rejected": -3.7717747688293457, + "step": 4103 + }, + { + "epoch": 0.48, + "learning_rate": 1.59159088224873e-07, + "logits/chosen": -2.528618574142456, + "logits/rejected": -2.521982431411743, + "logps/chosen": -451.046630859375, + "logps/rejected": -358.0485534667969, + "loss": 0.25, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5276907682418823, + "rewards/margins": 2.8991539478302, + "rewards/rejected": -3.426844835281372, + "step": 4104 + }, + { + "epoch": 0.48, + "learning_rate": 1.5912365654895473e-07, + "logits/chosen": -1.9094429016113281, + "logits/rejected": -1.9235280752182007, + "logps/chosen": -239.66941833496094, + "logps/rejected": -322.7490539550781, + "loss": 1.1488, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6217199563980103, + "rewards/margins": 0.2931760549545288, + "rewards/rejected": -1.914896011352539, + "step": 4105 + }, + { + "epoch": 0.48, + "learning_rate": 1.590882248730365e-07, + "logits/chosen": -2.2975056171417236, + "logits/rejected": -1.9994398355484009, + "logps/chosen": -250.85430908203125, + "logps/rejected": -336.4378662109375, + "loss": 0.1773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6649555563926697, + "rewards/margins": 3.0838699340820312, + "rewards/rejected": -3.7488255500793457, + "step": 4106 + }, + { + "epoch": 0.48, + "learning_rate": 1.5905279319711823e-07, + "logits/chosen": -2.58534836769104, + "logits/rejected": -2.168038845062256, + "logps/chosen": -102.24021911621094, + "logps/rejected": -228.93296813964844, + "loss": 0.2939, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9426016211509705, + "rewards/margins": 2.437493324279785, + "rewards/rejected": -3.3800950050354004, + "step": 4107 + }, + { + "epoch": 0.48, + "learning_rate": 1.5901736152119995e-07, + "logits/chosen": -2.071634292602539, + "logits/rejected": -2.1887335777282715, + "logps/chosen": -284.48193359375, + "logps/rejected": -284.4460144042969, + "loss": 0.5139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7219847440719604, + "rewards/margins": 1.8136142492294312, + "rewards/rejected": -2.5355989933013916, + "step": 4108 + }, + { + "epoch": 0.48, + "learning_rate": 1.5898192984528167e-07, + "logits/chosen": -2.9952168464660645, + "logits/rejected": -3.003607988357544, + "logps/chosen": -132.6145477294922, + "logps/rejected": -196.7735595703125, + "loss": 0.4071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9975700378417969, + "rewards/margins": 2.131071090698242, + "rewards/rejected": -3.128641128540039, + "step": 4109 + }, + { + "epoch": 0.48, + "learning_rate": 1.589464981693634e-07, + "logits/chosen": -2.0340311527252197, + "logits/rejected": -1.9708693027496338, + "logps/chosen": -295.36181640625, + "logps/rejected": -339.752685546875, + "loss": 0.4462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7982165217399597, + "rewards/margins": 1.0012586116790771, + "rewards/rejected": -1.799475073814392, + "step": 4110 + }, + { + "epoch": 0.48, + "learning_rate": 1.5891106649344515e-07, + "logits/chosen": -1.9854971170425415, + "logits/rejected": -2.1044232845306396, + "logps/chosen": -179.58389282226562, + "logps/rejected": -238.39712524414062, + "loss": 0.3881, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8694287538528442, + "rewards/margins": 3.079129695892334, + "rewards/rejected": -3.9485583305358887, + "step": 4111 + }, + { + "epoch": 0.48, + "learning_rate": 1.5887563481752687e-07, + "logits/chosen": -2.178144693374634, + "logits/rejected": -2.2932960987091064, + "logps/chosen": -400.01080322265625, + "logps/rejected": -340.9632873535156, + "loss": 0.3896, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8133999109268188, + "rewards/margins": 1.5972542762756348, + "rewards/rejected": -2.410654067993164, + "step": 4112 + }, + { + "epoch": 0.48, + "learning_rate": 1.588402031416086e-07, + "logits/chosen": -2.5651512145996094, + "logits/rejected": -2.5871524810791016, + "logps/chosen": -261.5509033203125, + "logps/rejected": -305.1123046875, + "loss": 0.6294, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9217718839645386, + "rewards/margins": 1.7191444635391235, + "rewards/rejected": -3.640916109085083, + "step": 4113 + }, + { + "epoch": 0.48, + "learning_rate": 1.588047714656903e-07, + "logits/chosen": -2.842311382293701, + "logits/rejected": -2.912226438522339, + "logps/chosen": -350.75164794921875, + "logps/rejected": -340.19842529296875, + "loss": 0.3128, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7577451467514038, + "rewards/margins": 2.4276111125946045, + "rewards/rejected": -4.185356140136719, + "step": 4114 + }, + { + "epoch": 0.48, + "learning_rate": 1.5876933978977203e-07, + "logits/chosen": -2.1172144412994385, + "logits/rejected": -2.4964070320129395, + "logps/chosen": -241.9571533203125, + "logps/rejected": -245.68923950195312, + "loss": 0.4407, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0332794189453125, + "rewards/margins": 4.227479934692383, + "rewards/rejected": -5.260759353637695, + "step": 4115 + }, + { + "epoch": 0.48, + "learning_rate": 1.5873390811385376e-07, + "logits/chosen": -1.5568325519561768, + "logits/rejected": -1.7480478286743164, + "logps/chosen": -393.9427185058594, + "logps/rejected": -322.64398193359375, + "loss": 0.449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7210593223571777, + "rewards/margins": 2.0277280807495117, + "rewards/rejected": -2.7487874031066895, + "step": 4116 + }, + { + "epoch": 0.48, + "learning_rate": 1.5869847643793548e-07, + "logits/chosen": -2.088031530380249, + "logits/rejected": -2.176208734512329, + "logps/chosen": -573.229736328125, + "logps/rejected": -345.135498046875, + "loss": 0.2559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8664940595626831, + "rewards/margins": 2.7311201095581055, + "rewards/rejected": -3.597614288330078, + "step": 4117 + }, + { + "epoch": 0.48, + "learning_rate": 1.5866304476201725e-07, + "logits/chosen": -2.3836605548858643, + "logits/rejected": -2.455425500869751, + "logps/chosen": -275.0123291015625, + "logps/rejected": -281.6199035644531, + "loss": 0.2118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23598292469978333, + "rewards/margins": 2.445526123046875, + "rewards/rejected": -2.681509017944336, + "step": 4118 + }, + { + "epoch": 0.48, + "learning_rate": 1.5862761308609898e-07, + "logits/chosen": -2.4276106357574463, + "logits/rejected": -2.5549001693725586, + "logps/chosen": -298.1375427246094, + "logps/rejected": -156.0191650390625, + "loss": 0.5989, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3395609855651855, + "rewards/margins": 0.7980355620384216, + "rewards/rejected": -2.137596607208252, + "step": 4119 + }, + { + "epoch": 0.48, + "learning_rate": 1.585921814101807e-07, + "logits/chosen": -2.9116568565368652, + "logits/rejected": -2.7068023681640625, + "logps/chosen": -616.156494140625, + "logps/rejected": -286.8794250488281, + "loss": 0.3279, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8033970594406128, + "rewards/margins": 2.395587682723999, + "rewards/rejected": -4.198984622955322, + "step": 4120 + }, + { + "epoch": 0.48, + "learning_rate": 1.5855674973426242e-07, + "logits/chosen": -2.0914127826690674, + "logits/rejected": -2.4580025672912598, + "logps/chosen": -394.1136169433594, + "logps/rejected": -217.47097778320312, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8979743123054504, + "rewards/margins": 1.8563799858093262, + "rewards/rejected": -2.754354476928711, + "step": 4121 + }, + { + "epoch": 0.48, + "learning_rate": 1.5852131805834417e-07, + "logits/chosen": -1.9182684421539307, + "logits/rejected": -2.076828956604004, + "logps/chosen": -334.1504821777344, + "logps/rejected": -262.4606628417969, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2752124071121216, + "rewards/margins": 1.6516660451889038, + "rewards/rejected": -2.9268784523010254, + "step": 4122 + }, + { + "epoch": 0.48, + "learning_rate": 1.584858863824259e-07, + "logits/chosen": -2.4182639122009277, + "logits/rejected": -2.109921455383301, + "logps/chosen": -290.3524475097656, + "logps/rejected": -365.86029052734375, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.711117148399353, + "rewards/margins": 2.268132209777832, + "rewards/rejected": -2.9792490005493164, + "step": 4123 + }, + { + "epoch": 0.48, + "learning_rate": 1.584504547065076e-07, + "logits/chosen": -2.4011192321777344, + "logits/rejected": -2.567085027694702, + "logps/chosen": -213.08224487304688, + "logps/rejected": -247.150146484375, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1518572568893433, + "rewards/margins": 2.173279047012329, + "rewards/rejected": -3.325136423110962, + "step": 4124 + }, + { + "epoch": 0.48, + "learning_rate": 1.5841502303058933e-07, + "logits/chosen": -2.5838942527770996, + "logits/rejected": -2.3286519050598145, + "logps/chosen": -199.90118408203125, + "logps/rejected": -423.9735107421875, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9944974184036255, + "rewards/margins": 2.408916711807251, + "rewards/rejected": -3.403414011001587, + "step": 4125 + }, + { + "epoch": 0.48, + "learning_rate": 1.5837959135467106e-07, + "logits/chosen": -2.046470880508423, + "logits/rejected": -1.9612995386123657, + "logps/chosen": -279.3263854980469, + "logps/rejected": -294.11248779296875, + "loss": 0.3631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8511017560958862, + "rewards/margins": 2.7475969791412354, + "rewards/rejected": -3.5986990928649902, + "step": 4126 + }, + { + "epoch": 0.48, + "learning_rate": 1.5834415967875278e-07, + "logits/chosen": -1.8600528240203857, + "logits/rejected": -1.7321062088012695, + "logps/chosen": -350.84381103515625, + "logps/rejected": -332.35809326171875, + "loss": 0.4289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3021193742752075, + "rewards/margins": 2.328277587890625, + "rewards/rejected": -3.630396842956543, + "step": 4127 + }, + { + "epoch": 0.48, + "learning_rate": 1.583087280028345e-07, + "logits/chosen": -2.6455893516540527, + "logits/rejected": -2.674375295639038, + "logps/chosen": -324.5091247558594, + "logps/rejected": -201.71070861816406, + "loss": 0.4566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7309709787368774, + "rewards/margins": 0.9964444637298584, + "rewards/rejected": -1.7274155616760254, + "step": 4128 + }, + { + "epoch": 0.48, + "learning_rate": 1.5827329632691628e-07, + "logits/chosen": -2.0027313232421875, + "logits/rejected": -2.3337321281433105, + "logps/chosen": -365.75262451171875, + "logps/rejected": -234.27976989746094, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5847543478012085, + "rewards/margins": 1.6078301668167114, + "rewards/rejected": -2.19258451461792, + "step": 4129 + }, + { + "epoch": 0.48, + "learning_rate": 1.58237864650998e-07, + "logits/chosen": -2.267455577850342, + "logits/rejected": -2.1600022315979004, + "logps/chosen": -188.8294677734375, + "logps/rejected": -262.0408935546875, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8660821914672852, + "rewards/margins": 3.101630687713623, + "rewards/rejected": -3.967712640762329, + "step": 4130 + }, + { + "epoch": 0.48, + "learning_rate": 1.5820243297507972e-07, + "logits/chosen": -1.854107141494751, + "logits/rejected": -2.101840019226074, + "logps/chosen": -271.6152038574219, + "logps/rejected": -181.65872192382812, + "loss": 0.4207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6665902733802795, + "rewards/margins": 1.7577829360961914, + "rewards/rejected": -2.424373149871826, + "step": 4131 + }, + { + "epoch": 0.48, + "learning_rate": 1.5816700129916144e-07, + "logits/chosen": -2.2030980587005615, + "logits/rejected": -2.2924413681030273, + "logps/chosen": -308.7123718261719, + "logps/rejected": -295.85333251953125, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08946412801742554, + "rewards/margins": 4.345136642456055, + "rewards/rejected": -4.255672454833984, + "step": 4132 + }, + { + "epoch": 0.48, + "learning_rate": 1.5813156962324316e-07, + "logits/chosen": -2.695335865020752, + "logits/rejected": -2.5826609134674072, + "logps/chosen": -172.8690948486328, + "logps/rejected": -188.6759796142578, + "loss": 0.4017, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2125296592712402, + "rewards/margins": 1.624316930770874, + "rewards/rejected": -2.836846351623535, + "step": 4133 + }, + { + "epoch": 0.48, + "learning_rate": 1.580961379473249e-07, + "logits/chosen": -2.611358404159546, + "logits/rejected": -2.7531533241271973, + "logps/chosen": -252.48388671875, + "logps/rejected": -139.24212646484375, + "loss": 0.1829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39181241393089294, + "rewards/margins": 2.7404918670654297, + "rewards/rejected": -3.1323046684265137, + "step": 4134 + }, + { + "epoch": 0.48, + "learning_rate": 1.5806070627140664e-07, + "logits/chosen": -2.60723614692688, + "logits/rejected": -2.730010986328125, + "logps/chosen": -276.92657470703125, + "logps/rejected": -236.3046112060547, + "loss": 0.2371, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2439155876636505, + "rewards/margins": 2.052091360092163, + "rewards/rejected": -2.2960071563720703, + "step": 4135 + }, + { + "epoch": 0.48, + "learning_rate": 1.5802527459548836e-07, + "logits/chosen": -2.461400270462036, + "logits/rejected": -2.0530707836151123, + "logps/chosen": -259.42596435546875, + "logps/rejected": -479.298095703125, + "loss": 0.5264, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4358010292053223, + "rewards/margins": 1.455265998840332, + "rewards/rejected": -2.8910670280456543, + "step": 4136 + }, + { + "epoch": 0.48, + "learning_rate": 1.5798984291957008e-07, + "logits/chosen": -2.2360522747039795, + "logits/rejected": -2.1889970302581787, + "logps/chosen": -196.19100952148438, + "logps/rejected": -318.1798095703125, + "loss": 0.227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.723287045955658, + "rewards/margins": 3.6495487689971924, + "rewards/rejected": -4.372836112976074, + "step": 4137 + }, + { + "epoch": 0.48, + "learning_rate": 1.579544112436518e-07, + "logits/chosen": -1.7125202417373657, + "logits/rejected": -2.2954537868499756, + "logps/chosen": -376.6832275390625, + "logps/rejected": -325.34698486328125, + "loss": 0.2094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4437797963619232, + "rewards/margins": 1.9978491067886353, + "rewards/rejected": -2.441628932952881, + "step": 4138 + }, + { + "epoch": 0.48, + "learning_rate": 1.5791897956773352e-07, + "logits/chosen": -2.237619161605835, + "logits/rejected": -1.9758802652359009, + "logps/chosen": -234.68589782714844, + "logps/rejected": -300.09912109375, + "loss": 0.5729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5957270264625549, + "rewards/margins": 1.801027774810791, + "rewards/rejected": -2.396754741668701, + "step": 4139 + }, + { + "epoch": 0.48, + "learning_rate": 1.5788354789181527e-07, + "logits/chosen": -2.1142961978912354, + "logits/rejected": -2.160720109939575, + "logps/chosen": -289.5484924316406, + "logps/rejected": -194.80850219726562, + "loss": 0.4188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11683648824691772, + "rewards/margins": 1.3440399169921875, + "rewards/rejected": -1.46087646484375, + "step": 4140 + }, + { + "epoch": 0.48, + "learning_rate": 1.5784811621589702e-07, + "logits/chosen": -2.1995062828063965, + "logits/rejected": -2.1393966674804688, + "logps/chosen": -362.2760009765625, + "logps/rejected": -277.73358154296875, + "loss": 0.7324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5141454935073853, + "rewards/margins": 1.1892544031143188, + "rewards/rejected": -2.703399658203125, + "step": 4141 + }, + { + "epoch": 0.48, + "learning_rate": 1.5781268453997874e-07, + "logits/chosen": -2.1861867904663086, + "logits/rejected": -1.904154896736145, + "logps/chosen": -286.0841979980469, + "logps/rejected": -327.94866943359375, + "loss": 0.3459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0573030710220337, + "rewards/margins": 1.0182193517684937, + "rewards/rejected": -2.0755224227905273, + "step": 4142 + }, + { + "epoch": 0.48, + "learning_rate": 1.5777725286406046e-07, + "logits/chosen": -2.6749157905578613, + "logits/rejected": -2.546511173248291, + "logps/chosen": -224.00933837890625, + "logps/rejected": -326.5504150390625, + "loss": 0.359, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0685029029846191, + "rewards/margins": 3.0021939277648926, + "rewards/rejected": -4.070696830749512, + "step": 4143 + }, + { + "epoch": 0.48, + "learning_rate": 1.577418211881422e-07, + "logits/chosen": -2.2558960914611816, + "logits/rejected": -2.281399726867676, + "logps/chosen": -259.3851013183594, + "logps/rejected": -226.3347625732422, + "loss": 0.4838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8091917037963867, + "rewards/margins": 1.419957160949707, + "rewards/rejected": -2.229149103164673, + "step": 4144 + }, + { + "epoch": 0.48, + "learning_rate": 1.5770638951222394e-07, + "logits/chosen": -2.080216884613037, + "logits/rejected": -1.7466087341308594, + "logps/chosen": -274.3110656738281, + "logps/rejected": -319.20849609375, + "loss": 0.6257, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1373041868209839, + "rewards/margins": 0.7189969420433044, + "rewards/rejected": -1.8563013076782227, + "step": 4145 + }, + { + "epoch": 0.48, + "learning_rate": 1.5767095783630566e-07, + "logits/chosen": -2.1442840099334717, + "logits/rejected": -1.6273415088653564, + "logps/chosen": -298.6656188964844, + "logps/rejected": -490.8800048828125, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8488000631332397, + "rewards/margins": 1.258759617805481, + "rewards/rejected": -2.1075596809387207, + "step": 4146 + }, + { + "epoch": 0.48, + "learning_rate": 1.5763552616038738e-07, + "logits/chosen": -2.706585168838501, + "logits/rejected": -2.980534076690674, + "logps/chosen": -304.9236145019531, + "logps/rejected": -270.6559753417969, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7622177600860596, + "rewards/margins": 3.7300093173980713, + "rewards/rejected": -5.492227077484131, + "step": 4147 + }, + { + "epoch": 0.48, + "learning_rate": 1.576000944844691e-07, + "logits/chosen": -2.0274767875671387, + "logits/rejected": -2.4374520778656006, + "logps/chosen": -405.0201110839844, + "logps/rejected": -247.704833984375, + "loss": 0.3328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3348127603530884, + "rewards/margins": 1.4745140075683594, + "rewards/rejected": -2.8093268871307373, + "step": 4148 + }, + { + "epoch": 0.48, + "learning_rate": 1.5756466280855082e-07, + "logits/chosen": -2.0397393703460693, + "logits/rejected": -1.807332992553711, + "logps/chosen": -270.2458801269531, + "logps/rejected": -342.57452392578125, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4827522933483124, + "rewards/margins": 1.4381444454193115, + "rewards/rejected": -1.9208967685699463, + "step": 4149 + }, + { + "epoch": 0.48, + "learning_rate": 1.5752923113263255e-07, + "logits/chosen": -2.0220909118652344, + "logits/rejected": -2.1483194828033447, + "logps/chosen": -427.98199462890625, + "logps/rejected": -390.4346008300781, + "loss": 0.4177, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7513718605041504, + "rewards/margins": 2.4837427139282227, + "rewards/rejected": -3.235114574432373, + "step": 4150 + }, + { + "epoch": 0.48, + "learning_rate": 1.574937994567143e-07, + "logits/chosen": -2.6240487098693848, + "logits/rejected": -2.6398673057556152, + "logps/chosen": -194.1738739013672, + "logps/rejected": -287.0479431152344, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.160439133644104, + "rewards/margins": 2.7707040309906006, + "rewards/rejected": -3.931143283843994, + "step": 4151 + }, + { + "epoch": 0.48, + "learning_rate": 1.5745836778079602e-07, + "logits/chosen": -2.2501888275146484, + "logits/rejected": -2.395749807357788, + "logps/chosen": -316.8781433105469, + "logps/rejected": -320.2548828125, + "loss": 0.324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20750093460083008, + "rewards/margins": 2.4453327655792236, + "rewards/rejected": -2.6528334617614746, + "step": 4152 + }, + { + "epoch": 0.48, + "learning_rate": 1.5742293610487777e-07, + "logits/chosen": -2.6261677742004395, + "logits/rejected": -2.5325379371643066, + "logps/chosen": -253.69210815429688, + "logps/rejected": -237.56460571289062, + "loss": 0.4457, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8740341663360596, + "rewards/margins": 1.1637897491455078, + "rewards/rejected": -3.0378239154815674, + "step": 4153 + }, + { + "epoch": 0.48, + "learning_rate": 1.573875044289595e-07, + "logits/chosen": -2.260078191757202, + "logits/rejected": -2.154541254043579, + "logps/chosen": -241.07449340820312, + "logps/rejected": -356.5749816894531, + "loss": 0.2393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6519558429718018, + "rewards/margins": 2.1383867263793945, + "rewards/rejected": -2.7903428077697754, + "step": 4154 + }, + { + "epoch": 0.48, + "learning_rate": 1.573520727530412e-07, + "logits/chosen": -2.6148130893707275, + "logits/rejected": -2.574190139770508, + "logps/chosen": -248.4049072265625, + "logps/rejected": -290.5125732421875, + "loss": 0.2695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9471781253814697, + "rewards/margins": 2.7451679706573486, + "rewards/rejected": -3.6923460960388184, + "step": 4155 + }, + { + "epoch": 0.48, + "learning_rate": 1.5731664107712296e-07, + "logits/chosen": -2.799419641494751, + "logits/rejected": -2.7524819374084473, + "logps/chosen": -230.13363647460938, + "logps/rejected": -192.35418701171875, + "loss": 0.5201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5818525552749634, + "rewards/margins": 1.5065317153930664, + "rewards/rejected": -2.0883843898773193, + "step": 4156 + }, + { + "epoch": 0.48, + "learning_rate": 1.5728120940120468e-07, + "logits/chosen": -2.4714205265045166, + "logits/rejected": -2.495512008666992, + "logps/chosen": -191.78375244140625, + "logps/rejected": -246.6481475830078, + "loss": 0.8582, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9701004028320312, + "rewards/margins": 0.6615644097328186, + "rewards/rejected": -2.631664752960205, + "step": 4157 + }, + { + "epoch": 0.48, + "learning_rate": 1.572457777252864e-07, + "logits/chosen": -2.223565101623535, + "logits/rejected": -2.2625820636749268, + "logps/chosen": -158.71435546875, + "logps/rejected": -229.4991455078125, + "loss": 0.4696, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.460983395576477, + "rewards/margins": 1.9164379835128784, + "rewards/rejected": -3.3774213790893555, + "step": 4158 + }, + { + "epoch": 0.48, + "learning_rate": 1.5721034604936812e-07, + "logits/chosen": -2.4221091270446777, + "logits/rejected": -2.606306552886963, + "logps/chosen": -254.51182556152344, + "logps/rejected": -415.460693359375, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2438970804214478, + "rewards/margins": 0.9935634136199951, + "rewards/rejected": -2.2374606132507324, + "step": 4159 + }, + { + "epoch": 0.48, + "learning_rate": 1.5717491437344985e-07, + "logits/chosen": -1.8214982748031616, + "logits/rejected": -2.2387146949768066, + "logps/chosen": -465.3339538574219, + "logps/rejected": -340.72564697265625, + "loss": 0.3403, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0367165803909302, + "rewards/margins": 1.889784812927246, + "rewards/rejected": -2.9265012741088867, + "step": 4160 + }, + { + "epoch": 0.48, + "learning_rate": 1.5713948269753157e-07, + "logits/chosen": -1.5033838748931885, + "logits/rejected": -1.6752598285675049, + "logps/chosen": -345.5698547363281, + "logps/rejected": -322.150390625, + "loss": 0.2213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4426257908344269, + "rewards/margins": 2.2727503776550293, + "rewards/rejected": -2.715376138687134, + "step": 4161 + }, + { + "epoch": 0.48, + "learning_rate": 1.571040510216133e-07, + "logits/chosen": -2.3698413372039795, + "logits/rejected": -2.273284673690796, + "logps/chosen": -183.62930297851562, + "logps/rejected": -240.69412231445312, + "loss": 0.259, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8423725962638855, + "rewards/margins": 2.6861393451690674, + "rewards/rejected": -3.5285120010375977, + "step": 4162 + }, + { + "epoch": 0.48, + "learning_rate": 1.5706861934569504e-07, + "logits/chosen": -2.5375421047210693, + "logits/rejected": -2.6934731006622314, + "logps/chosen": -296.34173583984375, + "logps/rejected": -322.0248107910156, + "loss": 0.4092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8352963924407959, + "rewards/margins": 2.5348501205444336, + "rewards/rejected": -3.3701465129852295, + "step": 4163 + }, + { + "epoch": 0.48, + "learning_rate": 1.570331876697768e-07, + "logits/chosen": -2.068100929260254, + "logits/rejected": -2.138357162475586, + "logps/chosen": -307.0281982421875, + "logps/rejected": -287.1894226074219, + "loss": 0.7942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8522269129753113, + "rewards/margins": 0.883599579334259, + "rewards/rejected": -1.7358264923095703, + "step": 4164 + }, + { + "epoch": 0.48, + "learning_rate": 1.569977559938585e-07, + "logits/chosen": -2.755688428878784, + "logits/rejected": -2.6583595275878906, + "logps/chosen": -311.52374267578125, + "logps/rejected": -204.33914184570312, + "loss": 1.0426, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6276205778121948, + "rewards/margins": 1.5248422622680664, + "rewards/rejected": -3.1524627208709717, + "step": 4165 + }, + { + "epoch": 0.48, + "learning_rate": 1.5696232431794023e-07, + "logits/chosen": -2.16886043548584, + "logits/rejected": -2.1241908073425293, + "logps/chosen": -201.84625244140625, + "logps/rejected": -233.87863159179688, + "loss": 0.6916, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2230740785598755, + "rewards/margins": 1.1839728355407715, + "rewards/rejected": -2.4070467948913574, + "step": 4166 + }, + { + "epoch": 0.48, + "learning_rate": 1.5692689264202198e-07, + "logits/chosen": -2.7029006481170654, + "logits/rejected": -2.643820285797119, + "logps/chosen": -332.34967041015625, + "logps/rejected": -276.01239013671875, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7206541299819946, + "rewards/margins": 2.4563968181610107, + "rewards/rejected": -3.177050828933716, + "step": 4167 + }, + { + "epoch": 0.48, + "learning_rate": 1.568914609661037e-07, + "logits/chosen": -1.797261357307434, + "logits/rejected": -2.235586166381836, + "logps/chosen": -223.82232666015625, + "logps/rejected": -209.6043243408203, + "loss": 0.6156, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.105198621749878, + "rewards/margins": 1.9258440732955933, + "rewards/rejected": -3.0310428142547607, + "step": 4168 + }, + { + "epoch": 0.48, + "learning_rate": 1.5685602929018543e-07, + "logits/chosen": -2.632432222366333, + "logits/rejected": -2.5465149879455566, + "logps/chosen": -178.07418823242188, + "logps/rejected": -260.344970703125, + "loss": 0.5325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18315337598323822, + "rewards/margins": 1.485952377319336, + "rewards/rejected": -1.6691056489944458, + "step": 4169 + }, + { + "epoch": 0.49, + "learning_rate": 1.5682059761426715e-07, + "logits/chosen": -2.151028633117676, + "logits/rejected": -2.2559762001037598, + "logps/chosen": -321.5174865722656, + "logps/rejected": -207.8048095703125, + "loss": 0.5687, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7346638441085815, + "rewards/margins": 0.9496192336082458, + "rewards/rejected": -2.6842830181121826, + "step": 4170 + }, + { + "epoch": 0.49, + "learning_rate": 1.5678516593834887e-07, + "logits/chosen": -2.592406749725342, + "logits/rejected": -2.3248939514160156, + "logps/chosen": -221.30752563476562, + "logps/rejected": -205.48233032226562, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0589115619659424, + "rewards/margins": 1.013702630996704, + "rewards/rejected": -2.0726141929626465, + "step": 4171 + }, + { + "epoch": 0.49, + "learning_rate": 1.567497342624306e-07, + "logits/chosen": -2.9331204891204834, + "logits/rejected": -2.8310275077819824, + "logps/chosen": -171.8787384033203, + "logps/rejected": -287.83935546875, + "loss": 0.1835, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1742606163024902, + "rewards/margins": 2.7964389324188232, + "rewards/rejected": -3.9706995487213135, + "step": 4172 + }, + { + "epoch": 0.49, + "learning_rate": 1.5671430258651231e-07, + "logits/chosen": -2.329993724822998, + "logits/rejected": -2.3867180347442627, + "logps/chosen": -462.095458984375, + "logps/rejected": -252.86248779296875, + "loss": 0.8832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6894361972808838, + "rewards/margins": 0.31260111927986145, + "rewards/rejected": -2.002037286758423, + "step": 4173 + }, + { + "epoch": 0.49, + "learning_rate": 1.5667887091059406e-07, + "logits/chosen": -2.70963191986084, + "logits/rejected": -2.8010990619659424, + "logps/chosen": -439.6603088378906, + "logps/rejected": -278.14447021484375, + "loss": 0.2254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47038334608078003, + "rewards/margins": 2.2186856269836426, + "rewards/rejected": -2.6890687942504883, + "step": 4174 + }, + { + "epoch": 0.49, + "learning_rate": 1.5664343923467578e-07, + "logits/chosen": -2.080286979675293, + "logits/rejected": -2.2805774211883545, + "logps/chosen": -304.99261474609375, + "logps/rejected": -224.95518493652344, + "loss": 0.5609, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6011609435081482, + "rewards/margins": 1.2242482900619507, + "rewards/rejected": -1.8254092931747437, + "step": 4175 + }, + { + "epoch": 0.49, + "learning_rate": 1.5660800755875753e-07, + "logits/chosen": -2.211413860321045, + "logits/rejected": -2.255418539047241, + "logps/chosen": -278.8165283203125, + "logps/rejected": -318.9987487792969, + "loss": 0.4567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0322880744934082, + "rewards/margins": 1.083324670791626, + "rewards/rejected": -2.115612745285034, + "step": 4176 + }, + { + "epoch": 0.49, + "learning_rate": 1.5657257588283926e-07, + "logits/chosen": -2.354156255722046, + "logits/rejected": -2.682738780975342, + "logps/chosen": -574.1495361328125, + "logps/rejected": -373.4350891113281, + "loss": 0.2529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6412387490272522, + "rewards/margins": 1.6676678657531738, + "rewards/rejected": -2.3089065551757812, + "step": 4177 + }, + { + "epoch": 0.49, + "learning_rate": 1.5653714420692098e-07, + "logits/chosen": -1.8615033626556396, + "logits/rejected": -2.2277145385742188, + "logps/chosen": -489.3848876953125, + "logps/rejected": -266.11309814453125, + "loss": 0.3086, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45478957891464233, + "rewards/margins": 1.8817253112792969, + "rewards/rejected": -1.4269357919692993, + "step": 4178 + }, + { + "epoch": 0.49, + "learning_rate": 1.5650171253100273e-07, + "logits/chosen": -2.7533535957336426, + "logits/rejected": -2.787672996520996, + "logps/chosen": -273.18682861328125, + "logps/rejected": -213.01812744140625, + "loss": 0.3894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.62388277053833, + "rewards/margins": 1.7281194925308228, + "rewards/rejected": -3.3520023822784424, + "step": 4179 + }, + { + "epoch": 0.49, + "learning_rate": 1.5646628085508445e-07, + "logits/chosen": -1.9549129009246826, + "logits/rejected": -2.45469069480896, + "logps/chosen": -457.2225341796875, + "logps/rejected": -286.02703857421875, + "loss": 0.5814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7884546518325806, + "rewards/margins": 0.9379298686981201, + "rewards/rejected": -1.7263846397399902, + "step": 4180 + }, + { + "epoch": 0.49, + "learning_rate": 1.5643084917916617e-07, + "logits/chosen": -2.4998726844787598, + "logits/rejected": -2.3361010551452637, + "logps/chosen": -239.412109375, + "logps/rejected": -254.39218139648438, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6998356580734253, + "rewards/margins": 2.188354969024658, + "rewards/rejected": -2.888190269470215, + "step": 4181 + }, + { + "epoch": 0.49, + "learning_rate": 1.563954175032479e-07, + "logits/chosen": -2.622525453567505, + "logits/rejected": -2.7748355865478516, + "logps/chosen": -162.9314422607422, + "logps/rejected": -152.78211975097656, + "loss": 0.1348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5086824893951416, + "rewards/margins": 3.160824775695801, + "rewards/rejected": -3.6695075035095215, + "step": 4182 + }, + { + "epoch": 0.49, + "learning_rate": 1.5635998582732961e-07, + "logits/chosen": -2.2013843059539795, + "logits/rejected": -2.3301632404327393, + "logps/chosen": -381.08221435546875, + "logps/rejected": -308.3149108886719, + "loss": 0.9379, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.129316806793213, + "rewards/margins": 0.7018431425094604, + "rewards/rejected": -1.8311598300933838, + "step": 4183 + }, + { + "epoch": 0.49, + "learning_rate": 1.5632455415141134e-07, + "logits/chosen": -2.5435256958007812, + "logits/rejected": -2.4566032886505127, + "logps/chosen": -309.3577575683594, + "logps/rejected": -262.3311767578125, + "loss": 0.8936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7115654945373535, + "rewards/margins": 1.763122797012329, + "rewards/rejected": -3.4746882915496826, + "step": 4184 + }, + { + "epoch": 0.49, + "learning_rate": 1.5628912247549309e-07, + "logits/chosen": -2.1664795875549316, + "logits/rejected": -2.3976573944091797, + "logps/chosen": -270.93133544921875, + "logps/rejected": -300.8971862792969, + "loss": 0.4149, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8059624433517456, + "rewards/margins": 4.1478471755981445, + "rewards/rejected": -4.95380973815918, + "step": 4185 + }, + { + "epoch": 0.49, + "learning_rate": 1.562536907995748e-07, + "logits/chosen": -2.7821292877197266, + "logits/rejected": -2.4968490600585938, + "logps/chosen": -203.15623474121094, + "logps/rejected": -301.80682373046875, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9515830874443054, + "rewards/margins": 3.129225730895996, + "rewards/rejected": -4.080808639526367, + "step": 4186 + }, + { + "epoch": 0.49, + "learning_rate": 1.5621825912365653e-07, + "logits/chosen": -2.0574231147766113, + "logits/rejected": -1.9697329998016357, + "logps/chosen": -359.9841003417969, + "logps/rejected": -399.0179138183594, + "loss": 0.4878, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5525528788566589, + "rewards/margins": 2.3682796955108643, + "rewards/rejected": -2.920832395553589, + "step": 4187 + }, + { + "epoch": 0.49, + "learning_rate": 1.5618282744773828e-07, + "logits/chosen": -2.276045322418213, + "logits/rejected": -2.499728202819824, + "logps/chosen": -389.3753967285156, + "logps/rejected": -294.92919921875, + "loss": 0.4174, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3425109386444092, + "rewards/margins": 1.598778486251831, + "rewards/rejected": -2.9412894248962402, + "step": 4188 + }, + { + "epoch": 0.49, + "learning_rate": 1.5614739577182e-07, + "logits/chosen": -2.2978153228759766, + "logits/rejected": -2.251972198486328, + "logps/chosen": -106.7737808227539, + "logps/rejected": -218.66073608398438, + "loss": 0.3172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36114704608917236, + "rewards/margins": 2.5171573162078857, + "rewards/rejected": -2.8783042430877686, + "step": 4189 + }, + { + "epoch": 0.49, + "learning_rate": 1.5611196409590175e-07, + "logits/chosen": -2.1270995140075684, + "logits/rejected": -2.393162727355957, + "logps/chosen": -248.9272918701172, + "logps/rejected": -202.97271728515625, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39361095428466797, + "rewards/margins": 2.41978120803833, + "rewards/rejected": -2.813392400741577, + "step": 4190 + }, + { + "epoch": 0.49, + "learning_rate": 1.5607653241998347e-07, + "logits/chosen": -1.7374160289764404, + "logits/rejected": -2.2144100666046143, + "logps/chosen": -345.683349609375, + "logps/rejected": -242.41519165039062, + "loss": 0.6332, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2654688358306885, + "rewards/margins": 0.6928931474685669, + "rewards/rejected": -1.958362102508545, + "step": 4191 + }, + { + "epoch": 0.49, + "learning_rate": 1.560411007440652e-07, + "logits/chosen": -1.8285834789276123, + "logits/rejected": -2.087998390197754, + "logps/chosen": -367.35260009765625, + "logps/rejected": -230.97195434570312, + "loss": 0.3991, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6413822770118713, + "rewards/margins": 1.4651374816894531, + "rewards/rejected": -2.1065196990966797, + "step": 4192 + }, + { + "epoch": 0.49, + "learning_rate": 1.5600566906814692e-07, + "logits/chosen": -2.4793336391448975, + "logits/rejected": -2.39030385017395, + "logps/chosen": -294.58233642578125, + "logps/rejected": -314.3695983886719, + "loss": 0.3441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9479573369026184, + "rewards/margins": 1.3553906679153442, + "rewards/rejected": -2.3033480644226074, + "step": 4193 + }, + { + "epoch": 0.49, + "learning_rate": 1.5597023739222864e-07, + "logits/chosen": -2.1798839569091797, + "logits/rejected": -2.373538017272949, + "logps/chosen": -434.94158935546875, + "logps/rejected": -276.64691162109375, + "loss": 0.3779, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7101126909255981, + "rewards/margins": 1.8202152252197266, + "rewards/rejected": -2.530327796936035, + "step": 4194 + }, + { + "epoch": 0.49, + "learning_rate": 1.5593480571631036e-07, + "logits/chosen": -2.0496816635131836, + "logits/rejected": -2.1611108779907227, + "logps/chosen": -282.61993408203125, + "logps/rejected": -208.932373046875, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.06795072555542, + "rewards/margins": 0.9938252568244934, + "rewards/rejected": -2.0617761611938477, + "step": 4195 + }, + { + "epoch": 0.49, + "learning_rate": 1.558993740403921e-07, + "logits/chosen": -1.8750758171081543, + "logits/rejected": -2.384117841720581, + "logps/chosen": -344.7598571777344, + "logps/rejected": -205.66404724121094, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18671998381614685, + "rewards/margins": 2.0612125396728516, + "rewards/rejected": -2.2479326725006104, + "step": 4196 + }, + { + "epoch": 0.49, + "learning_rate": 1.5586394236447383e-07, + "logits/chosen": -1.8473994731903076, + "logits/rejected": -2.2152717113494873, + "logps/chosen": -345.2440490722656, + "logps/rejected": -263.2420654296875, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4963598847389221, + "rewards/margins": 3.757162094116211, + "rewards/rejected": -4.253521919250488, + "step": 4197 + }, + { + "epoch": 0.49, + "learning_rate": 1.5582851068855555e-07, + "logits/chosen": -1.9456243515014648, + "logits/rejected": -2.1886649131774902, + "logps/chosen": -451.83697509765625, + "logps/rejected": -452.882568359375, + "loss": 1.0008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3961187601089478, + "rewards/margins": 1.994165301322937, + "rewards/rejected": -3.3902840614318848, + "step": 4198 + }, + { + "epoch": 0.49, + "learning_rate": 1.557930790126373e-07, + "logits/chosen": -2.069958209991455, + "logits/rejected": -2.069769859313965, + "logps/chosen": -400.955810546875, + "logps/rejected": -371.6534423828125, + "loss": 0.6835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7135387659072876, + "rewards/margins": 1.782979965209961, + "rewards/rejected": -3.496519088745117, + "step": 4199 + }, + { + "epoch": 0.49, + "learning_rate": 1.5575764733671902e-07, + "logits/chosen": -2.3866684436798096, + "logits/rejected": -2.308103561401367, + "logps/chosen": -201.48760986328125, + "logps/rejected": -269.93402099609375, + "loss": 0.5399, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0937325954437256, + "rewards/margins": 1.3600457906723022, + "rewards/rejected": -2.4537785053253174, + "step": 4200 + }, + { + "epoch": 0.49, + "learning_rate": 1.5572221566080077e-07, + "logits/chosen": -2.1886377334594727, + "logits/rejected": -2.0658164024353027, + "logps/chosen": -247.9052734375, + "logps/rejected": -273.84002685546875, + "loss": 0.7834, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1680951118469238, + "rewards/margins": 1.1702641248703003, + "rewards/rejected": -2.3383591175079346, + "step": 4201 + }, + { + "epoch": 0.49, + "learning_rate": 1.556867839848825e-07, + "logits/chosen": -2.33994722366333, + "logits/rejected": -2.2882912158966064, + "logps/chosen": -290.715087890625, + "logps/rejected": -357.2547607421875, + "loss": 0.1872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2784448564052582, + "rewards/margins": 4.278828144073486, + "rewards/rejected": -4.557272911071777, + "step": 4202 + }, + { + "epoch": 0.49, + "learning_rate": 1.5565135230896422e-07, + "logits/chosen": -1.9057643413543701, + "logits/rejected": -1.6760625839233398, + "logps/chosen": -162.06646728515625, + "logps/rejected": -604.0193481445312, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35089200735092163, + "rewards/margins": 3.8521459102630615, + "rewards/rejected": -4.203037738800049, + "step": 4203 + }, + { + "epoch": 0.49, + "learning_rate": 1.5561592063304594e-07, + "logits/chosen": -2.204662799835205, + "logits/rejected": -2.542515516281128, + "logps/chosen": -375.55401611328125, + "logps/rejected": -143.68309020996094, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8651166558265686, + "rewards/margins": 1.170993685722351, + "rewards/rejected": -2.0361101627349854, + "step": 4204 + }, + { + "epoch": 0.49, + "learning_rate": 1.5558048895712766e-07, + "logits/chosen": -2.539971113204956, + "logits/rejected": -2.4846532344818115, + "logps/chosen": -270.4479064941406, + "logps/rejected": -263.5343322753906, + "loss": 0.2834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8396803140640259, + "rewards/margins": 2.8850934505462646, + "rewards/rejected": -3.72477388381958, + "step": 4205 + }, + { + "epoch": 0.49, + "learning_rate": 1.5554505728120938e-07, + "logits/chosen": -2.2344443798065186, + "logits/rejected": -2.551969051361084, + "logps/chosen": -395.16827392578125, + "logps/rejected": -268.52093505859375, + "loss": 0.2452, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5827472805976868, + "rewards/margins": 2.541327953338623, + "rewards/rejected": -3.124075174331665, + "step": 4206 + }, + { + "epoch": 0.49, + "learning_rate": 1.555096256052911e-07, + "logits/chosen": -2.7171030044555664, + "logits/rejected": -2.8946352005004883, + "logps/chosen": -148.84063720703125, + "logps/rejected": -175.6888427734375, + "loss": 0.5884, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2476805448532104, + "rewards/margins": 1.5699149370193481, + "rewards/rejected": -2.8175952434539795, + "step": 4207 + }, + { + "epoch": 0.49, + "learning_rate": 1.5547419392937285e-07, + "logits/chosen": -1.9031076431274414, + "logits/rejected": -2.1907944679260254, + "logps/chosen": -531.751220703125, + "logps/rejected": -388.6005859375, + "loss": 0.218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6631029844284058, + "rewards/margins": 1.9835073947906494, + "rewards/rejected": -2.6466104984283447, + "step": 4208 + }, + { + "epoch": 0.49, + "learning_rate": 1.5543876225345458e-07, + "logits/chosen": -1.951372504234314, + "logits/rejected": -2.36936616897583, + "logps/chosen": -346.507568359375, + "logps/rejected": -148.57513427734375, + "loss": 0.6355, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4825596809387207, + "rewards/margins": 0.8358340859413147, + "rewards/rejected": -2.3183937072753906, + "step": 4209 + }, + { + "epoch": 0.49, + "learning_rate": 1.554033305775363e-07, + "logits/chosen": -2.337108850479126, + "logits/rejected": -1.8362160921096802, + "logps/chosen": -74.19679260253906, + "logps/rejected": -226.6238250732422, + "loss": 0.5865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.653024435043335, + "rewards/margins": 1.8003034591674805, + "rewards/rejected": -2.4533281326293945, + "step": 4210 + }, + { + "epoch": 0.49, + "learning_rate": 1.5536789890161805e-07, + "logits/chosen": -2.4479455947875977, + "logits/rejected": -2.4788005352020264, + "logps/chosen": -225.18568420410156, + "logps/rejected": -279.6341247558594, + "loss": 0.2967, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6062159538269043, + "rewards/margins": 2.6959404945373535, + "rewards/rejected": -3.302156448364258, + "step": 4211 + }, + { + "epoch": 0.49, + "learning_rate": 1.553324672256998e-07, + "logits/chosen": -1.8374024629592896, + "logits/rejected": -2.0174384117126465, + "logps/chosen": -353.2993469238281, + "logps/rejected": -298.183349609375, + "loss": 0.2056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21171113848686218, + "rewards/margins": 2.379586696624756, + "rewards/rejected": -2.5912978649139404, + "step": 4212 + }, + { + "epoch": 0.49, + "learning_rate": 1.5529703554978152e-07, + "logits/chosen": -2.0530803203582764, + "logits/rejected": -2.0405948162078857, + "logps/chosen": -388.78680419921875, + "logps/rejected": -314.1803894042969, + "loss": 0.4966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1562906503677368, + "rewards/margins": 2.565918445587158, + "rewards/rejected": -3.7222089767456055, + "step": 4213 + }, + { + "epoch": 0.49, + "learning_rate": 1.5526160387386324e-07, + "logits/chosen": -2.235501527786255, + "logits/rejected": -2.1123273372650146, + "logps/chosen": -362.783935546875, + "logps/rejected": -330.083251953125, + "loss": 0.358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1011303663253784, + "rewards/margins": 1.2766927480697632, + "rewards/rejected": -2.3778231143951416, + "step": 4214 + }, + { + "epoch": 0.49, + "learning_rate": 1.5522617219794496e-07, + "logits/chosen": -2.780184745788574, + "logits/rejected": -2.807072877883911, + "logps/chosen": -138.5372772216797, + "logps/rejected": -178.1685791015625, + "loss": 0.1168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6500250101089478, + "rewards/margins": 3.2305593490600586, + "rewards/rejected": -3.880584239959717, + "step": 4215 + }, + { + "epoch": 0.49, + "learning_rate": 1.5519074052202668e-07, + "logits/chosen": -2.601388454437256, + "logits/rejected": -2.641458749771118, + "logps/chosen": -211.64303588867188, + "logps/rejected": -240.1005096435547, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.974123477935791, + "rewards/margins": 1.745782732963562, + "rewards/rejected": -2.7199063301086426, + "step": 4216 + }, + { + "epoch": 0.49, + "learning_rate": 1.551553088461084e-07, + "logits/chosen": -2.091341495513916, + "logits/rejected": -1.9680372476577759, + "logps/chosen": -373.9072265625, + "logps/rejected": -366.6389465332031, + "loss": 0.4754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6733686923980713, + "rewards/margins": 2.289907455444336, + "rewards/rejected": -2.9632761478424072, + "step": 4217 + }, + { + "epoch": 0.49, + "learning_rate": 1.5511987717019013e-07, + "logits/chosen": -2.360729217529297, + "logits/rejected": -2.2917418479919434, + "logps/chosen": -483.8581848144531, + "logps/rejected": -440.9471740722656, + "loss": 0.164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3209574222564697, + "rewards/margins": 3.651742696762085, + "rewards/rejected": -3.9727001190185547, + "step": 4218 + }, + { + "epoch": 0.49, + "learning_rate": 1.5508444549427188e-07, + "logits/chosen": -2.3913514614105225, + "logits/rejected": -2.2192699909210205, + "logps/chosen": -162.5697479248047, + "logps/rejected": -177.00511169433594, + "loss": 0.238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3811604976654053, + "rewards/margins": 1.5962038040161133, + "rewards/rejected": -1.9773643016815186, + "step": 4219 + }, + { + "epoch": 0.49, + "learning_rate": 1.550490138183536e-07, + "logits/chosen": -2.2838666439056396, + "logits/rejected": -2.416447639465332, + "logps/chosen": -250.20236206054688, + "logps/rejected": -213.37826538085938, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21300038695335388, + "rewards/margins": 3.25825834274292, + "rewards/rejected": -3.4712586402893066, + "step": 4220 + }, + { + "epoch": 0.49, + "learning_rate": 1.5501358214243532e-07, + "logits/chosen": -2.152167797088623, + "logits/rejected": -2.3065335750579834, + "logps/chosen": -392.2319030761719, + "logps/rejected": -257.9918518066406, + "loss": 0.6436, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.463146448135376, + "rewards/margins": 1.1819086074829102, + "rewards/rejected": -1.6450550556182861, + "step": 4221 + }, + { + "epoch": 0.49, + "learning_rate": 1.5497815046651704e-07, + "logits/chosen": -1.6172950267791748, + "logits/rejected": -1.8373398780822754, + "logps/chosen": -487.0157470703125, + "logps/rejected": -464.99652099609375, + "loss": 0.3284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6367915868759155, + "rewards/margins": 1.657346248626709, + "rewards/rejected": -2.294137954711914, + "step": 4222 + }, + { + "epoch": 0.49, + "learning_rate": 1.549427187905988e-07, + "logits/chosen": -1.9449739456176758, + "logits/rejected": -2.0185787677764893, + "logps/chosen": -158.22862243652344, + "logps/rejected": -205.89923095703125, + "loss": 0.4544, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7416872978210449, + "rewards/margins": 1.414862036705017, + "rewards/rejected": -2.1565492153167725, + "step": 4223 + }, + { + "epoch": 0.49, + "learning_rate": 1.5490728711468054e-07, + "logits/chosen": -2.492893695831299, + "logits/rejected": -2.1953775882720947, + "logps/chosen": -155.6333465576172, + "logps/rejected": -250.72544860839844, + "loss": 0.3481, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5742707252502441, + "rewards/margins": 1.864371418952942, + "rewards/rejected": -3.4386420249938965, + "step": 4224 + }, + { + "epoch": 0.49, + "learning_rate": 1.5487185543876226e-07, + "logits/chosen": -1.9641962051391602, + "logits/rejected": -1.933701753616333, + "logps/chosen": -325.01434326171875, + "logps/rejected": -335.8203125, + "loss": 0.4156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6353504657745361, + "rewards/margins": 2.295146942138672, + "rewards/rejected": -2.930497646331787, + "step": 4225 + }, + { + "epoch": 0.49, + "learning_rate": 1.5483642376284398e-07, + "logits/chosen": -2.008098840713501, + "logits/rejected": -2.1107101440429688, + "logps/chosen": -263.98382568359375, + "logps/rejected": -286.8031005859375, + "loss": 0.4547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7853267788887024, + "rewards/margins": 1.1948570013046265, + "rewards/rejected": -1.9801836013793945, + "step": 4226 + }, + { + "epoch": 0.49, + "learning_rate": 1.548009920869257e-07, + "logits/chosen": -2.7829713821411133, + "logits/rejected": -2.796865701675415, + "logps/chosen": -198.48440551757812, + "logps/rejected": -240.46401977539062, + "loss": 0.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12219832092523575, + "rewards/margins": 4.2068891525268555, + "rewards/rejected": -4.329087257385254, + "step": 4227 + }, + { + "epoch": 0.49, + "learning_rate": 1.5476556041100743e-07, + "logits/chosen": -2.8691625595092773, + "logits/rejected": -2.90232253074646, + "logps/chosen": -259.3534240722656, + "logps/rejected": -187.81802368164062, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2154710292816162, + "rewards/margins": 2.3499207496643066, + "rewards/rejected": -2.565391778945923, + "step": 4228 + }, + { + "epoch": 0.49, + "learning_rate": 1.5473012873508915e-07, + "logits/chosen": -2.8094186782836914, + "logits/rejected": -2.8581786155700684, + "logps/chosen": -147.36273193359375, + "logps/rejected": -115.0368881225586, + "loss": 0.4565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014792539179325104, + "rewards/margins": 1.1600532531738281, + "rewards/rejected": -1.1748459339141846, + "step": 4229 + }, + { + "epoch": 0.49, + "learning_rate": 1.546946970591709e-07, + "logits/chosen": -2.225023031234741, + "logits/rejected": -1.918619155883789, + "logps/chosen": -170.2281951904297, + "logps/rejected": -236.80023193359375, + "loss": 0.3079, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0115596055984497, + "rewards/margins": 1.7705206871032715, + "rewards/rejected": -2.7820801734924316, + "step": 4230 + }, + { + "epoch": 0.49, + "learning_rate": 1.5465926538325262e-07, + "logits/chosen": -2.6455230712890625, + "logits/rejected": -2.620805263519287, + "logps/chosen": -242.54000854492188, + "logps/rejected": -371.3762512207031, + "loss": 0.4212, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.157359004020691, + "rewards/margins": 1.5229735374450684, + "rewards/rejected": -2.6803324222564697, + "step": 4231 + }, + { + "epoch": 0.49, + "learning_rate": 1.5462383370733434e-07, + "logits/chosen": -2.147968292236328, + "logits/rejected": -2.2863383293151855, + "logps/chosen": -489.2957763671875, + "logps/rejected": -332.1770935058594, + "loss": 0.3851, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0014922618865967, + "rewards/margins": 1.895164966583252, + "rewards/rejected": -2.8966572284698486, + "step": 4232 + }, + { + "epoch": 0.49, + "learning_rate": 1.5458840203141606e-07, + "logits/chosen": -2.0664100646972656, + "logits/rejected": -2.5880985260009766, + "logps/chosen": -236.2112579345703, + "logps/rejected": -217.5749053955078, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22217904031276703, + "rewards/margins": 2.874112844467163, + "rewards/rejected": -3.0962917804718018, + "step": 4233 + }, + { + "epoch": 0.49, + "learning_rate": 1.545529703554978e-07, + "logits/chosen": -2.154740810394287, + "logits/rejected": -2.2495694160461426, + "logps/chosen": -223.77037048339844, + "logps/rejected": -263.9287109375, + "loss": 0.1402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31739386916160583, + "rewards/margins": 3.3459737300872803, + "rewards/rejected": -3.663367509841919, + "step": 4234 + }, + { + "epoch": 0.49, + "learning_rate": 1.5451753867957956e-07, + "logits/chosen": -1.8370375633239746, + "logits/rejected": -1.6753730773925781, + "logps/chosen": -233.3326873779297, + "logps/rejected": -267.75634765625, + "loss": 0.283, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9684141874313354, + "rewards/margins": 2.9151928424835205, + "rewards/rejected": -3.8836069107055664, + "step": 4235 + }, + { + "epoch": 0.49, + "learning_rate": 1.5448210700366128e-07, + "logits/chosen": -2.000152587890625, + "logits/rejected": -2.1005642414093018, + "logps/chosen": -285.3634338378906, + "logps/rejected": -356.9669494628906, + "loss": 0.2536, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7302247285842896, + "rewards/margins": 3.0643539428710938, + "rewards/rejected": -3.7945785522460938, + "step": 4236 + }, + { + "epoch": 0.49, + "learning_rate": 1.54446675327743e-07, + "logits/chosen": -1.9809647798538208, + "logits/rejected": -2.2148256301879883, + "logps/chosen": -403.2225341796875, + "logps/rejected": -360.6706848144531, + "loss": 0.4555, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.049302101135253906, + "rewards/margins": 1.2935963869094849, + "rewards/rejected": -1.3428986072540283, + "step": 4237 + }, + { + "epoch": 0.49, + "learning_rate": 1.5441124365182473e-07, + "logits/chosen": -2.1211373805999756, + "logits/rejected": -2.3110873699188232, + "logps/chosen": -296.46356201171875, + "logps/rejected": -260.5908508300781, + "loss": 0.3092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8240818977355957, + "rewards/margins": 1.2839206457138062, + "rewards/rejected": -2.1080026626586914, + "step": 4238 + }, + { + "epoch": 0.49, + "learning_rate": 1.5437581197590645e-07, + "logits/chosen": -2.1035172939300537, + "logits/rejected": -2.0641393661499023, + "logps/chosen": -356.18536376953125, + "logps/rejected": -260.9963073730469, + "loss": 0.6869, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8862773180007935, + "rewards/margins": 1.2397949695587158, + "rewards/rejected": -2.1260721683502197, + "step": 4239 + }, + { + "epoch": 0.49, + "learning_rate": 1.5434038029998817e-07, + "logits/chosen": -2.2825767993927, + "logits/rejected": -1.6988734006881714, + "logps/chosen": -344.5052795410156, + "logps/rejected": -399.63690185546875, + "loss": 0.4786, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2615984678268433, + "rewards/margins": 1.814473032951355, + "rewards/rejected": -3.0760715007781982, + "step": 4240 + }, + { + "epoch": 0.49, + "learning_rate": 1.5430494862406992e-07, + "logits/chosen": -2.268984794616699, + "logits/rejected": -2.490394115447998, + "logps/chosen": -345.7473449707031, + "logps/rejected": -179.95140075683594, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34967711567878723, + "rewards/margins": 2.02929425239563, + "rewards/rejected": -2.3789713382720947, + "step": 4241 + }, + { + "epoch": 0.49, + "learning_rate": 1.5426951694815164e-07, + "logits/chosen": -2.028651714324951, + "logits/rejected": -1.8188753128051758, + "logps/chosen": -318.8696594238281, + "logps/rejected": -366.59368896484375, + "loss": 0.6116, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06175708770751953, + "rewards/margins": 0.4544529914855957, + "rewards/rejected": -0.5162100791931152, + "step": 4242 + }, + { + "epoch": 0.49, + "learning_rate": 1.5423408527223337e-07, + "logits/chosen": -1.949172019958496, + "logits/rejected": -2.052313804626465, + "logps/chosen": -155.8046875, + "logps/rejected": -164.6605682373047, + "loss": 0.316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4880596995353699, + "rewards/margins": 1.4798177480697632, + "rewards/rejected": -1.9678773880004883, + "step": 4243 + }, + { + "epoch": 0.49, + "learning_rate": 1.541986535963151e-07, + "logits/chosen": -2.3810529708862305, + "logits/rejected": -2.328674077987671, + "logps/chosen": -164.00653076171875, + "logps/rejected": -309.40423583984375, + "loss": 0.2108, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5997473001480103, + "rewards/margins": 3.881175994873047, + "rewards/rejected": -5.480923175811768, + "step": 4244 + }, + { + "epoch": 0.49, + "learning_rate": 1.541632219203968e-07, + "logits/chosen": -2.868957996368408, + "logits/rejected": -2.790341854095459, + "logps/chosen": -196.22256469726562, + "logps/rejected": -313.3121032714844, + "loss": 0.7041, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.494431495666504, + "rewards/margins": 2.2776315212249756, + "rewards/rejected": -3.7720630168914795, + "step": 4245 + }, + { + "epoch": 0.49, + "learning_rate": 1.5412779024447858e-07, + "logits/chosen": -2.7504994869232178, + "logits/rejected": -2.8103044033050537, + "logps/chosen": -220.88858032226562, + "logps/rejected": -320.0758972167969, + "loss": 0.1679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2829967141151428, + "rewards/margins": 2.1074719429016113, + "rewards/rejected": -2.3904685974121094, + "step": 4246 + }, + { + "epoch": 0.49, + "learning_rate": 1.540923585685603e-07, + "logits/chosen": -2.6229262351989746, + "logits/rejected": -2.5707273483276367, + "logps/chosen": -241.9980926513672, + "logps/rejected": -267.77801513671875, + "loss": 0.2288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5068312287330627, + "rewards/margins": 3.1791698932647705, + "rewards/rejected": -3.6860010623931885, + "step": 4247 + }, + { + "epoch": 0.49, + "learning_rate": 1.5405692689264203e-07, + "logits/chosen": -2.339515209197998, + "logits/rejected": -2.3078110218048096, + "logps/chosen": -315.0824279785156, + "logps/rejected": -359.0888671875, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.257819652557373, + "rewards/margins": 0.9051704406738281, + "rewards/rejected": -2.162990093231201, + "step": 4248 + }, + { + "epoch": 0.49, + "learning_rate": 1.5402149521672375e-07, + "logits/chosen": -2.543381452560425, + "logits/rejected": -2.650830030441284, + "logps/chosen": -246.1514892578125, + "logps/rejected": -257.54052734375, + "loss": 0.2973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5963013172149658, + "rewards/margins": 2.495551109313965, + "rewards/rejected": -3.0918524265289307, + "step": 4249 + }, + { + "epoch": 0.49, + "learning_rate": 1.5398606354080547e-07, + "logits/chosen": -2.1643595695495605, + "logits/rejected": -2.356955051422119, + "logps/chosen": -295.4329833984375, + "logps/rejected": -198.77330017089844, + "loss": 0.8915, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2637898921966553, + "rewards/margins": 0.5414870381355286, + "rewards/rejected": -1.8052769899368286, + "step": 4250 + }, + { + "epoch": 0.49, + "learning_rate": 1.539506318648872e-07, + "logits/chosen": -2.211146354675293, + "logits/rejected": -2.2021608352661133, + "logps/chosen": -102.38069152832031, + "logps/rejected": -225.507568359375, + "loss": 0.5021, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6190791130065918, + "rewards/margins": 2.0115549564361572, + "rewards/rejected": -3.630634069442749, + "step": 4251 + }, + { + "epoch": 0.49, + "learning_rate": 1.5391520018896892e-07, + "logits/chosen": -1.987952709197998, + "logits/rejected": -1.972144365310669, + "logps/chosen": -251.96258544921875, + "logps/rejected": -364.8106689453125, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1091543436050415, + "rewards/margins": 2.4591972827911377, + "rewards/rejected": -3.5683517456054688, + "step": 4252 + }, + { + "epoch": 0.49, + "learning_rate": 1.5387976851305067e-07, + "logits/chosen": -2.130178451538086, + "logits/rejected": -2.0667145252227783, + "logps/chosen": -268.03118896484375, + "logps/rejected": -377.71929931640625, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8018983602523804, + "rewards/margins": 2.1182358264923096, + "rewards/rejected": -2.9201340675354004, + "step": 4253 + }, + { + "epoch": 0.49, + "learning_rate": 1.538443368371324e-07, + "logits/chosen": -1.9856798648834229, + "logits/rejected": -2.466815948486328, + "logps/chosen": -427.8575134277344, + "logps/rejected": -270.6650390625, + "loss": 0.1404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02171120047569275, + "rewards/margins": 2.6592845916748047, + "rewards/rejected": -2.6809957027435303, + "step": 4254 + }, + { + "epoch": 0.49, + "learning_rate": 1.538089051612141e-07, + "logits/chosen": -2.3740897178649902, + "logits/rejected": -2.353344202041626, + "logps/chosen": -217.17514038085938, + "logps/rejected": -310.5447998046875, + "loss": 0.2201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5423228740692139, + "rewards/margins": 2.296145439147949, + "rewards/rejected": -3.838468313217163, + "step": 4255 + }, + { + "epoch": 0.5, + "learning_rate": 1.5377347348529583e-07, + "logits/chosen": -1.9890837669372559, + "logits/rejected": -1.9718835353851318, + "logps/chosen": -411.16162109375, + "logps/rejected": -361.01019287109375, + "loss": 0.4568, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.001009225845337, + "rewards/margins": 1.7641944885253906, + "rewards/rejected": -2.7652037143707275, + "step": 4256 + }, + { + "epoch": 0.5, + "learning_rate": 1.5373804180937755e-07, + "logits/chosen": -1.9421712160110474, + "logits/rejected": -1.934262752532959, + "logps/chosen": -340.1602783203125, + "logps/rejected": -348.1224365234375, + "loss": 0.3906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1256228685379028, + "rewards/margins": 1.5955533981323242, + "rewards/rejected": -2.7211761474609375, + "step": 4257 + }, + { + "epoch": 0.5, + "learning_rate": 1.5370261013345933e-07, + "logits/chosen": -2.2184653282165527, + "logits/rejected": -2.3386740684509277, + "logps/chosen": -181.63800048828125, + "logps/rejected": -248.48541259765625, + "loss": 0.2732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6168058514595032, + "rewards/margins": 1.8257722854614258, + "rewards/rejected": -2.442578077316284, + "step": 4258 + }, + { + "epoch": 0.5, + "learning_rate": 1.5366717845754105e-07, + "logits/chosen": -2.403015613555908, + "logits/rejected": -2.0070316791534424, + "logps/chosen": -289.396484375, + "logps/rejected": -358.3860778808594, + "loss": 0.1974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5178790092468262, + "rewards/margins": 2.545314311981201, + "rewards/rejected": -3.0631933212280273, + "step": 4259 + }, + { + "epoch": 0.5, + "learning_rate": 1.5363174678162277e-07, + "logits/chosen": -2.5418899059295654, + "logits/rejected": -2.352541208267212, + "logps/chosen": -319.2630920410156, + "logps/rejected": -313.2183532714844, + "loss": 0.2209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4045635461807251, + "rewards/margins": 3.0415146350860596, + "rewards/rejected": -3.446078300476074, + "step": 4260 + }, + { + "epoch": 0.5, + "learning_rate": 1.535963151057045e-07, + "logits/chosen": -2.1589102745056152, + "logits/rejected": -2.0008838176727295, + "logps/chosen": -375.5217590332031, + "logps/rejected": -358.9851989746094, + "loss": 0.4773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6098263263702393, + "rewards/margins": 1.4764554500579834, + "rewards/rejected": -2.0862817764282227, + "step": 4261 + }, + { + "epoch": 0.5, + "learning_rate": 1.5356088342978622e-07, + "logits/chosen": -2.1733298301696777, + "logits/rejected": -2.0634188652038574, + "logps/chosen": -229.7484130859375, + "logps/rejected": -291.6719055175781, + "loss": 0.4286, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3729580640792847, + "rewards/margins": 1.4149776697158813, + "rewards/rejected": -2.787935733795166, + "step": 4262 + }, + { + "epoch": 0.5, + "learning_rate": 1.5352545175386794e-07, + "logits/chosen": -2.0859203338623047, + "logits/rejected": -2.4530980587005615, + "logps/chosen": -251.95108032226562, + "logps/rejected": -252.70628356933594, + "loss": 0.6782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5641568899154663, + "rewards/margins": 1.3612463474273682, + "rewards/rejected": -1.9254032373428345, + "step": 4263 + }, + { + "epoch": 0.5, + "learning_rate": 1.534900200779497e-07, + "logits/chosen": -2.016542434692383, + "logits/rejected": -2.209221601486206, + "logps/chosen": -214.40408325195312, + "logps/rejected": -244.93080139160156, + "loss": 1.1063, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9619942903518677, + "rewards/margins": 0.07737934589385986, + "rewards/rejected": -2.0393736362457275, + "step": 4264 + }, + { + "epoch": 0.5, + "learning_rate": 1.534545884020314e-07, + "logits/chosen": -2.290271043777466, + "logits/rejected": -2.236297607421875, + "logps/chosen": -267.8720397949219, + "logps/rejected": -325.80584716796875, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17568451166152954, + "rewards/margins": 2.413940906524658, + "rewards/rejected": -2.589625358581543, + "step": 4265 + }, + { + "epoch": 0.5, + "learning_rate": 1.5341915672611313e-07, + "logits/chosen": -2.362647533416748, + "logits/rejected": -2.214596748352051, + "logps/chosen": -406.8388671875, + "logps/rejected": -354.97894287109375, + "loss": 0.3288, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1725614070892334, + "rewards/margins": 2.5254924297332764, + "rewards/rejected": -3.698054075241089, + "step": 4266 + }, + { + "epoch": 0.5, + "learning_rate": 1.5338372505019486e-07, + "logits/chosen": -2.920297861099243, + "logits/rejected": -2.980464458465576, + "logps/chosen": -157.0888214111328, + "logps/rejected": -225.21641540527344, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5910956859588623, + "rewards/margins": 2.5340590476989746, + "rewards/rejected": -3.125154495239258, + "step": 4267 + }, + { + "epoch": 0.5, + "learning_rate": 1.5334829337427658e-07, + "logits/chosen": -2.490616798400879, + "logits/rejected": -2.14863657951355, + "logps/chosen": -233.7051239013672, + "logps/rejected": -330.0008544921875, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1076128482818604, + "rewards/margins": 3.4714250564575195, + "rewards/rejected": -4.579038143157959, + "step": 4268 + }, + { + "epoch": 0.5, + "learning_rate": 1.533128616983583e-07, + "logits/chosen": -2.8016772270202637, + "logits/rejected": -2.7487833499908447, + "logps/chosen": -246.37582397460938, + "logps/rejected": -213.29348754882812, + "loss": 0.651, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8931890726089478, + "rewards/margins": 0.5293000936508179, + "rewards/rejected": -2.4224891662597656, + "step": 4269 + }, + { + "epoch": 0.5, + "learning_rate": 1.5327743002244007e-07, + "logits/chosen": -2.752962112426758, + "logits/rejected": -2.6458139419555664, + "logps/chosen": -211.2667999267578, + "logps/rejected": -336.0350341796875, + "loss": 0.1565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8676900267601013, + "rewards/margins": 3.5591137409210205, + "rewards/rejected": -4.4268035888671875, + "step": 4270 + }, + { + "epoch": 0.5, + "learning_rate": 1.532419983465218e-07, + "logits/chosen": -2.3950929641723633, + "logits/rejected": -2.710789918899536, + "logps/chosen": -198.43841552734375, + "logps/rejected": -225.40200805664062, + "loss": 0.1764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6378539204597473, + "rewards/margins": 2.739706039428711, + "rewards/rejected": -3.3775601387023926, + "step": 4271 + }, + { + "epoch": 0.5, + "learning_rate": 1.5320656667060352e-07, + "logits/chosen": -2.662505626678467, + "logits/rejected": -2.634950637817383, + "logps/chosen": -206.844482421875, + "logps/rejected": -257.8430480957031, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48766660690307617, + "rewards/margins": 2.2955124378204346, + "rewards/rejected": -2.78317928314209, + "step": 4272 + }, + { + "epoch": 0.5, + "learning_rate": 1.5317113499468524e-07, + "logits/chosen": -2.4974470138549805, + "logits/rejected": -2.734520435333252, + "logps/chosen": -296.371826171875, + "logps/rejected": -186.68031311035156, + "loss": 0.2344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6894539594650269, + "rewards/margins": 1.7265090942382812, + "rewards/rejected": -2.4159629344940186, + "step": 4273 + }, + { + "epoch": 0.5, + "learning_rate": 1.5313570331876696e-07, + "logits/chosen": -2.2007408142089844, + "logits/rejected": -2.4097177982330322, + "logps/chosen": -382.18585205078125, + "logps/rejected": -311.4286804199219, + "loss": 1.0448, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0980472564697266, + "rewards/margins": 2.0328850746154785, + "rewards/rejected": -3.130932331085205, + "step": 4274 + }, + { + "epoch": 0.5, + "learning_rate": 1.531002716428487e-07, + "logits/chosen": -1.8832228183746338, + "logits/rejected": -1.780751347541809, + "logps/chosen": -309.95599365234375, + "logps/rejected": -387.327392578125, + "loss": 0.5056, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1864901781082153, + "rewards/margins": 2.0453972816467285, + "rewards/rejected": -3.2318875789642334, + "step": 4275 + }, + { + "epoch": 0.5, + "learning_rate": 1.5306483996693043e-07, + "logits/chosen": -2.6675634384155273, + "logits/rejected": -2.880091905593872, + "logps/chosen": -301.7516174316406, + "logps/rejected": -199.12220764160156, + "loss": 0.2024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7444517612457275, + "rewards/margins": 2.3038105964660645, + "rewards/rejected": -3.048262596130371, + "step": 4276 + }, + { + "epoch": 0.5, + "learning_rate": 1.5302940829101216e-07, + "logits/chosen": -2.2409582138061523, + "logits/rejected": -2.323681354522705, + "logps/chosen": -318.89453125, + "logps/rejected": -600.5203247070312, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9868684411048889, + "rewards/margins": 3.8234286308288574, + "rewards/rejected": -4.810297012329102, + "step": 4277 + }, + { + "epoch": 0.5, + "learning_rate": 1.5299397661509388e-07, + "logits/chosen": -2.3955976963043213, + "logits/rejected": -2.700786590576172, + "logps/chosen": -288.8055114746094, + "logps/rejected": -182.29733276367188, + "loss": 0.8295, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8316454887390137, + "rewards/margins": 0.45133861899375916, + "rewards/rejected": -2.2829842567443848, + "step": 4278 + }, + { + "epoch": 0.5, + "learning_rate": 1.529585449391756e-07, + "logits/chosen": -2.7779595851898193, + "logits/rejected": -2.877051830291748, + "logps/chosen": -260.39215087890625, + "logps/rejected": -281.03155517578125, + "loss": 0.4675, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.139236569404602, + "rewards/margins": 1.7268693447113037, + "rewards/rejected": -2.866105794906616, + "step": 4279 + }, + { + "epoch": 0.5, + "learning_rate": 1.5292311326325732e-07, + "logits/chosen": -2.23439359664917, + "logits/rejected": -2.4839017391204834, + "logps/chosen": -236.51361083984375, + "logps/rejected": -312.40667724609375, + "loss": 0.3296, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3737927675247192, + "rewards/margins": 3.157532215118408, + "rewards/rejected": -4.531325340270996, + "step": 4280 + }, + { + "epoch": 0.5, + "learning_rate": 1.528876815873391e-07, + "logits/chosen": -1.8599714040756226, + "logits/rejected": -2.1752562522888184, + "logps/chosen": -378.04608154296875, + "logps/rejected": -410.76947021484375, + "loss": 0.6835, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2734568119049072, + "rewards/margins": 0.7233983278274536, + "rewards/rejected": -1.9968551397323608, + "step": 4281 + }, + { + "epoch": 0.5, + "learning_rate": 1.5285224991142082e-07, + "logits/chosen": -2.559166193008423, + "logits/rejected": -2.6843860149383545, + "logps/chosen": -154.65261840820312, + "logps/rejected": -276.34112548828125, + "loss": 0.3374, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5990694761276245, + "rewards/margins": 3.390658378601074, + "rewards/rejected": -3.9897279739379883, + "step": 4282 + }, + { + "epoch": 0.5, + "learning_rate": 1.5281681823550254e-07, + "logits/chosen": -2.544830799102783, + "logits/rejected": -2.8360440731048584, + "logps/chosen": -294.1865539550781, + "logps/rejected": -204.4533233642578, + "loss": 0.2522, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8448627591133118, + "rewards/margins": 2.4339609146118164, + "rewards/rejected": -3.2788236141204834, + "step": 4283 + }, + { + "epoch": 0.5, + "learning_rate": 1.5278138655958426e-07, + "logits/chosen": -2.765310049057007, + "logits/rejected": -2.611494541168213, + "logps/chosen": -355.063232421875, + "logps/rejected": -397.748046875, + "loss": 0.2973, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2748652696609497, + "rewards/margins": 2.362043857574463, + "rewards/rejected": -3.6369094848632812, + "step": 4284 + }, + { + "epoch": 0.5, + "learning_rate": 1.5274595488366599e-07, + "logits/chosen": -1.9682056903839111, + "logits/rejected": -1.7485270500183105, + "logps/chosen": -248.74639892578125, + "logps/rejected": -302.92681884765625, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1154016256332397, + "rewards/margins": 1.7855627536773682, + "rewards/rejected": -2.9009644985198975, + "step": 4285 + }, + { + "epoch": 0.5, + "learning_rate": 1.527105232077477e-07, + "logits/chosen": -1.521147608757019, + "logits/rejected": -1.682644009590149, + "logps/chosen": -415.6841735839844, + "logps/rejected": -354.0915222167969, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2118538618087769, + "rewards/margins": 2.22906756401062, + "rewards/rejected": -3.4409215450286865, + "step": 4286 + }, + { + "epoch": 0.5, + "learning_rate": 1.5267509153182946e-07, + "logits/chosen": -1.7822232246398926, + "logits/rejected": -2.1787190437316895, + "logps/chosen": -442.8018798828125, + "logps/rejected": -217.33682250976562, + "loss": 0.5106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1929667592048645, + "rewards/margins": 1.4088464975357056, + "rewards/rejected": -1.6018133163452148, + "step": 4287 + }, + { + "epoch": 0.5, + "learning_rate": 1.5263965985591118e-07, + "logits/chosen": -2.2087419033050537, + "logits/rejected": -2.0138778686523438, + "logps/chosen": -293.10589599609375, + "logps/rejected": -243.96463012695312, + "loss": 0.4307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6133863925933838, + "rewards/margins": 1.2432156801223755, + "rewards/rejected": -1.8566019535064697, + "step": 4288 + }, + { + "epoch": 0.5, + "learning_rate": 1.526042281799929e-07, + "logits/chosen": -2.241809606552124, + "logits/rejected": -2.357722520828247, + "logps/chosen": -194.8080596923828, + "logps/rejected": -257.8428039550781, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1622743606567383, + "rewards/margins": 2.955214262008667, + "rewards/rejected": -4.117488861083984, + "step": 4289 + }, + { + "epoch": 0.5, + "learning_rate": 1.5256879650407462e-07, + "logits/chosen": -2.113306999206543, + "logits/rejected": -2.2567076683044434, + "logps/chosen": -385.4450988769531, + "logps/rejected": -252.20614624023438, + "loss": 0.4274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6423571705818176, + "rewards/margins": 1.3067986965179443, + "rewards/rejected": -1.9491558074951172, + "step": 4290 + }, + { + "epoch": 0.5, + "learning_rate": 1.5253336482815635e-07, + "logits/chosen": -2.92417573928833, + "logits/rejected": -2.781428813934326, + "logps/chosen": -370.0450744628906, + "logps/rejected": -304.86083984375, + "loss": 0.3689, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5746009349822998, + "rewards/margins": 3.243821620941162, + "rewards/rejected": -4.818422317504883, + "step": 4291 + }, + { + "epoch": 0.5, + "learning_rate": 1.5249793315223807e-07, + "logits/chosen": -2.130568027496338, + "logits/rejected": -2.474905014038086, + "logps/chosen": -342.45050048828125, + "logps/rejected": -217.0756378173828, + "loss": 0.6094, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0228044986724854, + "rewards/margins": 1.1828935146331787, + "rewards/rejected": -2.205698013305664, + "step": 4292 + }, + { + "epoch": 0.5, + "learning_rate": 1.5246250147631984e-07, + "logits/chosen": -2.3553173542022705, + "logits/rejected": -2.137087821960449, + "logps/chosen": -278.73541259765625, + "logps/rejected": -270.9977111816406, + "loss": 0.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2739839553833008, + "rewards/margins": 3.074730157852173, + "rewards/rejected": -4.3487138748168945, + "step": 4293 + }, + { + "epoch": 0.5, + "learning_rate": 1.5242706980040156e-07, + "logits/chosen": -2.162520408630371, + "logits/rejected": -1.8134863376617432, + "logps/chosen": -192.6844482421875, + "logps/rejected": -314.5346374511719, + "loss": 0.2077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3970511555671692, + "rewards/margins": 3.701812267303467, + "rewards/rejected": -4.09886360168457, + "step": 4294 + }, + { + "epoch": 0.5, + "learning_rate": 1.5239163812448329e-07, + "logits/chosen": -2.7187843322753906, + "logits/rejected": -2.7966771125793457, + "logps/chosen": -209.6675262451172, + "logps/rejected": -219.25469970703125, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9427275657653809, + "rewards/margins": 1.8578760623931885, + "rewards/rejected": -2.8006036281585693, + "step": 4295 + }, + { + "epoch": 0.5, + "learning_rate": 1.52356206448565e-07, + "logits/chosen": -2.3302390575408936, + "logits/rejected": -2.1605334281921387, + "logps/chosen": -111.98717498779297, + "logps/rejected": -207.28973388671875, + "loss": 0.2261, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5030367970466614, + "rewards/margins": 2.518113851547241, + "rewards/rejected": -3.0211503505706787, + "step": 4296 + }, + { + "epoch": 0.5, + "learning_rate": 1.5232077477264673e-07, + "logits/chosen": -2.2052013874053955, + "logits/rejected": -2.083253860473633, + "logps/chosen": -113.82157897949219, + "logps/rejected": -227.1693115234375, + "loss": 0.2351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15822666883468628, + "rewards/margins": 3.32332181930542, + "rewards/rejected": -3.48154878616333, + "step": 4297 + }, + { + "epoch": 0.5, + "learning_rate": 1.5228534309672848e-07, + "logits/chosen": -2.181292772293091, + "logits/rejected": -1.952545404434204, + "logps/chosen": -142.2515411376953, + "logps/rejected": -185.93263244628906, + "loss": 0.3966, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.958098292350769, + "rewards/margins": 2.0660760402679443, + "rewards/rejected": -3.024174451828003, + "step": 4298 + }, + { + "epoch": 0.5, + "learning_rate": 1.522499114208102e-07, + "logits/chosen": -2.4294776916503906, + "logits/rejected": -2.37638258934021, + "logps/chosen": -154.66912841796875, + "logps/rejected": -226.6961669921875, + "loss": 0.7055, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5065608024597168, + "rewards/margins": 2.1269843578338623, + "rewards/rejected": -3.633544921875, + "step": 4299 + }, + { + "epoch": 0.5, + "learning_rate": 1.5221447974489192e-07, + "logits/chosen": -2.714020013809204, + "logits/rejected": -2.652264356613159, + "logps/chosen": -299.5177001953125, + "logps/rejected": -310.5702819824219, + "loss": 0.2176, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32908204197883606, + "rewards/margins": 2.784719467163086, + "rewards/rejected": -3.1138014793395996, + "step": 4300 + }, + { + "epoch": 0.5, + "learning_rate": 1.5217904806897365e-07, + "logits/chosen": -2.453956127166748, + "logits/rejected": -2.624359607696533, + "logps/chosen": -331.8744201660156, + "logps/rejected": -269.48162841796875, + "loss": 0.2403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8841144442558289, + "rewards/margins": 2.9946963787078857, + "rewards/rejected": -3.8788108825683594, + "step": 4301 + }, + { + "epoch": 0.5, + "learning_rate": 1.5214361639305537e-07, + "logits/chosen": -2.4486608505249023, + "logits/rejected": -2.586564540863037, + "logps/chosen": -411.9715881347656, + "logps/rejected": -282.53778076171875, + "loss": 0.4723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4530561864376068, + "rewards/margins": 1.5842235088348389, + "rewards/rejected": -2.0372796058654785, + "step": 4302 + }, + { + "epoch": 0.5, + "learning_rate": 1.521081847171371e-07, + "logits/chosen": -2.1105005741119385, + "logits/rejected": -2.1378371715545654, + "logps/chosen": -321.71551513671875, + "logps/rejected": -283.59991455078125, + "loss": 0.3345, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8658463954925537, + "rewards/margins": 1.7376617193222046, + "rewards/rejected": -2.603508234024048, + "step": 4303 + }, + { + "epoch": 0.5, + "learning_rate": 1.5207275304121884e-07, + "logits/chosen": -2.7593157291412354, + "logits/rejected": -2.7036802768707275, + "logps/chosen": -353.3402099609375, + "logps/rejected": -270.9657897949219, + "loss": 0.4833, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.824008584022522, + "rewards/margins": 3.386936664581299, + "rewards/rejected": -4.210945129394531, + "step": 4304 + }, + { + "epoch": 0.5, + "learning_rate": 1.520373213653006e-07, + "logits/chosen": -2.701078414916992, + "logits/rejected": -2.4819047451019287, + "logps/chosen": -403.67633056640625, + "logps/rejected": -249.9696044921875, + "loss": 0.4415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8156898021697998, + "rewards/margins": 1.8898921012878418, + "rewards/rejected": -2.7055821418762207, + "step": 4305 + }, + { + "epoch": 0.5, + "learning_rate": 1.520018896893823e-07, + "logits/chosen": -1.8966426849365234, + "logits/rejected": -2.2716963291168213, + "logps/chosen": -561.7974853515625, + "logps/rejected": -248.41207885742188, + "loss": 0.6988, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.078560471534729, + "rewards/margins": 1.3422222137451172, + "rewards/rejected": -2.4207825660705566, + "step": 4306 + }, + { + "epoch": 0.5, + "learning_rate": 1.5196645801346403e-07, + "logits/chosen": -2.3306541442871094, + "logits/rejected": -2.763977527618408, + "logps/chosen": -325.7136535644531, + "logps/rejected": -264.84893798828125, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9100849628448486, + "rewards/margins": 5.037590980529785, + "rewards/rejected": -5.947675704956055, + "step": 4307 + }, + { + "epoch": 0.5, + "learning_rate": 1.5193102633754575e-07, + "logits/chosen": -1.6247892379760742, + "logits/rejected": -1.7137699127197266, + "logps/chosen": -417.07354736328125, + "logps/rejected": -409.0220947265625, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4209728240966797, + "rewards/margins": 2.9849705696105957, + "rewards/rejected": -3.4059433937072754, + "step": 4308 + }, + { + "epoch": 0.5, + "learning_rate": 1.518955946616275e-07, + "logits/chosen": -2.698357343673706, + "logits/rejected": -2.264911413192749, + "logps/chosen": -358.6046142578125, + "logps/rejected": -266.30230712890625, + "loss": 0.2344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6404500603675842, + "rewards/margins": 3.1649279594421387, + "rewards/rejected": -3.805377960205078, + "step": 4309 + }, + { + "epoch": 0.5, + "learning_rate": 1.5186016298570922e-07, + "logits/chosen": -2.084332227706909, + "logits/rejected": -2.0502638816833496, + "logps/chosen": -247.8494873046875, + "logps/rejected": -284.5571594238281, + "loss": 0.6324, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.420001745223999, + "rewards/margins": 0.6811508536338806, + "rewards/rejected": -2.1011524200439453, + "step": 4310 + }, + { + "epoch": 0.5, + "learning_rate": 1.5182473130979095e-07, + "logits/chosen": -2.403601884841919, + "logits/rejected": -2.3329012393951416, + "logps/chosen": -211.25611877441406, + "logps/rejected": -217.35263061523438, + "loss": 0.6067, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3527064323425293, + "rewards/margins": 1.0846030712127686, + "rewards/rejected": -2.437309741973877, + "step": 4311 + }, + { + "epoch": 0.5, + "learning_rate": 1.5178929963387267e-07, + "logits/chosen": -2.3866121768951416, + "logits/rejected": -2.626359224319458, + "logps/chosen": -329.42840576171875, + "logps/rejected": -222.88406372070312, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04774979501962662, + "rewards/margins": 3.0765109062194824, + "rewards/rejected": -3.124260425567627, + "step": 4312 + }, + { + "epoch": 0.5, + "learning_rate": 1.517538679579544e-07, + "logits/chosen": -2.5660974979400635, + "logits/rejected": -2.4182331562042236, + "logps/chosen": -186.1149139404297, + "logps/rejected": -166.7367401123047, + "loss": 0.3587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5715744495391846, + "rewards/margins": 1.1570802927017212, + "rewards/rejected": -1.7286546230316162, + "step": 4313 + }, + { + "epoch": 0.5, + "learning_rate": 1.517184362820361e-07, + "logits/chosen": -1.7928569316864014, + "logits/rejected": -2.027052402496338, + "logps/chosen": -167.12301635742188, + "logps/rejected": -238.40406799316406, + "loss": 0.3288, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8394443392753601, + "rewards/margins": 2.688851833343506, + "rewards/rejected": -3.5282962322235107, + "step": 4314 + }, + { + "epoch": 0.5, + "learning_rate": 1.5168300460611783e-07, + "logits/chosen": -2.561551094055176, + "logits/rejected": -2.8854410648345947, + "logps/chosen": -488.9709777832031, + "logps/rejected": -257.8424377441406, + "loss": 0.3089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.989282488822937, + "rewards/margins": 2.291337013244629, + "rewards/rejected": -3.2806191444396973, + "step": 4315 + }, + { + "epoch": 0.5, + "learning_rate": 1.516475729301996e-07, + "logits/chosen": -2.6186091899871826, + "logits/rejected": -2.6712095737457275, + "logps/chosen": -118.32412719726562, + "logps/rejected": -271.76690673828125, + "loss": 0.1932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1257067173719406, + "rewards/margins": 3.8512825965881348, + "rewards/rejected": -3.976989507675171, + "step": 4316 + }, + { + "epoch": 0.5, + "learning_rate": 1.5161214125428133e-07, + "logits/chosen": -2.5843405723571777, + "logits/rejected": -2.538076877593994, + "logps/chosen": -104.80885314941406, + "logps/rejected": -149.2321014404297, + "loss": 0.3903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8238985538482666, + "rewards/margins": 1.055555820465088, + "rewards/rejected": -1.8794543743133545, + "step": 4317 + }, + { + "epoch": 0.5, + "learning_rate": 1.5157670957836305e-07, + "logits/chosen": -2.530078411102295, + "logits/rejected": -2.4148571491241455, + "logps/chosen": -242.86953735351562, + "logps/rejected": -237.532470703125, + "loss": 0.2974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7064067125320435, + "rewards/margins": 1.3039573431015015, + "rewards/rejected": -2.010364055633545, + "step": 4318 + }, + { + "epoch": 0.5, + "learning_rate": 1.5154127790244478e-07, + "logits/chosen": -2.362574577331543, + "logits/rejected": -2.3392527103424072, + "logps/chosen": -281.58203125, + "logps/rejected": -280.62469482421875, + "loss": 0.8528, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3266464471817017, + "rewards/margins": 0.6573824882507324, + "rewards/rejected": -1.984028935432434, + "step": 4319 + }, + { + "epoch": 0.5, + "learning_rate": 1.5150584622652653e-07, + "logits/chosen": -1.9832206964492798, + "logits/rejected": -2.1096866130828857, + "logps/chosen": -112.9889907836914, + "logps/rejected": -166.43572998046875, + "loss": 0.5653, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1626429557800293, + "rewards/margins": 1.6260852813720703, + "rewards/rejected": -2.7887282371520996, + "step": 4320 + }, + { + "epoch": 0.5, + "learning_rate": 1.5147041455060825e-07, + "logits/chosen": -2.2844033241271973, + "logits/rejected": -2.5145881175994873, + "logps/chosen": -370.1441345214844, + "logps/rejected": -331.8449401855469, + "loss": 0.8432, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2318850755691528, + "rewards/margins": 1.7976487874984741, + "rewards/rejected": -3.029533863067627, + "step": 4321 + }, + { + "epoch": 0.5, + "learning_rate": 1.5143498287468997e-07, + "logits/chosen": -2.946277618408203, + "logits/rejected": -2.8118672370910645, + "logps/chosen": -190.28897094726562, + "logps/rejected": -156.46151733398438, + "loss": 0.6061, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6267460584640503, + "rewards/margins": 1.3819265365600586, + "rewards/rejected": -3.0086727142333984, + "step": 4322 + }, + { + "epoch": 0.5, + "learning_rate": 1.513995511987717e-07, + "logits/chosen": -2.6600160598754883, + "logits/rejected": -2.845024347305298, + "logps/chosen": -251.10476684570312, + "logps/rejected": -264.8751220703125, + "loss": 0.7116, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7866148352622986, + "rewards/margins": 2.1639723777770996, + "rewards/rejected": -2.9505867958068848, + "step": 4323 + }, + { + "epoch": 0.5, + "learning_rate": 1.5136411952285341e-07, + "logits/chosen": -2.060102939605713, + "logits/rejected": -2.5831716060638428, + "logps/chosen": -360.67413330078125, + "logps/rejected": -242.4019775390625, + "loss": 0.1068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17287471890449524, + "rewards/margins": 2.8815369606018066, + "rewards/rejected": -3.0544118881225586, + "step": 4324 + }, + { + "epoch": 0.5, + "learning_rate": 1.5132868784693514e-07, + "logits/chosen": -2.660162925720215, + "logits/rejected": -2.615445613861084, + "logps/chosen": -285.03900146484375, + "logps/rejected": -338.6811218261719, + "loss": 0.2661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5105436444282532, + "rewards/margins": 3.0604135990142822, + "rewards/rejected": -3.5709569454193115, + "step": 4325 + }, + { + "epoch": 0.5, + "learning_rate": 1.5129325617101686e-07, + "logits/chosen": -2.029813289642334, + "logits/rejected": -2.363755941390991, + "logps/chosen": -285.2708740234375, + "logps/rejected": -256.00823974609375, + "loss": 0.4949, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0765560865402222, + "rewards/margins": 1.385362148284912, + "rewards/rejected": -2.461918354034424, + "step": 4326 + }, + { + "epoch": 0.5, + "learning_rate": 1.512578244950986e-07, + "logits/chosen": -2.7684950828552246, + "logits/rejected": -2.7302324771881104, + "logps/chosen": -179.16163635253906, + "logps/rejected": -250.98574829101562, + "loss": 0.4748, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5737764239311218, + "rewards/margins": 1.9058587551116943, + "rewards/rejected": -2.479635238647461, + "step": 4327 + }, + { + "epoch": 0.5, + "learning_rate": 1.5122239281918035e-07, + "logits/chosen": -2.225161075592041, + "logits/rejected": -2.1888508796691895, + "logps/chosen": -250.87237548828125, + "logps/rejected": -346.39227294921875, + "loss": 0.193, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5558536052703857, + "rewards/margins": 2.6224405765533447, + "rewards/rejected": -4.1782941818237305, + "step": 4328 + }, + { + "epoch": 0.5, + "learning_rate": 1.5118696114326208e-07, + "logits/chosen": -2.0678539276123047, + "logits/rejected": -2.03318190574646, + "logps/chosen": -193.02719116210938, + "logps/rejected": -209.3085479736328, + "loss": 0.6492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3768494129180908, + "rewards/margins": 2.378671407699585, + "rewards/rejected": -3.755520820617676, + "step": 4329 + }, + { + "epoch": 0.5, + "learning_rate": 1.511515294673438e-07, + "logits/chosen": -2.3158762454986572, + "logits/rejected": -2.334256410598755, + "logps/chosen": -293.54150390625, + "logps/rejected": -356.881591796875, + "loss": 0.3303, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3904640674591064, + "rewards/margins": 3.342280626296997, + "rewards/rejected": -4.732744216918945, + "step": 4330 + }, + { + "epoch": 0.5, + "learning_rate": 1.5111609779142552e-07, + "logits/chosen": -2.3744165897369385, + "logits/rejected": -2.518599033355713, + "logps/chosen": -288.6755065917969, + "logps/rejected": -232.468505859375, + "loss": 0.5281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.306557536125183, + "rewards/margins": 2.2666637897491455, + "rewards/rejected": -3.573221445083618, + "step": 4331 + }, + { + "epoch": 0.5, + "learning_rate": 1.5108066611550727e-07, + "logits/chosen": -1.9691746234893799, + "logits/rejected": -2.1620397567749023, + "logps/chosen": -304.16387939453125, + "logps/rejected": -257.9756164550781, + "loss": 0.4329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6263852119445801, + "rewards/margins": 2.8525993824005127, + "rewards/rejected": -3.478984832763672, + "step": 4332 + }, + { + "epoch": 0.5, + "learning_rate": 1.51045234439589e-07, + "logits/chosen": -1.9688916206359863, + "logits/rejected": -1.6499273777008057, + "logps/chosen": -211.80352783203125, + "logps/rejected": -318.6793518066406, + "loss": 0.2711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6547039747238159, + "rewards/margins": 2.51863694190979, + "rewards/rejected": -3.1733407974243164, + "step": 4333 + }, + { + "epoch": 0.5, + "learning_rate": 1.5100980276367071e-07, + "logits/chosen": -2.2902441024780273, + "logits/rejected": -2.2905776500701904, + "logps/chosen": -171.33584594726562, + "logps/rejected": -261.6302185058594, + "loss": 0.6412, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7318154573440552, + "rewards/margins": 1.3136053085327148, + "rewards/rejected": -3.0454208850860596, + "step": 4334 + }, + { + "epoch": 0.5, + "learning_rate": 1.5097437108775244e-07, + "logits/chosen": -1.7281849384307861, + "logits/rejected": -1.8013017177581787, + "logps/chosen": -307.65008544921875, + "logps/rejected": -346.1015625, + "loss": 0.3856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6965197324752808, + "rewards/margins": 2.1779327392578125, + "rewards/rejected": -2.8744523525238037, + "step": 4335 + }, + { + "epoch": 0.5, + "learning_rate": 1.5093893941183416e-07, + "logits/chosen": -2.8501462936401367, + "logits/rejected": -2.8830363750457764, + "logps/chosen": -104.16026306152344, + "logps/rejected": -134.36285400390625, + "loss": 0.4108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9449959993362427, + "rewards/margins": 0.7604342699050903, + "rewards/rejected": -1.7054301500320435, + "step": 4336 + }, + { + "epoch": 0.5, + "learning_rate": 1.5090350773591588e-07, + "logits/chosen": -2.0073204040527344, + "logits/rejected": -2.323707103729248, + "logps/chosen": -470.0325927734375, + "logps/rejected": -294.54913330078125, + "loss": 0.422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2827104330062866, + "rewards/margins": 2.0566630363464355, + "rewards/rejected": -3.3393735885620117, + "step": 4337 + }, + { + "epoch": 0.5, + "learning_rate": 1.5086807605999763e-07, + "logits/chosen": -2.486642360687256, + "logits/rejected": -2.4480013847351074, + "logps/chosen": -102.0362777709961, + "logps/rejected": -188.59942626953125, + "loss": 0.3011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07007420063018799, + "rewards/margins": 2.4947991371154785, + "rewards/rejected": -2.564873218536377, + "step": 4338 + }, + { + "epoch": 0.5, + "learning_rate": 1.5083264438407935e-07, + "logits/chosen": -2.397627830505371, + "logits/rejected": -2.5381393432617188, + "logps/chosen": -217.27732849121094, + "logps/rejected": -283.69842529296875, + "loss": 0.2487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8965975642204285, + "rewards/margins": 1.9952188730239868, + "rewards/rejected": -2.8918166160583496, + "step": 4339 + }, + { + "epoch": 0.5, + "learning_rate": 1.507972127081611e-07, + "logits/chosen": -2.234879732131958, + "logits/rejected": -2.138388156890869, + "logps/chosen": -252.5362548828125, + "logps/rejected": -264.9149169921875, + "loss": 0.5781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.392747163772583, + "rewards/margins": 2.000958204269409, + "rewards/rejected": -2.393705129623413, + "step": 4340 + }, + { + "epoch": 0.5, + "learning_rate": 1.5076178103224282e-07, + "logits/chosen": -1.8806730508804321, + "logits/rejected": -1.704809308052063, + "logps/chosen": -543.2327880859375, + "logps/rejected": -658.5618896484375, + "loss": 0.7936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0469233989715576, + "rewards/margins": 1.0340230464935303, + "rewards/rejected": -2.080946445465088, + "step": 4341 + }, + { + "epoch": 0.51, + "learning_rate": 1.5072634935632454e-07, + "logits/chosen": -2.2330739498138428, + "logits/rejected": -2.3611886501312256, + "logps/chosen": -332.44549560546875, + "logps/rejected": -313.6429443359375, + "loss": 0.1205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4973880648612976, + "rewards/margins": 2.983412742614746, + "rewards/rejected": -3.4808008670806885, + "step": 4342 + }, + { + "epoch": 0.51, + "learning_rate": 1.506909176804063e-07, + "logits/chosen": -2.603106737136841, + "logits/rejected": -2.329122543334961, + "logps/chosen": -359.9208984375, + "logps/rejected": -405.07568359375, + "loss": 0.5659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4199979305267334, + "rewards/margins": 1.9746787548065186, + "rewards/rejected": -2.394676685333252, + "step": 4343 + }, + { + "epoch": 0.51, + "learning_rate": 1.5065548600448801e-07, + "logits/chosen": -2.4392142295837402, + "logits/rejected": -2.474942684173584, + "logps/chosen": -165.6344451904297, + "logps/rejected": -204.64625549316406, + "loss": 0.5239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3761308193206787, + "rewards/margins": 2.425549030303955, + "rewards/rejected": -3.8016796112060547, + "step": 4344 + }, + { + "epoch": 0.51, + "learning_rate": 1.5062005432856974e-07, + "logits/chosen": -2.1699328422546387, + "logits/rejected": -2.277768611907959, + "logps/chosen": -276.71563720703125, + "logps/rejected": -281.0272521972656, + "loss": 0.8349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9548039436340332, + "rewards/margins": 1.033818244934082, + "rewards/rejected": -1.9886221885681152, + "step": 4345 + }, + { + "epoch": 0.51, + "learning_rate": 1.5058462265265146e-07, + "logits/chosen": -2.006967544555664, + "logits/rejected": -2.0948214530944824, + "logps/chosen": -393.83221435546875, + "logps/rejected": -394.51702880859375, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5407195091247559, + "rewards/margins": 2.1803205013275146, + "rewards/rejected": -3.7210402488708496, + "step": 4346 + }, + { + "epoch": 0.51, + "learning_rate": 1.5054919097673318e-07, + "logits/chosen": -2.2900214195251465, + "logits/rejected": -2.2937660217285156, + "logps/chosen": -327.0431823730469, + "logps/rejected": -348.63165283203125, + "loss": 0.4819, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1783134937286377, + "rewards/margins": 1.6505095958709717, + "rewards/rejected": -2.8288230895996094, + "step": 4347 + }, + { + "epoch": 0.51, + "learning_rate": 1.505137593008149e-07, + "logits/chosen": -2.187046766281128, + "logits/rejected": -2.378138542175293, + "logps/chosen": -345.8704833984375, + "logps/rejected": -278.2528076171875, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1015912294387817, + "rewards/margins": 2.045687675476074, + "rewards/rejected": -3.1472787857055664, + "step": 4348 + }, + { + "epoch": 0.51, + "learning_rate": 1.5047832762489665e-07, + "logits/chosen": -2.5904488563537598, + "logits/rejected": -2.6615853309631348, + "logps/chosen": -236.78591918945312, + "logps/rejected": -144.87545776367188, + "loss": 0.81, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2676069736480713, + "rewards/margins": 1.2389637231826782, + "rewards/rejected": -2.506570816040039, + "step": 4349 + }, + { + "epoch": 0.51, + "learning_rate": 1.5044289594897837e-07, + "logits/chosen": -2.5963644981384277, + "logits/rejected": -2.5106048583984375, + "logps/chosen": -210.99586486816406, + "logps/rejected": -260.60223388671875, + "loss": 0.3051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7144404053688049, + "rewards/margins": 3.857468605041504, + "rewards/rejected": -4.571908950805664, + "step": 4350 + }, + { + "epoch": 0.51, + "learning_rate": 1.5040746427306012e-07, + "logits/chosen": -1.7436047792434692, + "logits/rejected": -2.170863628387451, + "logps/chosen": -430.507568359375, + "logps/rejected": -254.6909942626953, + "loss": 0.4358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5758795738220215, + "rewards/margins": 0.9057127833366394, + "rewards/rejected": -1.4815922975540161, + "step": 4351 + }, + { + "epoch": 0.51, + "learning_rate": 1.5037203259714184e-07, + "logits/chosen": -1.9794533252716064, + "logits/rejected": -2.282529592514038, + "logps/chosen": -358.34979248046875, + "logps/rejected": -335.7793884277344, + "loss": 0.5172, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.638986349105835, + "rewards/margins": 0.9403858184814453, + "rewards/rejected": -2.5793724060058594, + "step": 4352 + }, + { + "epoch": 0.51, + "learning_rate": 1.5033660092122357e-07, + "logits/chosen": -2.3798699378967285, + "logits/rejected": -2.5058705806732178, + "logps/chosen": -274.5151672363281, + "logps/rejected": -346.3312072753906, + "loss": 0.3919, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5503833293914795, + "rewards/margins": 1.507620096206665, + "rewards/rejected": -3.0580034255981445, + "step": 4353 + }, + { + "epoch": 0.51, + "learning_rate": 1.5030116924530532e-07, + "logits/chosen": -1.9549859762191772, + "logits/rejected": -2.54410719871521, + "logps/chosen": -278.669189453125, + "logps/rejected": -287.09942626953125, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6237742900848389, + "rewards/margins": 3.6327781677246094, + "rewards/rejected": -4.256552219390869, + "step": 4354 + }, + { + "epoch": 0.51, + "learning_rate": 1.5026573756938704e-07, + "logits/chosen": -2.3094558715820312, + "logits/rejected": -2.359281063079834, + "logps/chosen": -216.14822387695312, + "logps/rejected": -372.9872741699219, + "loss": 0.1787, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7349266409873962, + "rewards/margins": 3.896794557571411, + "rewards/rejected": -4.631720542907715, + "step": 4355 + }, + { + "epoch": 0.51, + "learning_rate": 1.5023030589346876e-07, + "logits/chosen": -1.9102256298065186, + "logits/rejected": -1.9716451168060303, + "logps/chosen": -174.44606018066406, + "logps/rejected": -179.79747009277344, + "loss": 0.975, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0311594009399414, + "rewards/margins": 0.7269107103347778, + "rewards/rejected": -2.7580699920654297, + "step": 4356 + }, + { + "epoch": 0.51, + "learning_rate": 1.5019487421755048e-07, + "logits/chosen": -1.8558202981948853, + "logits/rejected": -1.714080572128296, + "logps/chosen": -280.8904113769531, + "logps/rejected": -355.36968994140625, + "loss": 0.4609, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.142195701599121, + "rewards/margins": 1.453489065170288, + "rewards/rejected": -2.59568452835083, + "step": 4357 + }, + { + "epoch": 0.51, + "learning_rate": 1.501594425416322e-07, + "logits/chosen": -2.3113255500793457, + "logits/rejected": -2.1292450428009033, + "logps/chosen": -381.748046875, + "logps/rejected": -215.47662353515625, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3703558444976807, + "rewards/margins": 2.1256959438323975, + "rewards/rejected": -3.496051788330078, + "step": 4358 + }, + { + "epoch": 0.51, + "learning_rate": 1.5012401086571393e-07, + "logits/chosen": -2.406862735748291, + "logits/rejected": -2.487161159515381, + "logps/chosen": -254.95794677734375, + "logps/rejected": -344.4727783203125, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06246212124824524, + "rewards/margins": 3.110140800476074, + "rewards/rejected": -3.0476784706115723, + "step": 4359 + }, + { + "epoch": 0.51, + "learning_rate": 1.5008857918979565e-07, + "logits/chosen": -2.265291690826416, + "logits/rejected": -2.060199737548828, + "logps/chosen": -194.21383666992188, + "logps/rejected": -244.22549438476562, + "loss": 0.4298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9951977729797363, + "rewards/margins": 1.6636674404144287, + "rewards/rejected": -2.658865213394165, + "step": 4360 + }, + { + "epoch": 0.51, + "learning_rate": 1.500531475138774e-07, + "logits/chosen": -1.9943825006484985, + "logits/rejected": -1.9677015542984009, + "logps/chosen": -496.36578369140625, + "logps/rejected": -272.0693359375, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6931813359260559, + "rewards/margins": 2.0923101902008057, + "rewards/rejected": -2.785491466522217, + "step": 4361 + }, + { + "epoch": 0.51, + "learning_rate": 1.5001771583795912e-07, + "logits/chosen": -2.1320672035217285, + "logits/rejected": -2.006071090698242, + "logps/chosen": -220.083251953125, + "logps/rejected": -283.9471130371094, + "loss": 0.4156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47227972745895386, + "rewards/margins": 1.1195602416992188, + "rewards/rejected": -1.5918400287628174, + "step": 4362 + }, + { + "epoch": 0.51, + "learning_rate": 1.4998228416204087e-07, + "logits/chosen": -2.6416006088256836, + "logits/rejected": -2.790005683898926, + "logps/chosen": -261.884521484375, + "logps/rejected": -266.50653076171875, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1440531611442566, + "rewards/margins": 3.558241128921509, + "rewards/rejected": -3.70229434967041, + "step": 4363 + }, + { + "epoch": 0.51, + "learning_rate": 1.499468524861226e-07, + "logits/chosen": -2.9722530841827393, + "logits/rejected": -2.9671471118927, + "logps/chosen": -206.3912811279297, + "logps/rejected": -178.916015625, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5252015590667725, + "rewards/margins": 1.6576200723648071, + "rewards/rejected": -2.182821750640869, + "step": 4364 + }, + { + "epoch": 0.51, + "learning_rate": 1.499114208102043e-07, + "logits/chosen": -2.6765661239624023, + "logits/rejected": -2.647322416305542, + "logps/chosen": -225.3341827392578, + "logps/rejected": -263.080810546875, + "loss": 0.2772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9254167675971985, + "rewards/margins": 2.201167106628418, + "rewards/rejected": -3.1265838146209717, + "step": 4365 + }, + { + "epoch": 0.51, + "learning_rate": 1.4987598913428606e-07, + "logits/chosen": -2.1692962646484375, + "logits/rejected": -2.252351760864258, + "logps/chosen": -307.15081787109375, + "logps/rejected": -212.50759887695312, + "loss": 0.4129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5721405148506165, + "rewards/margins": 3.0663905143737793, + "rewards/rejected": -3.638530969619751, + "step": 4366 + }, + { + "epoch": 0.51, + "learning_rate": 1.4984055745836778e-07, + "logits/chosen": -2.7623291015625, + "logits/rejected": -2.8632280826568604, + "logps/chosen": -217.71620178222656, + "logps/rejected": -228.7214813232422, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0911099910736084, + "rewards/margins": 2.347404956817627, + "rewards/rejected": -3.4385149478912354, + "step": 4367 + }, + { + "epoch": 0.51, + "learning_rate": 1.498051257824495e-07, + "logits/chosen": -2.310675859451294, + "logits/rejected": -2.348601818084717, + "logps/chosen": -225.42730712890625, + "logps/rejected": -225.019775390625, + "loss": 0.2782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36424386501312256, + "rewards/margins": 2.0844473838806152, + "rewards/rejected": -2.4486913681030273, + "step": 4368 + }, + { + "epoch": 0.51, + "learning_rate": 1.4976969410653123e-07, + "logits/chosen": -2.5015225410461426, + "logits/rejected": -2.5293760299682617, + "logps/chosen": -541.6838989257812, + "logps/rejected": -507.81488037109375, + "loss": 0.315, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2868618965148926, + "rewards/margins": 2.9761388301849365, + "rewards/rejected": -4.26300048828125, + "step": 4369 + }, + { + "epoch": 0.51, + "learning_rate": 1.4973426243061295e-07, + "logits/chosen": -2.5372400283813477, + "logits/rejected": -2.498185396194458, + "logps/chosen": -418.7004089355469, + "logps/rejected": -464.050537109375, + "loss": 0.3025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9781321287155151, + "rewards/margins": 3.6465299129486084, + "rewards/rejected": -4.624661922454834, + "step": 4370 + }, + { + "epoch": 0.51, + "learning_rate": 1.4969883075469467e-07, + "logits/chosen": -2.75283145904541, + "logits/rejected": -2.5639398097991943, + "logps/chosen": -294.8426208496094, + "logps/rejected": -266.1232604980469, + "loss": 0.2143, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4081199169158936, + "rewards/margins": 1.7354803085327148, + "rewards/rejected": -3.1436002254486084, + "step": 4371 + }, + { + "epoch": 0.51, + "learning_rate": 1.4966339907877642e-07, + "logits/chosen": -2.316554546356201, + "logits/rejected": -2.0293965339660645, + "logps/chosen": -406.33233642578125, + "logps/rejected": -498.1932067871094, + "loss": 0.5817, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1302292346954346, + "rewards/margins": 1.7246274948120117, + "rewards/rejected": -2.854856491088867, + "step": 4372 + }, + { + "epoch": 0.51, + "learning_rate": 1.4962796740285814e-07, + "logits/chosen": -1.9122366905212402, + "logits/rejected": -2.1462059020996094, + "logps/chosen": -390.9963684082031, + "logps/rejected": -290.6221923828125, + "loss": 1.1691, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8259488344192505, + "rewards/margins": 0.15067905187606812, + "rewards/rejected": -1.9766278266906738, + "step": 4373 + }, + { + "epoch": 0.51, + "learning_rate": 1.495925357269399e-07, + "logits/chosen": -2.0770342350006104, + "logits/rejected": -2.274329662322998, + "logps/chosen": -452.0794677734375, + "logps/rejected": -383.7308349609375, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4660171270370483, + "rewards/margins": 3.7271156311035156, + "rewards/rejected": -5.1931328773498535, + "step": 4374 + }, + { + "epoch": 0.51, + "learning_rate": 1.495571040510216e-07, + "logits/chosen": -1.9078186750411987, + "logits/rejected": -1.688300609588623, + "logps/chosen": -308.55194091796875, + "logps/rejected": -286.681884765625, + "loss": 0.5419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6246774196624756, + "rewards/margins": 1.6940176486968994, + "rewards/rejected": -2.318695068359375, + "step": 4375 + }, + { + "epoch": 0.51, + "learning_rate": 1.4952167237510333e-07, + "logits/chosen": -2.62184476852417, + "logits/rejected": -2.839738368988037, + "logps/chosen": -353.78607177734375, + "logps/rejected": -205.7127685546875, + "loss": 0.2973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9544470310211182, + "rewards/margins": 2.1276042461395264, + "rewards/rejected": -3.0820512771606445, + "step": 4376 + }, + { + "epoch": 0.51, + "learning_rate": 1.4948624069918506e-07, + "logits/chosen": -2.182478189468384, + "logits/rejected": -2.4130959510803223, + "logps/chosen": -299.7957763671875, + "logps/rejected": -209.65316772460938, + "loss": 0.2684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.726485013961792, + "rewards/margins": 2.0807764530181885, + "rewards/rejected": -2.8072617053985596, + "step": 4377 + }, + { + "epoch": 0.51, + "learning_rate": 1.494508090232668e-07, + "logits/chosen": -2.3520045280456543, + "logits/rejected": -2.3126676082611084, + "logps/chosen": -255.115234375, + "logps/rejected": -276.49151611328125, + "loss": 0.3904, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7623100280761719, + "rewards/margins": 2.470048666000366, + "rewards/rejected": -3.232358694076538, + "step": 4378 + }, + { + "epoch": 0.51, + "learning_rate": 1.4941537734734853e-07, + "logits/chosen": -2.218036651611328, + "logits/rejected": -2.0261313915252686, + "logps/chosen": -233.1011962890625, + "logps/rejected": -399.63427734375, + "loss": 0.5243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.193806767463684, + "rewards/margins": 1.6197278499603271, + "rewards/rejected": -2.8135344982147217, + "step": 4379 + }, + { + "epoch": 0.51, + "learning_rate": 1.4937994567143025e-07, + "logits/chosen": -2.521239757537842, + "logits/rejected": -2.2199630737304688, + "logps/chosen": -218.27894592285156, + "logps/rejected": -271.52215576171875, + "loss": 0.2735, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9013968706130981, + "rewards/margins": 2.6468002796173096, + "rewards/rejected": -3.5481972694396973, + "step": 4380 + }, + { + "epoch": 0.51, + "learning_rate": 1.4934451399551197e-07, + "logits/chosen": -2.236525297164917, + "logits/rejected": -2.069474220275879, + "logps/chosen": -226.61947631835938, + "logps/rejected": -200.11383056640625, + "loss": 1.4552, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.448176860809326, + "rewards/margins": 0.2535504698753357, + "rewards/rejected": -2.7017273902893066, + "step": 4381 + }, + { + "epoch": 0.51, + "learning_rate": 1.493090823195937e-07, + "logits/chosen": -2.589669704437256, + "logits/rejected": -2.5461068153381348, + "logps/chosen": -339.33685302734375, + "logps/rejected": -250.9718780517578, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37482020258903503, + "rewards/margins": 3.1432723999023438, + "rewards/rejected": -3.518092632293701, + "step": 4382 + }, + { + "epoch": 0.51, + "learning_rate": 1.4927365064367544e-07, + "logits/chosen": -2.2279052734375, + "logits/rejected": -2.6070244312286377, + "logps/chosen": -253.30596923828125, + "logps/rejected": -228.40545654296875, + "loss": 0.6172, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5758841037750244, + "rewards/margins": 1.280324101448059, + "rewards/rejected": -2.856208324432373, + "step": 4383 + }, + { + "epoch": 0.51, + "learning_rate": 1.4923821896775716e-07, + "logits/chosen": -2.454094648361206, + "logits/rejected": -2.856034755706787, + "logps/chosen": -446.74072265625, + "logps/rejected": -245.69976806640625, + "loss": 0.5204, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0648690462112427, + "rewards/margins": 1.5602524280548096, + "rewards/rejected": -2.625121593475342, + "step": 4384 + }, + { + "epoch": 0.51, + "learning_rate": 1.4920278729183889e-07, + "logits/chosen": -2.4626951217651367, + "logits/rejected": -2.562631368637085, + "logps/chosen": -234.2470245361328, + "logps/rejected": -256.95556640625, + "loss": 0.5188, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9999050498008728, + "rewards/margins": 1.2314295768737793, + "rewards/rejected": -2.231334686279297, + "step": 4385 + }, + { + "epoch": 0.51, + "learning_rate": 1.4916735561592064e-07, + "logits/chosen": -2.3105568885803223, + "logits/rejected": -2.1869077682495117, + "logps/chosen": -333.7840881347656, + "logps/rejected": -324.4123229980469, + "loss": 1.0571, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9736287593841553, + "rewards/margins": 0.6410728096961975, + "rewards/rejected": -2.614701509475708, + "step": 4386 + }, + { + "epoch": 0.51, + "learning_rate": 1.4913192394000236e-07, + "logits/chosen": -2.420621871948242, + "logits/rejected": -2.5317068099975586, + "logps/chosen": -210.7329864501953, + "logps/rejected": -255.68417358398438, + "loss": 0.5233, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2133358716964722, + "rewards/margins": 1.0521609783172607, + "rewards/rejected": -2.2654967308044434, + "step": 4387 + }, + { + "epoch": 0.51, + "learning_rate": 1.4909649226408408e-07, + "logits/chosen": -2.8436591625213623, + "logits/rejected": -2.812608003616333, + "logps/chosen": -200.7388458251953, + "logps/rejected": -187.2425537109375, + "loss": 0.244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3373832702636719, + "rewards/margins": 1.6355093717575073, + "rewards/rejected": -2.9728925228118896, + "step": 4388 + }, + { + "epoch": 0.51, + "learning_rate": 1.4906106058816583e-07, + "logits/chosen": -2.505821466445923, + "logits/rejected": -2.4162914752960205, + "logps/chosen": -119.76532745361328, + "logps/rejected": -171.96142578125, + "loss": 0.6685, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2128705978393555, + "rewards/margins": 1.1468853950500488, + "rewards/rejected": -2.3597559928894043, + "step": 4389 + }, + { + "epoch": 0.51, + "learning_rate": 1.4902562891224755e-07, + "logits/chosen": -1.8850551843643188, + "logits/rejected": -2.1123485565185547, + "logps/chosen": -312.9224548339844, + "logps/rejected": -252.53005981445312, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5672345757484436, + "rewards/margins": 1.911978006362915, + "rewards/rejected": -2.479212522506714, + "step": 4390 + }, + { + "epoch": 0.51, + "learning_rate": 1.4899019723632927e-07, + "logits/chosen": -2.2002546787261963, + "logits/rejected": -2.296029806137085, + "logps/chosen": -425.8719787597656, + "logps/rejected": -395.3763122558594, + "loss": 0.3651, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2021698951721191, + "rewards/margins": 2.863576889038086, + "rewards/rejected": -4.065746784210205, + "step": 4391 + }, + { + "epoch": 0.51, + "learning_rate": 1.48954765560411e-07, + "logits/chosen": -1.8527302742004395, + "logits/rejected": -1.6333009004592896, + "logps/chosen": -221.0084686279297, + "logps/rejected": -291.35211181640625, + "loss": 0.2237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4222627878189087, + "rewards/margins": 3.366766929626465, + "rewards/rejected": -3.789029836654663, + "step": 4392 + }, + { + "epoch": 0.51, + "learning_rate": 1.4891933388449272e-07, + "logits/chosen": -2.5589468479156494, + "logits/rejected": -2.4643869400024414, + "logps/chosen": -170.20748901367188, + "logps/rejected": -152.09963989257812, + "loss": 0.2349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28161168098449707, + "rewards/margins": 2.009808301925659, + "rewards/rejected": -2.2914199829101562, + "step": 4393 + }, + { + "epoch": 0.51, + "learning_rate": 1.4888390220857447e-07, + "logits/chosen": -2.6228833198547363, + "logits/rejected": -2.5463998317718506, + "logps/chosen": -133.10140991210938, + "logps/rejected": -128.49986267089844, + "loss": 0.5215, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37436443567276, + "rewards/margins": 0.9123578071594238, + "rewards/rejected": -1.286722183227539, + "step": 4394 + }, + { + "epoch": 0.51, + "learning_rate": 1.488484705326562e-07, + "logits/chosen": -2.649916172027588, + "logits/rejected": -2.5899627208709717, + "logps/chosen": -186.29647827148438, + "logps/rejected": -350.657958984375, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7304617166519165, + "rewards/margins": 2.3380022048950195, + "rewards/rejected": -3.0684638023376465, + "step": 4395 + }, + { + "epoch": 0.51, + "learning_rate": 1.488130388567379e-07, + "logits/chosen": -1.977373480796814, + "logits/rejected": -1.6695935726165771, + "logps/chosen": -342.6892395019531, + "logps/rejected": -328.6878662109375, + "loss": 0.5453, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7726660966873169, + "rewards/margins": 1.2011258602142334, + "rewards/rejected": -1.9737920761108398, + "step": 4396 + }, + { + "epoch": 0.51, + "learning_rate": 1.4877760718081966e-07, + "logits/chosen": -2.2806236743927, + "logits/rejected": -2.449937105178833, + "logps/chosen": -289.75189208984375, + "logps/rejected": -220.74557495117188, + "loss": 0.2903, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6180912256240845, + "rewards/margins": 2.3723080158233643, + "rewards/rejected": -2.990399122238159, + "step": 4397 + }, + { + "epoch": 0.51, + "learning_rate": 1.4874217550490138e-07, + "logits/chosen": -2.4363765716552734, + "logits/rejected": -2.4653265476226807, + "logps/chosen": -263.3052978515625, + "logps/rejected": -339.00994873046875, + "loss": 0.4572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.726157546043396, + "rewards/margins": 1.8498708009719849, + "rewards/rejected": -3.576028347015381, + "step": 4398 + }, + { + "epoch": 0.51, + "learning_rate": 1.487067438289831e-07, + "logits/chosen": -1.8131859302520752, + "logits/rejected": -2.1827306747436523, + "logps/chosen": -344.66522216796875, + "logps/rejected": -271.425048828125, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.028161644935608, + "rewards/margins": 2.6561880111694336, + "rewards/rejected": -3.684349536895752, + "step": 4399 + }, + { + "epoch": 0.51, + "learning_rate": 1.4867131215306482e-07, + "logits/chosen": -2.0924620628356934, + "logits/rejected": -2.1976428031921387, + "logps/chosen": -265.1179504394531, + "logps/rejected": -318.68109130859375, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9029265642166138, + "rewards/margins": 2.494323492050171, + "rewards/rejected": -3.397249937057495, + "step": 4400 + }, + { + "epoch": 0.51, + "learning_rate": 1.4863588047714657e-07, + "logits/chosen": -2.968360662460327, + "logits/rejected": -2.9503936767578125, + "logps/chosen": -182.49832153320312, + "logps/rejected": -177.2061309814453, + "loss": 0.2897, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1017954349517822, + "rewards/margins": 2.0853538513183594, + "rewards/rejected": -3.1871492862701416, + "step": 4401 + }, + { + "epoch": 0.51, + "learning_rate": 1.486004488012283e-07, + "logits/chosen": -1.8406156301498413, + "logits/rejected": -2.0943050384521484, + "logps/chosen": -295.34283447265625, + "logps/rejected": -261.4505920410156, + "loss": 0.6362, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.893394947052002, + "rewards/margins": 1.4385179281234741, + "rewards/rejected": -3.3319129943847656, + "step": 4402 + }, + { + "epoch": 0.51, + "learning_rate": 1.4856501712531002e-07, + "logits/chosen": -2.2200498580932617, + "logits/rejected": -2.3146586418151855, + "logps/chosen": -244.54991149902344, + "logps/rejected": -291.24224853515625, + "loss": 0.3311, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3029348850250244, + "rewards/margins": 2.7298858165740967, + "rewards/rejected": -4.032820701599121, + "step": 4403 + }, + { + "epoch": 0.51, + "learning_rate": 1.4852958544939174e-07, + "logits/chosen": -2.4234228134155273, + "logits/rejected": -2.4797098636627197, + "logps/chosen": -295.424072265625, + "logps/rejected": -202.92066955566406, + "loss": 0.5093, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8641774654388428, + "rewards/margins": 1.2280616760253906, + "rewards/rejected": -3.0922389030456543, + "step": 4404 + }, + { + "epoch": 0.51, + "learning_rate": 1.4849415377347346e-07, + "logits/chosen": -2.034693956375122, + "logits/rejected": -2.1512560844421387, + "logps/chosen": -396.93121337890625, + "logps/rejected": -347.546142578125, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4289146661758423, + "rewards/margins": 1.821908712387085, + "rewards/rejected": -3.2508232593536377, + "step": 4405 + }, + { + "epoch": 0.51, + "learning_rate": 1.484587220975552e-07, + "logits/chosen": -2.6328794956207275, + "logits/rejected": -2.6797876358032227, + "logps/chosen": -166.774658203125, + "logps/rejected": -209.33944702148438, + "loss": 0.2129, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5647311806678772, + "rewards/margins": 3.3435425758361816, + "rewards/rejected": -3.908273696899414, + "step": 4406 + }, + { + "epoch": 0.51, + "learning_rate": 1.4842329042163693e-07, + "logits/chosen": -2.381096601486206, + "logits/rejected": -2.547952890396118, + "logps/chosen": -295.4427795410156, + "logps/rejected": -223.9435577392578, + "loss": 0.1637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3597848117351532, + "rewards/margins": 2.4268367290496826, + "rewards/rejected": -2.786621570587158, + "step": 4407 + }, + { + "epoch": 0.51, + "learning_rate": 1.4838785874571868e-07, + "logits/chosen": -2.4607720375061035, + "logits/rejected": -2.2779037952423096, + "logps/chosen": -189.45005798339844, + "logps/rejected": -266.0784912109375, + "loss": 0.7671, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7229597568511963, + "rewards/margins": 1.09784734249115, + "rewards/rejected": -2.8208069801330566, + "step": 4408 + }, + { + "epoch": 0.51, + "learning_rate": 1.483524270698004e-07, + "logits/chosen": -2.4146833419799805, + "logits/rejected": -2.670118808746338, + "logps/chosen": -327.0577392578125, + "logps/rejected": -250.93777465820312, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9281965494155884, + "rewards/margins": 2.2258803844451904, + "rewards/rejected": -3.1540770530700684, + "step": 4409 + }, + { + "epoch": 0.51, + "learning_rate": 1.4831699539388212e-07, + "logits/chosen": -2.5412850379943848, + "logits/rejected": -2.3235068321228027, + "logps/chosen": -214.45562744140625, + "logps/rejected": -299.60821533203125, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3515143096446991, + "rewards/margins": 2.426133155822754, + "rewards/rejected": -2.7776474952697754, + "step": 4410 + }, + { + "epoch": 0.51, + "learning_rate": 1.4828156371796385e-07, + "logits/chosen": -2.643850803375244, + "logits/rejected": -2.778143882751465, + "logps/chosen": -317.8985900878906, + "logps/rejected": -266.9397277832031, + "loss": 0.9565, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8946664333343506, + "rewards/margins": 1.142998456954956, + "rewards/rejected": -3.0376646518707275, + "step": 4411 + }, + { + "epoch": 0.51, + "learning_rate": 1.4824613204204557e-07, + "logits/chosen": -1.8103883266448975, + "logits/rejected": -1.7367323637008667, + "logps/chosen": -161.3958740234375, + "logps/rejected": -241.13726806640625, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0110852718353271, + "rewards/margins": 1.9242312908172607, + "rewards/rejected": -2.935316562652588, + "step": 4412 + }, + { + "epoch": 0.51, + "learning_rate": 1.4821070036612732e-07, + "logits/chosen": -2.3891959190368652, + "logits/rejected": -2.6364450454711914, + "logps/chosen": -238.0491943359375, + "logps/rejected": -217.40872192382812, + "loss": 0.2476, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0949835777282715, + "rewards/margins": 2.981060028076172, + "rewards/rejected": -4.076043128967285, + "step": 4413 + }, + { + "epoch": 0.51, + "learning_rate": 1.4817526869020904e-07, + "logits/chosen": -2.5794312953948975, + "logits/rejected": -2.549661874771118, + "logps/chosen": -116.14244079589844, + "logps/rejected": -149.17257690429688, + "loss": 0.7781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9410846829414368, + "rewards/margins": 2.013380765914917, + "rewards/rejected": -2.954465389251709, + "step": 4414 + }, + { + "epoch": 0.51, + "learning_rate": 1.4813983701429076e-07, + "logits/chosen": -2.1912097930908203, + "logits/rejected": -2.1807496547698975, + "logps/chosen": -250.3771514892578, + "logps/rejected": -289.1433410644531, + "loss": 0.4719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.304254949092865, + "rewards/margins": 1.1198935508728027, + "rewards/rejected": -1.4241485595703125, + "step": 4415 + }, + { + "epoch": 0.51, + "learning_rate": 1.4810440533837248e-07, + "logits/chosen": -2.538388252258301, + "logits/rejected": -2.57651424407959, + "logps/chosen": -218.90145874023438, + "logps/rejected": -254.50723266601562, + "loss": 0.2231, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0254383087158203, + "rewards/margins": 2.687488079071045, + "rewards/rejected": -3.712926149368286, + "step": 4416 + }, + { + "epoch": 0.51, + "learning_rate": 1.4806897366245423e-07, + "logits/chosen": -2.8534984588623047, + "logits/rejected": -2.792548179626465, + "logps/chosen": -134.83261108398438, + "logps/rejected": -221.30496215820312, + "loss": 0.088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.060146212577819824, + "rewards/margins": 5.51931095123291, + "rewards/rejected": -5.459164619445801, + "step": 4417 + }, + { + "epoch": 0.51, + "learning_rate": 1.4803354198653595e-07, + "logits/chosen": -2.43074107170105, + "logits/rejected": -2.2295186519622803, + "logps/chosen": -218.0709228515625, + "logps/rejected": -277.57666015625, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8720704317092896, + "rewards/margins": 3.302969217300415, + "rewards/rejected": -5.175039768218994, + "step": 4418 + }, + { + "epoch": 0.51, + "learning_rate": 1.479981103106177e-07, + "logits/chosen": -2.0074965953826904, + "logits/rejected": -1.83356773853302, + "logps/chosen": -345.2950439453125, + "logps/rejected": -298.92523193359375, + "loss": 0.3687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9536505341529846, + "rewards/margins": 2.594271659851074, + "rewards/rejected": -3.547922134399414, + "step": 4419 + }, + { + "epoch": 0.51, + "learning_rate": 1.4796267863469943e-07, + "logits/chosen": -2.479872465133667, + "logits/rejected": -2.3713173866271973, + "logps/chosen": -270.0908508300781, + "logps/rejected": -262.513427734375, + "loss": 0.361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39090344309806824, + "rewards/margins": 1.1342356204986572, + "rewards/rejected": -1.5251390933990479, + "step": 4420 + }, + { + "epoch": 0.51, + "learning_rate": 1.4792724695878115e-07, + "logits/chosen": -2.397322654724121, + "logits/rejected": -2.4213998317718506, + "logps/chosen": -303.75518798828125, + "logps/rejected": -256.4100036621094, + "loss": 0.7796, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7788350582122803, + "rewards/margins": 0.48534438014030457, + "rewards/rejected": -2.264179229736328, + "step": 4421 + }, + { + "epoch": 0.51, + "learning_rate": 1.4789181528286287e-07, + "logits/chosen": -2.20271372795105, + "logits/rejected": -2.0609002113342285, + "logps/chosen": -251.00485229492188, + "logps/rejected": -272.7604675292969, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9944652318954468, + "rewards/margins": 3.0154619216918945, + "rewards/rejected": -4.009926795959473, + "step": 4422 + }, + { + "epoch": 0.51, + "learning_rate": 1.478563836069446e-07, + "logits/chosen": -1.7998576164245605, + "logits/rejected": -2.386542320251465, + "logps/chosen": -406.647216796875, + "logps/rejected": -297.8609619140625, + "loss": 0.2157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5721414089202881, + "rewards/margins": 3.177448034286499, + "rewards/rejected": -3.749589681625366, + "step": 4423 + }, + { + "epoch": 0.51, + "learning_rate": 1.4782095193102634e-07, + "logits/chosen": -1.617243766784668, + "logits/rejected": -2.1043570041656494, + "logps/chosen": -391.259033203125, + "logps/rejected": -318.3969421386719, + "loss": 0.2043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8498345017433167, + "rewards/margins": 2.8927087783813477, + "rewards/rejected": -3.7425432205200195, + "step": 4424 + }, + { + "epoch": 0.51, + "learning_rate": 1.4778552025510806e-07, + "logits/chosen": -2.149286985397339, + "logits/rejected": -2.4479012489318848, + "logps/chosen": -375.099853515625, + "logps/rejected": -217.349853515625, + "loss": 0.7245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0832116603851318, + "rewards/margins": 1.632344126701355, + "rewards/rejected": -2.7155556678771973, + "step": 4425 + }, + { + "epoch": 0.51, + "learning_rate": 1.4775008857918978e-07, + "logits/chosen": -2.6721880435943604, + "logits/rejected": -2.7408366203308105, + "logps/chosen": -339.01959228515625, + "logps/rejected": -261.10089111328125, + "loss": 0.3962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6538711190223694, + "rewards/margins": 1.8800548315048218, + "rewards/rejected": -2.533926010131836, + "step": 4426 + }, + { + "epoch": 0.51, + "learning_rate": 1.477146569032715e-07, + "logits/chosen": -2.9296205043792725, + "logits/rejected": -2.9541945457458496, + "logps/chosen": -251.08499145507812, + "logps/rejected": -216.32086181640625, + "loss": 0.4743, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.308314323425293, + "rewards/margins": 1.8914165496826172, + "rewards/rejected": -3.19973087310791, + "step": 4427 + }, + { + "epoch": 0.52, + "learning_rate": 1.4767922522735326e-07, + "logits/chosen": -2.1725010871887207, + "logits/rejected": -2.2895779609680176, + "logps/chosen": -275.2348327636719, + "logps/rejected": -251.96725463867188, + "loss": 0.2433, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5583562850952148, + "rewards/margins": 2.2301840782165527, + "rewards/rejected": -3.7885403633117676, + "step": 4428 + }, + { + "epoch": 0.52, + "learning_rate": 1.4764379355143498e-07, + "logits/chosen": -1.9671940803527832, + "logits/rejected": -1.940125584602356, + "logps/chosen": -391.9891357421875, + "logps/rejected": -355.71856689453125, + "loss": 0.501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.09218168258667, + "rewards/margins": 1.0796053409576416, + "rewards/rejected": -2.1717872619628906, + "step": 4429 + }, + { + "epoch": 0.52, + "learning_rate": 1.476083618755167e-07, + "logits/chosen": -2.0464701652526855, + "logits/rejected": -1.9772510528564453, + "logps/chosen": -318.3564147949219, + "logps/rejected": -268.5296630859375, + "loss": 0.434, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5642836689949036, + "rewards/margins": 0.9671038389205933, + "rewards/rejected": -1.531387448310852, + "step": 4430 + }, + { + "epoch": 0.52, + "learning_rate": 1.4757293019959845e-07, + "logits/chosen": -2.512221574783325, + "logits/rejected": -2.619988441467285, + "logps/chosen": -248.73089599609375, + "logps/rejected": -254.08273315429688, + "loss": 0.2285, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9480062127113342, + "rewards/margins": 2.524616241455078, + "rewards/rejected": -3.4726223945617676, + "step": 4431 + }, + { + "epoch": 0.52, + "learning_rate": 1.4753749852368017e-07, + "logits/chosen": -2.988757848739624, + "logits/rejected": -3.057882070541382, + "logps/chosen": -137.3933563232422, + "logps/rejected": -209.21343994140625, + "loss": 0.2188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7389181852340698, + "rewards/margins": 2.2509453296661377, + "rewards/rejected": -2.989863634109497, + "step": 4432 + }, + { + "epoch": 0.52, + "learning_rate": 1.475020668477619e-07, + "logits/chosen": -2.1609950065612793, + "logits/rejected": -2.303598165512085, + "logps/chosen": -278.7253112792969, + "logps/rejected": -179.31640625, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08576703071594238, + "rewards/margins": 2.3501505851745605, + "rewards/rejected": -2.435917615890503, + "step": 4433 + }, + { + "epoch": 0.52, + "learning_rate": 1.4746663517184361e-07, + "logits/chosen": -2.4267539978027344, + "logits/rejected": -2.0118002891540527, + "logps/chosen": -270.8675537109375, + "logps/rejected": -290.1181945800781, + "loss": 0.2195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43265408277511597, + "rewards/margins": 2.4803638458251953, + "rewards/rejected": -2.913018226623535, + "step": 4434 + }, + { + "epoch": 0.52, + "learning_rate": 1.4743120349592534e-07, + "logits/chosen": -2.362781047821045, + "logits/rejected": -2.33396577835083, + "logps/chosen": -341.7763671875, + "logps/rejected": -331.458740234375, + "loss": 0.2306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8991145491600037, + "rewards/margins": 1.691605806350708, + "rewards/rejected": -2.5907204151153564, + "step": 4435 + }, + { + "epoch": 0.52, + "learning_rate": 1.4739577182000709e-07, + "logits/chosen": -2.4533236026763916, + "logits/rejected": -2.331653594970703, + "logps/chosen": -338.546630859375, + "logps/rejected": -323.87261962890625, + "loss": 0.4362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7489994764328003, + "rewards/margins": 1.180591106414795, + "rewards/rejected": -1.9295908212661743, + "step": 4436 + }, + { + "epoch": 0.52, + "learning_rate": 1.473603401440888e-07, + "logits/chosen": -2.4877233505249023, + "logits/rejected": -2.321103096008301, + "logps/chosen": -132.04811096191406, + "logps/rejected": -275.34332275390625, + "loss": 0.1804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7433214783668518, + "rewards/margins": 4.282045364379883, + "rewards/rejected": -5.02536678314209, + "step": 4437 + }, + { + "epoch": 0.52, + "learning_rate": 1.4732490846817053e-07, + "logits/chosen": -1.9070383310317993, + "logits/rejected": -2.264163017272949, + "logps/chosen": -512.89208984375, + "logps/rejected": -258.14520263671875, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8968722224235535, + "rewards/margins": 1.2877230644226074, + "rewards/rejected": -2.1845953464508057, + "step": 4438 + }, + { + "epoch": 0.52, + "learning_rate": 1.4728947679225228e-07, + "logits/chosen": -2.262068033218384, + "logits/rejected": -2.207988739013672, + "logps/chosen": -318.7604675292969, + "logps/rejected": -328.91162109375, + "loss": 0.3935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6034828424453735, + "rewards/margins": 3.05527400970459, + "rewards/rejected": -3.658756732940674, + "step": 4439 + }, + { + "epoch": 0.52, + "learning_rate": 1.47254045116334e-07, + "logits/chosen": -2.025137424468994, + "logits/rejected": -1.870969533920288, + "logps/chosen": -315.0353698730469, + "logps/rejected": -304.2069091796875, + "loss": 0.4461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.385631799697876, + "rewards/margins": 0.9987223744392395, + "rewards/rejected": -2.3843541145324707, + "step": 4440 + }, + { + "epoch": 0.52, + "learning_rate": 1.4721861344041572e-07, + "logits/chosen": -2.0687079429626465, + "logits/rejected": -2.2213711738586426, + "logps/chosen": -292.3248291015625, + "logps/rejected": -275.29608154296875, + "loss": 1.4275, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.931654930114746, + "rewards/margins": -0.6592932939529419, + "rewards/rejected": -1.2723616361618042, + "step": 4441 + }, + { + "epoch": 0.52, + "learning_rate": 1.4718318176449747e-07, + "logits/chosen": -1.7691656351089478, + "logits/rejected": -2.208822727203369, + "logps/chosen": -556.1945190429688, + "logps/rejected": -302.4568786621094, + "loss": 0.2766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6165403127670288, + "rewards/margins": 1.6032326221466064, + "rewards/rejected": -2.2197728157043457, + "step": 4442 + }, + { + "epoch": 0.52, + "learning_rate": 1.471477500885792e-07, + "logits/chosen": -2.3519840240478516, + "logits/rejected": -2.558540105819702, + "logps/chosen": -389.3567810058594, + "logps/rejected": -326.2566833496094, + "loss": 0.3025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0227785110473633, + "rewards/margins": 1.8923583030700684, + "rewards/rejected": -2.9151368141174316, + "step": 4443 + }, + { + "epoch": 0.52, + "learning_rate": 1.4711231841266092e-07, + "logits/chosen": -2.569638729095459, + "logits/rejected": -2.523052215576172, + "logps/chosen": -205.23114013671875, + "logps/rejected": -353.7296142578125, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6614469885826111, + "rewards/margins": 3.8840219974517822, + "rewards/rejected": -4.545469284057617, + "step": 4444 + }, + { + "epoch": 0.52, + "learning_rate": 1.4707688673674264e-07, + "logits/chosen": -1.8673027753829956, + "logits/rejected": -2.145742654800415, + "logps/chosen": -298.67291259765625, + "logps/rejected": -344.64013671875, + "loss": 0.5412, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.011865496635437, + "rewards/margins": 3.21443510055542, + "rewards/rejected": -4.2263007164001465, + "step": 4445 + }, + { + "epoch": 0.52, + "learning_rate": 1.4704145506082436e-07, + "logits/chosen": -2.2536473274230957, + "logits/rejected": -2.128931760787964, + "logps/chosen": -321.917236328125, + "logps/rejected": -322.8045654296875, + "loss": 0.4554, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0958231687545776, + "rewards/margins": 1.1885547637939453, + "rewards/rejected": -2.2843778133392334, + "step": 4446 + }, + { + "epoch": 0.52, + "learning_rate": 1.4700602338490608e-07, + "logits/chosen": -2.766233205795288, + "logits/rejected": -2.8095147609710693, + "logps/chosen": -245.27239990234375, + "logps/rejected": -294.1472473144531, + "loss": 0.2607, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7898604869842529, + "rewards/margins": 2.7628350257873535, + "rewards/rejected": -3.5526952743530273, + "step": 4447 + }, + { + "epoch": 0.52, + "learning_rate": 1.4697059170898783e-07, + "logits/chosen": -2.4857375621795654, + "logits/rejected": -2.342219591140747, + "logps/chosen": -318.2737121582031, + "logps/rejected": -278.203857421875, + "loss": 0.1923, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6695927381515503, + "rewards/margins": 3.376140594482422, + "rewards/rejected": -4.045733451843262, + "step": 4448 + }, + { + "epoch": 0.52, + "learning_rate": 1.4693516003306955e-07, + "logits/chosen": -2.0723586082458496, + "logits/rejected": -2.108640432357788, + "logps/chosen": -201.29833984375, + "logps/rejected": -223.54495239257812, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5666395425796509, + "rewards/margins": 2.223055839538574, + "rewards/rejected": -3.7896952629089355, + "step": 4449 + }, + { + "epoch": 0.52, + "learning_rate": 1.4689972835715127e-07, + "logits/chosen": -2.2349987030029297, + "logits/rejected": -2.4069197177886963, + "logps/chosen": -450.22802734375, + "logps/rejected": -362.393310546875, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1170331239700317, + "rewards/margins": 2.380876064300537, + "rewards/rejected": -3.4979093074798584, + "step": 4450 + }, + { + "epoch": 0.52, + "learning_rate": 1.4686429668123302e-07, + "logits/chosen": -2.5749642848968506, + "logits/rejected": -2.612610101699829, + "logps/chosen": -285.2718200683594, + "logps/rejected": -244.83592224121094, + "loss": 0.3427, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0477691888809204, + "rewards/margins": 2.3544816970825195, + "rewards/rejected": -3.4022510051727295, + "step": 4451 + }, + { + "epoch": 0.52, + "learning_rate": 1.4682886500531475e-07, + "logits/chosen": -2.2493345737457275, + "logits/rejected": -2.1813173294067383, + "logps/chosen": -245.28187561035156, + "logps/rejected": -329.4098205566406, + "loss": 0.4165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3334898352622986, + "rewards/margins": 2.6428403854370117, + "rewards/rejected": -2.976330041885376, + "step": 4452 + }, + { + "epoch": 0.52, + "learning_rate": 1.4679343332939647e-07, + "logits/chosen": -2.0255753993988037, + "logits/rejected": -2.0662360191345215, + "logps/chosen": -304.1136169433594, + "logps/rejected": -213.90328979492188, + "loss": 0.5954, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.030884027481079, + "rewards/margins": 1.6108708381652832, + "rewards/rejected": -2.6417548656463623, + "step": 4453 + }, + { + "epoch": 0.52, + "learning_rate": 1.4675800165347822e-07, + "logits/chosen": -2.625333309173584, + "logits/rejected": -2.5561375617980957, + "logps/chosen": -99.22433471679688, + "logps/rejected": -215.77012634277344, + "loss": 0.4811, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8753434419631958, + "rewards/margins": 2.069636821746826, + "rewards/rejected": -2.9449803829193115, + "step": 4454 + }, + { + "epoch": 0.52, + "learning_rate": 1.4672256997755994e-07, + "logits/chosen": -2.2792840003967285, + "logits/rejected": -2.1142284870147705, + "logps/chosen": -287.46429443359375, + "logps/rejected": -327.88873291015625, + "loss": 0.3179, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4376065731048584, + "rewards/margins": 2.2809529304504395, + "rewards/rejected": -3.718559741973877, + "step": 4455 + }, + { + "epoch": 0.52, + "learning_rate": 1.4668713830164166e-07, + "logits/chosen": -2.701112985610962, + "logits/rejected": -2.6729681491851807, + "logps/chosen": -222.4083709716797, + "logps/rejected": -388.5052185058594, + "loss": 0.295, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7814595699310303, + "rewards/margins": 3.6106553077697754, + "rewards/rejected": -4.392114639282227, + "step": 4456 + }, + { + "epoch": 0.52, + "learning_rate": 1.4665170662572338e-07, + "logits/chosen": -2.605877161026001, + "logits/rejected": -2.68992280960083, + "logps/chosen": -307.8085021972656, + "logps/rejected": -307.5794677734375, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3876197338104248, + "rewards/margins": 5.3514580726623535, + "rewards/rejected": -5.739077568054199, + "step": 4457 + }, + { + "epoch": 0.52, + "learning_rate": 1.466162749498051e-07, + "logits/chosen": -2.809152841567993, + "logits/rejected": -2.6513051986694336, + "logps/chosen": -197.27479553222656, + "logps/rejected": -157.20521545410156, + "loss": 0.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18291398882865906, + "rewards/margins": 2.7612855434417725, + "rewards/rejected": -2.944199323654175, + "step": 4458 + }, + { + "epoch": 0.52, + "learning_rate": 1.4658084327388685e-07, + "logits/chosen": -2.5619490146636963, + "logits/rejected": -2.363804340362549, + "logps/chosen": -190.09835815429688, + "logps/rejected": -348.0328063964844, + "loss": 0.3735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13806763291358948, + "rewards/margins": 2.8524105548858643, + "rewards/rejected": -2.990478038787842, + "step": 4459 + }, + { + "epoch": 0.52, + "learning_rate": 1.4654541159796858e-07, + "logits/chosen": -2.495720624923706, + "logits/rejected": -2.3665590286254883, + "logps/chosen": -138.03118896484375, + "logps/rejected": -193.0414276123047, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4779880046844482, + "rewards/margins": 2.1500084400177, + "rewards/rejected": -3.6279964447021484, + "step": 4460 + }, + { + "epoch": 0.52, + "learning_rate": 1.465099799220503e-07, + "logits/chosen": -2.2872748374938965, + "logits/rejected": -2.305588483810425, + "logps/chosen": -250.34957885742188, + "logps/rejected": -186.43698120117188, + "loss": 0.5969, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0724120140075684, + "rewards/margins": 0.6916897296905518, + "rewards/rejected": -1.7641019821166992, + "step": 4461 + }, + { + "epoch": 0.52, + "learning_rate": 1.4647454824613205e-07, + "logits/chosen": -1.1987195014953613, + "logits/rejected": -2.02854061126709, + "logps/chosen": -445.2873229980469, + "logps/rejected": -198.29734802246094, + "loss": 0.5596, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6525984406471252, + "rewards/margins": 1.8891656398773193, + "rewards/rejected": -2.541764259338379, + "step": 4462 + }, + { + "epoch": 0.52, + "learning_rate": 1.4643911657021377e-07, + "logits/chosen": -2.0466513633728027, + "logits/rejected": -1.8301678895950317, + "logps/chosen": -178.20452880859375, + "logps/rejected": -253.1067352294922, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0872677564620972, + "rewards/margins": 1.8799058198928833, + "rewards/rejected": -2.9671733379364014, + "step": 4463 + }, + { + "epoch": 0.52, + "learning_rate": 1.464036848942955e-07, + "logits/chosen": -2.5822713375091553, + "logits/rejected": -2.498356819152832, + "logps/chosen": -208.46078491210938, + "logps/rejected": -286.2212829589844, + "loss": 0.4677, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3653315305709839, + "rewards/margins": 1.6777690649032593, + "rewards/rejected": -3.043100357055664, + "step": 4464 + }, + { + "epoch": 0.52, + "learning_rate": 1.4636825321837724e-07, + "logits/chosen": -2.2921411991119385, + "logits/rejected": -2.2969882488250732, + "logps/chosen": -188.29824829101562, + "logps/rejected": -204.5272216796875, + "loss": 0.3132, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3106204271316528, + "rewards/margins": 1.6569228172302246, + "rewards/rejected": -2.967543601989746, + "step": 4465 + }, + { + "epoch": 0.52, + "learning_rate": 1.4633282154245896e-07, + "logits/chosen": -1.8771297931671143, + "logits/rejected": -2.386168956756592, + "logps/chosen": -513.5070190429688, + "logps/rejected": -304.9075927734375, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9737687110900879, + "rewards/margins": 2.774693012237549, + "rewards/rejected": -3.7484617233276367, + "step": 4466 + }, + { + "epoch": 0.52, + "learning_rate": 1.4629738986654068e-07, + "logits/chosen": -2.164501667022705, + "logits/rejected": -2.154564380645752, + "logps/chosen": -377.2845458984375, + "logps/rejected": -317.6806335449219, + "loss": 0.291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7112680673599243, + "rewards/margins": 3.057116746902466, + "rewards/rejected": -3.7683846950531006, + "step": 4467 + }, + { + "epoch": 0.52, + "learning_rate": 1.462619581906224e-07, + "logits/chosen": -2.7060515880584717, + "logits/rejected": -2.898784875869751, + "logps/chosen": -225.28872680664062, + "logps/rejected": -237.53672790527344, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2169175148010254, + "rewards/margins": 1.9486417770385742, + "rewards/rejected": -3.1655595302581787, + "step": 4468 + }, + { + "epoch": 0.52, + "learning_rate": 1.4622652651470413e-07, + "logits/chosen": -2.081125259399414, + "logits/rejected": -2.3295023441314697, + "logps/chosen": -90.39495849609375, + "logps/rejected": -78.12962341308594, + "loss": 0.3544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5589986443519592, + "rewards/margins": 1.2361222505569458, + "rewards/rejected": -1.7951208353042603, + "step": 4469 + }, + { + "epoch": 0.52, + "learning_rate": 1.4619109483878585e-07, + "logits/chosen": -2.8612313270568848, + "logits/rejected": -2.902712821960449, + "logps/chosen": -129.88836669921875, + "logps/rejected": -233.85845947265625, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.039818525314331, + "rewards/margins": 3.191118001937866, + "rewards/rejected": -4.230936527252197, + "step": 4470 + }, + { + "epoch": 0.52, + "learning_rate": 1.461556631628676e-07, + "logits/chosen": -1.7133870124816895, + "logits/rejected": -2.015822410583496, + "logps/chosen": -348.4404602050781, + "logps/rejected": -243.8916015625, + "loss": 0.7743, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3400354385375977, + "rewards/margins": 1.526307225227356, + "rewards/rejected": -2.866342544555664, + "step": 4471 + }, + { + "epoch": 0.52, + "learning_rate": 1.4612023148694932e-07, + "logits/chosen": -2.9060301780700684, + "logits/rejected": -2.969766616821289, + "logps/chosen": -90.10565185546875, + "logps/rejected": -144.2926025390625, + "loss": 0.5102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7778643369674683, + "rewards/margins": 1.5235484838485718, + "rewards/rejected": -2.30141282081604, + "step": 4472 + }, + { + "epoch": 0.52, + "learning_rate": 1.4608479981103107e-07, + "logits/chosen": -2.423031806945801, + "logits/rejected": -2.318917989730835, + "logps/chosen": -319.1260986328125, + "logps/rejected": -259.3294372558594, + "loss": 0.3037, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7504431009292603, + "rewards/margins": 2.212116241455078, + "rewards/rejected": -2.962559223175049, + "step": 4473 + }, + { + "epoch": 0.52, + "learning_rate": 1.460493681351128e-07, + "logits/chosen": -2.179124355316162, + "logits/rejected": -2.1011312007904053, + "logps/chosen": -242.96009826660156, + "logps/rejected": -248.56988525390625, + "loss": 0.7599, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0776407718658447, + "rewards/margins": 2.032346248626709, + "rewards/rejected": -3.1099870204925537, + "step": 4474 + }, + { + "epoch": 0.52, + "learning_rate": 1.460139364591945e-07, + "logits/chosen": -2.8879988193511963, + "logits/rejected": -2.660626173019409, + "logps/chosen": -208.4625244140625, + "logps/rejected": -293.02532958984375, + "loss": 0.2156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5916134715080261, + "rewards/margins": 2.238893985748291, + "rewards/rejected": -2.830507278442383, + "step": 4475 + }, + { + "epoch": 0.52, + "learning_rate": 1.4597850478327624e-07, + "logits/chosen": -1.5811975002288818, + "logits/rejected": -1.9255549907684326, + "logps/chosen": -517.3548583984375, + "logps/rejected": -401.31170654296875, + "loss": 0.6448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6135041117668152, + "rewards/margins": 0.8250095248222351, + "rewards/rejected": -1.4385137557983398, + "step": 4476 + }, + { + "epoch": 0.52, + "learning_rate": 1.4594307310735798e-07, + "logits/chosen": -1.8996312618255615, + "logits/rejected": -1.7134811878204346, + "logps/chosen": -181.67945861816406, + "logps/rejected": -250.56008911132812, + "loss": 0.3931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5300594568252563, + "rewards/margins": 1.6830741167068481, + "rewards/rejected": -2.2131335735321045, + "step": 4477 + }, + { + "epoch": 0.52, + "learning_rate": 1.459076414314397e-07, + "logits/chosen": -1.8532767295837402, + "logits/rejected": -1.7179992198944092, + "logps/chosen": -219.01376342773438, + "logps/rejected": -284.02264404296875, + "loss": 0.333, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8556686639785767, + "rewards/margins": 1.981954574584961, + "rewards/rejected": -2.837623119354248, + "step": 4478 + }, + { + "epoch": 0.52, + "learning_rate": 1.4587220975552143e-07, + "logits/chosen": -2.4246692657470703, + "logits/rejected": -2.4030208587646484, + "logps/chosen": -160.03680419921875, + "logps/rejected": -246.8701629638672, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25532275438308716, + "rewards/margins": 2.757906675338745, + "rewards/rejected": -3.0132293701171875, + "step": 4479 + }, + { + "epoch": 0.52, + "learning_rate": 1.4583677807960315e-07, + "logits/chosen": -1.6362032890319824, + "logits/rejected": -1.732161521911621, + "logps/chosen": -249.58319091796875, + "logps/rejected": -300.4599304199219, + "loss": 0.2851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7609021663665771, + "rewards/margins": 1.6288145780563354, + "rewards/rejected": -2.389716625213623, + "step": 4480 + }, + { + "epoch": 0.52, + "learning_rate": 1.4580134640368487e-07, + "logits/chosen": -2.402101516723633, + "logits/rejected": -2.631615400314331, + "logps/chosen": -481.0220947265625, + "logps/rejected": -290.1775207519531, + "loss": 0.4889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.410093069076538, + "rewards/margins": 1.709699273109436, + "rewards/rejected": -3.1197924613952637, + "step": 4481 + }, + { + "epoch": 0.52, + "learning_rate": 1.4576591472776662e-07, + "logits/chosen": -2.4820668697357178, + "logits/rejected": -2.071434259414673, + "logps/chosen": -81.98477935791016, + "logps/rejected": -273.60540771484375, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019898168742656708, + "rewards/margins": 2.942033290863037, + "rewards/rejected": -2.922135353088379, + "step": 4482 + }, + { + "epoch": 0.52, + "learning_rate": 1.4573048305184834e-07, + "logits/chosen": -2.4995055198669434, + "logits/rejected": -2.6168086528778076, + "logps/chosen": -285.3377990722656, + "logps/rejected": -166.1121368408203, + "loss": 0.5506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9135602712631226, + "rewards/margins": 1.1943471431732178, + "rewards/rejected": -2.10790753364563, + "step": 4483 + }, + { + "epoch": 0.52, + "learning_rate": 1.4569505137593007e-07, + "logits/chosen": -2.2843873500823975, + "logits/rejected": -2.759885787963867, + "logps/chosen": -331.755615234375, + "logps/rejected": -216.98208618164062, + "loss": 0.8224, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3159453868865967, + "rewards/margins": 0.15904884040355682, + "rewards/rejected": -1.4749943017959595, + "step": 4484 + }, + { + "epoch": 0.52, + "learning_rate": 1.4565961970001181e-07, + "logits/chosen": -2.084843397140503, + "logits/rejected": -2.456906318664551, + "logps/chosen": -267.0753173828125, + "logps/rejected": -247.46316528320312, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7693420052528381, + "rewards/margins": 1.8761694431304932, + "rewards/rejected": -2.6455113887786865, + "step": 4485 + }, + { + "epoch": 0.52, + "learning_rate": 1.4562418802409354e-07, + "logits/chosen": -2.6055572032928467, + "logits/rejected": -2.7297492027282715, + "logps/chosen": -351.4302673339844, + "logps/rejected": -348.66851806640625, + "loss": 0.3002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4020070433616638, + "rewards/margins": 1.6331117153167725, + "rewards/rejected": -2.035118818283081, + "step": 4486 + }, + { + "epoch": 0.52, + "learning_rate": 1.4558875634817526e-07, + "logits/chosen": -2.4523234367370605, + "logits/rejected": -2.682985782623291, + "logps/chosen": -685.8344116210938, + "logps/rejected": -258.84661865234375, + "loss": 0.4713, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0630455017089844, + "rewards/margins": 2.051617383956909, + "rewards/rejected": -3.1146628856658936, + "step": 4487 + }, + { + "epoch": 0.52, + "learning_rate": 1.4555332467225698e-07, + "logits/chosen": -2.6613357067108154, + "logits/rejected": -2.6569604873657227, + "logps/chosen": -301.30035400390625, + "logps/rejected": -367.19561767578125, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6537983417510986, + "rewards/margins": 2.7344160079956055, + "rewards/rejected": -3.388214111328125, + "step": 4488 + }, + { + "epoch": 0.52, + "learning_rate": 1.4551789299633873e-07, + "logits/chosen": -1.8501089811325073, + "logits/rejected": -2.242534637451172, + "logps/chosen": -236.48529052734375, + "logps/rejected": -188.29867553710938, + "loss": 0.1387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2772791087627411, + "rewards/margins": 2.63201642036438, + "rewards/rejected": -2.9092957973480225, + "step": 4489 + }, + { + "epoch": 0.52, + "learning_rate": 1.4548246132042045e-07, + "logits/chosen": -1.9406241178512573, + "logits/rejected": -1.9414557218551636, + "logps/chosen": -154.046630859375, + "logps/rejected": -223.7883758544922, + "loss": 0.6037, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2056920528411865, + "rewards/margins": 0.9932038187980652, + "rewards/rejected": -2.1988959312438965, + "step": 4490 + }, + { + "epoch": 0.52, + "learning_rate": 1.4544702964450217e-07, + "logits/chosen": -2.4831719398498535, + "logits/rejected": -2.5373644828796387, + "logps/chosen": -288.54156494140625, + "logps/rejected": -162.93429565429688, + "loss": 0.2924, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3155889511108398, + "rewards/margins": 2.4275221824645996, + "rewards/rejected": -3.7431111335754395, + "step": 4491 + }, + { + "epoch": 0.52, + "learning_rate": 1.454115979685839e-07, + "logits/chosen": -2.6326756477355957, + "logits/rejected": -2.6019070148468018, + "logps/chosen": -322.994873046875, + "logps/rejected": -321.88037109375, + "loss": 0.6734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8376266360282898, + "rewards/margins": 1.9719176292419434, + "rewards/rejected": -2.809544324874878, + "step": 4492 + }, + { + "epoch": 0.52, + "learning_rate": 1.4537616629266564e-07, + "logits/chosen": -2.6733832359313965, + "logits/rejected": -2.63567852973938, + "logps/chosen": -350.3821105957031, + "logps/rejected": -275.8485107421875, + "loss": 0.4188, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.144783616065979, + "rewards/margins": 1.749941110610962, + "rewards/rejected": -2.8947248458862305, + "step": 4493 + }, + { + "epoch": 0.52, + "learning_rate": 1.4534073461674737e-07, + "logits/chosen": -2.6723110675811768, + "logits/rejected": -2.6978647708892822, + "logps/chosen": -277.2783203125, + "logps/rejected": -321.64306640625, + "loss": 0.1699, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7720358371734619, + "rewards/margins": 4.211016654968262, + "rewards/rejected": -4.9830522537231445, + "step": 4494 + }, + { + "epoch": 0.52, + "learning_rate": 1.453053029408291e-07, + "logits/chosen": -2.287447452545166, + "logits/rejected": -1.9264509677886963, + "logps/chosen": -255.82891845703125, + "logps/rejected": -374.95501708984375, + "loss": 0.3699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4010649025440216, + "rewards/margins": 2.2390036582946777, + "rewards/rejected": -2.640068531036377, + "step": 4495 + }, + { + "epoch": 0.52, + "learning_rate": 1.4526987126491084e-07, + "logits/chosen": -2.0508456230163574, + "logits/rejected": -2.5995490550994873, + "logps/chosen": -473.290771484375, + "logps/rejected": -195.6824951171875, + "loss": 0.3038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29383036494255066, + "rewards/margins": 1.712847113609314, + "rewards/rejected": -2.0066773891448975, + "step": 4496 + }, + { + "epoch": 0.52, + "learning_rate": 1.4523443958899256e-07, + "logits/chosen": -1.8114819526672363, + "logits/rejected": -1.8963158130645752, + "logps/chosen": -249.40232849121094, + "logps/rejected": -203.25222778320312, + "loss": 0.7529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.933061957359314, + "rewards/margins": 0.19214923679828644, + "rewards/rejected": -1.1252111196517944, + "step": 4497 + }, + { + "epoch": 0.52, + "learning_rate": 1.4519900791307428e-07, + "logits/chosen": -2.3821778297424316, + "logits/rejected": -2.6239259243011475, + "logps/chosen": -479.08837890625, + "logps/rejected": -271.6213073730469, + "loss": 0.7108, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.279365062713623, + "rewards/margins": 1.2666561603546143, + "rewards/rejected": -2.5460212230682373, + "step": 4498 + }, + { + "epoch": 0.52, + "learning_rate": 1.45163576237156e-07, + "logits/chosen": -3.1678967475891113, + "logits/rejected": -2.9324545860290527, + "logps/chosen": -244.92601013183594, + "logps/rejected": -169.06808471679688, + "loss": 0.2878, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.077366828918457, + "rewards/margins": 1.9760448932647705, + "rewards/rejected": -3.0534117221832275, + "step": 4499 + }, + { + "epoch": 0.52, + "learning_rate": 1.4512814456123775e-07, + "logits/chosen": -2.3210928440093994, + "logits/rejected": -1.9703279733657837, + "logps/chosen": -236.15118408203125, + "logps/rejected": -333.5020751953125, + "loss": 0.1707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6681629419326782, + "rewards/margins": 3.728099822998047, + "rewards/rejected": -4.396262168884277, + "step": 4500 + }, + { + "epoch": 0.52, + "learning_rate": 1.4509271288531947e-07, + "logits/chosen": -2.6626482009887695, + "logits/rejected": -2.9056429862976074, + "logps/chosen": -328.17657470703125, + "logps/rejected": -297.45220947265625, + "loss": 0.6198, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.637305498123169, + "rewards/margins": 2.253784418106079, + "rewards/rejected": -3.891089916229248, + "step": 4501 + }, + { + "epoch": 0.52, + "learning_rate": 1.450572812094012e-07, + "logits/chosen": -2.532075881958008, + "logits/rejected": -2.2468795776367188, + "logps/chosen": -230.837646484375, + "logps/rejected": -175.48361206054688, + "loss": 0.5205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9369468092918396, + "rewards/margins": 0.6457905769348145, + "rewards/rejected": -1.5827374458312988, + "step": 4502 + }, + { + "epoch": 0.52, + "learning_rate": 1.4502184953348292e-07, + "logits/chosen": -2.6798038482666016, + "logits/rejected": -2.317324638366699, + "logps/chosen": -198.99826049804688, + "logps/rejected": -425.55682373046875, + "loss": 0.3911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.397950679063797, + "rewards/margins": 1.8767015933990479, + "rewards/rejected": -2.2746522426605225, + "step": 4503 + }, + { + "epoch": 0.52, + "learning_rate": 1.4498641785756464e-07, + "logits/chosen": -2.2641334533691406, + "logits/rejected": -2.2100231647491455, + "logps/chosen": -285.7616271972656, + "logps/rejected": -288.3416748046875, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9303148984909058, + "rewards/margins": 1.9492205381393433, + "rewards/rejected": -2.879535436630249, + "step": 4504 + }, + { + "epoch": 0.52, + "learning_rate": 1.449509861816464e-07, + "logits/chosen": -2.534092426300049, + "logits/rejected": -2.621316432952881, + "logps/chosen": -415.91070556640625, + "logps/rejected": -248.5422821044922, + "loss": 0.6313, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2406803369522095, + "rewards/margins": 1.3138843774795532, + "rewards/rejected": -2.554564952850342, + "step": 4505 + }, + { + "epoch": 0.52, + "learning_rate": 1.449155545057281e-07, + "logits/chosen": -2.1466965675354004, + "logits/rejected": -2.2590575218200684, + "logps/chosen": -377.6200866699219, + "logps/rejected": -338.7649841308594, + "loss": 0.1773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29249611496925354, + "rewards/margins": 3.055487632751465, + "rewards/rejected": -3.3479838371276855, + "step": 4506 + }, + { + "epoch": 0.52, + "learning_rate": 1.4488012282980986e-07, + "logits/chosen": -1.8711618185043335, + "logits/rejected": -2.2112176418304443, + "logps/chosen": -424.2223815917969, + "logps/rejected": -454.1842346191406, + "loss": 0.3042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15294623374938965, + "rewards/margins": 1.6848701238632202, + "rewards/rejected": -1.8378162384033203, + "step": 4507 + }, + { + "epoch": 0.52, + "learning_rate": 1.4484469115389158e-07, + "logits/chosen": -1.8297598361968994, + "logits/rejected": -2.1137871742248535, + "logps/chosen": -552.373291015625, + "logps/rejected": -322.032958984375, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.377629816532135, + "rewards/margins": 2.665693759918213, + "rewards/rejected": -3.043323516845703, + "step": 4508 + }, + { + "epoch": 0.52, + "learning_rate": 1.448092594779733e-07, + "logits/chosen": -2.17305326461792, + "logits/rejected": -2.3085408210754395, + "logps/chosen": -215.98678588867188, + "logps/rejected": -167.53207397460938, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7037274241447449, + "rewards/margins": 0.3869680166244507, + "rewards/rejected": -1.0906953811645508, + "step": 4509 + }, + { + "epoch": 0.52, + "learning_rate": 1.4477382780205503e-07, + "logits/chosen": -2.492035388946533, + "logits/rejected": -2.3766770362854004, + "logps/chosen": -96.54740905761719, + "logps/rejected": -123.30646514892578, + "loss": 0.4338, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1493990421295166, + "rewards/margins": 1.530091404914856, + "rewards/rejected": -2.679490566253662, + "step": 4510 + }, + { + "epoch": 0.52, + "learning_rate": 1.4473839612613675e-07, + "logits/chosen": -2.7520909309387207, + "logits/rejected": -2.6018526554107666, + "logps/chosen": -228.30950927734375, + "logps/rejected": -279.6526794433594, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4812743663787842, + "rewards/margins": 1.1138497591018677, + "rewards/rejected": -2.5951240062713623, + "step": 4511 + }, + { + "epoch": 0.52, + "learning_rate": 1.447029644502185e-07, + "logits/chosen": -2.7976064682006836, + "logits/rejected": -2.594771385192871, + "logps/chosen": -276.5714111328125, + "logps/rejected": -221.60467529296875, + "loss": 0.2525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21855135262012482, + "rewards/margins": 3.214864730834961, + "rewards/rejected": -3.433415651321411, + "step": 4512 + }, + { + "epoch": 0.52, + "learning_rate": 1.4466753277430022e-07, + "logits/chosen": -2.695167064666748, + "logits/rejected": -2.552050828933716, + "logps/chosen": -127.60615539550781, + "logps/rejected": -241.47325134277344, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7626305818557739, + "rewards/margins": 2.4334850311279297, + "rewards/rejected": -3.196115732192993, + "step": 4513 + }, + { + "epoch": 0.53, + "learning_rate": 1.4463210109838194e-07, + "logits/chosen": -2.5713653564453125, + "logits/rejected": -2.457108736038208, + "logps/chosen": -234.27789306640625, + "logps/rejected": -367.5045471191406, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4797108173370361, + "rewards/margins": 2.2798445224761963, + "rewards/rejected": -3.7595553398132324, + "step": 4514 + }, + { + "epoch": 0.53, + "learning_rate": 1.4459666942246366e-07, + "logits/chosen": -2.323906898498535, + "logits/rejected": -2.237969160079956, + "logps/chosen": -218.26930236816406, + "logps/rejected": -295.8759460449219, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.545875608921051, + "rewards/margins": 2.273963689804077, + "rewards/rejected": -2.8198394775390625, + "step": 4515 + }, + { + "epoch": 0.53, + "learning_rate": 1.445612377465454e-07, + "logits/chosen": -2.5516295433044434, + "logits/rejected": -2.4767520427703857, + "logps/chosen": -217.02427673339844, + "logps/rejected": -250.64633178710938, + "loss": 0.5553, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0402499437332153, + "rewards/margins": 2.0265557765960693, + "rewards/rejected": -3.066805839538574, + "step": 4516 + }, + { + "epoch": 0.53, + "learning_rate": 1.4452580607062713e-07, + "logits/chosen": -2.4409875869750977, + "logits/rejected": -2.286607503890991, + "logps/chosen": -223.46481323242188, + "logps/rejected": -322.7868347167969, + "loss": 0.1258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7761311531066895, + "rewards/margins": 3.540062189102173, + "rewards/rejected": -4.316193580627441, + "step": 4517 + }, + { + "epoch": 0.53, + "learning_rate": 1.4449037439470888e-07, + "logits/chosen": -2.1901299953460693, + "logits/rejected": -2.258115768432617, + "logps/chosen": -190.00509643554688, + "logps/rejected": -183.37911987304688, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8091549277305603, + "rewards/margins": 1.242027759552002, + "rewards/rejected": -2.051182508468628, + "step": 4518 + }, + { + "epoch": 0.53, + "learning_rate": 1.444549427187906e-07, + "logits/chosen": -2.595181465148926, + "logits/rejected": -2.555992603302002, + "logps/chosen": -112.34892272949219, + "logps/rejected": -113.49689483642578, + "loss": 0.3377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5503670573234558, + "rewards/margins": 1.5663617849349976, + "rewards/rejected": -2.1167287826538086, + "step": 4519 + }, + { + "epoch": 0.53, + "learning_rate": 1.4441951104287233e-07, + "logits/chosen": -1.6047509908676147, + "logits/rejected": -2.100574016571045, + "logps/chosen": -320.12371826171875, + "logps/rejected": -213.81773376464844, + "loss": 0.5428, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6118432283401489, + "rewards/margins": 1.4327383041381836, + "rewards/rejected": -2.044581651687622, + "step": 4520 + }, + { + "epoch": 0.53, + "learning_rate": 1.4438407936695405e-07, + "logits/chosen": -1.3728859424591064, + "logits/rejected": -1.734853982925415, + "logps/chosen": -554.4150390625, + "logps/rejected": -481.1383056640625, + "loss": 0.421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31647610664367676, + "rewards/margins": 1.1784223318099976, + "rewards/rejected": -1.4948984384536743, + "step": 4521 + }, + { + "epoch": 0.53, + "learning_rate": 1.4434864769103577e-07, + "logits/chosen": -2.1794750690460205, + "logits/rejected": -2.0429024696350098, + "logps/chosen": -391.2835998535156, + "logps/rejected": -379.21368408203125, + "loss": 1.075, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.484217643737793, + "rewards/margins": 0.5887270569801331, + "rewards/rejected": -2.0729446411132812, + "step": 4522 + }, + { + "epoch": 0.53, + "learning_rate": 1.443132160151175e-07, + "logits/chosen": -1.181962013244629, + "logits/rejected": -1.734419822692871, + "logps/chosen": -562.5978393554688, + "logps/rejected": -291.0789489746094, + "loss": 0.5154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6563250422477722, + "rewards/margins": 1.4066563844680786, + "rewards/rejected": -2.062981367111206, + "step": 4523 + }, + { + "epoch": 0.53, + "learning_rate": 1.4427778433919924e-07, + "logits/chosen": -2.309689521789551, + "logits/rejected": -2.1552557945251465, + "logps/chosen": -203.62754821777344, + "logps/rejected": -229.5712127685547, + "loss": 0.2164, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08582200109958649, + "rewards/margins": 2.134716272354126, + "rewards/rejected": -2.048894166946411, + "step": 4524 + }, + { + "epoch": 0.53, + "learning_rate": 1.4424235266328096e-07, + "logits/chosen": -2.534682512283325, + "logits/rejected": -2.4873056411743164, + "logps/chosen": -122.90055847167969, + "logps/rejected": -207.57847595214844, + "loss": 0.5359, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1138291358947754, + "rewards/margins": 1.6452456712722778, + "rewards/rejected": -2.7590749263763428, + "step": 4525 + }, + { + "epoch": 0.53, + "learning_rate": 1.4420692098736269e-07, + "logits/chosen": -2.8869314193725586, + "logits/rejected": -3.0136208534240723, + "logps/chosen": -184.37977600097656, + "logps/rejected": -291.44024658203125, + "loss": 0.5312, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.41671884059906, + "rewards/margins": 2.3138339519500732, + "rewards/rejected": -3.7305526733398438, + "step": 4526 + }, + { + "epoch": 0.53, + "learning_rate": 1.4417148931144443e-07, + "logits/chosen": -2.4688076972961426, + "logits/rejected": -2.436426877975464, + "logps/chosen": -101.77072143554688, + "logps/rejected": -163.4958038330078, + "loss": 0.3662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8609261512756348, + "rewards/margins": 2.0007665157318115, + "rewards/rejected": -2.8616926670074463, + "step": 4527 + }, + { + "epoch": 0.53, + "learning_rate": 1.4413605763552616e-07, + "logits/chosen": -1.7943674325942993, + "logits/rejected": -2.15969181060791, + "logps/chosen": -460.53460693359375, + "logps/rejected": -393.0439453125, + "loss": 0.4543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.407240390777588, + "rewards/margins": 1.0326753854751587, + "rewards/rejected": -2.439915895462036, + "step": 4528 + }, + { + "epoch": 0.53, + "learning_rate": 1.4410062595960788e-07, + "logits/chosen": -2.8639392852783203, + "logits/rejected": -2.7998557090759277, + "logps/chosen": -217.3195343017578, + "logps/rejected": -258.15679931640625, + "loss": 0.36, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40182435512542725, + "rewards/margins": 1.919093370437622, + "rewards/rejected": -2.3209176063537598, + "step": 4529 + }, + { + "epoch": 0.53, + "learning_rate": 1.4406519428368963e-07, + "logits/chosen": -2.055154323577881, + "logits/rejected": -2.263939619064331, + "logps/chosen": -222.68838500976562, + "logps/rejected": -175.53009033203125, + "loss": 0.288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2472681850194931, + "rewards/margins": 1.9792189598083496, + "rewards/rejected": -2.226487159729004, + "step": 4530 + }, + { + "epoch": 0.53, + "learning_rate": 1.4402976260777135e-07, + "logits/chosen": -2.527340888977051, + "logits/rejected": -2.203871965408325, + "logps/chosen": -157.57061767578125, + "logps/rejected": -161.94886779785156, + "loss": 0.3919, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.559215784072876, + "rewards/margins": 1.808696985244751, + "rewards/rejected": -2.367912769317627, + "step": 4531 + }, + { + "epoch": 0.53, + "learning_rate": 1.4399433093185307e-07, + "logits/chosen": -2.4067320823669434, + "logits/rejected": -2.3116726875305176, + "logps/chosen": -225.63128662109375, + "logps/rejected": -309.308349609375, + "loss": 0.2273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.132466435432434, + "rewards/margins": 2.7170751094818115, + "rewards/rejected": -3.849541664123535, + "step": 4532 + }, + { + "epoch": 0.53, + "learning_rate": 1.439588992559348e-07, + "logits/chosen": -1.8923847675323486, + "logits/rejected": -2.040466785430908, + "logps/chosen": -331.92962646484375, + "logps/rejected": -275.8675537109375, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3218657970428467, + "rewards/margins": 1.3280314207077026, + "rewards/rejected": -2.6498970985412598, + "step": 4533 + }, + { + "epoch": 0.53, + "learning_rate": 1.4392346758001652e-07, + "logits/chosen": -1.9332072734832764, + "logits/rejected": -2.268585443496704, + "logps/chosen": -331.6237487792969, + "logps/rejected": -237.0109100341797, + "loss": 1.2664, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5782623291015625, + "rewards/margins": 0.922237753868103, + "rewards/rejected": -3.500500202178955, + "step": 4534 + }, + { + "epoch": 0.53, + "learning_rate": 1.4388803590409826e-07, + "logits/chosen": -2.3668246269226074, + "logits/rejected": -2.5182385444641113, + "logps/chosen": -298.1815490722656, + "logps/rejected": -338.0044860839844, + "loss": 0.4519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9222475290298462, + "rewards/margins": 3.586411952972412, + "rewards/rejected": -4.508659362792969, + "step": 4535 + }, + { + "epoch": 0.53, + "learning_rate": 1.4385260422817999e-07, + "logits/chosen": -1.7934093475341797, + "logits/rejected": -1.9206140041351318, + "logps/chosen": -637.3226318359375, + "logps/rejected": -405.039794921875, + "loss": 0.4205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7445183396339417, + "rewards/margins": 1.1738955974578857, + "rewards/rejected": -1.9184138774871826, + "step": 4536 + }, + { + "epoch": 0.53, + "learning_rate": 1.438171725522617e-07, + "logits/chosen": -2.119317054748535, + "logits/rejected": -2.1920840740203857, + "logps/chosen": -153.34149169921875, + "logps/rejected": -165.95285034179688, + "loss": 0.5006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7456852197647095, + "rewards/margins": 1.7846500873565674, + "rewards/rejected": -2.5303354263305664, + "step": 4537 + }, + { + "epoch": 0.53, + "learning_rate": 1.4378174087634346e-07, + "logits/chosen": -1.9258193969726562, + "logits/rejected": -2.2234652042388916, + "logps/chosen": -403.2442626953125, + "logps/rejected": -284.0052490234375, + "loss": 0.3428, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08641433715820312, + "rewards/margins": 2.7570629119873047, + "rewards/rejected": -2.6706485748291016, + "step": 4538 + }, + { + "epoch": 0.53, + "learning_rate": 1.4374630920042518e-07, + "logits/chosen": -2.1453492641448975, + "logits/rejected": -2.4140048027038574, + "logps/chosen": -276.9625244140625, + "logps/rejected": -274.12261962890625, + "loss": 0.3794, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9709456562995911, + "rewards/margins": 3.295370578765869, + "rewards/rejected": -4.2663164138793945, + "step": 4539 + }, + { + "epoch": 0.53, + "learning_rate": 1.437108775245069e-07, + "logits/chosen": -2.187670946121216, + "logits/rejected": -1.9425549507141113, + "logps/chosen": -378.585693359375, + "logps/rejected": -300.03009033203125, + "loss": 0.5154, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0079045295715332, + "rewards/margins": 0.5125000476837158, + "rewards/rejected": -1.520404577255249, + "step": 4540 + }, + { + "epoch": 0.53, + "learning_rate": 1.4367544584858865e-07, + "logits/chosen": -2.5555591583251953, + "logits/rejected": -2.438849449157715, + "logps/chosen": -183.1067352294922, + "logps/rejected": -143.8574676513672, + "loss": 0.4778, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0507951974868774, + "rewards/margins": 1.8909821510314941, + "rewards/rejected": -2.941777229309082, + "step": 4541 + }, + { + "epoch": 0.53, + "learning_rate": 1.4364001417267037e-07, + "logits/chosen": -2.7133073806762695, + "logits/rejected": -2.596078872680664, + "logps/chosen": -163.97145080566406, + "logps/rejected": -263.69207763671875, + "loss": 0.5348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0817997455596924, + "rewards/margins": 1.4727386236190796, + "rewards/rejected": -2.5545387268066406, + "step": 4542 + }, + { + "epoch": 0.53, + "learning_rate": 1.436045824967521e-07, + "logits/chosen": -1.5911260843276978, + "logits/rejected": -1.4432694911956787, + "logps/chosen": -305.38671875, + "logps/rejected": -438.78729248046875, + "loss": 0.4665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3665088415145874, + "rewards/margins": 1.8471590280532837, + "rewards/rejected": -3.213667631149292, + "step": 4543 + }, + { + "epoch": 0.53, + "learning_rate": 1.4356915082083382e-07, + "logits/chosen": -2.4066531658172607, + "logits/rejected": -2.2318778038024902, + "logps/chosen": -146.27267456054688, + "logps/rejected": -181.6082763671875, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6682428121566772, + "rewards/margins": 1.59196138381958, + "rewards/rejected": -3.2602040767669678, + "step": 4544 + }, + { + "epoch": 0.53, + "learning_rate": 1.4353371914491554e-07, + "logits/chosen": -2.579958915710449, + "logits/rejected": -2.521435260772705, + "logps/chosen": -284.2271728515625, + "logps/rejected": -334.385009765625, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4357292652130127, + "rewards/margins": 3.7329325675964355, + "rewards/rejected": -4.168662071228027, + "step": 4545 + }, + { + "epoch": 0.53, + "learning_rate": 1.4349828746899726e-07, + "logits/chosen": -2.657261371612549, + "logits/rejected": -2.6024065017700195, + "logps/chosen": -416.35235595703125, + "logps/rejected": -299.7491455078125, + "loss": 0.1677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8758676052093506, + "rewards/margins": 2.643146276473999, + "rewards/rejected": -3.5190141201019287, + "step": 4546 + }, + { + "epoch": 0.53, + "learning_rate": 1.43462855793079e-07, + "logits/chosen": -2.011324405670166, + "logits/rejected": -1.9053081274032593, + "logps/chosen": -246.220703125, + "logps/rejected": -231.98468017578125, + "loss": 0.4304, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8850646018981934, + "rewards/margins": 1.8854163885116577, + "rewards/rejected": -2.7704808712005615, + "step": 4547 + }, + { + "epoch": 0.53, + "learning_rate": 1.4342742411716073e-07, + "logits/chosen": -2.2863762378692627, + "logits/rejected": -2.182316541671753, + "logps/chosen": -175.74551391601562, + "logps/rejected": -281.2192687988281, + "loss": 0.2808, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2007759064435959, + "rewards/margins": 3.655919313430786, + "rewards/rejected": -3.8566954135894775, + "step": 4548 + }, + { + "epoch": 0.53, + "learning_rate": 1.4339199244124245e-07, + "logits/chosen": -2.1655240058898926, + "logits/rejected": -2.4422080516815186, + "logps/chosen": -340.9500427246094, + "logps/rejected": -290.5348815917969, + "loss": 0.3238, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14776934683322906, + "rewards/margins": 1.8168530464172363, + "rewards/rejected": -1.9646224975585938, + "step": 4549 + }, + { + "epoch": 0.53, + "learning_rate": 1.433565607653242e-07, + "logits/chosen": -1.4708178043365479, + "logits/rejected": -2.040053606033325, + "logps/chosen": -497.3683776855469, + "logps/rejected": -358.2477111816406, + "loss": 0.5847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8377424478530884, + "rewards/margins": 1.123195767402649, + "rewards/rejected": -1.9609382152557373, + "step": 4550 + }, + { + "epoch": 0.53, + "learning_rate": 1.4332112908940592e-07, + "logits/chosen": -1.9071755409240723, + "logits/rejected": -1.8008759021759033, + "logps/chosen": -438.569580078125, + "logps/rejected": -374.8845520019531, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9436010122299194, + "rewards/margins": 2.160256862640381, + "rewards/rejected": -3.1038577556610107, + "step": 4551 + }, + { + "epoch": 0.53, + "learning_rate": 1.4328569741348765e-07, + "logits/chosen": -2.0289788246154785, + "logits/rejected": -1.8425214290618896, + "logps/chosen": -377.4927978515625, + "logps/rejected": -387.34515380859375, + "loss": 0.9871, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4708130359649658, + "rewards/margins": 0.5777369737625122, + "rewards/rejected": -2.0485498905181885, + "step": 4552 + }, + { + "epoch": 0.53, + "learning_rate": 1.432502657375694e-07, + "logits/chosen": -1.9287724494934082, + "logits/rejected": -1.932666301727295, + "logps/chosen": -203.9404754638672, + "logps/rejected": -348.5835266113281, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39140504598617554, + "rewards/margins": 5.071662425994873, + "rewards/rejected": -5.463068008422852, + "step": 4553 + }, + { + "epoch": 0.53, + "learning_rate": 1.4321483406165112e-07, + "logits/chosen": -1.6115461587905884, + "logits/rejected": -2.1434059143066406, + "logps/chosen": -453.5948791503906, + "logps/rejected": -257.0216369628906, + "loss": 0.3281, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6525439620018005, + "rewards/margins": 1.3091933727264404, + "rewards/rejected": -1.9617373943328857, + "step": 4554 + }, + { + "epoch": 0.53, + "learning_rate": 1.4317940238573284e-07, + "logits/chosen": -2.5339245796203613, + "logits/rejected": -2.7047104835510254, + "logps/chosen": -200.31126403808594, + "logps/rejected": -189.361572265625, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6265925765037537, + "rewards/margins": 3.195383071899414, + "rewards/rejected": -2.5687904357910156, + "step": 4555 + }, + { + "epoch": 0.53, + "learning_rate": 1.4314397070981456e-07, + "logits/chosen": -2.073204517364502, + "logits/rejected": -2.312370777130127, + "logps/chosen": -493.60052490234375, + "logps/rejected": -383.4974060058594, + "loss": 0.4152, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9416556358337402, + "rewards/margins": 1.3919603824615479, + "rewards/rejected": -2.333616018295288, + "step": 4556 + }, + { + "epoch": 0.53, + "learning_rate": 1.4310853903389628e-07, + "logits/chosen": -2.1404306888580322, + "logits/rejected": -1.9542174339294434, + "logps/chosen": -324.094482421875, + "logps/rejected": -447.409912109375, + "loss": 0.8349, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.15049147605896, + "rewards/margins": 0.7797902226448059, + "rewards/rejected": -1.930281639099121, + "step": 4557 + }, + { + "epoch": 0.53, + "learning_rate": 1.43073107357978e-07, + "logits/chosen": -1.772026777267456, + "logits/rejected": -2.270484447479248, + "logps/chosen": -646.3619995117188, + "logps/rejected": -470.0079345703125, + "loss": 0.6181, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1211533546447754, + "rewards/margins": 0.42548900842666626, + "rewards/rejected": -1.5466423034667969, + "step": 4558 + }, + { + "epoch": 0.53, + "learning_rate": 1.4303767568205975e-07, + "logits/chosen": -2.7580230236053467, + "logits/rejected": -2.7416677474975586, + "logps/chosen": -259.8551025390625, + "logps/rejected": -291.7745666503906, + "loss": 0.3658, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0203540325164795, + "rewards/margins": 2.654365301132202, + "rewards/rejected": -3.6747193336486816, + "step": 4559 + }, + { + "epoch": 0.53, + "learning_rate": 1.4300224400614148e-07, + "logits/chosen": -2.2695071697235107, + "logits/rejected": -2.325585126876831, + "logps/chosen": -187.5009307861328, + "logps/rejected": -250.90591430664062, + "loss": 0.4016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7131271958351135, + "rewards/margins": 1.8361079692840576, + "rewards/rejected": -2.5492351055145264, + "step": 4560 + }, + { + "epoch": 0.53, + "learning_rate": 1.4296681233022322e-07, + "logits/chosen": -2.750105857849121, + "logits/rejected": -2.4683191776275635, + "logps/chosen": -270.112548828125, + "logps/rejected": -304.95965576171875, + "loss": 0.4085, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6515185832977295, + "rewards/margins": 1.5852755308151245, + "rewards/rejected": -2.2367942333221436, + "step": 4561 + }, + { + "epoch": 0.53, + "learning_rate": 1.4293138065430495e-07, + "logits/chosen": -2.3127176761627197, + "logits/rejected": -2.3834595680236816, + "logps/chosen": -371.79388427734375, + "logps/rejected": -259.0505065917969, + "loss": 0.7814, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9851155281066895, + "rewards/margins": 1.247124433517456, + "rewards/rejected": -3.2322397232055664, + "step": 4562 + }, + { + "epoch": 0.53, + "learning_rate": 1.4289594897838667e-07, + "logits/chosen": -2.1493802070617676, + "logits/rejected": -2.200692653656006, + "logps/chosen": -275.173583984375, + "logps/rejected": -332.397216796875, + "loss": 0.2149, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3591339588165283, + "rewards/margins": 2.6663687229156494, + "rewards/rejected": -3.0255024433135986, + "step": 4563 + }, + { + "epoch": 0.53, + "learning_rate": 1.4286051730246842e-07, + "logits/chosen": -2.340829372406006, + "logits/rejected": -2.2817506790161133, + "logps/chosen": -303.1248779296875, + "logps/rejected": -231.0667724609375, + "loss": 0.6783, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3850717544555664, + "rewards/margins": 1.4177653789520264, + "rewards/rejected": -2.802837371826172, + "step": 4564 + }, + { + "epoch": 0.53, + "learning_rate": 1.4282508562655014e-07, + "logits/chosen": -2.807140350341797, + "logits/rejected": -2.882100820541382, + "logps/chosen": -257.6672668457031, + "logps/rejected": -265.2479248046875, + "loss": 0.481, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.363244891166687, + "rewards/margins": 1.60715913772583, + "rewards/rejected": -2.9704041481018066, + "step": 4565 + }, + { + "epoch": 0.53, + "learning_rate": 1.4278965395063186e-07, + "logits/chosen": -2.681138038635254, + "logits/rejected": -2.7151107788085938, + "logps/chosen": -221.9573516845703, + "logps/rejected": -280.098876953125, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3648585379123688, + "rewards/margins": 2.921353578567505, + "rewards/rejected": -3.2862119674682617, + "step": 4566 + }, + { + "epoch": 0.53, + "learning_rate": 1.4275422227471358e-07, + "logits/chosen": -1.8168375492095947, + "logits/rejected": -2.171895742416382, + "logps/chosen": -390.2123718261719, + "logps/rejected": -309.4970397949219, + "loss": 1.4804, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6768441200256348, + "rewards/margins": 0.007744848728179932, + "rewards/rejected": -2.68458890914917, + "step": 4567 + }, + { + "epoch": 0.53, + "learning_rate": 1.427187905987953e-07, + "logits/chosen": -2.327075481414795, + "logits/rejected": -2.245744228363037, + "logps/chosen": -231.1329345703125, + "logps/rejected": -319.91912841796875, + "loss": 0.4647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.734835147857666, + "rewards/margins": 0.8618031144142151, + "rewards/rejected": -1.5966382026672363, + "step": 4568 + }, + { + "epoch": 0.53, + "learning_rate": 1.4268335892287703e-07, + "logits/chosen": -2.2313027381896973, + "logits/rejected": -2.457914352416992, + "logps/chosen": -406.89227294921875, + "logps/rejected": -373.0696716308594, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1542465686798096, + "rewards/margins": 3.549875020980835, + "rewards/rejected": -4.7041215896606445, + "step": 4569 + }, + { + "epoch": 0.53, + "learning_rate": 1.4264792724695878e-07, + "logits/chosen": -2.6116902828216553, + "logits/rejected": -2.4388294219970703, + "logps/chosen": -169.36260986328125, + "logps/rejected": -192.33892822265625, + "loss": 0.8204, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3113291263580322, + "rewards/margins": 0.9894109964370728, + "rewards/rejected": -2.3007402420043945, + "step": 4570 + }, + { + "epoch": 0.53, + "learning_rate": 1.426124955710405e-07, + "logits/chosen": -2.2877373695373535, + "logits/rejected": -2.3286972045898438, + "logps/chosen": -414.1171875, + "logps/rejected": -380.396484375, + "loss": 0.3828, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19829264283180237, + "rewards/margins": 1.762611985206604, + "rewards/rejected": -1.5643192529678345, + "step": 4571 + }, + { + "epoch": 0.53, + "learning_rate": 1.4257706389512225e-07, + "logits/chosen": -2.172844409942627, + "logits/rejected": -2.2091777324676514, + "logps/chosen": -401.7303771972656, + "logps/rejected": -249.33340454101562, + "loss": 0.4236, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0799400806427002, + "rewards/margins": 1.86809504032135, + "rewards/rejected": -2.94803524017334, + "step": 4572 + }, + { + "epoch": 0.53, + "learning_rate": 1.4254163221920397e-07, + "logits/chosen": -2.473447322845459, + "logits/rejected": -2.342238426208496, + "logps/chosen": -287.98779296875, + "logps/rejected": -195.54818725585938, + "loss": 0.3523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5149829983711243, + "rewards/margins": 1.5939180850982666, + "rewards/rejected": -2.108901023864746, + "step": 4573 + }, + { + "epoch": 0.53, + "learning_rate": 1.425062005432857e-07, + "logits/chosen": -2.347651720046997, + "logits/rejected": -2.516204833984375, + "logps/chosen": -457.66217041015625, + "logps/rejected": -350.29998779296875, + "loss": 0.3574, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7880032062530518, + "rewards/margins": 2.1141960620880127, + "rewards/rejected": -2.9021992683410645, + "step": 4574 + }, + { + "epoch": 0.53, + "learning_rate": 1.4247076886736741e-07, + "logits/chosen": -2.8588175773620605, + "logits/rejected": -2.772818088531494, + "logps/chosen": -490.4708251953125, + "logps/rejected": -440.9217224121094, + "loss": 0.1517, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1680539846420288, + "rewards/margins": 4.840087890625, + "rewards/rejected": -6.00814151763916, + "step": 4575 + }, + { + "epoch": 0.53, + "learning_rate": 1.4243533719144916e-07, + "logits/chosen": -1.696918249130249, + "logits/rejected": -1.8250718116760254, + "logps/chosen": -586.201416015625, + "logps/rejected": -441.43475341796875, + "loss": 0.5173, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13461264967918396, + "rewards/margins": 2.304497480392456, + "rewards/rejected": -2.439110279083252, + "step": 4576 + }, + { + "epoch": 0.53, + "learning_rate": 1.4239990551553088e-07, + "logits/chosen": -2.141784906387329, + "logits/rejected": -1.6939605474472046, + "logps/chosen": -307.0180358886719, + "logps/rejected": -432.68695068359375, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0283546447753906, + "rewards/margins": 4.45158576965332, + "rewards/rejected": -5.479939937591553, + "step": 4577 + }, + { + "epoch": 0.53, + "learning_rate": 1.423644738396126e-07, + "logits/chosen": -2.4585256576538086, + "logits/rejected": -2.557363271713257, + "logps/chosen": -283.5745544433594, + "logps/rejected": -299.0815124511719, + "loss": 0.3576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5687925815582275, + "rewards/margins": 2.9264378547668457, + "rewards/rejected": -3.495230197906494, + "step": 4578 + }, + { + "epoch": 0.53, + "learning_rate": 1.4232904216369433e-07, + "logits/chosen": -2.364220142364502, + "logits/rejected": -2.2840168476104736, + "logps/chosen": -301.54241943359375, + "logps/rejected": -222.2979278564453, + "loss": 0.3328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1832081079483032, + "rewards/margins": 1.9876179695129395, + "rewards/rejected": -3.1708261966705322, + "step": 4579 + }, + { + "epoch": 0.53, + "learning_rate": 1.4229361048777605e-07, + "logits/chosen": -2.1571855545043945, + "logits/rejected": -2.1304352283477783, + "logps/chosen": -223.14669799804688, + "logps/rejected": -215.6138458251953, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4926740229129791, + "rewards/margins": 2.303830862045288, + "rewards/rejected": -2.7965049743652344, + "step": 4580 + }, + { + "epoch": 0.53, + "learning_rate": 1.422581788118578e-07, + "logits/chosen": -2.50447678565979, + "logits/rejected": -2.2331831455230713, + "logps/chosen": -210.81455993652344, + "logps/rejected": -391.03717041015625, + "loss": 0.3002, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0266237258911133, + "rewards/margins": 4.442818641662598, + "rewards/rejected": -6.469442367553711, + "step": 4581 + }, + { + "epoch": 0.53, + "learning_rate": 1.4222274713593952e-07, + "logits/chosen": -2.205030918121338, + "logits/rejected": -2.0796868801116943, + "logps/chosen": -351.6739807128906, + "logps/rejected": -240.228759765625, + "loss": 0.4903, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6667861938476562, + "rewards/margins": 1.360234022140503, + "rewards/rejected": -3.0270204544067383, + "step": 4582 + }, + { + "epoch": 0.53, + "learning_rate": 1.4218731546002124e-07, + "logits/chosen": -1.9994723796844482, + "logits/rejected": -2.1341073513031006, + "logps/chosen": -216.28082275390625, + "logps/rejected": -256.68499755859375, + "loss": 0.4098, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4084702730178833, + "rewards/margins": 2.0712666511535645, + "rewards/rejected": -3.479736804962158, + "step": 4583 + }, + { + "epoch": 0.53, + "learning_rate": 1.42151883784103e-07, + "logits/chosen": -1.9265532493591309, + "logits/rejected": -1.692497968673706, + "logps/chosen": -363.0285339355469, + "logps/rejected": -354.53936767578125, + "loss": 0.3624, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.044895052909851, + "rewards/margins": 2.8845739364624023, + "rewards/rejected": -3.9294686317443848, + "step": 4584 + }, + { + "epoch": 0.53, + "learning_rate": 1.4211645210818471e-07, + "logits/chosen": -2.676792621612549, + "logits/rejected": -2.5210492610931396, + "logps/chosen": -299.9329833984375, + "logps/rejected": -339.12322998046875, + "loss": 0.3146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4944720268249512, + "rewards/margins": 1.8063539266586304, + "rewards/rejected": -3.300826072692871, + "step": 4585 + }, + { + "epoch": 0.53, + "learning_rate": 1.4208102043226644e-07, + "logits/chosen": -2.399466037750244, + "logits/rejected": -2.7145683765411377, + "logps/chosen": -222.4784698486328, + "logps/rejected": -162.9097900390625, + "loss": 0.5488, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3615578413009644, + "rewards/margins": 1.9946962594985962, + "rewards/rejected": -3.3562541007995605, + "step": 4586 + }, + { + "epoch": 0.53, + "learning_rate": 1.4204558875634816e-07, + "logits/chosen": -2.5588457584381104, + "logits/rejected": -2.474881410598755, + "logps/chosen": -405.8204650878906, + "logps/rejected": -335.6864013671875, + "loss": 0.1903, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6817061305046082, + "rewards/margins": 2.8251466751098633, + "rewards/rejected": -3.5068531036376953, + "step": 4587 + }, + { + "epoch": 0.53, + "learning_rate": 1.420101570804299e-07, + "logits/chosen": -2.753014326095581, + "logits/rejected": -2.9598236083984375, + "logps/chosen": -447.74066162109375, + "logps/rejected": -271.0645446777344, + "loss": 0.3964, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9950798153877258, + "rewards/margins": 3.0899975299835205, + "rewards/rejected": -4.085077285766602, + "step": 4588 + }, + { + "epoch": 0.53, + "learning_rate": 1.4197472540451163e-07, + "logits/chosen": -2.3595938682556152, + "logits/rejected": -2.1703238487243652, + "logps/chosen": -516.5152587890625, + "logps/rejected": -401.35333251953125, + "loss": 0.618, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0838663578033447, + "rewards/margins": 1.3270374536514282, + "rewards/rejected": -2.4109039306640625, + "step": 4589 + }, + { + "epoch": 0.53, + "learning_rate": 1.4193929372859335e-07, + "logits/chosen": -3.195132255554199, + "logits/rejected": -3.0004518032073975, + "logps/chosen": -404.9744873046875, + "logps/rejected": -344.565673828125, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2071678638458252, + "rewards/margins": 2.50435733795166, + "rewards/rejected": -3.7115249633789062, + "step": 4590 + }, + { + "epoch": 0.53, + "learning_rate": 1.4190386205267507e-07, + "logits/chosen": -2.6674184799194336, + "logits/rejected": -2.7029495239257812, + "logps/chosen": -196.17269897460938, + "logps/rejected": -201.492431640625, + "loss": 1.266, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6181941032409668, + "rewards/margins": 0.8772364854812622, + "rewards/rejected": -2.4954304695129395, + "step": 4591 + }, + { + "epoch": 0.53, + "learning_rate": 1.4186843037675682e-07, + "logits/chosen": -2.2206661701202393, + "logits/rejected": -2.1138124465942383, + "logps/chosen": -190.13931274414062, + "logps/rejected": -309.45965576171875, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2348183393478394, + "rewards/margins": 3.169175386428833, + "rewards/rejected": -4.403993606567383, + "step": 4592 + }, + { + "epoch": 0.53, + "learning_rate": 1.4183299870083854e-07, + "logits/chosen": -2.281700611114502, + "logits/rejected": -2.081501007080078, + "logps/chosen": -241.08177185058594, + "logps/rejected": -304.19610595703125, + "loss": 0.3404, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5322091579437256, + "rewards/margins": 3.235095977783203, + "rewards/rejected": -4.76730489730835, + "step": 4593 + }, + { + "epoch": 0.53, + "learning_rate": 1.4179756702492027e-07, + "logits/chosen": -2.2878401279449463, + "logits/rejected": -2.3796496391296387, + "logps/chosen": -261.31817626953125, + "logps/rejected": -415.4995422363281, + "loss": 0.4587, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3747403621673584, + "rewards/margins": 1.494633674621582, + "rewards/rejected": -2.8693740367889404, + "step": 4594 + }, + { + "epoch": 0.53, + "learning_rate": 1.4176213534900201e-07, + "logits/chosen": -1.9523630142211914, + "logits/rejected": -2.4192006587982178, + "logps/chosen": -432.78082275390625, + "logps/rejected": -225.52833557128906, + "loss": 0.6136, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.544884443283081, + "rewards/margins": 0.46388956904411316, + "rewards/rejected": -2.0087740421295166, + "step": 4595 + }, + { + "epoch": 0.53, + "learning_rate": 1.4172670367308374e-07, + "logits/chosen": -2.486020088195801, + "logits/rejected": -2.5494656562805176, + "logps/chosen": -252.11700439453125, + "logps/rejected": -252.0416717529297, + "loss": 0.8167, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.458574891090393, + "rewards/margins": 0.9344184398651123, + "rewards/rejected": -2.392993450164795, + "step": 4596 + }, + { + "epoch": 0.53, + "learning_rate": 1.4169127199716546e-07, + "logits/chosen": -2.645944356918335, + "logits/rejected": -2.810886859893799, + "logps/chosen": -442.96990966796875, + "logps/rejected": -246.04653930664062, + "loss": 0.3937, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1360650062561035, + "rewards/margins": 1.7514028549194336, + "rewards/rejected": -2.887467861175537, + "step": 4597 + }, + { + "epoch": 0.53, + "learning_rate": 1.4165584032124718e-07, + "logits/chosen": -2.5343689918518066, + "logits/rejected": -2.5346174240112305, + "logps/chosen": -378.7255859375, + "logps/rejected": -269.8512268066406, + "loss": 0.1363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3184240460395813, + "rewards/margins": 2.716642379760742, + "rewards/rejected": -3.0350663661956787, + "step": 4598 + }, + { + "epoch": 0.54, + "learning_rate": 1.4162040864532893e-07, + "logits/chosen": -2.649057626724243, + "logits/rejected": -2.9146320819854736, + "logps/chosen": -252.0757293701172, + "logps/rejected": -328.2381286621094, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4550088047981262, + "rewards/margins": 3.642380475997925, + "rewards/rejected": -4.097389221191406, + "step": 4599 + }, + { + "epoch": 0.54, + "learning_rate": 1.4158497696941065e-07, + "logits/chosen": -2.34460186958313, + "logits/rejected": -2.5735206604003906, + "logps/chosen": -246.5498046875, + "logps/rejected": -218.96018981933594, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3370246887207031, + "rewards/margins": 1.77339768409729, + "rewards/rejected": -3.1104226112365723, + "step": 4600 + }, + { + "epoch": 0.54, + "learning_rate": 1.4154954529349237e-07, + "logits/chosen": -2.1869618892669678, + "logits/rejected": -2.1773574352264404, + "logps/chosen": -438.0602722167969, + "logps/rejected": -352.83837890625, + "loss": 0.2702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8134980201721191, + "rewards/margins": 2.5503172874450684, + "rewards/rejected": -3.3638153076171875, + "step": 4601 + }, + { + "epoch": 0.54, + "learning_rate": 1.415141136175741e-07, + "logits/chosen": -2.126478910446167, + "logits/rejected": -2.192349433898926, + "logps/chosen": -199.8409423828125, + "logps/rejected": -173.22769165039062, + "loss": 0.3361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4917775094509125, + "rewards/margins": 1.407465934753418, + "rewards/rejected": -1.8992433547973633, + "step": 4602 + }, + { + "epoch": 0.54, + "learning_rate": 1.4147868194165582e-07, + "logits/chosen": -3.004516839981079, + "logits/rejected": -2.9348459243774414, + "logps/chosen": -276.03460693359375, + "logps/rejected": -167.3862762451172, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.775209903717041, + "rewards/margins": 0.9002853631973267, + "rewards/rejected": -2.675495147705078, + "step": 4603 + }, + { + "epoch": 0.54, + "learning_rate": 1.4144325026573757e-07, + "logits/chosen": -1.9522188901901245, + "logits/rejected": -2.4306392669677734, + "logps/chosen": -487.74267578125, + "logps/rejected": -278.35675048828125, + "loss": 0.5755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5592601895332336, + "rewards/margins": 2.151123523712158, + "rewards/rejected": -2.710383653640747, + "step": 4604 + }, + { + "epoch": 0.54, + "learning_rate": 1.414078185898193e-07, + "logits/chosen": -2.3156447410583496, + "logits/rejected": -2.3067805767059326, + "logps/chosen": -340.23406982421875, + "logps/rejected": -345.88482666015625, + "loss": 0.2681, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.124963641166687, + "rewards/margins": 2.833944797515869, + "rewards/rejected": -3.9589083194732666, + "step": 4605 + }, + { + "epoch": 0.54, + "learning_rate": 1.4137238691390104e-07, + "logits/chosen": -2.551785469055176, + "logits/rejected": -2.4820475578308105, + "logps/chosen": -125.06780242919922, + "logps/rejected": -146.87893676757812, + "loss": 0.3509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24461446702480316, + "rewards/margins": 2.1002471446990967, + "rewards/rejected": -2.3448615074157715, + "step": 4606 + }, + { + "epoch": 0.54, + "learning_rate": 1.4133695523798276e-07, + "logits/chosen": -2.117509365081787, + "logits/rejected": -1.907294750213623, + "logps/chosen": -291.6663818359375, + "logps/rejected": -308.48748779296875, + "loss": 0.232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009315751492977142, + "rewards/margins": 2.9148430824279785, + "rewards/rejected": -2.905527353286743, + "step": 4607 + }, + { + "epoch": 0.54, + "learning_rate": 1.4130152356206448e-07, + "logits/chosen": -2.799347400665283, + "logits/rejected": -2.826687812805176, + "logps/chosen": -168.2774200439453, + "logps/rejected": -243.99237060546875, + "loss": 0.4035, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7215653657913208, + "rewards/margins": 3.433682441711426, + "rewards/rejected": -4.155247211456299, + "step": 4608 + }, + { + "epoch": 0.54, + "learning_rate": 1.412660918861462e-07, + "logits/chosen": -2.273547410964966, + "logits/rejected": -2.332209587097168, + "logps/chosen": -258.91265869140625, + "logps/rejected": -195.6422119140625, + "loss": 0.4132, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.717590868473053, + "rewards/margins": 1.4587090015411377, + "rewards/rejected": -2.176299810409546, + "step": 4609 + }, + { + "epoch": 0.54, + "learning_rate": 1.4123066021022793e-07, + "logits/chosen": -2.2808103561401367, + "logits/rejected": -2.3269219398498535, + "logps/chosen": -109.20295715332031, + "logps/rejected": -170.42051696777344, + "loss": 0.3024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9899467825889587, + "rewards/margins": 2.274631977081299, + "rewards/rejected": -3.2645788192749023, + "step": 4610 + }, + { + "epoch": 0.54, + "learning_rate": 1.4119522853430967e-07, + "logits/chosen": -2.561711072921753, + "logits/rejected": -2.775409460067749, + "logps/chosen": -359.1471862792969, + "logps/rejected": -271.50225830078125, + "loss": 0.2375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.707338273525238, + "rewards/margins": 3.361579418182373, + "rewards/rejected": -4.068917751312256, + "step": 4611 + }, + { + "epoch": 0.54, + "learning_rate": 1.411597968583914e-07, + "logits/chosen": -2.586505651473999, + "logits/rejected": -2.336860179901123, + "logps/chosen": -319.4090576171875, + "logps/rejected": -272.719482421875, + "loss": 0.5567, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7085068225860596, + "rewards/margins": 1.4417999982833862, + "rewards/rejected": -3.1503071784973145, + "step": 4612 + }, + { + "epoch": 0.54, + "learning_rate": 1.4112436518247312e-07, + "logits/chosen": -2.1734559535980225, + "logits/rejected": -2.213317394256592, + "logps/chosen": -371.52569580078125, + "logps/rejected": -415.55364990234375, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3319993019104004, + "rewards/margins": 2.779252052307129, + "rewards/rejected": -3.1112515926361084, + "step": 4613 + }, + { + "epoch": 0.54, + "learning_rate": 1.4108893350655484e-07, + "logits/chosen": -2.28395676612854, + "logits/rejected": -2.529528856277466, + "logps/chosen": -212.99978637695312, + "logps/rejected": -212.356689453125, + "loss": 0.5409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9368150234222412, + "rewards/margins": 2.1434431076049805, + "rewards/rejected": -3.0802578926086426, + "step": 4614 + }, + { + "epoch": 0.54, + "learning_rate": 1.410535018306366e-07, + "logits/chosen": -2.139815092086792, + "logits/rejected": -1.7968776226043701, + "logps/chosen": -289.36297607421875, + "logps/rejected": -325.81439208984375, + "loss": 0.5681, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2703001499176025, + "rewards/margins": 2.5040411949157715, + "rewards/rejected": -3.774341344833374, + "step": 4615 + }, + { + "epoch": 0.54, + "learning_rate": 1.410180701547183e-07, + "logits/chosen": -1.8466485738754272, + "logits/rejected": -1.7690751552581787, + "logps/chosen": -411.89129638671875, + "logps/rejected": -319.6438293457031, + "loss": 0.4478, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4901155233383179, + "rewards/margins": 2.7898993492126465, + "rewards/rejected": -4.280014991760254, + "step": 4616 + }, + { + "epoch": 0.54, + "learning_rate": 1.4098263847880006e-07, + "logits/chosen": -1.8233273029327393, + "logits/rejected": -1.7715038061141968, + "logps/chosen": -186.27394104003906, + "logps/rejected": -228.92381286621094, + "loss": 1.0429, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3475981950759888, + "rewards/margins": 0.15446317195892334, + "rewards/rejected": -1.502061367034912, + "step": 4617 + }, + { + "epoch": 0.54, + "learning_rate": 1.4094720680288178e-07, + "logits/chosen": -2.3038926124572754, + "logits/rejected": -1.980542540550232, + "logps/chosen": -259.0091857910156, + "logps/rejected": -387.54620361328125, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36256974935531616, + "rewards/margins": 3.3528194427490234, + "rewards/rejected": -3.7153892517089844, + "step": 4618 + }, + { + "epoch": 0.54, + "learning_rate": 1.409117751269635e-07, + "logits/chosen": -2.442410469055176, + "logits/rejected": -2.2034428119659424, + "logps/chosen": -261.4752197265625, + "logps/rejected": -327.8642883300781, + "loss": 0.149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3813626766204834, + "rewards/margins": 3.0767712593078613, + "rewards/rejected": -3.458134174346924, + "step": 4619 + }, + { + "epoch": 0.54, + "learning_rate": 1.4087634345104523e-07, + "logits/chosen": -2.488736867904663, + "logits/rejected": -2.3907792568206787, + "logps/chosen": -170.3992156982422, + "logps/rejected": -167.36102294921875, + "loss": 0.3796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5916995406150818, + "rewards/margins": 1.6883834600448608, + "rewards/rejected": -2.280082941055298, + "step": 4620 + }, + { + "epoch": 0.54, + "learning_rate": 1.4084091177512695e-07, + "logits/chosen": -1.9242665767669678, + "logits/rejected": -1.6609541177749634, + "logps/chosen": -314.72540283203125, + "logps/rejected": -314.41571044921875, + "loss": 0.5002, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3276955783367157, + "rewards/margins": 1.068794846534729, + "rewards/rejected": -1.3964905738830566, + "step": 4621 + }, + { + "epoch": 0.54, + "learning_rate": 1.4080548009920867e-07, + "logits/chosen": -1.883446455001831, + "logits/rejected": -2.056710720062256, + "logps/chosen": -283.36773681640625, + "logps/rejected": -178.4051513671875, + "loss": 0.5106, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.092626690864563, + "rewards/margins": 1.982210397720337, + "rewards/rejected": -3.0748372077941895, + "step": 4622 + }, + { + "epoch": 0.54, + "learning_rate": 1.4077004842329042e-07, + "logits/chosen": -2.173339366912842, + "logits/rejected": -2.258835792541504, + "logps/chosen": -275.67449951171875, + "logps/rejected": -359.0852966308594, + "loss": 0.4557, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3687057495117188, + "rewards/margins": 1.614367961883545, + "rewards/rejected": -2.9830737113952637, + "step": 4623 + }, + { + "epoch": 0.54, + "learning_rate": 1.4073461674737214e-07, + "logits/chosen": -2.7102890014648438, + "logits/rejected": -2.9394314289093018, + "logps/chosen": -182.8770294189453, + "logps/rejected": -151.47421264648438, + "loss": 0.2996, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30483877658843994, + "rewards/margins": 1.905653715133667, + "rewards/rejected": -2.2104926109313965, + "step": 4624 + }, + { + "epoch": 0.54, + "learning_rate": 1.4069918507145386e-07, + "logits/chosen": -2.2822136878967285, + "logits/rejected": -2.544072389602661, + "logps/chosen": -244.72389221191406, + "logps/rejected": -256.6878967285156, + "loss": 0.6769, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.010319471359253, + "rewards/margins": 1.0113540887832642, + "rewards/rejected": -2.0216736793518066, + "step": 4625 + }, + { + "epoch": 0.54, + "learning_rate": 1.406637533955356e-07, + "logits/chosen": -2.387230634689331, + "logits/rejected": -2.511632204055786, + "logps/chosen": -270.7560119628906, + "logps/rejected": -293.37054443359375, + "loss": 0.5062, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7429796457290649, + "rewards/margins": 1.2903060913085938, + "rewards/rejected": -2.033285617828369, + "step": 4626 + }, + { + "epoch": 0.54, + "learning_rate": 1.4062832171961733e-07, + "logits/chosen": -1.650795817375183, + "logits/rejected": -1.7438623905181885, + "logps/chosen": -258.4410095214844, + "logps/rejected": -238.97073364257812, + "loss": 0.4304, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5347384214401245, + "rewards/margins": 2.169691801071167, + "rewards/rejected": -2.704430103302002, + "step": 4627 + }, + { + "epoch": 0.54, + "learning_rate": 1.4059289004369906e-07, + "logits/chosen": -2.37996506690979, + "logits/rejected": -2.5870180130004883, + "logps/chosen": -318.6277770996094, + "logps/rejected": -313.6494140625, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08037789165973663, + "rewards/margins": 2.271235704421997, + "rewards/rejected": -2.1908576488494873, + "step": 4628 + }, + { + "epoch": 0.54, + "learning_rate": 1.405574583677808e-07, + "logits/chosen": -2.2125773429870605, + "logits/rejected": -2.4222450256347656, + "logps/chosen": -283.5555725097656, + "logps/rejected": -285.8691711425781, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.113051176071167, + "rewards/margins": 1.6194679737091064, + "rewards/rejected": -2.7325193881988525, + "step": 4629 + }, + { + "epoch": 0.54, + "learning_rate": 1.4052202669186253e-07, + "logits/chosen": -2.072291135787964, + "logits/rejected": -2.268136501312256, + "logps/chosen": -505.2510681152344, + "logps/rejected": -397.8077697753906, + "loss": 0.3293, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1535784006118774, + "rewards/margins": 2.3244216442108154, + "rewards/rejected": -3.477999687194824, + "step": 4630 + }, + { + "epoch": 0.54, + "learning_rate": 1.4048659501594425e-07, + "logits/chosen": -2.4458773136138916, + "logits/rejected": -2.586926221847534, + "logps/chosen": -231.9804229736328, + "logps/rejected": -313.34405517578125, + "loss": 0.1751, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3741075992584229, + "rewards/margins": 5.375438690185547, + "rewards/rejected": -6.749546527862549, + "step": 4631 + }, + { + "epoch": 0.54, + "learning_rate": 1.4045116334002597e-07, + "logits/chosen": -1.819756031036377, + "logits/rejected": -1.9599452018737793, + "logps/chosen": -321.6615295410156, + "logps/rejected": -229.91293334960938, + "loss": 0.2858, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32386845350265503, + "rewards/margins": 3.019430160522461, + "rewards/rejected": -3.3432986736297607, + "step": 4632 + }, + { + "epoch": 0.54, + "learning_rate": 1.404157316641077e-07, + "logits/chosen": -2.6913838386535645, + "logits/rejected": -2.713521957397461, + "logps/chosen": -297.64520263671875, + "logps/rejected": -294.28460693359375, + "loss": 0.2131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.054552413523197174, + "rewards/margins": 2.3052990436553955, + "rewards/rejected": -2.250746726989746, + "step": 4633 + }, + { + "epoch": 0.54, + "learning_rate": 1.4038029998818942e-07, + "logits/chosen": -2.0544722080230713, + "logits/rejected": -2.0543596744537354, + "logps/chosen": -301.5877685546875, + "logps/rejected": -407.3842468261719, + "loss": 0.3057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7007627487182617, + "rewards/margins": 2.8040943145751953, + "rewards/rejected": -3.504857301712036, + "step": 4634 + }, + { + "epoch": 0.54, + "learning_rate": 1.4034486831227116e-07, + "logits/chosen": -1.9697105884552002, + "logits/rejected": -2.2083659172058105, + "logps/chosen": -353.19879150390625, + "logps/rejected": -354.5985412597656, + "loss": 0.65, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9303594827651978, + "rewards/margins": 1.6507543325424194, + "rewards/rejected": -2.581113815307617, + "step": 4635 + }, + { + "epoch": 0.54, + "learning_rate": 1.4030943663635289e-07, + "logits/chosen": -2.281644821166992, + "logits/rejected": -2.199143171310425, + "logps/chosen": -99.43919372558594, + "logps/rejected": -201.25436401367188, + "loss": 0.3422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3843918442726135, + "rewards/margins": 1.9153754711151123, + "rewards/rejected": -2.299767255783081, + "step": 4636 + }, + { + "epoch": 0.54, + "learning_rate": 1.402740049604346e-07, + "logits/chosen": -2.4119224548339844, + "logits/rejected": -2.4123127460479736, + "logps/chosen": -291.9481506347656, + "logps/rejected": -280.2415466308594, + "loss": 0.4058, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13311628997325897, + "rewards/margins": 2.172309637069702, + "rewards/rejected": -2.3054258823394775, + "step": 4637 + }, + { + "epoch": 0.54, + "learning_rate": 1.4023857328451636e-07, + "logits/chosen": -1.960544466972351, + "logits/rejected": -2.323045492172241, + "logps/chosen": -338.75592041015625, + "logps/rejected": -333.9443359375, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0249533653259277, + "rewards/margins": 1.5529719591140747, + "rewards/rejected": -2.577925205230713, + "step": 4638 + }, + { + "epoch": 0.54, + "learning_rate": 1.4020314160859808e-07, + "logits/chosen": -2.006157875061035, + "logits/rejected": -2.267416477203369, + "logps/chosen": -610.21728515625, + "logps/rejected": -498.1329650878906, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.391754150390625, + "rewards/margins": 2.823577880859375, + "rewards/rejected": -3.21533203125, + "step": 4639 + }, + { + "epoch": 0.54, + "learning_rate": 1.4016770993267983e-07, + "logits/chosen": -2.5585439205169678, + "logits/rejected": -2.4318017959594727, + "logps/chosen": -200.657958984375, + "logps/rejected": -302.74822998046875, + "loss": 0.2177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2509542405605316, + "rewards/margins": 2.479598045349121, + "rewards/rejected": -2.7305524349212646, + "step": 4640 + }, + { + "epoch": 0.54, + "learning_rate": 1.4013227825676155e-07, + "logits/chosen": -2.2312424182891846, + "logits/rejected": -1.9213645458221436, + "logps/chosen": -193.17190551757812, + "logps/rejected": -327.869140625, + "loss": 0.2002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.567861795425415, + "rewards/margins": 2.085513114929199, + "rewards/rejected": -2.6533749103546143, + "step": 4641 + }, + { + "epoch": 0.54, + "learning_rate": 1.4009684658084327e-07, + "logits/chosen": -2.707710027694702, + "logits/rejected": -2.7420802116394043, + "logps/chosen": -356.7666931152344, + "logps/rejected": -231.7493896484375, + "loss": 0.316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1910066157579422, + "rewards/margins": 1.9518628120422363, + "rewards/rejected": -2.142869472503662, + "step": 4642 + }, + { + "epoch": 0.54, + "learning_rate": 1.40061414904925e-07, + "logits/chosen": -2.6794207096099854, + "logits/rejected": -2.668992519378662, + "logps/chosen": -300.8631286621094, + "logps/rejected": -225.80633544921875, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5382250547409058, + "rewards/margins": 1.927200436592102, + "rewards/rejected": -3.465425491333008, + "step": 4643 + }, + { + "epoch": 0.54, + "learning_rate": 1.4002598322900672e-07, + "logits/chosen": -2.4029369354248047, + "logits/rejected": -2.6959807872772217, + "logps/chosen": -396.9256286621094, + "logps/rejected": -257.730224609375, + "loss": 0.205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36849308013916016, + "rewards/margins": 2.134237766265869, + "rewards/rejected": -2.5027308464050293, + "step": 4644 + }, + { + "epoch": 0.54, + "learning_rate": 1.3999055155308844e-07, + "logits/chosen": -1.8774166107177734, + "logits/rejected": -2.235417366027832, + "logps/chosen": -432.1070251464844, + "logps/rejected": -348.6182861328125, + "loss": 0.7733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7194104194641113, + "rewards/margins": 2.225750684738159, + "rewards/rejected": -3.9451608657836914, + "step": 4645 + }, + { + "epoch": 0.54, + "learning_rate": 1.399551198771702e-07, + "logits/chosen": -2.5603129863739014, + "logits/rejected": -2.5383780002593994, + "logps/chosen": -239.2616729736328, + "logps/rejected": -214.18389892578125, + "loss": 0.3212, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0744171142578125, + "rewards/margins": 1.9511613845825195, + "rewards/rejected": -3.025578498840332, + "step": 4646 + }, + { + "epoch": 0.54, + "learning_rate": 1.399196882012519e-07, + "logits/chosen": -2.394824504852295, + "logits/rejected": -2.751594305038452, + "logps/chosen": -489.6965026855469, + "logps/rejected": -259.13055419921875, + "loss": 0.2797, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5138629674911499, + "rewards/margins": 2.649033546447754, + "rewards/rejected": -3.1628966331481934, + "step": 4647 + }, + { + "epoch": 0.54, + "learning_rate": 1.3988425652533363e-07, + "logits/chosen": -2.4699554443359375, + "logits/rejected": -2.6205644607543945, + "logps/chosen": -131.47962951660156, + "logps/rejected": -106.37858581542969, + "loss": 0.562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9913872480392456, + "rewards/margins": 1.586961030960083, + "rewards/rejected": -2.578348159790039, + "step": 4648 + }, + { + "epoch": 0.54, + "learning_rate": 1.3984882484941538e-07, + "logits/chosen": -2.3861100673675537, + "logits/rejected": -2.3119096755981445, + "logps/chosen": -201.8434600830078, + "logps/rejected": -240.46542358398438, + "loss": 0.6545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9000641703605652, + "rewards/margins": 0.5334414839744568, + "rewards/rejected": -1.4335055351257324, + "step": 4649 + }, + { + "epoch": 0.54, + "learning_rate": 1.398133931734971e-07, + "logits/chosen": -2.801379680633545, + "logits/rejected": -2.4593918323516846, + "logps/chosen": -176.1024169921875, + "logps/rejected": -420.041748046875, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6378292441368103, + "rewards/margins": 2.119774341583252, + "rewards/rejected": -2.757603168487549, + "step": 4650 + }, + { + "epoch": 0.54, + "learning_rate": 1.3977796149757882e-07, + "logits/chosen": -2.583631992340088, + "logits/rejected": -2.66263747215271, + "logps/chosen": -291.7658996582031, + "logps/rejected": -235.30886840820312, + "loss": 0.2924, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6669131517410278, + "rewards/margins": 2.2146811485290527, + "rewards/rejected": -2.881594181060791, + "step": 4651 + }, + { + "epoch": 0.54, + "learning_rate": 1.3974252982166057e-07, + "logits/chosen": -1.9886165857315063, + "logits/rejected": -2.325502634048462, + "logps/chosen": -229.90707397460938, + "logps/rejected": -199.75570678710938, + "loss": 1.1368, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4292023181915283, + "rewards/margins": 0.6366958618164062, + "rewards/rejected": -2.0658984184265137, + "step": 4652 + }, + { + "epoch": 0.54, + "learning_rate": 1.397070981457423e-07, + "logits/chosen": -2.815807342529297, + "logits/rejected": -2.6347432136535645, + "logps/chosen": -175.99752807617188, + "logps/rejected": -253.67666625976562, + "loss": 0.5253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6726453900337219, + "rewards/margins": 2.51926326751709, + "rewards/rejected": -3.191908359527588, + "step": 4653 + }, + { + "epoch": 0.54, + "learning_rate": 1.3967166646982402e-07, + "logits/chosen": -1.6682302951812744, + "logits/rejected": -1.4753551483154297, + "logps/chosen": -253.54891967773438, + "logps/rejected": -253.41029357910156, + "loss": 0.2703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5850856304168701, + "rewards/margins": 2.6897430419921875, + "rewards/rejected": -3.2748286724090576, + "step": 4654 + }, + { + "epoch": 0.54, + "learning_rate": 1.3963623479390574e-07, + "logits/chosen": -1.7669637203216553, + "logits/rejected": -2.0146374702453613, + "logps/chosen": -185.62783813476562, + "logps/rejected": -218.7101593017578, + "loss": 0.806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8467280864715576, + "rewards/margins": 2.182321071624756, + "rewards/rejected": -3.0290489196777344, + "step": 4655 + }, + { + "epoch": 0.54, + "learning_rate": 1.3960080311798746e-07, + "logits/chosen": -2.125117778778076, + "logits/rejected": -1.9678285121917725, + "logps/chosen": -237.483154296875, + "logps/rejected": -243.57662963867188, + "loss": 0.7358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8209328651428223, + "rewards/margins": 0.890250027179718, + "rewards/rejected": -2.7111828327178955, + "step": 4656 + }, + { + "epoch": 0.54, + "learning_rate": 1.3956537144206918e-07, + "logits/chosen": -2.0038907527923584, + "logits/rejected": -2.0493030548095703, + "logps/chosen": -457.79010009765625, + "logps/rejected": -313.58001708984375, + "loss": 0.1384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8392704725265503, + "rewards/margins": 2.8022499084472656, + "rewards/rejected": -3.6415202617645264, + "step": 4657 + }, + { + "epoch": 0.54, + "learning_rate": 1.3952993976615093e-07, + "logits/chosen": -2.310718297958374, + "logits/rejected": -2.179898977279663, + "logps/chosen": -281.24920654296875, + "logps/rejected": -340.8134460449219, + "loss": 0.5465, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.508171796798706, + "rewards/margins": 1.042060375213623, + "rewards/rejected": -1.550232172012329, + "step": 4658 + }, + { + "epoch": 0.54, + "learning_rate": 1.3949450809023265e-07, + "logits/chosen": -1.5177257061004639, + "logits/rejected": -1.9910404682159424, + "logps/chosen": -389.15533447265625, + "logps/rejected": -218.09645080566406, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6872767210006714, + "rewards/margins": 1.7604118585586548, + "rewards/rejected": -2.447688579559326, + "step": 4659 + }, + { + "epoch": 0.54, + "learning_rate": 1.394590764143144e-07, + "logits/chosen": -2.5335559844970703, + "logits/rejected": -2.4346776008605957, + "logps/chosen": -249.88473510742188, + "logps/rejected": -276.7055358886719, + "loss": 0.367, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6952215433120728, + "rewards/margins": 2.3285441398620605, + "rewards/rejected": -3.0237655639648438, + "step": 4660 + }, + { + "epoch": 0.54, + "learning_rate": 1.3942364473839613e-07, + "logits/chosen": -2.495677947998047, + "logits/rejected": -2.275684118270874, + "logps/chosen": -242.21231079101562, + "logps/rejected": -212.3388671875, + "loss": 0.3379, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8226564526557922, + "rewards/margins": 2.640901565551758, + "rewards/rejected": -3.463557720184326, + "step": 4661 + }, + { + "epoch": 0.54, + "learning_rate": 1.3938821306247785e-07, + "logits/chosen": -2.7891387939453125, + "logits/rejected": -2.728475332260132, + "logps/chosen": -292.6061096191406, + "logps/rejected": -174.230224609375, + "loss": 0.572, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2863845825195312, + "rewards/margins": 0.8544105887413025, + "rewards/rejected": -2.1407952308654785, + "step": 4662 + }, + { + "epoch": 0.54, + "learning_rate": 1.3935278138655957e-07, + "logits/chosen": -2.4309675693511963, + "logits/rejected": -2.6914401054382324, + "logps/chosen": -381.6159362792969, + "logps/rejected": -181.98321533203125, + "loss": 0.5442, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3593753576278687, + "rewards/margins": 1.077452301979065, + "rewards/rejected": -2.4368276596069336, + "step": 4663 + }, + { + "epoch": 0.54, + "learning_rate": 1.3931734971064132e-07, + "logits/chosen": -2.058439016342163, + "logits/rejected": -1.926108956336975, + "logps/chosen": -470.2954406738281, + "logps/rejected": -412.153564453125, + "loss": 0.2975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5503692626953125, + "rewards/margins": 3.129492998123169, + "rewards/rejected": -3.6798622608184814, + "step": 4664 + }, + { + "epoch": 0.54, + "learning_rate": 1.3928191803472304e-07, + "logits/chosen": -1.5848381519317627, + "logits/rejected": -1.6899890899658203, + "logps/chosen": -250.3109893798828, + "logps/rejected": -230.82565307617188, + "loss": 0.5727, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1744331121444702, + "rewards/margins": 0.5719725489616394, + "rewards/rejected": -1.7464056015014648, + "step": 4665 + }, + { + "epoch": 0.54, + "learning_rate": 1.3924648635880476e-07, + "logits/chosen": -2.111471652984619, + "logits/rejected": -2.015836715698242, + "logps/chosen": -176.54747009277344, + "logps/rejected": -309.32293701171875, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5898741483688354, + "rewards/margins": 7.034238815307617, + "rewards/rejected": -8.624113082885742, + "step": 4666 + }, + { + "epoch": 0.54, + "learning_rate": 1.3921105468288648e-07, + "logits/chosen": -2.110163927078247, + "logits/rejected": -1.9838898181915283, + "logps/chosen": -241.6524658203125, + "logps/rejected": -274.14068603515625, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2469930648803711, + "rewards/margins": 3.0673561096191406, + "rewards/rejected": -3.3143491744995117, + "step": 4667 + }, + { + "epoch": 0.54, + "learning_rate": 1.391756230069682e-07, + "logits/chosen": -1.9278613328933716, + "logits/rejected": -2.2332634925842285, + "logps/chosen": -288.4875793457031, + "logps/rejected": -289.66937255859375, + "loss": 0.3831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5456650853157043, + "rewards/margins": 2.013261556625366, + "rewards/rejected": -2.558926582336426, + "step": 4668 + }, + { + "epoch": 0.54, + "learning_rate": 1.3914019133104996e-07, + "logits/chosen": -3.0415592193603516, + "logits/rejected": -3.0117597579956055, + "logps/chosen": -269.69622802734375, + "logps/rejected": -287.282470703125, + "loss": 0.6016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.765359878540039, + "rewards/margins": 1.8982093334197998, + "rewards/rejected": -3.663569450378418, + "step": 4669 + }, + { + "epoch": 0.54, + "learning_rate": 1.3910475965513168e-07, + "logits/chosen": -2.3206262588500977, + "logits/rejected": -2.4040768146514893, + "logps/chosen": -436.53192138671875, + "logps/rejected": -506.7242431640625, + "loss": 0.3097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7087792158126831, + "rewards/margins": 3.074434757232666, + "rewards/rejected": -3.7832140922546387, + "step": 4670 + }, + { + "epoch": 0.54, + "learning_rate": 1.3906932797921343e-07, + "logits/chosen": -2.6716668605804443, + "logits/rejected": -2.3149075508117676, + "logps/chosen": -356.38958740234375, + "logps/rejected": -274.56005859375, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4026784896850586, + "rewards/margins": 3.229523181915283, + "rewards/rejected": -3.632201671600342, + "step": 4671 + }, + { + "epoch": 0.54, + "learning_rate": 1.3903389630329515e-07, + "logits/chosen": -1.9392061233520508, + "logits/rejected": -2.220991611480713, + "logps/chosen": -500.70428466796875, + "logps/rejected": -271.74481201171875, + "loss": 0.326, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2649695575237274, + "rewards/margins": 2.1823410987854004, + "rewards/rejected": -2.44731068611145, + "step": 4672 + }, + { + "epoch": 0.54, + "learning_rate": 1.3899846462737687e-07, + "logits/chosen": -2.6198482513427734, + "logits/rejected": -2.862626314163208, + "logps/chosen": -349.3892822265625, + "logps/rejected": -256.7392883300781, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8212947249412537, + "rewards/margins": 2.8251118659973145, + "rewards/rejected": -3.6464061737060547, + "step": 4673 + }, + { + "epoch": 0.54, + "learning_rate": 1.389630329514586e-07, + "logits/chosen": -1.9378951787948608, + "logits/rejected": -1.890894889831543, + "logps/chosen": -110.85163879394531, + "logps/rejected": -160.02059936523438, + "loss": 0.9412, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6994227766990662, + "rewards/margins": 0.14792358875274658, + "rewards/rejected": -0.8473464250564575, + "step": 4674 + }, + { + "epoch": 0.54, + "learning_rate": 1.3892760127554034e-07, + "logits/chosen": -2.189277410507202, + "logits/rejected": -2.0028505325317383, + "logps/chosen": -216.20535278320312, + "logps/rejected": -396.2156982421875, + "loss": 0.6757, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5002159476280212, + "rewards/margins": 0.8469866514205933, + "rewards/rejected": -1.3472027778625488, + "step": 4675 + }, + { + "epoch": 0.54, + "learning_rate": 1.3889216959962206e-07, + "logits/chosen": -2.6757733821868896, + "logits/rejected": -2.8256826400756836, + "logps/chosen": -168.033203125, + "logps/rejected": -135.13600158691406, + "loss": 0.4276, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9229838848114014, + "rewards/margins": 1.540279746055603, + "rewards/rejected": -2.463263511657715, + "step": 4676 + }, + { + "epoch": 0.54, + "learning_rate": 1.3885673792370378e-07, + "logits/chosen": -2.509420871734619, + "logits/rejected": -2.6747665405273438, + "logps/chosen": -218.97369384765625, + "logps/rejected": -202.3203887939453, + "loss": 0.3462, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9100318551063538, + "rewards/margins": 2.2310853004455566, + "rewards/rejected": -3.1411168575286865, + "step": 4677 + }, + { + "epoch": 0.54, + "learning_rate": 1.388213062477855e-07, + "logits/chosen": -2.481872320175171, + "logits/rejected": -2.381617307662964, + "logps/chosen": -302.3758850097656, + "logps/rejected": -365.77215576171875, + "loss": 0.4199, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.096962332725525, + "rewards/margins": 1.1807422637939453, + "rewards/rejected": -2.2777047157287598, + "step": 4678 + }, + { + "epoch": 0.54, + "learning_rate": 1.3878587457186723e-07, + "logits/chosen": -2.293200969696045, + "logits/rejected": -1.9653453826904297, + "logps/chosen": -179.04466247558594, + "logps/rejected": -383.2584533691406, + "loss": 0.3732, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.005512237548828, + "rewards/margins": 2.4436733722686768, + "rewards/rejected": -4.449185371398926, + "step": 4679 + }, + { + "epoch": 0.54, + "learning_rate": 1.3875044289594898e-07, + "logits/chosen": -2.8790366649627686, + "logits/rejected": -2.766206741333008, + "logps/chosen": -244.59999084472656, + "logps/rejected": -306.4901428222656, + "loss": 0.3218, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8741177916526794, + "rewards/margins": 1.7850703001022339, + "rewards/rejected": -2.6591880321502686, + "step": 4680 + }, + { + "epoch": 0.54, + "learning_rate": 1.387150112200307e-07, + "logits/chosen": -2.016005754470825, + "logits/rejected": -1.959578275680542, + "logps/chosen": -217.69479370117188, + "logps/rejected": -301.8997802734375, + "loss": 0.5459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8471136093139648, + "rewards/margins": 1.2172771692276, + "rewards/rejected": -2.0643906593322754, + "step": 4681 + }, + { + "epoch": 0.54, + "learning_rate": 1.3867957954411242e-07, + "logits/chosen": -2.341768980026245, + "logits/rejected": -2.155132293701172, + "logps/chosen": -264.2984313964844, + "logps/rejected": -386.89739990234375, + "loss": 0.2159, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1487504541873932, + "rewards/margins": 2.5707225799560547, + "rewards/rejected": -2.719472885131836, + "step": 4682 + }, + { + "epoch": 0.54, + "learning_rate": 1.3864414786819417e-07, + "logits/chosen": -2.7579574584960938, + "logits/rejected": -2.5138916969299316, + "logps/chosen": -239.72943115234375, + "logps/rejected": -270.1615295410156, + "loss": 0.1711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47561120986938477, + "rewards/margins": 2.6028003692626953, + "rewards/rejected": -3.07841157913208, + "step": 4683 + }, + { + "epoch": 0.54, + "learning_rate": 1.386087161922759e-07, + "logits/chosen": -2.7259068489074707, + "logits/rejected": -2.6352415084838867, + "logps/chosen": -407.44451904296875, + "logps/rejected": -316.0409240722656, + "loss": 0.2891, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.430635690689087, + "rewards/margins": 2.992611885070801, + "rewards/rejected": -4.423247337341309, + "step": 4684 + }, + { + "epoch": 0.55, + "learning_rate": 1.3857328451635761e-07, + "logits/chosen": -1.679327368736267, + "logits/rejected": -1.8333697319030762, + "logps/chosen": -228.38636779785156, + "logps/rejected": -256.81866455078125, + "loss": 0.3637, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03204267472028732, + "rewards/margins": 2.2337722778320312, + "rewards/rejected": -2.265814781188965, + "step": 4685 + }, + { + "epoch": 0.55, + "learning_rate": 1.3853785284043934e-07, + "logits/chosen": -2.3476147651672363, + "logits/rejected": -2.2785134315490723, + "logps/chosen": -134.52001953125, + "logps/rejected": -199.80426025390625, + "loss": 0.7286, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2154312133789062, + "rewards/margins": 1.8141987323760986, + "rewards/rejected": -3.029629707336426, + "step": 4686 + }, + { + "epoch": 0.55, + "learning_rate": 1.3850242116452109e-07, + "logits/chosen": -2.2354538440704346, + "logits/rejected": -1.8113107681274414, + "logps/chosen": -361.2423400878906, + "logps/rejected": -377.1839904785156, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7918009757995605, + "rewards/margins": 2.529996871948242, + "rewards/rejected": -3.3217978477478027, + "step": 4687 + }, + { + "epoch": 0.55, + "learning_rate": 1.384669894886028e-07, + "logits/chosen": -2.1865453720092773, + "logits/rejected": -2.2889394760131836, + "logps/chosen": -268.517333984375, + "logps/rejected": -435.1257019042969, + "loss": 0.6081, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2378592491149902, + "rewards/margins": 0.7339532375335693, + "rewards/rejected": -1.9718124866485596, + "step": 4688 + }, + { + "epoch": 0.55, + "learning_rate": 1.3843155781268453e-07, + "logits/chosen": -2.7144253253936768, + "logits/rejected": -2.900031566619873, + "logps/chosen": -371.5560607910156, + "logps/rejected": -180.7022247314453, + "loss": 0.2157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1344684213399887, + "rewards/margins": 1.8678932189941406, + "rewards/rejected": -1.7334246635437012, + "step": 4689 + }, + { + "epoch": 0.55, + "learning_rate": 1.3839612613676625e-07, + "logits/chosen": -2.494483709335327, + "logits/rejected": -2.3249659538269043, + "logps/chosen": -207.7342071533203, + "logps/rejected": -250.6759033203125, + "loss": 0.2622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8450804948806763, + "rewards/margins": 2.7127411365509033, + "rewards/rejected": -3.557821750640869, + "step": 4690 + }, + { + "epoch": 0.55, + "learning_rate": 1.38360694460848e-07, + "logits/chosen": -2.713585615158081, + "logits/rejected": -2.4504356384277344, + "logps/chosen": -360.0636901855469, + "logps/rejected": -457.97808837890625, + "loss": 0.0938, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4330697059631348, + "rewards/margins": 4.008028984069824, + "rewards/rejected": -5.441098690032959, + "step": 4691 + }, + { + "epoch": 0.55, + "learning_rate": 1.3832526278492972e-07, + "logits/chosen": -2.8238022327423096, + "logits/rejected": -2.8606646060943604, + "logps/chosen": -124.88056182861328, + "logps/rejected": -162.61724853515625, + "loss": 0.5353, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.848009705543518, + "rewards/margins": 1.0346837043762207, + "rewards/rejected": -2.882693290710449, + "step": 4692 + }, + { + "epoch": 0.55, + "learning_rate": 1.3828983110901144e-07, + "logits/chosen": -2.189094066619873, + "logits/rejected": -2.449833631515503, + "logps/chosen": -208.824951171875, + "logps/rejected": -168.07872009277344, + "loss": 0.8181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0666577816009521, + "rewards/margins": 1.6400065422058105, + "rewards/rejected": -2.7066640853881836, + "step": 4693 + }, + { + "epoch": 0.55, + "learning_rate": 1.382543994330932e-07, + "logits/chosen": -1.7471110820770264, + "logits/rejected": -2.0172762870788574, + "logps/chosen": -298.93609619140625, + "logps/rejected": -297.5482482910156, + "loss": 0.8843, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0852224826812744, + "rewards/margins": 1.1522774696350098, + "rewards/rejected": -3.237499952316284, + "step": 4694 + }, + { + "epoch": 0.55, + "learning_rate": 1.3821896775717492e-07, + "logits/chosen": -2.2259209156036377, + "logits/rejected": -2.430196762084961, + "logps/chosen": -203.9559326171875, + "logps/rejected": -199.2470703125, + "loss": 0.4614, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1949338912963867, + "rewards/margins": 2.9847564697265625, + "rewards/rejected": -4.179690361022949, + "step": 4695 + }, + { + "epoch": 0.55, + "learning_rate": 1.3818353608125664e-07, + "logits/chosen": -1.8091020584106445, + "logits/rejected": -1.9365923404693604, + "logps/chosen": -361.91192626953125, + "logps/rejected": -263.3755187988281, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5637113451957703, + "rewards/margins": 3.1464719772338867, + "rewards/rejected": -3.7101831436157227, + "step": 4696 + }, + { + "epoch": 0.55, + "learning_rate": 1.3814810440533836e-07, + "logits/chosen": -1.8597486019134521, + "logits/rejected": -2.0216007232666016, + "logps/chosen": -285.0445861816406, + "logps/rejected": -253.76336669921875, + "loss": 0.3094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.599220871925354, + "rewards/margins": 3.3847367763519287, + "rewards/rejected": -3.9839577674865723, + "step": 4697 + }, + { + "epoch": 0.55, + "learning_rate": 1.3811267272942008e-07, + "logits/chosen": -2.0665149688720703, + "logits/rejected": -2.2510690689086914, + "logps/chosen": -484.36590576171875, + "logps/rejected": -386.00677490234375, + "loss": 0.4242, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7342631816864014, + "rewards/margins": 1.4957157373428345, + "rewards/rejected": -2.2299790382385254, + "step": 4698 + }, + { + "epoch": 0.55, + "learning_rate": 1.3807724105350183e-07, + "logits/chosen": -2.6513190269470215, + "logits/rejected": -2.3595383167266846, + "logps/chosen": -214.05511474609375, + "logps/rejected": -370.12371826171875, + "loss": 0.3292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8504027128219604, + "rewards/margins": 3.853813409805298, + "rewards/rejected": -4.704216480255127, + "step": 4699 + }, + { + "epoch": 0.55, + "learning_rate": 1.3804180937758355e-07, + "logits/chosen": -2.69779634475708, + "logits/rejected": -2.609147548675537, + "logps/chosen": -289.087158203125, + "logps/rejected": -250.0310821533203, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4147191047668457, + "rewards/margins": 2.46034574508667, + "rewards/rejected": -3.8750648498535156, + "step": 4700 + }, + { + "epoch": 0.55, + "learning_rate": 1.3800637770166527e-07, + "logits/chosen": -2.40474009513855, + "logits/rejected": -2.6059038639068604, + "logps/chosen": -146.51187133789062, + "logps/rejected": -143.2216033935547, + "loss": 0.782, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0190911293029785, + "rewards/margins": 0.3009237051010132, + "rewards/rejected": -2.3200149536132812, + "step": 4701 + }, + { + "epoch": 0.55, + "learning_rate": 1.37970946025747e-07, + "logits/chosen": -1.8882046937942505, + "logits/rejected": -1.9263839721679688, + "logps/chosen": -469.96246337890625, + "logps/rejected": -328.43328857421875, + "loss": 0.5794, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.505752444267273, + "rewards/margins": 0.8482300043106079, + "rewards/rejected": -2.353982448577881, + "step": 4702 + }, + { + "epoch": 0.55, + "learning_rate": 1.3793551434982875e-07, + "logits/chosen": -2.121264696121216, + "logits/rejected": -2.2644078731536865, + "logps/chosen": -405.9455871582031, + "logps/rejected": -291.00872802734375, + "loss": 0.3004, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8322147130966187, + "rewards/margins": 2.7103993892669678, + "rewards/rejected": -3.542613983154297, + "step": 4703 + }, + { + "epoch": 0.55, + "learning_rate": 1.3790008267391047e-07, + "logits/chosen": -1.8543821573257446, + "logits/rejected": -1.9413037300109863, + "logps/chosen": -197.53298950195312, + "logps/rejected": -193.8294677734375, + "loss": 0.3714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4261317849159241, + "rewards/margins": 1.0919054746627808, + "rewards/rejected": -1.5180373191833496, + "step": 4704 + }, + { + "epoch": 0.55, + "learning_rate": 1.3786465099799222e-07, + "logits/chosen": -2.7397310733795166, + "logits/rejected": -2.6462512016296387, + "logps/chosen": -422.5434265136719, + "logps/rejected": -268.18060302734375, + "loss": 0.7928, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.715246319770813, + "rewards/margins": 1.0085017681121826, + "rewards/rejected": -1.7237482070922852, + "step": 4705 + }, + { + "epoch": 0.55, + "learning_rate": 1.3782921932207394e-07, + "logits/chosen": -2.4241180419921875, + "logits/rejected": -2.1593668460845947, + "logps/chosen": -154.4390106201172, + "logps/rejected": -309.258544921875, + "loss": 0.36, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7173805236816406, + "rewards/margins": 1.622138500213623, + "rewards/rejected": -2.3395190238952637, + "step": 4706 + }, + { + "epoch": 0.55, + "learning_rate": 1.3779378764615566e-07, + "logits/chosen": -2.3804335594177246, + "logits/rejected": -2.7067556381225586, + "logps/chosen": -326.6612243652344, + "logps/rejected": -178.73138427734375, + "loss": 0.9157, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6315735578536987, + "rewards/margins": 1.2318180799484253, + "rewards/rejected": -2.863391637802124, + "step": 4707 + }, + { + "epoch": 0.55, + "learning_rate": 1.3775835597023738e-07, + "logits/chosen": -1.770754337310791, + "logits/rejected": -1.8549132347106934, + "logps/chosen": -521.8173828125, + "logps/rejected": -479.09674072265625, + "loss": 0.262, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.039337396621704, + "rewards/margins": 2.142674207687378, + "rewards/rejected": -3.182011604309082, + "step": 4708 + }, + { + "epoch": 0.55, + "learning_rate": 1.377229242943191e-07, + "logits/chosen": -2.6086838245391846, + "logits/rejected": -2.4712724685668945, + "logps/chosen": -66.51902770996094, + "logps/rejected": -156.0189208984375, + "loss": 0.2868, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0162909030914307, + "rewards/margins": 1.5565400123596191, + "rewards/rejected": -2.57283091545105, + "step": 4709 + }, + { + "epoch": 0.55, + "learning_rate": 1.3768749261840085e-07, + "logits/chosen": -2.367035150527954, + "logits/rejected": -2.553280830383301, + "logps/chosen": -342.42669677734375, + "logps/rejected": -290.5960998535156, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1148009300231934, + "rewards/margins": 2.8048911094665527, + "rewards/rejected": -3.919692039489746, + "step": 4710 + }, + { + "epoch": 0.55, + "learning_rate": 1.3765206094248258e-07, + "logits/chosen": -2.0284957885742188, + "logits/rejected": -1.9715826511383057, + "logps/chosen": -243.28414916992188, + "logps/rejected": -251.28651428222656, + "loss": 0.2181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5676429271697998, + "rewards/margins": 1.7687158584594727, + "rewards/rejected": -2.3363590240478516, + "step": 4711 + }, + { + "epoch": 0.55, + "learning_rate": 1.376166292665643e-07, + "logits/chosen": -2.0973639488220215, + "logits/rejected": -2.218979835510254, + "logps/chosen": -269.1494445800781, + "logps/rejected": -254.00726318359375, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6902115345001221, + "rewards/margins": 1.4208730459213257, + "rewards/rejected": -2.111084461212158, + "step": 4712 + }, + { + "epoch": 0.55, + "learning_rate": 1.3758119759064602e-07, + "logits/chosen": -2.340134382247925, + "logits/rejected": -2.2144041061401367, + "logps/chosen": -252.20388793945312, + "logps/rejected": -304.486572265625, + "loss": 0.6198, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1328516006469727, + "rewards/margins": 1.602946162223816, + "rewards/rejected": -2.735797643661499, + "step": 4713 + }, + { + "epoch": 0.55, + "learning_rate": 1.3754576591472777e-07, + "logits/chosen": -2.6191177368164062, + "logits/rejected": -2.5225486755371094, + "logps/chosen": -202.28863525390625, + "logps/rejected": -244.33389282226562, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9176826477050781, + "rewards/margins": 2.280834913253784, + "rewards/rejected": -4.198517799377441, + "step": 4714 + }, + { + "epoch": 0.55, + "learning_rate": 1.375103342388095e-07, + "logits/chosen": -2.647246837615967, + "logits/rejected": -2.380121946334839, + "logps/chosen": -420.21881103515625, + "logps/rejected": -450.28741455078125, + "loss": 0.0914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055417682975530624, + "rewards/margins": 3.6382675170898438, + "rewards/rejected": -3.5828499794006348, + "step": 4715 + }, + { + "epoch": 0.55, + "learning_rate": 1.3747490256289124e-07, + "logits/chosen": -1.604865312576294, + "logits/rejected": -2.0989575386047363, + "logps/chosen": -501.2213439941406, + "logps/rejected": -253.29586791992188, + "loss": 0.7798, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2217791080474854, + "rewards/margins": 1.272356390953064, + "rewards/rejected": -2.4941353797912598, + "step": 4716 + }, + { + "epoch": 0.55, + "learning_rate": 1.3743947088697296e-07, + "logits/chosen": -2.3008460998535156, + "logits/rejected": -2.3037962913513184, + "logps/chosen": -292.83856201171875, + "logps/rejected": -360.1231689453125, + "loss": 0.2687, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9344025254249573, + "rewards/margins": 2.109858274459839, + "rewards/rejected": -3.0442609786987305, + "step": 4717 + }, + { + "epoch": 0.55, + "learning_rate": 1.3740403921105468e-07, + "logits/chosen": -1.9269311428070068, + "logits/rejected": -1.9583978652954102, + "logps/chosen": -271.16510009765625, + "logps/rejected": -293.09326171875, + "loss": 0.2718, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1010171175003052, + "rewards/margins": 2.7805399894714355, + "rewards/rejected": -3.8815574645996094, + "step": 4718 + }, + { + "epoch": 0.55, + "learning_rate": 1.373686075351364e-07, + "logits/chosen": -2.4832842350006104, + "logits/rejected": -2.166980504989624, + "logps/chosen": -187.32382202148438, + "logps/rejected": -352.697021484375, + "loss": 0.3458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8647884130477905, + "rewards/margins": 2.2715275287628174, + "rewards/rejected": -3.1363158226013184, + "step": 4719 + }, + { + "epoch": 0.55, + "learning_rate": 1.3733317585921813e-07, + "logits/chosen": -2.352123975753784, + "logits/rejected": -2.5097413063049316, + "logps/chosen": -181.93553161621094, + "logps/rejected": -152.21847534179688, + "loss": 1.1801, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9667631387710571, + "rewards/margins": 0.5230333805084229, + "rewards/rejected": -2.4897966384887695, + "step": 4720 + }, + { + "epoch": 0.55, + "learning_rate": 1.3729774418329985e-07, + "logits/chosen": -2.164426803588867, + "logits/rejected": -1.9944628477096558, + "logps/chosen": -359.66937255859375, + "logps/rejected": -393.1251220703125, + "loss": 0.5241, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1965500116348267, + "rewards/margins": 0.8995264172554016, + "rewards/rejected": -2.096076488494873, + "step": 4721 + }, + { + "epoch": 0.55, + "learning_rate": 1.372623125073816e-07, + "logits/chosen": -2.9935872554779053, + "logits/rejected": -3.0012893676757812, + "logps/chosen": -99.36637878417969, + "logps/rejected": -197.3230743408203, + "loss": 0.2159, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11535173654556274, + "rewards/margins": 3.1650686264038086, + "rewards/rejected": -3.0497169494628906, + "step": 4722 + }, + { + "epoch": 0.55, + "learning_rate": 1.3722688083146332e-07, + "logits/chosen": -2.070338249206543, + "logits/rejected": -2.2104783058166504, + "logps/chosen": -357.1840515136719, + "logps/rejected": -304.708251953125, + "loss": 0.4139, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4379235804080963, + "rewards/margins": 1.8970017433166504, + "rewards/rejected": -2.334925413131714, + "step": 4723 + }, + { + "epoch": 0.55, + "learning_rate": 1.3719144915554504e-07, + "logits/chosen": -2.229290008544922, + "logits/rejected": -2.1809616088867188, + "logps/chosen": -360.4341125488281, + "logps/rejected": -275.5603332519531, + "loss": 0.3426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06838201731443405, + "rewards/margins": 2.1629765033721924, + "rewards/rejected": -2.0945944786071777, + "step": 4724 + }, + { + "epoch": 0.55, + "learning_rate": 1.371560174796268e-07, + "logits/chosen": -2.532602310180664, + "logits/rejected": -2.4986557960510254, + "logps/chosen": -221.0703125, + "logps/rejected": -182.41732788085938, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8399113416671753, + "rewards/margins": 1.9128930568695068, + "rewards/rejected": -2.7528042793273926, + "step": 4725 + }, + { + "epoch": 0.55, + "learning_rate": 1.371205858037085e-07, + "logits/chosen": -2.1773502826690674, + "logits/rejected": -2.338958740234375, + "logps/chosen": -322.22442626953125, + "logps/rejected": -322.6492919921875, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.862139105796814, + "rewards/margins": 1.6975926160812378, + "rewards/rejected": -2.5597317218780518, + "step": 4726 + }, + { + "epoch": 0.55, + "learning_rate": 1.3708515412779024e-07, + "logits/chosen": -2.2198712825775146, + "logits/rejected": -2.6037635803222656, + "logps/chosen": -304.070556640625, + "logps/rejected": -226.2404327392578, + "loss": 0.5697, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0426892042160034, + "rewards/margins": 2.4267349243164062, + "rewards/rejected": -3.46942400932312, + "step": 4727 + }, + { + "epoch": 0.55, + "learning_rate": 1.3704972245187198e-07, + "logits/chosen": -3.0108132362365723, + "logits/rejected": -2.9294087886810303, + "logps/chosen": -197.01809692382812, + "logps/rejected": -132.1771240234375, + "loss": 0.9249, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.764293909072876, + "rewards/margins": 0.0008790642023086548, + "rewards/rejected": -1.7651731967926025, + "step": 4728 + }, + { + "epoch": 0.55, + "learning_rate": 1.370142907759537e-07, + "logits/chosen": -1.5378971099853516, + "logits/rejected": -2.147143840789795, + "logps/chosen": -494.5244140625, + "logps/rejected": -325.88848876953125, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5163771510124207, + "rewards/margins": 1.7954797744750977, + "rewards/rejected": -2.311856985092163, + "step": 4729 + }, + { + "epoch": 0.55, + "learning_rate": 1.3697885910003543e-07, + "logits/chosen": -1.9862951040267944, + "logits/rejected": -1.9674913883209229, + "logps/chosen": -266.73162841796875, + "logps/rejected": -300.3538818359375, + "loss": 0.8311, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9404716491699219, + "rewards/margins": 0.46428078413009644, + "rewards/rejected": -2.404752254486084, + "step": 4730 + }, + { + "epoch": 0.55, + "learning_rate": 1.3694342742411715e-07, + "logits/chosen": -2.472410202026367, + "logits/rejected": -2.536156415939331, + "logps/chosen": -448.71917724609375, + "logps/rejected": -346.9501037597656, + "loss": 0.1916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9298628568649292, + "rewards/margins": 1.9345213174819946, + "rewards/rejected": -2.864384174346924, + "step": 4731 + }, + { + "epoch": 0.55, + "learning_rate": 1.3690799574819887e-07, + "logits/chosen": -2.6315619945526123, + "logits/rejected": -2.526231527328491, + "logps/chosen": -118.54723358154297, + "logps/rejected": -164.55337524414062, + "loss": 0.438, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24835769832134247, + "rewards/margins": 0.9508744478225708, + "rewards/rejected": -1.1992321014404297, + "step": 4732 + }, + { + "epoch": 0.55, + "learning_rate": 1.368725640722806e-07, + "logits/chosen": -2.6464016437530518, + "logits/rejected": -2.5927212238311768, + "logps/chosen": -124.32131958007812, + "logps/rejected": -206.15338134765625, + "loss": 0.4494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8903191685676575, + "rewards/margins": 1.5845801830291748, + "rewards/rejected": -2.4748992919921875, + "step": 4733 + }, + { + "epoch": 0.55, + "learning_rate": 1.3683713239636234e-07, + "logits/chosen": -1.7996280193328857, + "logits/rejected": -1.994588017463684, + "logps/chosen": -335.42291259765625, + "logps/rejected": -233.84210205078125, + "loss": 1.1187, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6044812202453613, + "rewards/margins": 0.3401778042316437, + "rewards/rejected": -1.9446592330932617, + "step": 4734 + }, + { + "epoch": 0.55, + "learning_rate": 1.3680170072044407e-07, + "logits/chosen": -2.220944404602051, + "logits/rejected": -2.4758520126342773, + "logps/chosen": -356.01873779296875, + "logps/rejected": -304.5627136230469, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4546217620372772, + "rewards/margins": 2.6972200870513916, + "rewards/rejected": -3.151841878890991, + "step": 4735 + }, + { + "epoch": 0.55, + "learning_rate": 1.367662690445258e-07, + "logits/chosen": -2.905578136444092, + "logits/rejected": -2.778278350830078, + "logps/chosen": -343.2167053222656, + "logps/rejected": -269.4004821777344, + "loss": 0.4228, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3371495008468628, + "rewards/margins": 1.0278897285461426, + "rewards/rejected": -2.365039348602295, + "step": 4736 + }, + { + "epoch": 0.55, + "learning_rate": 1.3673083736860754e-07, + "logits/chosen": -2.912095308303833, + "logits/rejected": -2.8371334075927734, + "logps/chosen": -344.013671875, + "logps/rejected": -376.49749755859375, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.872443675994873, + "rewards/margins": 2.715012550354004, + "rewards/rejected": -3.587456226348877, + "step": 4737 + }, + { + "epoch": 0.55, + "learning_rate": 1.3669540569268926e-07, + "logits/chosen": -2.9281985759735107, + "logits/rejected": -2.8419225215911865, + "logps/chosen": -478.85797119140625, + "logps/rejected": -256.4886474609375, + "loss": 0.4489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0417009592056274, + "rewards/margins": 2.2816970348358154, + "rewards/rejected": -3.3233981132507324, + "step": 4738 + }, + { + "epoch": 0.55, + "learning_rate": 1.3665997401677098e-07, + "logits/chosen": -1.8992586135864258, + "logits/rejected": -1.7674548625946045, + "logps/chosen": -416.49981689453125, + "logps/rejected": -396.7330322265625, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38935524225234985, + "rewards/margins": 2.3906056880950928, + "rewards/rejected": -2.779960870742798, + "step": 4739 + }, + { + "epoch": 0.55, + "learning_rate": 1.3662454234085273e-07, + "logits/chosen": -2.078162670135498, + "logits/rejected": -2.069307327270508, + "logps/chosen": -545.8004150390625, + "logps/rejected": -420.8433532714844, + "loss": 0.2558, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8821578025817871, + "rewards/margins": 1.8744664192199707, + "rewards/rejected": -2.756624221801758, + "step": 4740 + }, + { + "epoch": 0.55, + "learning_rate": 1.3658911066493445e-07, + "logits/chosen": -2.3431458473205566, + "logits/rejected": -2.387803554534912, + "logps/chosen": -176.67857360839844, + "logps/rejected": -194.964111328125, + "loss": 0.2211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5938167572021484, + "rewards/margins": 2.395796298980713, + "rewards/rejected": -2.9896132946014404, + "step": 4741 + }, + { + "epoch": 0.55, + "learning_rate": 1.3655367898901617e-07, + "logits/chosen": -2.518853187561035, + "logits/rejected": -2.461524724960327, + "logps/chosen": -139.9394989013672, + "logps/rejected": -213.30068969726562, + "loss": 0.1163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2897334098815918, + "rewards/margins": 3.6248629093170166, + "rewards/rejected": -4.914596080780029, + "step": 4742 + }, + { + "epoch": 0.55, + "learning_rate": 1.365182473130979e-07, + "logits/chosen": -2.4313161373138428, + "logits/rejected": -2.3159656524658203, + "logps/chosen": -251.19412231445312, + "logps/rejected": -220.88414001464844, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0568633079528809, + "rewards/margins": 2.02512788772583, + "rewards/rejected": -3.081991195678711, + "step": 4743 + }, + { + "epoch": 0.55, + "learning_rate": 1.3648281563717962e-07, + "logits/chosen": -2.3864357471466064, + "logits/rejected": -2.261124610900879, + "logps/chosen": -267.0919189453125, + "logps/rejected": -232.5522918701172, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3057079315185547, + "rewards/margins": 3.514192581176758, + "rewards/rejected": -4.8199005126953125, + "step": 4744 + }, + { + "epoch": 0.55, + "learning_rate": 1.3644738396126137e-07, + "logits/chosen": -2.3787012100219727, + "logits/rejected": -2.3395094871520996, + "logps/chosen": -430.6835021972656, + "logps/rejected": -283.790283203125, + "loss": 0.2827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6828256845474243, + "rewards/margins": 2.3110737800598145, + "rewards/rejected": -2.993899345397949, + "step": 4745 + }, + { + "epoch": 0.55, + "learning_rate": 1.364119522853431e-07, + "logits/chosen": -1.7207090854644775, + "logits/rejected": -1.6952297687530518, + "logps/chosen": -426.17578125, + "logps/rejected": -274.68133544921875, + "loss": 0.5377, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6128113269805908, + "rewards/margins": 1.0564544200897217, + "rewards/rejected": -1.6692657470703125, + "step": 4746 + }, + { + "epoch": 0.55, + "learning_rate": 1.363765206094248e-07, + "logits/chosen": -2.077094793319702, + "logits/rejected": -2.339543581008911, + "logps/chosen": -223.3112030029297, + "logps/rejected": -96.54255676269531, + "loss": 0.7157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44476884603500366, + "rewards/margins": 0.5776172876358032, + "rewards/rejected": -1.022386074066162, + "step": 4747 + }, + { + "epoch": 0.55, + "learning_rate": 1.3634108893350656e-07, + "logits/chosen": -1.8230571746826172, + "logits/rejected": -1.8413864374160767, + "logps/chosen": -512.7545166015625, + "logps/rejected": -464.5118408203125, + "loss": 0.3476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9805612564086914, + "rewards/margins": 2.1053879261016846, + "rewards/rejected": -3.085949182510376, + "step": 4748 + }, + { + "epoch": 0.55, + "learning_rate": 1.3630565725758828e-07, + "logits/chosen": -2.8873627185821533, + "logits/rejected": -2.7339439392089844, + "logps/chosen": -261.1265869140625, + "logps/rejected": -279.05743408203125, + "loss": 0.4337, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8882686495780945, + "rewards/margins": 2.327861785888672, + "rewards/rejected": -3.216130495071411, + "step": 4749 + }, + { + "epoch": 0.55, + "learning_rate": 1.3627022558167e-07, + "logits/chosen": -2.30623197555542, + "logits/rejected": -2.5109710693359375, + "logps/chosen": -171.9878692626953, + "logps/rejected": -147.439208984375, + "loss": 0.6633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5696159601211548, + "rewards/margins": 0.6513950824737549, + "rewards/rejected": -1.2210111618041992, + "step": 4750 + }, + { + "epoch": 0.55, + "learning_rate": 1.3623479390575175e-07, + "logits/chosen": -2.2310564517974854, + "logits/rejected": -2.0480871200561523, + "logps/chosen": -239.11392211914062, + "logps/rejected": -265.633544921875, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5356967449188232, + "rewards/margins": 1.997672438621521, + "rewards/rejected": -2.5333690643310547, + "step": 4751 + }, + { + "epoch": 0.55, + "learning_rate": 1.3619936222983347e-07, + "logits/chosen": -2.2992911338806152, + "logits/rejected": -2.5622501373291016, + "logps/chosen": -367.2425842285156, + "logps/rejected": -227.37042236328125, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2531914710998535, + "rewards/margins": 2.030552864074707, + "rewards/rejected": -2.2837443351745605, + "step": 4752 + }, + { + "epoch": 0.55, + "learning_rate": 1.361639305539152e-07, + "logits/chosen": -2.5114846229553223, + "logits/rejected": -2.123502492904663, + "logps/chosen": -174.34194946289062, + "logps/rejected": -305.5697326660156, + "loss": 0.5361, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3875906467437744, + "rewards/margins": 1.8313299417495728, + "rewards/rejected": -3.2189204692840576, + "step": 4753 + }, + { + "epoch": 0.55, + "learning_rate": 1.3612849887799692e-07, + "logits/chosen": -2.238067865371704, + "logits/rejected": -2.1840944290161133, + "logps/chosen": -224.88929748535156, + "logps/rejected": -303.97052001953125, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4682645797729492, + "rewards/margins": 2.753000020980835, + "rewards/rejected": -3.2212648391723633, + "step": 4754 + }, + { + "epoch": 0.55, + "learning_rate": 1.3609306720207864e-07, + "logits/chosen": -2.536285877227783, + "logits/rejected": -2.688255786895752, + "logps/chosen": -292.4500427246094, + "logps/rejected": -170.01797485351562, + "loss": 0.4144, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0263276100158691, + "rewards/margins": 1.9013323783874512, + "rewards/rejected": -2.9276599884033203, + "step": 4755 + }, + { + "epoch": 0.55, + "learning_rate": 1.3605763552616036e-07, + "logits/chosen": -2.3432910442352295, + "logits/rejected": -2.406921863555908, + "logps/chosen": -202.25233459472656, + "logps/rejected": -272.17340087890625, + "loss": 0.2041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37929433584213257, + "rewards/margins": 3.773375988006592, + "rewards/rejected": -4.152669906616211, + "step": 4756 + }, + { + "epoch": 0.55, + "learning_rate": 1.360222038502421e-07, + "logits/chosen": -2.0680954456329346, + "logits/rejected": -2.0888071060180664, + "logps/chosen": -401.71368408203125, + "logps/rejected": -423.43536376953125, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6841655969619751, + "rewards/margins": 2.5926270484924316, + "rewards/rejected": -3.2767927646636963, + "step": 4757 + }, + { + "epoch": 0.55, + "learning_rate": 1.3598677217432383e-07, + "logits/chosen": -2.6374566555023193, + "logits/rejected": -2.5904271602630615, + "logps/chosen": -162.10430908203125, + "logps/rejected": -238.37954711914062, + "loss": 0.3869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4259127378463745, + "rewards/margins": 2.817147970199585, + "rewards/rejected": -4.24306058883667, + "step": 4758 + }, + { + "epoch": 0.55, + "learning_rate": 1.3595134049840558e-07, + "logits/chosen": -2.7538092136383057, + "logits/rejected": -2.74617338180542, + "logps/chosen": -102.0278549194336, + "logps/rejected": -153.0843505859375, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0910463333129883, + "rewards/margins": 2.8075437545776367, + "rewards/rejected": -3.898589849472046, + "step": 4759 + }, + { + "epoch": 0.55, + "learning_rate": 1.359159088224873e-07, + "logits/chosen": -2.9400899410247803, + "logits/rejected": -2.6833057403564453, + "logps/chosen": -284.5726318359375, + "logps/rejected": -229.6984405517578, + "loss": 0.2192, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8604476451873779, + "rewards/margins": 2.701650381088257, + "rewards/rejected": -3.5620980262756348, + "step": 4760 + }, + { + "epoch": 0.55, + "learning_rate": 1.3588047714656903e-07, + "logits/chosen": -2.5903029441833496, + "logits/rejected": -2.21964168548584, + "logps/chosen": -93.775634765625, + "logps/rejected": -223.4354248046875, + "loss": 0.2285, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7259180545806885, + "rewards/margins": 3.0232691764831543, + "rewards/rejected": -3.749187469482422, + "step": 4761 + }, + { + "epoch": 0.55, + "learning_rate": 1.3584504547065075e-07, + "logits/chosen": -2.806459426879883, + "logits/rejected": -2.6126503944396973, + "logps/chosen": -295.15472412109375, + "logps/rejected": -313.27337646484375, + "loss": 0.269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7095727920532227, + "rewards/margins": 2.582773208618164, + "rewards/rejected": -3.292346239089966, + "step": 4762 + }, + { + "epoch": 0.55, + "learning_rate": 1.358096137947325e-07, + "logits/chosen": -2.0713870525360107, + "logits/rejected": -2.4536073207855225, + "logps/chosen": -366.06610107421875, + "logps/rejected": -269.1665344238281, + "loss": 0.4244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8157504796981812, + "rewards/margins": 2.530465602874756, + "rewards/rejected": -3.3462159633636475, + "step": 4763 + }, + { + "epoch": 0.55, + "learning_rate": 1.3577418211881422e-07, + "logits/chosen": -2.917710065841675, + "logits/rejected": -2.874675989151001, + "logps/chosen": -202.31622314453125, + "logps/rejected": -312.83831787109375, + "loss": 0.1906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.237585425376892, + "rewards/margins": 2.4143404960632324, + "rewards/rejected": -3.651926040649414, + "step": 4764 + }, + { + "epoch": 0.55, + "learning_rate": 1.3573875044289594e-07, + "logits/chosen": -2.169966220855713, + "logits/rejected": -2.217585563659668, + "logps/chosen": -247.94467163085938, + "logps/rejected": -266.3641357421875, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3053717613220215, + "rewards/margins": 0.8699508905410767, + "rewards/rejected": -2.1753227710723877, + "step": 4765 + }, + { + "epoch": 0.55, + "learning_rate": 1.3570331876697766e-07, + "logits/chosen": -2.398695707321167, + "logits/rejected": -2.5150675773620605, + "logps/chosen": -229.18109130859375, + "logps/rejected": -276.24615478515625, + "loss": 0.3697, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5252599120140076, + "rewards/margins": 2.5795392990112305, + "rewards/rejected": -3.1047987937927246, + "step": 4766 + }, + { + "epoch": 0.55, + "learning_rate": 1.3566788709105938e-07, + "logits/chosen": -1.8994649648666382, + "logits/rejected": -1.733847737312317, + "logps/chosen": -327.17645263671875, + "logps/rejected": -403.698486328125, + "loss": 0.1568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6792294979095459, + "rewards/margins": 2.6266674995422363, + "rewards/rejected": -3.3058969974517822, + "step": 4767 + }, + { + "epoch": 0.55, + "learning_rate": 1.3563245541514113e-07, + "logits/chosen": -2.5133297443389893, + "logits/rejected": -2.310063362121582, + "logps/chosen": -200.16807556152344, + "logps/rejected": -413.283203125, + "loss": 0.3174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5755150318145752, + "rewards/margins": 3.5376152992248535, + "rewards/rejected": -4.113130569458008, + "step": 4768 + }, + { + "epoch": 0.55, + "learning_rate": 1.3559702373922286e-07, + "logits/chosen": -2.288316249847412, + "logits/rejected": -2.5732479095458984, + "logps/chosen": -516.8251953125, + "logps/rejected": -342.52630615234375, + "loss": 0.2207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3692968487739563, + "rewards/margins": 3.4012653827667236, + "rewards/rejected": -3.770561933517456, + "step": 4769 + }, + { + "epoch": 0.55, + "learning_rate": 1.355615920633046e-07, + "logits/chosen": -2.344167470932007, + "logits/rejected": -2.4296417236328125, + "logps/chosen": -193.3877716064453, + "logps/rejected": -227.2036895751953, + "loss": 0.2942, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7753921747207642, + "rewards/margins": 1.6635266542434692, + "rewards/rejected": -3.4389188289642334, + "step": 4770 + }, + { + "epoch": 0.56, + "learning_rate": 1.3552616038738633e-07, + "logits/chosen": -1.6669954061508179, + "logits/rejected": -1.824260950088501, + "logps/chosen": -262.6709899902344, + "logps/rejected": -247.2725830078125, + "loss": 0.6974, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8396344184875488, + "rewards/margins": 1.2455623149871826, + "rewards/rejected": -3.0851964950561523, + "step": 4771 + }, + { + "epoch": 0.56, + "learning_rate": 1.3549072871146805e-07, + "logits/chosen": -2.3296360969543457, + "logits/rejected": -2.1609036922454834, + "logps/chosen": -272.47064208984375, + "logps/rejected": -259.47625732421875, + "loss": 0.42, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.517580270767212, + "rewards/margins": 1.9313591718673706, + "rewards/rejected": -3.448939561843872, + "step": 4772 + }, + { + "epoch": 0.56, + "learning_rate": 1.3545529703554977e-07, + "logits/chosen": -2.0885562896728516, + "logits/rejected": -2.075273036956787, + "logps/chosen": -419.1817321777344, + "logps/rejected": -517.1770629882812, + "loss": 0.1955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3954195976257324, + "rewards/margins": 3.0735950469970703, + "rewards/rejected": -3.4690146446228027, + "step": 4773 + }, + { + "epoch": 0.56, + "learning_rate": 1.354198653596315e-07, + "logits/chosen": -1.9208297729492188, + "logits/rejected": -1.8786579370498657, + "logps/chosen": -452.5075378417969, + "logps/rejected": -447.77264404296875, + "loss": 0.5056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6049402952194214, + "rewards/margins": 2.9190948009490967, + "rewards/rejected": -3.5240354537963867, + "step": 4774 + }, + { + "epoch": 0.56, + "learning_rate": 1.3538443368371324e-07, + "logits/chosen": -2.519357442855835, + "logits/rejected": -2.3466873168945312, + "logps/chosen": -233.63198852539062, + "logps/rejected": -249.32061767578125, + "loss": 0.129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7203342914581299, + "rewards/margins": 3.793567180633545, + "rewards/rejected": -4.513901710510254, + "step": 4775 + }, + { + "epoch": 0.56, + "learning_rate": 1.3534900200779496e-07, + "logits/chosen": -1.8717317581176758, + "logits/rejected": -2.5876221656799316, + "logps/chosen": -420.82080078125, + "logps/rejected": -180.8782958984375, + "loss": 0.4866, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7359131574630737, + "rewards/margins": 1.2096350193023682, + "rewards/rejected": -2.9455482959747314, + "step": 4776 + }, + { + "epoch": 0.56, + "learning_rate": 1.3531357033187669e-07, + "logits/chosen": -2.190742015838623, + "logits/rejected": -2.647852897644043, + "logps/chosen": -365.7681884765625, + "logps/rejected": -149.70355224609375, + "loss": 0.3357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9764113426208496, + "rewards/margins": 1.548378825187683, + "rewards/rejected": -2.5247902870178223, + "step": 4777 + }, + { + "epoch": 0.56, + "learning_rate": 1.352781386559584e-07, + "logits/chosen": -2.2249386310577393, + "logits/rejected": -2.38775372505188, + "logps/chosen": -311.9693908691406, + "logps/rejected": -337.367919921875, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015069536864757538, + "rewards/margins": 2.728260040283203, + "rewards/rejected": -2.7433295249938965, + "step": 4778 + }, + { + "epoch": 0.56, + "learning_rate": 1.3524270698004016e-07, + "logits/chosen": -2.4999024868011475, + "logits/rejected": -2.50923490524292, + "logps/chosen": -427.73797607421875, + "logps/rejected": -306.4647521972656, + "loss": 0.543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5849019289016724, + "rewards/margins": 0.9016056060791016, + "rewards/rejected": -1.486507534980774, + "step": 4779 + }, + { + "epoch": 0.56, + "learning_rate": 1.3520727530412188e-07, + "logits/chosen": -2.5646355152130127, + "logits/rejected": -2.732895851135254, + "logps/chosen": -287.9400634765625, + "logps/rejected": -222.17672729492188, + "loss": 0.1746, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2416011095046997, + "rewards/margins": 2.3624701499938965, + "rewards/rejected": -3.6040711402893066, + "step": 4780 + }, + { + "epoch": 0.56, + "learning_rate": 1.351718436282036e-07, + "logits/chosen": -2.016615390777588, + "logits/rejected": -1.7967348098754883, + "logps/chosen": -378.30487060546875, + "logps/rejected": -256.3493347167969, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31932157278060913, + "rewards/margins": 3.3373537063598633, + "rewards/rejected": -3.656675338745117, + "step": 4781 + }, + { + "epoch": 0.56, + "learning_rate": 1.3513641195228535e-07, + "logits/chosen": -2.447911262512207, + "logits/rejected": -2.3912229537963867, + "logps/chosen": -186.66824340820312, + "logps/rejected": -194.14654541015625, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.300320029258728, + "rewards/margins": 3.0581483840942383, + "rewards/rejected": -3.3584680557250977, + "step": 4782 + }, + { + "epoch": 0.56, + "learning_rate": 1.3510098027636707e-07, + "logits/chosen": -2.2296741008758545, + "logits/rejected": -2.618868112564087, + "logps/chosen": -329.87921142578125, + "logps/rejected": -176.23382568359375, + "loss": 0.4824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5391423106193542, + "rewards/margins": 1.2522938251495361, + "rewards/rejected": -1.7914361953735352, + "step": 4783 + }, + { + "epoch": 0.56, + "learning_rate": 1.350655486004488e-07, + "logits/chosen": -2.0495851039886475, + "logits/rejected": -1.8757215738296509, + "logps/chosen": -206.16131591796875, + "logps/rejected": -252.47427368164062, + "loss": 0.3389, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2968755960464478, + "rewards/margins": 2.275651216506958, + "rewards/rejected": -3.572526693344116, + "step": 4784 + }, + { + "epoch": 0.56, + "learning_rate": 1.3503011692453052e-07, + "logits/chosen": -2.601973533630371, + "logits/rejected": -2.4654483795166016, + "logps/chosen": -182.08126831054688, + "logps/rejected": -233.82476806640625, + "loss": 0.4083, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5412182807922363, + "rewards/margins": 3.332303047180176, + "rewards/rejected": -3.873521327972412, + "step": 4785 + }, + { + "epoch": 0.56, + "learning_rate": 1.3499468524861226e-07, + "logits/chosen": -1.9240566492080688, + "logits/rejected": -2.2552785873413086, + "logps/chosen": -363.001953125, + "logps/rejected": -204.51611328125, + "loss": 0.4472, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6791923642158508, + "rewards/margins": 1.8439723253250122, + "rewards/rejected": -2.523164749145508, + "step": 4786 + }, + { + "epoch": 0.56, + "learning_rate": 1.3495925357269399e-07, + "logits/chosen": -2.605337381362915, + "logits/rejected": -2.6551809310913086, + "logps/chosen": -328.428955078125, + "logps/rejected": -267.11083984375, + "loss": 0.3106, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6002723574638367, + "rewards/margins": 2.5191709995269775, + "rewards/rejected": -3.119443416595459, + "step": 4787 + }, + { + "epoch": 0.56, + "learning_rate": 1.349238218967757e-07, + "logits/chosen": -2.5228638648986816, + "logits/rejected": -2.4802961349487305, + "logps/chosen": -201.95742797851562, + "logps/rejected": -264.07879638671875, + "loss": 0.8355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3274090886116028, + "rewards/margins": 2.3236615657806396, + "rewards/rejected": -2.6510705947875977, + "step": 4788 + }, + { + "epoch": 0.56, + "learning_rate": 1.3488839022085743e-07, + "logits/chosen": -2.6290879249572754, + "logits/rejected": -2.689253091812134, + "logps/chosen": -314.3952331542969, + "logps/rejected": -363.89154052734375, + "loss": 0.5544, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8368784189224243, + "rewards/margins": 2.710062026977539, + "rewards/rejected": -3.546940326690674, + "step": 4789 + }, + { + "epoch": 0.56, + "learning_rate": 1.3485295854493918e-07, + "logits/chosen": -2.65145206451416, + "logits/rejected": -2.515990734100342, + "logps/chosen": -100.15419006347656, + "logps/rejected": -181.13455200195312, + "loss": 0.24, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6860624551773071, + "rewards/margins": 3.311164140701294, + "rewards/rejected": -3.9972264766693115, + "step": 4790 + }, + { + "epoch": 0.56, + "learning_rate": 1.348175268690209e-07, + "logits/chosen": -2.135190010070801, + "logits/rejected": -2.3148250579833984, + "logps/chosen": -328.19061279296875, + "logps/rejected": -286.62579345703125, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45264920592308044, + "rewards/margins": 1.4107916355133057, + "rewards/rejected": -1.863440990447998, + "step": 4791 + }, + { + "epoch": 0.56, + "learning_rate": 1.3478209519310262e-07, + "logits/chosen": -2.938114881515503, + "logits/rejected": -2.871096134185791, + "logps/chosen": -220.356689453125, + "logps/rejected": -168.50709533691406, + "loss": 0.6429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.837807297706604, + "rewards/margins": 0.6550407409667969, + "rewards/rejected": -1.4928481578826904, + "step": 4792 + }, + { + "epoch": 0.56, + "learning_rate": 1.3474666351718437e-07, + "logits/chosen": -2.187373161315918, + "logits/rejected": -2.391545057296753, + "logps/chosen": -363.452880859375, + "logps/rejected": -248.85845947265625, + "loss": 0.7714, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3433905839920044, + "rewards/margins": 1.3724381923675537, + "rewards/rejected": -2.7158284187316895, + "step": 4793 + }, + { + "epoch": 0.56, + "learning_rate": 1.347112318412661e-07, + "logits/chosen": -2.000432014465332, + "logits/rejected": -1.950591802597046, + "logps/chosen": -194.1404571533203, + "logps/rejected": -262.76641845703125, + "loss": 0.5324, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1445708274841309, + "rewards/margins": 2.57920241355896, + "rewards/rejected": -3.723773241043091, + "step": 4794 + }, + { + "epoch": 0.56, + "learning_rate": 1.3467580016534782e-07, + "logits/chosen": -2.598007917404175, + "logits/rejected": -2.4690425395965576, + "logps/chosen": -495.1966552734375, + "logps/rejected": -379.8099365234375, + "loss": 0.204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9442806839942932, + "rewards/margins": 4.1000847816467285, + "rewards/rejected": -5.044365882873535, + "step": 4795 + }, + { + "epoch": 0.56, + "learning_rate": 1.3464036848942954e-07, + "logits/chosen": -2.4276440143585205, + "logits/rejected": -2.5586822032928467, + "logps/chosen": -243.9134063720703, + "logps/rejected": -232.67581176757812, + "loss": 0.4682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9979997873306274, + "rewards/margins": 1.949330449104309, + "rewards/rejected": -2.9473302364349365, + "step": 4796 + }, + { + "epoch": 0.56, + "learning_rate": 1.3460493681351126e-07, + "logits/chosen": -3.004756450653076, + "logits/rejected": -2.931865930557251, + "logps/chosen": -304.8893127441406, + "logps/rejected": -187.06324768066406, + "loss": 0.3823, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9813922643661499, + "rewards/margins": 1.6206156015396118, + "rewards/rejected": -2.6020078659057617, + "step": 4797 + }, + { + "epoch": 0.56, + "learning_rate": 1.34569505137593e-07, + "logits/chosen": -2.0442163944244385, + "logits/rejected": -1.897104024887085, + "logps/chosen": -396.56964111328125, + "logps/rejected": -377.3663330078125, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1797077655792236, + "rewards/margins": 3.4593849182128906, + "rewards/rejected": -4.639092922210693, + "step": 4798 + }, + { + "epoch": 0.56, + "learning_rate": 1.3453407346167473e-07, + "logits/chosen": -2.215677261352539, + "logits/rejected": -2.229221820831299, + "logps/chosen": -306.0623474121094, + "logps/rejected": -232.78497314453125, + "loss": 0.5427, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6663211584091187, + "rewards/margins": 1.0750013589859009, + "rewards/rejected": -2.7413225173950195, + "step": 4799 + }, + { + "epoch": 0.56, + "learning_rate": 1.3449864178575645e-07, + "logits/chosen": -1.8940424919128418, + "logits/rejected": -2.1272988319396973, + "logps/chosen": -351.45703125, + "logps/rejected": -211.61441040039062, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5950030088424683, + "rewards/margins": 1.6930532455444336, + "rewards/rejected": -3.2880563735961914, + "step": 4800 + }, + { + "epoch": 0.56, + "learning_rate": 1.3446321010983818e-07, + "logits/chosen": -1.9469788074493408, + "logits/rejected": -1.5836882591247559, + "logps/chosen": -257.44671630859375, + "logps/rejected": -432.7026672363281, + "loss": 0.3106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7608668208122253, + "rewards/margins": 1.80302894115448, + "rewards/rejected": -2.5638957023620605, + "step": 4801 + }, + { + "epoch": 0.56, + "learning_rate": 1.3442777843391992e-07, + "logits/chosen": -1.705657720565796, + "logits/rejected": -2.434072256088257, + "logps/chosen": -673.6923828125, + "logps/rejected": -386.9703369140625, + "loss": 0.5533, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3115854263305664, + "rewards/margins": 1.8436658382415771, + "rewards/rejected": -3.1552510261535645, + "step": 4802 + }, + { + "epoch": 0.56, + "learning_rate": 1.3439234675800165e-07, + "logits/chosen": -2.311337947845459, + "logits/rejected": -2.7397637367248535, + "logps/chosen": -347.7208251953125, + "logps/rejected": -285.2784729003906, + "loss": 0.5595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8334397077560425, + "rewards/margins": 1.8213317394256592, + "rewards/rejected": -3.654771327972412, + "step": 4803 + }, + { + "epoch": 0.56, + "learning_rate": 1.343569150820834e-07, + "logits/chosen": -1.9491355419158936, + "logits/rejected": -2.209559917449951, + "logps/chosen": -436.05426025390625, + "logps/rejected": -305.9840393066406, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4000500738620758, + "rewards/margins": 2.8863682746887207, + "rewards/rejected": -3.2864181995391846, + "step": 4804 + }, + { + "epoch": 0.56, + "learning_rate": 1.3432148340616512e-07, + "logits/chosen": -1.976977825164795, + "logits/rejected": -1.8404932022094727, + "logps/chosen": -285.21246337890625, + "logps/rejected": -340.08251953125, + "loss": 0.4755, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.107237696647644, + "rewards/margins": 1.2450755834579468, + "rewards/rejected": -2.352313280105591, + "step": 4805 + }, + { + "epoch": 0.56, + "learning_rate": 1.3428605173024684e-07, + "logits/chosen": -2.2803890705108643, + "logits/rejected": -2.1740174293518066, + "logps/chosen": -198.27061462402344, + "logps/rejected": -295.6157531738281, + "loss": 0.5108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5683337450027466, + "rewards/margins": 1.9810936450958252, + "rewards/rejected": -2.549427032470703, + "step": 4806 + }, + { + "epoch": 0.56, + "learning_rate": 1.3425062005432856e-07, + "logits/chosen": -2.4764933586120605, + "logits/rejected": -2.278775453567505, + "logps/chosen": -293.0487365722656, + "logps/rejected": -266.87969970703125, + "loss": 0.3205, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7209728956222534, + "rewards/margins": 1.8965500593185425, + "rewards/rejected": -2.617522954940796, + "step": 4807 + }, + { + "epoch": 0.56, + "learning_rate": 1.3421518837841028e-07, + "logits/chosen": -2.81520414352417, + "logits/rejected": -2.5558152198791504, + "logps/chosen": -125.82276916503906, + "logps/rejected": -222.4290771484375, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.679972231388092, + "rewards/margins": 2.7240750789642334, + "rewards/rejected": -3.4040474891662598, + "step": 4808 + }, + { + "epoch": 0.56, + "learning_rate": 1.34179756702492e-07, + "logits/chosen": -2.25893497467041, + "logits/rejected": -2.5644800662994385, + "logps/chosen": -239.9957275390625, + "logps/rejected": -234.2500762939453, + "loss": 0.323, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46591848134994507, + "rewards/margins": 1.7638959884643555, + "rewards/rejected": -2.229814291000366, + "step": 4809 + }, + { + "epoch": 0.56, + "learning_rate": 1.3414432502657375e-07, + "logits/chosen": -2.82350754737854, + "logits/rejected": -2.8753676414489746, + "logps/chosen": -278.1351013183594, + "logps/rejected": -324.8166198730469, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23239001631736755, + "rewards/margins": 2.864569664001465, + "rewards/rejected": -3.0969595909118652, + "step": 4810 + }, + { + "epoch": 0.56, + "learning_rate": 1.3410889335065548e-07, + "logits/chosen": -2.4252471923828125, + "logits/rejected": -2.318361282348633, + "logps/chosen": -227.4571533203125, + "logps/rejected": -326.63665771484375, + "loss": 0.4243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1358206272125244, + "rewards/margins": 2.5583138465881348, + "rewards/rejected": -3.694134473800659, + "step": 4811 + }, + { + "epoch": 0.56, + "learning_rate": 1.340734616747372e-07, + "logits/chosen": -2.5086593627929688, + "logits/rejected": -2.4786484241485596, + "logps/chosen": -269.8622741699219, + "logps/rejected": -278.1499938964844, + "loss": 0.5303, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7806077599525452, + "rewards/margins": 2.248934268951416, + "rewards/rejected": -3.0295422077178955, + "step": 4812 + }, + { + "epoch": 0.56, + "learning_rate": 1.3403802999881895e-07, + "logits/chosen": -2.1105756759643555, + "logits/rejected": -2.3248865604400635, + "logps/chosen": -411.33251953125, + "logps/rejected": -215.0321044921875, + "loss": 0.4388, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7294637560844421, + "rewards/margins": 1.6476856470108032, + "rewards/rejected": -2.3771495819091797, + "step": 4813 + }, + { + "epoch": 0.56, + "learning_rate": 1.3400259832290067e-07, + "logits/chosen": -2.7094483375549316, + "logits/rejected": -2.8354251384735107, + "logps/chosen": -122.00049591064453, + "logps/rejected": -134.54893493652344, + "loss": 0.2746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5770488977432251, + "rewards/margins": 1.9796063899993896, + "rewards/rejected": -2.556655168533325, + "step": 4814 + }, + { + "epoch": 0.56, + "learning_rate": 1.339671666469824e-07, + "logits/chosen": -2.7510125637054443, + "logits/rejected": -2.75443434715271, + "logps/chosen": -324.27734375, + "logps/rejected": -222.93609619140625, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40036338567733765, + "rewards/margins": 2.5495941638946533, + "rewards/rejected": -2.9499573707580566, + "step": 4815 + }, + { + "epoch": 0.56, + "learning_rate": 1.3393173497106414e-07, + "logits/chosen": -2.780465602874756, + "logits/rejected": -2.619685649871826, + "logps/chosen": -72.43958282470703, + "logps/rejected": -139.71014404296875, + "loss": 0.3446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4918566644191742, + "rewards/margins": 1.4393720626831055, + "rewards/rejected": -1.931228756904602, + "step": 4816 + }, + { + "epoch": 0.56, + "learning_rate": 1.3389630329514586e-07, + "logits/chosen": -2.473820209503174, + "logits/rejected": -2.506575345993042, + "logps/chosen": -180.6910400390625, + "logps/rejected": -273.0887145996094, + "loss": 0.301, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0271449089050293, + "rewards/margins": 1.8369653224945068, + "rewards/rejected": -2.8641104698181152, + "step": 4817 + }, + { + "epoch": 0.56, + "learning_rate": 1.3386087161922758e-07, + "logits/chosen": -2.257101535797119, + "logits/rejected": -2.4931561946868896, + "logps/chosen": -180.65956115722656, + "logps/rejected": -288.21527099609375, + "loss": 0.3151, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3197062015533447, + "rewards/margins": 3.4818365573883057, + "rewards/rejected": -4.80154275894165, + "step": 4818 + }, + { + "epoch": 0.56, + "learning_rate": 1.338254399433093e-07, + "logits/chosen": -2.418492317199707, + "logits/rejected": -2.545842170715332, + "logps/chosen": -227.50601196289062, + "logps/rejected": -240.3156280517578, + "loss": 0.399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2040722668170929, + "rewards/margins": 2.158360481262207, + "rewards/rejected": -2.3624327182769775, + "step": 4819 + }, + { + "epoch": 0.56, + "learning_rate": 1.3379000826739103e-07, + "logits/chosen": -2.0776755809783936, + "logits/rejected": -2.0328803062438965, + "logps/chosen": -154.8576202392578, + "logps/rejected": -282.4368591308594, + "loss": 0.7125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3417311906814575, + "rewards/margins": 2.869412660598755, + "rewards/rejected": -4.211143970489502, + "step": 4820 + }, + { + "epoch": 0.56, + "learning_rate": 1.3375457659147278e-07, + "logits/chosen": -2.480557441711426, + "logits/rejected": -2.1764285564422607, + "logps/chosen": -295.97210693359375, + "logps/rejected": -312.1411437988281, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5396378636360168, + "rewards/margins": 3.5547091960906982, + "rewards/rejected": -4.09434700012207, + "step": 4821 + }, + { + "epoch": 0.56, + "learning_rate": 1.337191449155545e-07, + "logits/chosen": -2.230681896209717, + "logits/rejected": -2.3632616996765137, + "logps/chosen": -169.21902465820312, + "logps/rejected": -159.06396484375, + "loss": 0.4943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6703957319259644, + "rewards/margins": 1.2191112041473389, + "rewards/rejected": -1.8895070552825928, + "step": 4822 + }, + { + "epoch": 0.56, + "learning_rate": 1.3368371323963622e-07, + "logits/chosen": -1.9889390468597412, + "logits/rejected": -2.282945156097412, + "logps/chosen": -345.9639892578125, + "logps/rejected": -224.836181640625, + "loss": 0.2828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23787352442741394, + "rewards/margins": 2.421614646911621, + "rewards/rejected": -2.6594882011413574, + "step": 4823 + }, + { + "epoch": 0.56, + "learning_rate": 1.3364828156371797e-07, + "logits/chosen": -2.402397394180298, + "logits/rejected": -2.666292667388916, + "logps/chosen": -304.6971130371094, + "logps/rejected": -269.91107177734375, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5197363495826721, + "rewards/margins": 2.6802635192871094, + "rewards/rejected": -3.1999998092651367, + "step": 4824 + }, + { + "epoch": 0.56, + "learning_rate": 1.336128498877997e-07, + "logits/chosen": -2.1652965545654297, + "logits/rejected": -1.5787073373794556, + "logps/chosen": -247.16275024414062, + "logps/rejected": -412.2457275390625, + "loss": 0.3239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1093870401382446, + "rewards/margins": 2.5163068771362305, + "rewards/rejected": -3.6256937980651855, + "step": 4825 + }, + { + "epoch": 0.56, + "learning_rate": 1.3357741821188141e-07, + "logits/chosen": -2.1708731651306152, + "logits/rejected": -1.7457194328308105, + "logps/chosen": -249.27047729492188, + "logps/rejected": -324.54345703125, + "loss": 0.8201, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.219892144203186, + "rewards/margins": 2.578939199447632, + "rewards/rejected": -3.7988312244415283, + "step": 4826 + }, + { + "epoch": 0.56, + "learning_rate": 1.3354198653596316e-07, + "logits/chosen": -2.128981590270996, + "logits/rejected": -2.3160088062286377, + "logps/chosen": -405.2899475097656, + "logps/rejected": -197.04275512695312, + "loss": 0.3419, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2086031436920166, + "rewards/margins": 1.5092394351959229, + "rewards/rejected": -2.7178425788879395, + "step": 4827 + }, + { + "epoch": 0.56, + "learning_rate": 1.3350655486004488e-07, + "logits/chosen": -2.1194238662719727, + "logits/rejected": -2.510593891143799, + "logps/chosen": -425.111083984375, + "logps/rejected": -190.42495727539062, + "loss": 1.2198, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.964802086353302, + "rewards/margins": 0.17127111554145813, + "rewards/rejected": -1.1360732316970825, + "step": 4828 + }, + { + "epoch": 0.56, + "learning_rate": 1.334711231841266e-07, + "logits/chosen": -2.0685296058654785, + "logits/rejected": -1.7098164558410645, + "logps/chosen": -191.93795776367188, + "logps/rejected": -285.0353698730469, + "loss": 0.3717, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.195899248123169, + "rewards/margins": 2.1681838035583496, + "rewards/rejected": -3.3640830516815186, + "step": 4829 + }, + { + "epoch": 0.56, + "learning_rate": 1.3343569150820833e-07, + "logits/chosen": -3.022818088531494, + "logits/rejected": -2.7374424934387207, + "logps/chosen": -178.90403747558594, + "logps/rejected": -270.30865478515625, + "loss": 0.2087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7888580560684204, + "rewards/margins": 2.3844919204711914, + "rewards/rejected": -3.1733498573303223, + "step": 4830 + }, + { + "epoch": 0.56, + "learning_rate": 1.3340025983229005e-07, + "logits/chosen": -2.62302827835083, + "logits/rejected": -2.3632349967956543, + "logps/chosen": -127.32085418701172, + "logps/rejected": -195.50387573242188, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6348721981048584, + "rewards/margins": 1.4908071756362915, + "rewards/rejected": -3.1256794929504395, + "step": 4831 + }, + { + "epoch": 0.56, + "learning_rate": 1.3336482815637177e-07, + "logits/chosen": -2.259399890899658, + "logits/rejected": -2.3931002616882324, + "logps/chosen": -356.15362548828125, + "logps/rejected": -289.8577880859375, + "loss": 0.1294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29262447357177734, + "rewards/margins": 3.9053874015808105, + "rewards/rejected": -4.198011875152588, + "step": 4832 + }, + { + "epoch": 0.56, + "learning_rate": 1.3332939648045352e-07, + "logits/chosen": -1.8416881561279297, + "logits/rejected": -1.9172431230545044, + "logps/chosen": -181.92164611816406, + "logps/rejected": -177.23358154296875, + "loss": 1.6134, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1789870262145996, + "rewards/margins": 0.3244028091430664, + "rewards/rejected": -3.503389835357666, + "step": 4833 + }, + { + "epoch": 0.56, + "learning_rate": 1.3329396480453524e-07, + "logits/chosen": -2.12557053565979, + "logits/rejected": -2.4236819744110107, + "logps/chosen": -490.1539611816406, + "logps/rejected": -305.9359130859375, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5510066747665405, + "rewards/margins": 3.167400360107422, + "rewards/rejected": -3.718407154083252, + "step": 4834 + }, + { + "epoch": 0.56, + "learning_rate": 1.3325853312861697e-07, + "logits/chosen": -2.686554431915283, + "logits/rejected": -2.3159339427948, + "logps/chosen": -247.21380615234375, + "logps/rejected": -349.305908203125, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4346218407154083, + "rewards/margins": 2.3758535385131836, + "rewards/rejected": -2.8104751110076904, + "step": 4835 + }, + { + "epoch": 0.56, + "learning_rate": 1.3322310145269871e-07, + "logits/chosen": -2.0885813236236572, + "logits/rejected": -2.272174119949341, + "logps/chosen": -249.76986694335938, + "logps/rejected": -216.489990234375, + "loss": 0.324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2069412469863892, + "rewards/margins": 2.692007541656494, + "rewards/rejected": -3.898948907852173, + "step": 4836 + }, + { + "epoch": 0.56, + "learning_rate": 1.3318766977678044e-07, + "logits/chosen": -2.343043804168701, + "logits/rejected": -2.4121859073638916, + "logps/chosen": -264.8946533203125, + "logps/rejected": -274.0367126464844, + "loss": 0.4428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5555753707885742, + "rewards/margins": 1.7139980792999268, + "rewards/rejected": -3.269573211669922, + "step": 4837 + }, + { + "epoch": 0.56, + "learning_rate": 1.3315223810086216e-07, + "logits/chosen": -2.117399215698242, + "logits/rejected": -2.284200429916382, + "logps/chosen": -455.9658203125, + "logps/rejected": -248.62051391601562, + "loss": 0.357, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0770058631896973, + "rewards/margins": 2.6469650268554688, + "rewards/rejected": -3.723970890045166, + "step": 4838 + }, + { + "epoch": 0.56, + "learning_rate": 1.331168064249439e-07, + "logits/chosen": -2.541111707687378, + "logits/rejected": -2.5157470703125, + "logps/chosen": -205.6208038330078, + "logps/rejected": -294.0926513671875, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5124972462654114, + "rewards/margins": 3.315567970275879, + "rewards/rejected": -3.8280653953552246, + "step": 4839 + }, + { + "epoch": 0.56, + "learning_rate": 1.3308137474902563e-07, + "logits/chosen": -2.4285888671875, + "logits/rejected": -2.3759918212890625, + "logps/chosen": -177.8372039794922, + "logps/rejected": -229.3604736328125, + "loss": 0.5861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8160719275474548, + "rewards/margins": 1.2868075370788574, + "rewards/rejected": -2.102879524230957, + "step": 4840 + }, + { + "epoch": 0.56, + "learning_rate": 1.3304594307310735e-07, + "logits/chosen": -2.774801015853882, + "logits/rejected": -2.7022886276245117, + "logps/chosen": -269.7401123046875, + "logps/rejected": -330.5428466796875, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2108013927936554, + "rewards/margins": 3.0106611251831055, + "rewards/rejected": -3.2214624881744385, + "step": 4841 + }, + { + "epoch": 0.56, + "learning_rate": 1.3301051139718907e-07, + "logits/chosen": -2.108685255050659, + "logits/rejected": -2.2624566555023193, + "logps/chosen": -334.28363037109375, + "logps/rejected": -306.8539733886719, + "loss": 0.3423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8335934281349182, + "rewards/margins": 1.175866961479187, + "rewards/rejected": -2.00946044921875, + "step": 4842 + }, + { + "epoch": 0.56, + "learning_rate": 1.329750797212708e-07, + "logits/chosen": -2.9858222007751465, + "logits/rejected": -2.8743252754211426, + "logps/chosen": -126.69331359863281, + "logps/rejected": -106.56228637695312, + "loss": 0.4461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9517567753791809, + "rewards/margins": 0.8791329264640808, + "rewards/rejected": -1.8308898210525513, + "step": 4843 + }, + { + "epoch": 0.56, + "learning_rate": 1.3293964804535254e-07, + "logits/chosen": -2.4178996086120605, + "logits/rejected": -2.299884796142578, + "logps/chosen": -305.438232421875, + "logps/rejected": -422.0162048339844, + "loss": 0.1978, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2280575037002563, + "rewards/margins": 2.228874683380127, + "rewards/rejected": -3.4569315910339355, + "step": 4844 + }, + { + "epoch": 0.56, + "learning_rate": 1.3290421636943427e-07, + "logits/chosen": -2.4443774223327637, + "logits/rejected": -2.4611291885375977, + "logps/chosen": -174.09104919433594, + "logps/rejected": -258.4566650390625, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15403929352760315, + "rewards/margins": 1.492767095565796, + "rewards/rejected": -1.646806240081787, + "step": 4845 + }, + { + "epoch": 0.56, + "learning_rate": 1.32868784693516e-07, + "logits/chosen": -2.3672800064086914, + "logits/rejected": -2.4519002437591553, + "logps/chosen": -461.7724609375, + "logps/rejected": -310.1210632324219, + "loss": 0.1368, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2834198474884033, + "rewards/margins": 3.6446213722229004, + "rewards/rejected": -4.928041458129883, + "step": 4846 + }, + { + "epoch": 0.56, + "learning_rate": 1.3283335301759774e-07, + "logits/chosen": -2.8313148021698, + "logits/rejected": -2.7516262531280518, + "logps/chosen": -251.89149475097656, + "logps/rejected": -222.3126220703125, + "loss": 0.2957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6945430636405945, + "rewards/margins": 2.3216400146484375, + "rewards/rejected": -3.0161828994750977, + "step": 4847 + }, + { + "epoch": 0.56, + "learning_rate": 1.3279792134167946e-07, + "logits/chosen": -3.0157265663146973, + "logits/rejected": -3.0970239639282227, + "logps/chosen": -145.85411071777344, + "logps/rejected": -221.014892578125, + "loss": 0.396, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3150124549865723, + "rewards/margins": 3.540337085723877, + "rewards/rejected": -4.855349540710449, + "step": 4848 + }, + { + "epoch": 0.56, + "learning_rate": 1.3276248966576118e-07, + "logits/chosen": -2.439927339553833, + "logits/rejected": -2.446319580078125, + "logps/chosen": -259.73175048828125, + "logps/rejected": -321.76318359375, + "loss": 0.3532, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.218989610671997, + "rewards/margins": 5.209564208984375, + "rewards/rejected": -6.428553104400635, + "step": 4849 + }, + { + "epoch": 0.56, + "learning_rate": 1.327270579898429e-07, + "logits/chosen": -2.3646745681762695, + "logits/rejected": -2.2422561645507812, + "logps/chosen": -441.7309875488281, + "logps/rejected": -337.9956359863281, + "loss": 0.1635, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6864237785339355, + "rewards/margins": 3.0727810859680176, + "rewards/rejected": -3.759204626083374, + "step": 4850 + }, + { + "epoch": 0.56, + "learning_rate": 1.3269162631392465e-07, + "logits/chosen": -2.370487928390503, + "logits/rejected": -2.206157684326172, + "logps/chosen": -179.42941284179688, + "logps/rejected": -268.83154296875, + "loss": 0.2961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6279398202896118, + "rewards/margins": 2.9814999103546143, + "rewards/rejected": -3.6094396114349365, + "step": 4851 + }, + { + "epoch": 0.56, + "learning_rate": 1.3265619463800637e-07, + "logits/chosen": -2.6876261234283447, + "logits/rejected": -2.956812858581543, + "logps/chosen": -194.4202880859375, + "logps/rejected": -210.02622985839844, + "loss": 0.48, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8661454916000366, + "rewards/margins": 2.4760854244232178, + "rewards/rejected": -3.342231035232544, + "step": 4852 + }, + { + "epoch": 0.56, + "learning_rate": 1.326207629620881e-07, + "logits/chosen": -2.470817804336548, + "logits/rejected": -2.579787254333496, + "logps/chosen": -322.1881103515625, + "logps/rejected": -264.3547668457031, + "loss": 0.6813, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9738895297050476, + "rewards/margins": 1.1918505430221558, + "rewards/rejected": -2.1657400131225586, + "step": 4853 + }, + { + "epoch": 0.56, + "learning_rate": 1.3258533128616982e-07, + "logits/chosen": -2.5028185844421387, + "logits/rejected": -2.430607557296753, + "logps/chosen": -353.4879150390625, + "logps/rejected": -386.04644775390625, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6412597894668579, + "rewards/margins": 4.909400463104248, + "rewards/rejected": -5.550660133361816, + "step": 4854 + }, + { + "epoch": 0.56, + "learning_rate": 1.3254989961025154e-07, + "logits/chosen": -1.897416353225708, + "logits/rejected": -2.0023114681243896, + "logps/chosen": -308.72210693359375, + "logps/rejected": -294.2940368652344, + "loss": 0.2903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8157027959823608, + "rewards/margins": 3.429597854614258, + "rewards/rejected": -5.245300769805908, + "step": 4855 + }, + { + "epoch": 0.56, + "learning_rate": 1.325144679343333e-07, + "logits/chosen": -1.9952653646469116, + "logits/rejected": -1.9006515741348267, + "logps/chosen": -290.20819091796875, + "logps/rejected": -328.07598876953125, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.353482723236084, + "rewards/margins": 2.819632053375244, + "rewards/rejected": -5.173114776611328, + "step": 4856 + }, + { + "epoch": 0.57, + "learning_rate": 1.32479036258415e-07, + "logits/chosen": -1.9845229387283325, + "logits/rejected": -1.805362343788147, + "logps/chosen": -318.6240234375, + "logps/rejected": -258.4371337890625, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0806046724319458, + "rewards/margins": 0.9584529399871826, + "rewards/rejected": -2.039057731628418, + "step": 4857 + }, + { + "epoch": 0.57, + "learning_rate": 1.3244360458249676e-07, + "logits/chosen": -3.006474733352661, + "logits/rejected": -3.0107474327087402, + "logps/chosen": -316.58062744140625, + "logps/rejected": -293.6958923339844, + "loss": 0.3161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7412282228469849, + "rewards/margins": 2.3549535274505615, + "rewards/rejected": -3.096181869506836, + "step": 4858 + }, + { + "epoch": 0.57, + "learning_rate": 1.3240817290657848e-07, + "logits/chosen": -2.1828246116638184, + "logits/rejected": -2.2827301025390625, + "logps/chosen": -229.5074005126953, + "logps/rejected": -239.6103515625, + "loss": 0.8888, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1257296800613403, + "rewards/margins": 1.176745891571045, + "rewards/rejected": -2.302475690841675, + "step": 4859 + }, + { + "epoch": 0.57, + "learning_rate": 1.323727412306602e-07, + "logits/chosen": -2.203692674636841, + "logits/rejected": -2.305471420288086, + "logps/chosen": -271.7756652832031, + "logps/rejected": -220.06240844726562, + "loss": 0.1923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4497028589248657, + "rewards/margins": 3.1304235458374023, + "rewards/rejected": -3.5801262855529785, + "step": 4860 + }, + { + "epoch": 0.57, + "learning_rate": 1.3233730955474193e-07, + "logits/chosen": -2.17549467086792, + "logits/rejected": -2.468386650085449, + "logps/chosen": -189.46514892578125, + "logps/rejected": -135.9783935546875, + "loss": 0.5541, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9483795762062073, + "rewards/margins": 1.000497817993164, + "rewards/rejected": -1.9488774538040161, + "step": 4861 + }, + { + "epoch": 0.57, + "learning_rate": 1.3230187787882367e-07, + "logits/chosen": -2.4614248275756836, + "logits/rejected": -2.5643181800842285, + "logps/chosen": -193.00230407714844, + "logps/rejected": -211.10606384277344, + "loss": 0.845, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8061829209327698, + "rewards/margins": 1.0074421167373657, + "rewards/rejected": -1.8136252164840698, + "step": 4862 + }, + { + "epoch": 0.57, + "learning_rate": 1.322664462029054e-07, + "logits/chosen": -2.0437021255493164, + "logits/rejected": -1.9147361516952515, + "logps/chosen": -133.8846893310547, + "logps/rejected": -176.849609375, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0504157543182373, + "rewards/margins": 1.2902220487594604, + "rewards/rejected": -2.340637683868408, + "step": 4863 + }, + { + "epoch": 0.57, + "learning_rate": 1.3223101452698712e-07, + "logits/chosen": -2.1211376190185547, + "logits/rejected": -1.8958401679992676, + "logps/chosen": -433.8878173828125, + "logps/rejected": -361.3505859375, + "loss": 0.3309, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1925612688064575, + "rewards/margins": 2.390378475189209, + "rewards/rejected": -3.582939863204956, + "step": 4864 + }, + { + "epoch": 0.57, + "learning_rate": 1.3219558285106884e-07, + "logits/chosen": -2.0353870391845703, + "logits/rejected": -2.105717897415161, + "logps/chosen": -387.3922119140625, + "logps/rejected": -396.1689147949219, + "loss": 0.2847, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1892784833908081, + "rewards/margins": 2.9928905963897705, + "rewards/rejected": -3.182169198989868, + "step": 4865 + }, + { + "epoch": 0.57, + "learning_rate": 1.3216015117515056e-07, + "logits/chosen": -2.613842487335205, + "logits/rejected": -2.770437240600586, + "logps/chosen": -233.72418212890625, + "logps/rejected": -385.05657958984375, + "loss": 0.4216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7680480480194092, + "rewards/margins": 1.6731019020080566, + "rewards/rejected": -2.441149950027466, + "step": 4866 + }, + { + "epoch": 0.57, + "learning_rate": 1.321247194992323e-07, + "logits/chosen": -1.9144995212554932, + "logits/rejected": -2.162978172302246, + "logps/chosen": -464.622802734375, + "logps/rejected": -352.81146240234375, + "loss": 0.6391, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4449738264083862, + "rewards/margins": 1.3397817611694336, + "rewards/rejected": -2.7847557067871094, + "step": 4867 + }, + { + "epoch": 0.57, + "learning_rate": 1.3208928782331403e-07, + "logits/chosen": -1.799444317817688, + "logits/rejected": -1.8814607858657837, + "logps/chosen": -219.76414489746094, + "logps/rejected": -175.86529541015625, + "loss": 0.7493, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.753049373626709, + "rewards/margins": 0.5366894602775574, + "rewards/rejected": -1.2897388935089111, + "step": 4868 + }, + { + "epoch": 0.57, + "learning_rate": 1.3205385614739578e-07, + "logits/chosen": -2.827772378921509, + "logits/rejected": -2.8862557411193848, + "logps/chosen": -352.2066650390625, + "logps/rejected": -271.43707275390625, + "loss": 0.45, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8628345727920532, + "rewards/margins": 2.104935646057129, + "rewards/rejected": -3.967770576477051, + "step": 4869 + }, + { + "epoch": 0.57, + "learning_rate": 1.320184244714775e-07, + "logits/chosen": -2.436633825302124, + "logits/rejected": -2.4843311309814453, + "logps/chosen": -381.75897216796875, + "logps/rejected": -330.3065185546875, + "loss": 0.3081, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44621357321739197, + "rewards/margins": 1.949384093284607, + "rewards/rejected": -2.3955976963043213, + "step": 4870 + }, + { + "epoch": 0.57, + "learning_rate": 1.3198299279555923e-07, + "logits/chosen": -2.0412540435791016, + "logits/rejected": -1.8859896659851074, + "logps/chosen": -370.6697082519531, + "logps/rejected": -390.9971923828125, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0785162448883057, + "rewards/margins": 5.060270309448242, + "rewards/rejected": -6.138786315917969, + "step": 4871 + }, + { + "epoch": 0.57, + "learning_rate": 1.3194756111964095e-07, + "logits/chosen": -2.2092747688293457, + "logits/rejected": -2.2519941329956055, + "logps/chosen": -274.89813232421875, + "logps/rejected": -236.9650421142578, + "loss": 0.2435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9307136535644531, + "rewards/margins": 2.247997283935547, + "rewards/rejected": -3.178710699081421, + "step": 4872 + }, + { + "epoch": 0.57, + "learning_rate": 1.3191212944372267e-07, + "logits/chosen": -2.460153102874756, + "logits/rejected": -2.5517916679382324, + "logps/chosen": -446.050537109375, + "logps/rejected": -486.3163146972656, + "loss": 0.883, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7914328575134277, + "rewards/margins": 2.1654887199401855, + "rewards/rejected": -3.9569215774536133, + "step": 4873 + }, + { + "epoch": 0.57, + "learning_rate": 1.3187669776780442e-07, + "logits/chosen": -2.3170366287231445, + "logits/rejected": -2.4324302673339844, + "logps/chosen": -209.82150268554688, + "logps/rejected": -207.75918579101562, + "loss": 0.1772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6023393273353577, + "rewards/margins": 3.063223361968994, + "rewards/rejected": -3.665562629699707, + "step": 4874 + }, + { + "epoch": 0.57, + "learning_rate": 1.3184126609188614e-07, + "logits/chosen": -1.707032561302185, + "logits/rejected": -2.0126800537109375, + "logps/chosen": -246.61380004882812, + "logps/rejected": -240.6519012451172, + "loss": 0.5848, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7620730400085449, + "rewards/margins": 0.9961439371109009, + "rewards/rejected": -1.7582168579101562, + "step": 4875 + }, + { + "epoch": 0.57, + "learning_rate": 1.3180583441596786e-07, + "logits/chosen": -2.6139612197875977, + "logits/rejected": -2.566134452819824, + "logps/chosen": -303.848876953125, + "logps/rejected": -236.66749572753906, + "loss": 0.647, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.46212899684906, + "rewards/margins": 1.2747955322265625, + "rewards/rejected": -2.736924648284912, + "step": 4876 + }, + { + "epoch": 0.57, + "learning_rate": 1.3177040274004959e-07, + "logits/chosen": -1.9026943445205688, + "logits/rejected": -2.0661065578460693, + "logps/chosen": -299.0624084472656, + "logps/rejected": -335.7823181152344, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0012033730745315552, + "rewards/margins": 3.6685149669647217, + "rewards/rejected": -3.667311191558838, + "step": 4877 + }, + { + "epoch": 0.57, + "learning_rate": 1.3173497106413133e-07, + "logits/chosen": -2.267727851867676, + "logits/rejected": -2.2949419021606445, + "logps/chosen": -471.922607421875, + "logps/rejected": -413.017333984375, + "loss": 0.7689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7593218088150024, + "rewards/margins": 1.7323665618896484, + "rewards/rejected": -2.4916884899139404, + "step": 4878 + }, + { + "epoch": 0.57, + "learning_rate": 1.3169953938821306e-07, + "logits/chosen": -3.0514931678771973, + "logits/rejected": -2.9911398887634277, + "logps/chosen": -320.5049133300781, + "logps/rejected": -279.6960144042969, + "loss": 0.2605, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0562995672225952, + "rewards/margins": 2.326406717300415, + "rewards/rejected": -3.3827064037323, + "step": 4879 + }, + { + "epoch": 0.57, + "learning_rate": 1.3166410771229478e-07, + "logits/chosen": -1.982433557510376, + "logits/rejected": -2.322680711746216, + "logps/chosen": -263.6446533203125, + "logps/rejected": -138.51004028320312, + "loss": 1.1905, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.344170331954956, + "rewards/margins": 0.2974928617477417, + "rewards/rejected": -1.6416631937026978, + "step": 4880 + }, + { + "epoch": 0.57, + "learning_rate": 1.3162867603637653e-07, + "logits/chosen": -2.6239237785339355, + "logits/rejected": -2.734391212463379, + "logps/chosen": -423.7042236328125, + "logps/rejected": -205.45262145996094, + "loss": 0.2122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13174742460250854, + "rewards/margins": 2.2522153854370117, + "rewards/rejected": -2.383962631225586, + "step": 4881 + }, + { + "epoch": 0.57, + "learning_rate": 1.3159324436045825e-07, + "logits/chosen": -2.698192834854126, + "logits/rejected": -2.869889497756958, + "logps/chosen": -211.55145263671875, + "logps/rejected": -242.32992553710938, + "loss": 0.3798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1815402507781982, + "rewards/margins": 1.5053143501281738, + "rewards/rejected": -2.686854600906372, + "step": 4882 + }, + { + "epoch": 0.57, + "learning_rate": 1.3155781268453997e-07, + "logits/chosen": -2.5411503314971924, + "logits/rejected": -2.6909987926483154, + "logps/chosen": -267.0626220703125, + "logps/rejected": -189.8928680419922, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8078060150146484, + "rewards/margins": 2.268554449081421, + "rewards/rejected": -3.0763604640960693, + "step": 4883 + }, + { + "epoch": 0.57, + "learning_rate": 1.315223810086217e-07, + "logits/chosen": -1.7500556707382202, + "logits/rejected": -1.8007586002349854, + "logps/chosen": -305.53662109375, + "logps/rejected": -268.64923095703125, + "loss": 0.306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6195812225341797, + "rewards/margins": 1.7717528343200684, + "rewards/rejected": -2.391334056854248, + "step": 4884 + }, + { + "epoch": 0.57, + "learning_rate": 1.3148694933270342e-07, + "logits/chosen": -1.868788242340088, + "logits/rejected": -2.3687856197357178, + "logps/chosen": -341.5501708984375, + "logps/rejected": -237.43983459472656, + "loss": 0.2567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2975740432739258, + "rewards/margins": 2.755711793899536, + "rewards/rejected": -3.053285837173462, + "step": 4885 + }, + { + "epoch": 0.57, + "learning_rate": 1.3145151765678516e-07, + "logits/chosen": -2.1739184856414795, + "logits/rejected": -2.3760266304016113, + "logps/chosen": -368.3441162109375, + "logps/rejected": -329.6139831542969, + "loss": 0.2495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23234453797340393, + "rewards/margins": 1.9916355609893799, + "rewards/rejected": -2.223980188369751, + "step": 4886 + }, + { + "epoch": 0.57, + "learning_rate": 1.314160859808669e-07, + "logits/chosen": -2.9032163619995117, + "logits/rejected": -2.8782596588134766, + "logps/chosen": -442.82098388671875, + "logps/rejected": -311.3962097167969, + "loss": 0.1337, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0057841539382935, + "rewards/margins": 3.781269073486328, + "rewards/rejected": -4.78705358505249, + "step": 4887 + }, + { + "epoch": 0.57, + "learning_rate": 1.313806543049486e-07, + "logits/chosen": -2.726161479949951, + "logits/rejected": -2.6783595085144043, + "logps/chosen": -170.22482299804688, + "logps/rejected": -212.1488800048828, + "loss": 0.4659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49220770597457886, + "rewards/margins": 2.2177610397338867, + "rewards/rejected": -2.7099685668945312, + "step": 4888 + }, + { + "epoch": 0.57, + "learning_rate": 1.3134522262903036e-07, + "logits/chosen": -2.894850969314575, + "logits/rejected": -2.6442928314208984, + "logps/chosen": -273.3145751953125, + "logps/rejected": -368.74462890625, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1080833673477173, + "rewards/margins": 1.0282264947891235, + "rewards/rejected": -2.136309862136841, + "step": 4889 + }, + { + "epoch": 0.57, + "learning_rate": 1.3130979095311208e-07, + "logits/chosen": -2.6371946334838867, + "logits/rejected": -2.7554733753204346, + "logps/chosen": -229.92892456054688, + "logps/rejected": -203.5054473876953, + "loss": 1.3496, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0637826919555664, + "rewards/margins": 0.538162112236023, + "rewards/rejected": -2.6019446849823, + "step": 4890 + }, + { + "epoch": 0.57, + "learning_rate": 1.312743592771938e-07, + "logits/chosen": -2.473237991333008, + "logits/rejected": -2.447319746017456, + "logps/chosen": -336.63250732421875, + "logps/rejected": -258.8016357421875, + "loss": 0.3093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7821365594863892, + "rewards/margins": 3.529296875, + "rewards/rejected": -4.3114333152771, + "step": 4891 + }, + { + "epoch": 0.57, + "learning_rate": 1.3123892760127555e-07, + "logits/chosen": -2.5205070972442627, + "logits/rejected": -2.4741930961608887, + "logps/chosen": -364.9985046386719, + "logps/rejected": -345.62335205078125, + "loss": 0.1629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6785728335380554, + "rewards/margins": 2.938798666000366, + "rewards/rejected": -3.6173715591430664, + "step": 4892 + }, + { + "epoch": 0.57, + "learning_rate": 1.3120349592535727e-07, + "logits/chosen": -1.9815500974655151, + "logits/rejected": -2.0793333053588867, + "logps/chosen": -483.670654296875, + "logps/rejected": -364.821044921875, + "loss": 0.2565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5529549717903137, + "rewards/margins": 2.605999708175659, + "rewards/rejected": -3.158954620361328, + "step": 4893 + }, + { + "epoch": 0.57, + "learning_rate": 1.31168064249439e-07, + "logits/chosen": -2.16487717628479, + "logits/rejected": -2.245985746383667, + "logps/chosen": -356.5289001464844, + "logps/rejected": -220.52694702148438, + "loss": 0.4326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8610485792160034, + "rewards/margins": 1.593369722366333, + "rewards/rejected": -2.454418182373047, + "step": 4894 + }, + { + "epoch": 0.57, + "learning_rate": 1.3113263257352072e-07, + "logits/chosen": -2.204349994659424, + "logits/rejected": -1.989993691444397, + "logps/chosen": -175.68948364257812, + "logps/rejected": -308.66546630859375, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7373576164245605, + "rewards/margins": 2.210646152496338, + "rewards/rejected": -2.9480037689208984, + "step": 4895 + }, + { + "epoch": 0.57, + "learning_rate": 1.3109720089760244e-07, + "logits/chosen": -2.0302274227142334, + "logits/rejected": -2.102696180343628, + "logps/chosen": -375.5113830566406, + "logps/rejected": -268.3872985839844, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7611495852470398, + "rewards/margins": 1.7323479652404785, + "rewards/rejected": -2.493497610092163, + "step": 4896 + }, + { + "epoch": 0.57, + "learning_rate": 1.310617692216842e-07, + "logits/chosen": -2.8365378379821777, + "logits/rejected": -2.5993003845214844, + "logps/chosen": -338.25006103515625, + "logps/rejected": -276.83038330078125, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.611274003982544, + "rewards/margins": 2.212188959121704, + "rewards/rejected": -2.823462963104248, + "step": 4897 + }, + { + "epoch": 0.57, + "learning_rate": 1.310263375457659e-07, + "logits/chosen": -2.490060806274414, + "logits/rejected": -2.5275661945343018, + "logps/chosen": -254.22674560546875, + "logps/rejected": -168.57431030273438, + "loss": 0.5016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2124173939228058, + "rewards/margins": 2.4431004524230957, + "rewards/rejected": -2.655517578125, + "step": 4898 + }, + { + "epoch": 0.57, + "learning_rate": 1.3099090586984763e-07, + "logits/chosen": -2.5266757011413574, + "logits/rejected": -2.522172451019287, + "logps/chosen": -160.60398864746094, + "logps/rejected": -308.7193908691406, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6654006242752075, + "rewards/margins": 1.9512474536895752, + "rewards/rejected": -2.6166481971740723, + "step": 4899 + }, + { + "epoch": 0.57, + "learning_rate": 1.3095547419392935e-07, + "logits/chosen": -2.098836898803711, + "logits/rejected": -2.050503730773926, + "logps/chosen": -448.47149658203125, + "logps/rejected": -325.55328369140625, + "loss": 0.3791, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.776012659072876, + "rewards/margins": 2.8517580032348633, + "rewards/rejected": -4.627770900726318, + "step": 4900 + }, + { + "epoch": 0.57, + "learning_rate": 1.309200425180111e-07, + "logits/chosen": -2.446624994277954, + "logits/rejected": -2.5571067333221436, + "logps/chosen": -279.81982421875, + "logps/rejected": -250.8458251953125, + "loss": 0.3028, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9273287653923035, + "rewards/margins": 2.4733033180236816, + "rewards/rejected": -3.400631904602051, + "step": 4901 + }, + { + "epoch": 0.57, + "learning_rate": 1.3088461084209282e-07, + "logits/chosen": -2.344827175140381, + "logits/rejected": -2.4614408016204834, + "logps/chosen": -210.6289825439453, + "logps/rejected": -169.69204711914062, + "loss": 1.2876, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4216456413269043, + "rewards/margins": 0.2642451524734497, + "rewards/rejected": -2.6858906745910645, + "step": 4902 + }, + { + "epoch": 0.57, + "learning_rate": 1.3084917916617457e-07, + "logits/chosen": -1.6561877727508545, + "logits/rejected": -1.7253623008728027, + "logps/chosen": -416.9621887207031, + "logps/rejected": -438.3257141113281, + "loss": 0.3237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.993068277835846, + "rewards/margins": 2.3853416442871094, + "rewards/rejected": -3.3784096240997314, + "step": 4903 + }, + { + "epoch": 0.57, + "learning_rate": 1.308137474902563e-07, + "logits/chosen": -2.4472110271453857, + "logits/rejected": -2.5242342948913574, + "logps/chosen": -139.63827514648438, + "logps/rejected": -107.35699462890625, + "loss": 0.3066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9662624597549438, + "rewards/margins": 1.2339637279510498, + "rewards/rejected": -2.200226306915283, + "step": 4904 + }, + { + "epoch": 0.57, + "learning_rate": 1.3077831581433802e-07, + "logits/chosen": -2.2800920009613037, + "logits/rejected": -2.4133448600769043, + "logps/chosen": -389.1122131347656, + "logps/rejected": -299.1798095703125, + "loss": 0.6629, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.003803014755249, + "rewards/margins": 0.7560297250747681, + "rewards/rejected": -1.759832739830017, + "step": 4905 + }, + { + "epoch": 0.57, + "learning_rate": 1.3074288413841974e-07, + "logits/chosen": -2.2172834873199463, + "logits/rejected": -2.3301074504852295, + "logps/chosen": -286.9349670410156, + "logps/rejected": -189.72958374023438, + "loss": 0.4249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9784653186798096, + "rewards/margins": 1.6188820600509644, + "rewards/rejected": -2.5973474979400635, + "step": 4906 + }, + { + "epoch": 0.57, + "learning_rate": 1.3070745246250146e-07, + "logits/chosen": -2.4728925228118896, + "logits/rejected": -2.2771623134613037, + "logps/chosen": -242.75350952148438, + "logps/rejected": -359.1866455078125, + "loss": 0.3923, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3976023197174072, + "rewards/margins": 2.6350767612457275, + "rewards/rejected": -4.032679080963135, + "step": 4907 + }, + { + "epoch": 0.57, + "learning_rate": 1.3067202078658318e-07, + "logits/chosen": -1.7711801528930664, + "logits/rejected": -1.9588677883148193, + "logps/chosen": -487.91558837890625, + "logps/rejected": -500.8773498535156, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9819821715354919, + "rewards/margins": 1.5080817937850952, + "rewards/rejected": -2.4900641441345215, + "step": 4908 + }, + { + "epoch": 0.57, + "learning_rate": 1.3063658911066493e-07, + "logits/chosen": -2.076648235321045, + "logits/rejected": -2.0835564136505127, + "logps/chosen": -409.09454345703125, + "logps/rejected": -374.7704162597656, + "loss": 0.3552, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6248995065689087, + "rewards/margins": 1.6097500324249268, + "rewards/rejected": -2.234649419784546, + "step": 4909 + }, + { + "epoch": 0.57, + "learning_rate": 1.3060115743474665e-07, + "logits/chosen": -2.5695855617523193, + "logits/rejected": -2.6179418563842773, + "logps/chosen": -298.19384765625, + "logps/rejected": -229.6575927734375, + "loss": 0.281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6266980171203613, + "rewards/margins": 2.1781461238861084, + "rewards/rejected": -2.8048441410064697, + "step": 4910 + }, + { + "epoch": 0.57, + "learning_rate": 1.3056572575882838e-07, + "logits/chosen": -2.7867047786712646, + "logits/rejected": -2.6262154579162598, + "logps/chosen": -199.03916931152344, + "logps/rejected": -239.67872619628906, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1712989807128906, + "rewards/margins": 1.0142991542816162, + "rewards/rejected": -2.185598134994507, + "step": 4911 + }, + { + "epoch": 0.57, + "learning_rate": 1.3053029408291013e-07, + "logits/chosen": -1.9262363910675049, + "logits/rejected": -1.9955253601074219, + "logps/chosen": -288.3578796386719, + "logps/rejected": -347.6042175292969, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3174705505371094, + "rewards/margins": 3.5497567653656006, + "rewards/rejected": -3.867227554321289, + "step": 4912 + }, + { + "epoch": 0.57, + "learning_rate": 1.3049486240699185e-07, + "logits/chosen": -2.6033542156219482, + "logits/rejected": -2.6996002197265625, + "logps/chosen": -321.653564453125, + "logps/rejected": -319.6850891113281, + "loss": 0.4123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.665565013885498, + "rewards/margins": 1.6352014541625977, + "rewards/rejected": -2.3007664680480957, + "step": 4913 + }, + { + "epoch": 0.57, + "learning_rate": 1.3045943073107357e-07, + "logits/chosen": -2.157122850418091, + "logits/rejected": -2.4260237216949463, + "logps/chosen": -208.73158264160156, + "logps/rejected": -236.58409118652344, + "loss": 0.9194, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3800923824310303, + "rewards/margins": 1.466701865196228, + "rewards/rejected": -3.8467941284179688, + "step": 4914 + }, + { + "epoch": 0.57, + "learning_rate": 1.3042399905515532e-07, + "logits/chosen": -2.285346269607544, + "logits/rejected": -2.0704169273376465, + "logps/chosen": -194.35081481933594, + "logps/rejected": -261.7137756347656, + "loss": 0.5341, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1006237268447876, + "rewards/margins": 1.5816081762313843, + "rewards/rejected": -2.682232141494751, + "step": 4915 + }, + { + "epoch": 0.57, + "learning_rate": 1.3038856737923704e-07, + "logits/chosen": -2.4884984493255615, + "logits/rejected": -2.668881893157959, + "logps/chosen": -307.3558654785156, + "logps/rejected": -310.4018249511719, + "loss": 0.2357, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4041770696640015, + "rewards/margins": 3.434743881225586, + "rewards/rejected": -4.838920593261719, + "step": 4916 + }, + { + "epoch": 0.57, + "learning_rate": 1.3035313570331876e-07, + "logits/chosen": -2.060009479522705, + "logits/rejected": -1.9025694131851196, + "logps/chosen": -305.6308288574219, + "logps/rejected": -409.52362060546875, + "loss": 0.1882, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2018437683582306, + "rewards/margins": 2.9881582260131836, + "rewards/rejected": -3.190001964569092, + "step": 4917 + }, + { + "epoch": 0.57, + "learning_rate": 1.3031770402740048e-07, + "logits/chosen": -2.2658984661102295, + "logits/rejected": -2.102811098098755, + "logps/chosen": -297.4826354980469, + "logps/rejected": -248.32949829101562, + "loss": 0.2389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8426122069358826, + "rewards/margins": 2.482356309890747, + "rewards/rejected": -3.3249683380126953, + "step": 4918 + }, + { + "epoch": 0.57, + "learning_rate": 1.302822723514822e-07, + "logits/chosen": -1.955812931060791, + "logits/rejected": -1.858857274055481, + "logps/chosen": -425.9463806152344, + "logps/rejected": -325.12017822265625, + "loss": 0.3759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2555692791938782, + "rewards/margins": 1.5322794914245605, + "rewards/rejected": -1.787848711013794, + "step": 4919 + }, + { + "epoch": 0.57, + "learning_rate": 1.3024684067556393e-07, + "logits/chosen": -2.217649459838867, + "logits/rejected": -2.6436541080474854, + "logps/chosen": -200.87335205078125, + "logps/rejected": -145.0616455078125, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23772066831588745, + "rewards/margins": 1.3452272415161133, + "rewards/rejected": -1.582947850227356, + "step": 4920 + }, + { + "epoch": 0.57, + "learning_rate": 1.3021140899964568e-07, + "logits/chosen": -1.978952169418335, + "logits/rejected": -1.83791184425354, + "logps/chosen": -224.68748474121094, + "logps/rejected": -212.7786865234375, + "loss": 0.375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7447393536567688, + "rewards/margins": 1.1644201278686523, + "rewards/rejected": -1.9091594219207764, + "step": 4921 + }, + { + "epoch": 0.57, + "learning_rate": 1.301759773237274e-07, + "logits/chosen": -2.1389050483703613, + "logits/rejected": -2.4295268058776855, + "logps/chosen": -353.09808349609375, + "logps/rejected": -212.7467041015625, + "loss": 0.1531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3820021450519562, + "rewards/margins": 2.6475329399108887, + "rewards/rejected": -3.0295348167419434, + "step": 4922 + }, + { + "epoch": 0.57, + "learning_rate": 1.3014054564780915e-07, + "logits/chosen": -2.3991661071777344, + "logits/rejected": -2.320427656173706, + "logps/chosen": -275.2611083984375, + "logps/rejected": -264.94873046875, + "loss": 0.3954, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6660456657409668, + "rewards/margins": 2.668148994445801, + "rewards/rejected": -4.334194660186768, + "step": 4923 + }, + { + "epoch": 0.57, + "learning_rate": 1.3010511397189087e-07, + "logits/chosen": -2.319770574569702, + "logits/rejected": -2.308619260787964, + "logps/chosen": -346.07904052734375, + "logps/rejected": -257.2724609375, + "loss": 0.1931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17696575820446014, + "rewards/margins": 1.8882322311401367, + "rewards/rejected": -2.0651979446411133, + "step": 4924 + }, + { + "epoch": 0.57, + "learning_rate": 1.300696822959726e-07, + "logits/chosen": -2.877318859100342, + "logits/rejected": -2.7470200061798096, + "logps/chosen": -611.4945678710938, + "logps/rejected": -264.7587890625, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5746270418167114, + "rewards/margins": 2.7248153686523438, + "rewards/rejected": -3.2994425296783447, + "step": 4925 + }, + { + "epoch": 0.57, + "learning_rate": 1.3003425062005434e-07, + "logits/chosen": -2.028644561767578, + "logits/rejected": -2.4943199157714844, + "logps/chosen": -410.6567687988281, + "logps/rejected": -272.51849365234375, + "loss": 0.9134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.186037302017212, + "rewards/margins": 0.3469725549221039, + "rewards/rejected": -1.5330097675323486, + "step": 4926 + }, + { + "epoch": 0.57, + "learning_rate": 1.2999881894413606e-07, + "logits/chosen": -2.436856508255005, + "logits/rejected": -2.447812557220459, + "logps/chosen": -299.63763427734375, + "logps/rejected": -242.04632568359375, + "loss": 0.2011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6836138963699341, + "rewards/margins": 2.127715587615967, + "rewards/rejected": -2.8113296031951904, + "step": 4927 + }, + { + "epoch": 0.57, + "learning_rate": 1.2996338726821779e-07, + "logits/chosen": -2.5746853351593018, + "logits/rejected": -2.750491142272949, + "logps/chosen": -257.95562744140625, + "logps/rejected": -217.0816650390625, + "loss": 0.2242, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011576220393180847, + "rewards/margins": 1.949467420578003, + "rewards/rejected": -1.9378911256790161, + "step": 4928 + }, + { + "epoch": 0.57, + "learning_rate": 1.299279555922995e-07, + "logits/chosen": -2.0329642295837402, + "logits/rejected": -2.0717101097106934, + "logps/chosen": -517.856689453125, + "logps/rejected": -297.9531555175781, + "loss": 1.1188, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7898049354553223, + "rewards/margins": 0.4849618077278137, + "rewards/rejected": -3.2747669219970703, + "step": 4929 + }, + { + "epoch": 0.57, + "learning_rate": 1.2989252391638123e-07, + "logits/chosen": -2.0974202156066895, + "logits/rejected": -2.110975503921509, + "logps/chosen": -306.4818115234375, + "logps/rejected": -287.61492919921875, + "loss": 0.4578, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3665285110473633, + "rewards/margins": 2.160068988800049, + "rewards/rejected": -3.526597738265991, + "step": 4930 + }, + { + "epoch": 0.57, + "learning_rate": 1.2985709224046295e-07, + "logits/chosen": -2.7785258293151855, + "logits/rejected": -2.3334174156188965, + "logps/chosen": -100.2549057006836, + "logps/rejected": -258.3974609375, + "loss": 0.289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3372487425804138, + "rewards/margins": 2.211582899093628, + "rewards/rejected": -2.5488317012786865, + "step": 4931 + }, + { + "epoch": 0.57, + "learning_rate": 1.298216605645447e-07, + "logits/chosen": -2.43625807762146, + "logits/rejected": -2.4659969806671143, + "logps/chosen": -253.04092407226562, + "logps/rejected": -274.9180908203125, + "loss": 0.7277, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7942510843276978, + "rewards/margins": 1.4348478317260742, + "rewards/rejected": -2.2290987968444824, + "step": 4932 + }, + { + "epoch": 0.57, + "learning_rate": 1.2978622888862642e-07, + "logits/chosen": -2.2840816974639893, + "logits/rejected": -2.0167858600616455, + "logps/chosen": -285.70452880859375, + "logps/rejected": -347.6697998046875, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.909662127494812, + "rewards/margins": 3.3156776428222656, + "rewards/rejected": -4.225339889526367, + "step": 4933 + }, + { + "epoch": 0.57, + "learning_rate": 1.2975079721270814e-07, + "logits/chosen": -1.9259463548660278, + "logits/rejected": -2.447626829147339, + "logps/chosen": -491.7336120605469, + "logps/rejected": -251.5966033935547, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6146990060806274, + "rewards/margins": 3.2180466651916504, + "rewards/rejected": -3.8327455520629883, + "step": 4934 + }, + { + "epoch": 0.57, + "learning_rate": 1.297153655367899e-07, + "logits/chosen": -2.542686939239502, + "logits/rejected": -2.3447842597961426, + "logps/chosen": -197.29840087890625, + "logps/rejected": -261.6246643066406, + "loss": 0.1429, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7434940934181213, + "rewards/margins": 3.6033902168273926, + "rewards/rejected": -4.346884250640869, + "step": 4935 + }, + { + "epoch": 0.57, + "learning_rate": 1.2967993386087162e-07, + "logits/chosen": -2.0600674152374268, + "logits/rejected": -2.3497314453125, + "logps/chosen": -593.2804565429688, + "logps/rejected": -509.93865966796875, + "loss": 0.212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5343054533004761, + "rewards/margins": 2.4115982055664062, + "rewards/rejected": -2.945903778076172, + "step": 4936 + }, + { + "epoch": 0.57, + "learning_rate": 1.2964450218495334e-07, + "logits/chosen": -1.8445041179656982, + "logits/rejected": -2.0008301734924316, + "logps/chosen": -416.8793640136719, + "logps/rejected": -287.35028076171875, + "loss": 0.6965, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1846888065338135, + "rewards/margins": 1.1824102401733398, + "rewards/rejected": -2.3670990467071533, + "step": 4937 + }, + { + "epoch": 0.57, + "learning_rate": 1.2960907050903509e-07, + "logits/chosen": -2.4627020359039307, + "logits/rejected": -2.567291259765625, + "logps/chosen": -175.21295166015625, + "logps/rejected": -330.9231262207031, + "loss": 0.2549, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6372267603874207, + "rewards/margins": 4.0871171951293945, + "rewards/rejected": -4.724343776702881, + "step": 4938 + }, + { + "epoch": 0.57, + "learning_rate": 1.295736388331168e-07, + "logits/chosen": -1.842760682106018, + "logits/rejected": -2.2526907920837402, + "logps/chosen": -184.8463897705078, + "logps/rejected": -159.6168212890625, + "loss": 0.4453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8329417705535889, + "rewards/margins": 1.5737833976745605, + "rewards/rejected": -3.4067251682281494, + "step": 4939 + }, + { + "epoch": 0.57, + "learning_rate": 1.2953820715719853e-07, + "logits/chosen": -1.8504279851913452, + "logits/rejected": -1.9069013595581055, + "logps/chosen": -457.6588134765625, + "logps/rejected": -323.8978271484375, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4876168370246887, + "rewards/margins": 2.654024600982666, + "rewards/rejected": -3.141641616821289, + "step": 4940 + }, + { + "epoch": 0.57, + "learning_rate": 1.2950277548128025e-07, + "logits/chosen": -2.4690239429473877, + "logits/rejected": -2.1209776401519775, + "logps/chosen": -197.31777954101562, + "logps/rejected": -310.25421142578125, + "loss": 0.684, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8896211981773376, + "rewards/margins": 1.9191218614578247, + "rewards/rejected": -2.8087430000305176, + "step": 4941 + }, + { + "epoch": 0.57, + "learning_rate": 1.2946734380536197e-07, + "logits/chosen": -1.7603150606155396, + "logits/rejected": -1.8750998973846436, + "logps/chosen": -444.46527099609375, + "logps/rejected": -456.8905029296875, + "loss": 0.3339, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.44185471534729, + "rewards/margins": 1.7989068031311035, + "rewards/rejected": -3.2407615184783936, + "step": 4942 + }, + { + "epoch": 0.58, + "learning_rate": 1.2943191212944372e-07, + "logits/chosen": -2.0848381519317627, + "logits/rejected": -2.1048665046691895, + "logps/chosen": -144.6287384033203, + "logps/rejected": -234.92112731933594, + "loss": 0.6171, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.722733736038208, + "rewards/margins": 1.6206884384155273, + "rewards/rejected": -3.3434219360351562, + "step": 4943 + }, + { + "epoch": 0.58, + "learning_rate": 1.2939648045352544e-07, + "logits/chosen": -2.1562881469726562, + "logits/rejected": -2.2121524810791016, + "logps/chosen": -321.2801513671875, + "logps/rejected": -332.8514709472656, + "loss": 0.2755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23874785006046295, + "rewards/margins": 1.8395302295684814, + "rewards/rejected": -2.078278064727783, + "step": 4944 + }, + { + "epoch": 0.58, + "learning_rate": 1.2936104877760717e-07, + "logits/chosen": -2.019896984100342, + "logits/rejected": -2.0468990802764893, + "logps/chosen": -408.7285461425781, + "logps/rejected": -414.41168212890625, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17786996066570282, + "rewards/margins": 2.8183822631835938, + "rewards/rejected": -2.9962522983551025, + "step": 4945 + }, + { + "epoch": 0.58, + "learning_rate": 1.2932561710168892e-07, + "logits/chosen": -2.4705793857574463, + "logits/rejected": -2.3821911811828613, + "logps/chosen": -373.4603271484375, + "logps/rejected": -298.5753173828125, + "loss": 0.4047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4362245798110962, + "rewards/margins": 2.996866226196289, + "rewards/rejected": -4.4330902099609375, + "step": 4946 + }, + { + "epoch": 0.58, + "learning_rate": 1.2929018542577064e-07, + "logits/chosen": -2.129570960998535, + "logits/rejected": -1.9283666610717773, + "logps/chosen": -232.6303253173828, + "logps/rejected": -336.94580078125, + "loss": 0.8464, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.099475622177124, + "rewards/margins": 1.0745607614517212, + "rewards/rejected": -2.1740365028381348, + "step": 4947 + }, + { + "epoch": 0.58, + "learning_rate": 1.2925475374985236e-07, + "logits/chosen": -2.154878616333008, + "logits/rejected": -2.4892845153808594, + "logps/chosen": -366.5902404785156, + "logps/rejected": -261.05780029296875, + "loss": 1.1211, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.5631541013717651, + "rewards/margins": 0.4615755081176758, + "rewards/rejected": -2.0247294902801514, + "step": 4948 + }, + { + "epoch": 0.58, + "learning_rate": 1.2921932207393408e-07, + "logits/chosen": -2.460223913192749, + "logits/rejected": -2.3769500255584717, + "logps/chosen": -182.82952880859375, + "logps/rejected": -279.4175720214844, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0462646558880806, + "rewards/margins": 4.038254261016846, + "rewards/rejected": -4.084518909454346, + "step": 4949 + }, + { + "epoch": 0.58, + "learning_rate": 1.2918389039801583e-07, + "logits/chosen": -2.2186965942382812, + "logits/rejected": -2.3140652179718018, + "logps/chosen": -318.793212890625, + "logps/rejected": -272.7854309082031, + "loss": 0.1681, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9923732280731201, + "rewards/margins": 2.9132678508758545, + "rewards/rejected": -3.9056410789489746, + "step": 4950 + }, + { + "epoch": 0.58, + "learning_rate": 1.2914845872209755e-07, + "logits/chosen": -2.2563459873199463, + "logits/rejected": -1.9658119678497314, + "logps/chosen": -136.78271484375, + "logps/rejected": -277.77044677734375, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0255968570709229, + "rewards/margins": 3.470048427581787, + "rewards/rejected": -4.495645523071289, + "step": 4951 + }, + { + "epoch": 0.58, + "learning_rate": 1.2911302704617927e-07, + "logits/chosen": -2.6663882732391357, + "logits/rejected": -2.6697287559509277, + "logps/chosen": -288.6773681640625, + "logps/rejected": -310.09735107421875, + "loss": 0.5662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3684192895889282, + "rewards/margins": 3.438345193862915, + "rewards/rejected": -4.806764602661133, + "step": 4952 + }, + { + "epoch": 0.58, + "learning_rate": 1.29077595370261e-07, + "logits/chosen": -2.1657121181488037, + "logits/rejected": -2.286780834197998, + "logps/chosen": -295.60107421875, + "logps/rejected": -234.94326782226562, + "loss": 0.3945, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.619848608970642, + "rewards/margins": 1.8318440914154053, + "rewards/rejected": -3.451692581176758, + "step": 4953 + }, + { + "epoch": 0.58, + "learning_rate": 1.2904216369434272e-07, + "logits/chosen": -2.4368579387664795, + "logits/rejected": -2.4067797660827637, + "logps/chosen": -322.55755615234375, + "logps/rejected": -322.501953125, + "loss": 0.4102, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3796987533569336, + "rewards/margins": 2.9810991287231445, + "rewards/rejected": -4.360797882080078, + "step": 4954 + }, + { + "epoch": 0.58, + "learning_rate": 1.2900673201842447e-07, + "logits/chosen": -2.537346124649048, + "logits/rejected": -2.3009536266326904, + "logps/chosen": -214.00863647460938, + "logps/rejected": -293.10614013671875, + "loss": 0.1763, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5657811164855957, + "rewards/margins": 2.8408422470092773, + "rewards/rejected": -3.406623601913452, + "step": 4955 + }, + { + "epoch": 0.58, + "learning_rate": 1.289713003425062e-07, + "logits/chosen": -1.6116416454315186, + "logits/rejected": -2.1356890201568604, + "logps/chosen": -331.9773254394531, + "logps/rejected": -177.7339324951172, + "loss": 0.6966, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3787846565246582, + "rewards/margins": 0.2444840520620346, + "rewards/rejected": -1.6232688426971436, + "step": 4956 + }, + { + "epoch": 0.58, + "learning_rate": 1.2893586866658794e-07, + "logits/chosen": -2.2807111740112305, + "logits/rejected": -2.329294204711914, + "logps/chosen": -575.661376953125, + "logps/rejected": -377.9850158691406, + "loss": 0.7078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6326277256011963, + "rewards/margins": 0.8870232105255127, + "rewards/rejected": -2.519650936126709, + "step": 4957 + }, + { + "epoch": 0.58, + "learning_rate": 1.2890043699066966e-07, + "logits/chosen": -2.7589313983917236, + "logits/rejected": -2.727663993835449, + "logps/chosen": -179.0670166015625, + "logps/rejected": -101.14022064208984, + "loss": 0.6405, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3017739057540894, + "rewards/margins": 0.6723085045814514, + "rewards/rejected": -1.9740824699401855, + "step": 4958 + }, + { + "epoch": 0.58, + "learning_rate": 1.2886500531475138e-07, + "logits/chosen": -2.42645001411438, + "logits/rejected": -2.6815199851989746, + "logps/chosen": -459.3863525390625, + "logps/rejected": -242.99600219726562, + "loss": 0.2489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7337267994880676, + "rewards/margins": 1.9174344539642334, + "rewards/rejected": -2.6511611938476562, + "step": 4959 + }, + { + "epoch": 0.58, + "learning_rate": 1.288295736388331e-07, + "logits/chosen": -2.365729808807373, + "logits/rejected": -2.226304531097412, + "logps/chosen": -160.03855895996094, + "logps/rejected": -255.910400390625, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6791954040527344, + "rewards/margins": 2.042104721069336, + "rewards/rejected": -3.7213003635406494, + "step": 4960 + }, + { + "epoch": 0.58, + "learning_rate": 1.2879414196291483e-07, + "logits/chosen": -2.7303483486175537, + "logits/rejected": -2.609713315963745, + "logps/chosen": -220.66314697265625, + "logps/rejected": -252.3252410888672, + "loss": 0.8425, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0714027881622314, + "rewards/margins": 0.8686571717262268, + "rewards/rejected": -2.9400601387023926, + "step": 4961 + }, + { + "epoch": 0.58, + "learning_rate": 1.2875871028699658e-07, + "logits/chosen": -2.04184889793396, + "logits/rejected": -2.089648485183716, + "logps/chosen": -111.89359283447266, + "logps/rejected": -250.9869384765625, + "loss": 0.3175, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.081406593322754, + "rewards/margins": 2.8639979362487793, + "rewards/rejected": -3.945404529571533, + "step": 4962 + }, + { + "epoch": 0.58, + "learning_rate": 1.287232786110783e-07, + "logits/chosen": -2.818045139312744, + "logits/rejected": -2.88004732131958, + "logps/chosen": -187.65798950195312, + "logps/rejected": -168.46170043945312, + "loss": 0.3209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.692453145980835, + "rewards/margins": 2.398386001586914, + "rewards/rejected": -3.090839385986328, + "step": 4963 + }, + { + "epoch": 0.58, + "learning_rate": 1.2868784693516002e-07, + "logits/chosen": -2.3949756622314453, + "logits/rejected": -2.437565565109253, + "logps/chosen": -298.41552734375, + "logps/rejected": -308.6033935546875, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9097143411636353, + "rewards/margins": 2.430297374725342, + "rewards/rejected": -3.3400118350982666, + "step": 4964 + }, + { + "epoch": 0.58, + "learning_rate": 1.2865241525924174e-07, + "logits/chosen": -2.1086783409118652, + "logits/rejected": -2.0637729167938232, + "logps/chosen": -396.76043701171875, + "logps/rejected": -425.26995849609375, + "loss": 0.4722, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0683314800262451, + "rewards/margins": 2.0109074115753174, + "rewards/rejected": -3.0792388916015625, + "step": 4965 + }, + { + "epoch": 0.58, + "learning_rate": 1.286169835833235e-07, + "logits/chosen": -2.108046531677246, + "logits/rejected": -2.0814669132232666, + "logps/chosen": -291.6985168457031, + "logps/rejected": -388.8736572265625, + "loss": 0.5962, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4186750650405884, + "rewards/margins": 1.1423519849777222, + "rewards/rejected": -2.5610270500183105, + "step": 4966 + }, + { + "epoch": 0.58, + "learning_rate": 1.285815519074052e-07, + "logits/chosen": -2.1042392253875732, + "logits/rejected": -2.4184608459472656, + "logps/chosen": -267.4323425292969, + "logps/rejected": -193.12094116210938, + "loss": 0.5841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5966074466705322, + "rewards/margins": 1.0796830654144287, + "rewards/rejected": -1.6762906312942505, + "step": 4967 + }, + { + "epoch": 0.58, + "learning_rate": 1.2854612023148696e-07, + "logits/chosen": -1.851239800453186, + "logits/rejected": -2.01800537109375, + "logps/chosen": -352.7012634277344, + "logps/rejected": -262.5887756347656, + "loss": 0.2342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08510750532150269, + "rewards/margins": 1.5105820894241333, + "rewards/rejected": -1.5956895351409912, + "step": 4968 + }, + { + "epoch": 0.58, + "learning_rate": 1.2851068855556868e-07, + "logits/chosen": -2.432558536529541, + "logits/rejected": -2.633359670639038, + "logps/chosen": -182.19703674316406, + "logps/rejected": -198.9443359375, + "loss": 0.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.720443069934845, + "rewards/margins": 3.0327236652374268, + "rewards/rejected": -3.753166913986206, + "step": 4969 + }, + { + "epoch": 0.58, + "learning_rate": 1.284752568796504e-07, + "logits/chosen": -2.5202078819274902, + "logits/rejected": -2.4729084968566895, + "logps/chosen": -219.0676727294922, + "logps/rejected": -187.67848205566406, + "loss": 0.2653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11130917072296143, + "rewards/margins": 1.8346264362335205, + "rewards/rejected": -1.945935606956482, + "step": 4970 + }, + { + "epoch": 0.58, + "learning_rate": 1.2843982520373213e-07, + "logits/chosen": -2.0898869037628174, + "logits/rejected": -2.237042188644409, + "logps/chosen": -540.52294921875, + "logps/rejected": -342.38055419921875, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0757588148117065, + "rewards/margins": 0.9184942841529846, + "rewards/rejected": -1.994253158569336, + "step": 4971 + }, + { + "epoch": 0.58, + "learning_rate": 1.2840439352781385e-07, + "logits/chosen": -2.341604471206665, + "logits/rejected": -2.5139918327331543, + "logps/chosen": -544.1693115234375, + "logps/rejected": -362.1025695800781, + "loss": 0.6268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33385393023490906, + "rewards/margins": 2.2152416706085205, + "rewards/rejected": -2.54909610748291, + "step": 4972 + }, + { + "epoch": 0.58, + "learning_rate": 1.283689618518956e-07, + "logits/chosen": -2.2023181915283203, + "logits/rejected": -1.9675989151000977, + "logps/chosen": -285.7481689453125, + "logps/rejected": -354.8930358886719, + "loss": 1.1541, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5335869789123535, + "rewards/margins": 1.3830063343048096, + "rewards/rejected": -1.9165929555892944, + "step": 4973 + }, + { + "epoch": 0.58, + "learning_rate": 1.2833353017597732e-07, + "logits/chosen": -2.7190661430358887, + "logits/rejected": -2.7233190536499023, + "logps/chosen": -278.17254638671875, + "logps/rejected": -219.3129119873047, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7535488605499268, + "rewards/margins": 1.7164993286132812, + "rewards/rejected": -2.470048189163208, + "step": 4974 + }, + { + "epoch": 0.58, + "learning_rate": 1.2829809850005904e-07, + "logits/chosen": -1.973333477973938, + "logits/rejected": -2.2589316368103027, + "logps/chosen": -499.8240661621094, + "logps/rejected": -332.400634765625, + "loss": 0.2862, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5119386911392212, + "rewards/margins": 2.5388479232788086, + "rewards/rejected": -4.050786018371582, + "step": 4975 + }, + { + "epoch": 0.58, + "learning_rate": 1.2826266682414076e-07, + "logits/chosen": -2.306122303009033, + "logits/rejected": -2.555237293243408, + "logps/chosen": -371.9221496582031, + "logps/rejected": -290.66473388671875, + "loss": 0.3225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6776841282844543, + "rewards/margins": 2.9434800148010254, + "rewards/rejected": -3.621164321899414, + "step": 4976 + }, + { + "epoch": 0.58, + "learning_rate": 1.2822723514822251e-07, + "logits/chosen": -2.2051119804382324, + "logits/rejected": -2.614405632019043, + "logps/chosen": -307.7746887207031, + "logps/rejected": -169.33465576171875, + "loss": 0.1548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17593073844909668, + "rewards/margins": 2.823361873626709, + "rewards/rejected": -2.9992928504943848, + "step": 4977 + }, + { + "epoch": 0.58, + "learning_rate": 1.2819180347230424e-07, + "logits/chosen": -2.281564712524414, + "logits/rejected": -2.2349283695220947, + "logps/chosen": -245.7202606201172, + "logps/rejected": -341.51123046875, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.369429588317871, + "rewards/margins": 3.81427001953125, + "rewards/rejected": -5.183699607849121, + "step": 4978 + }, + { + "epoch": 0.58, + "learning_rate": 1.2815637179638596e-07, + "logits/chosen": -2.3892860412597656, + "logits/rejected": -2.5508804321289062, + "logps/chosen": -299.2299499511719, + "logps/rejected": -252.72314453125, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7366554737091064, + "rewards/margins": 2.0160117149353027, + "rewards/rejected": -2.752667188644409, + "step": 4979 + }, + { + "epoch": 0.58, + "learning_rate": 1.281209401204677e-07, + "logits/chosen": -2.0227205753326416, + "logits/rejected": -2.236707925796509, + "logps/chosen": -281.2027587890625, + "logps/rejected": -257.28509521484375, + "loss": 0.3239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5506935119628906, + "rewards/margins": 2.8614561557769775, + "rewards/rejected": -3.412149429321289, + "step": 4980 + }, + { + "epoch": 0.58, + "learning_rate": 1.2808550844454943e-07, + "logits/chosen": -2.5172176361083984, + "logits/rejected": -2.4444332122802734, + "logps/chosen": -250.2740478515625, + "logps/rejected": -206.22427368164062, + "loss": 0.7018, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5766663551330566, + "rewards/margins": 0.5567880272865295, + "rewards/rejected": -1.1334543228149414, + "step": 4981 + }, + { + "epoch": 0.58, + "learning_rate": 1.2805007676863115e-07, + "logits/chosen": -2.141328811645508, + "logits/rejected": -2.5219016075134277, + "logps/chosen": -192.51229858398438, + "logps/rejected": -245.0391082763672, + "loss": 0.6426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.724065899848938, + "rewards/margins": 1.4952632188796997, + "rewards/rejected": -2.2193291187286377, + "step": 4982 + }, + { + "epoch": 0.58, + "learning_rate": 1.2801464509271287e-07, + "logits/chosen": -2.025484561920166, + "logits/rejected": -2.4529287815093994, + "logps/chosen": -396.4825744628906, + "logps/rejected": -275.458984375, + "loss": 0.2005, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30254507064819336, + "rewards/margins": 3.6435177326202393, + "rewards/rejected": -3.9460625648498535, + "step": 4983 + }, + { + "epoch": 0.58, + "learning_rate": 1.279792134167946e-07, + "logits/chosen": -2.470404624938965, + "logits/rejected": -2.4293644428253174, + "logps/chosen": -223.46432495117188, + "logps/rejected": -399.7251281738281, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42662739753723145, + "rewards/margins": 2.4852964878082275, + "rewards/rejected": -2.911923885345459, + "step": 4984 + }, + { + "epoch": 0.58, + "learning_rate": 1.2794378174087634e-07, + "logits/chosen": -2.445805549621582, + "logits/rejected": -2.464209794998169, + "logps/chosen": -300.95196533203125, + "logps/rejected": -306.5870056152344, + "loss": 0.4353, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34047120809555054, + "rewards/margins": 1.87259840965271, + "rewards/rejected": -2.2130696773529053, + "step": 4985 + }, + { + "epoch": 0.58, + "learning_rate": 1.2790835006495807e-07, + "logits/chosen": -2.698894500732422, + "logits/rejected": -2.7770156860351562, + "logps/chosen": -135.4595489501953, + "logps/rejected": -216.386962890625, + "loss": 0.5819, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5836890935897827, + "rewards/margins": 1.689714789390564, + "rewards/rejected": -3.2734038829803467, + "step": 4986 + }, + { + "epoch": 0.58, + "learning_rate": 1.278729183890398e-07, + "logits/chosen": -1.9910471439361572, + "logits/rejected": -1.9543753862380981, + "logps/chosen": -275.806640625, + "logps/rejected": -292.7551574707031, + "loss": 1.0247, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3051960468292236, + "rewards/margins": 1.457802414894104, + "rewards/rejected": -3.762998580932617, + "step": 4987 + }, + { + "epoch": 0.58, + "learning_rate": 1.2783748671312154e-07, + "logits/chosen": -1.911210536956787, + "logits/rejected": -2.034813642501831, + "logps/chosen": -415.4217834472656, + "logps/rejected": -308.489990234375, + "loss": 0.8522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9848672747612, + "rewards/margins": 1.3181178569793701, + "rewards/rejected": -2.302985191345215, + "step": 4988 + }, + { + "epoch": 0.58, + "learning_rate": 1.2780205503720326e-07, + "logits/chosen": -2.555696487426758, + "logits/rejected": -2.5398035049438477, + "logps/chosen": -242.5450897216797, + "logps/rejected": -246.6356201171875, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20990701019763947, + "rewards/margins": 3.8510372638702393, + "rewards/rejected": -4.060944557189941, + "step": 4989 + }, + { + "epoch": 0.58, + "learning_rate": 1.2776662336128498e-07, + "logits/chosen": -1.5355135202407837, + "logits/rejected": -1.7660880088806152, + "logps/chosen": -620.6427612304688, + "logps/rejected": -544.0104370117188, + "loss": 0.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01670283079147339, + "rewards/margins": 2.954514741897583, + "rewards/rejected": -2.971217393875122, + "step": 4990 + }, + { + "epoch": 0.58, + "learning_rate": 1.2773119168536673e-07, + "logits/chosen": -2.4170894622802734, + "logits/rejected": -2.593413829803467, + "logps/chosen": -456.5002136230469, + "logps/rejected": -227.36859130859375, + "loss": 0.4416, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.525996685028076, + "rewards/margins": 2.124033212661743, + "rewards/rejected": -4.650030136108398, + "step": 4991 + }, + { + "epoch": 0.58, + "learning_rate": 1.2769576000944845e-07, + "logits/chosen": -2.352752923965454, + "logits/rejected": -2.2720630168914795, + "logps/chosen": -287.6003112792969, + "logps/rejected": -338.2923583984375, + "loss": 0.7006, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1352601051330566, + "rewards/margins": 2.8906590938568115, + "rewards/rejected": -4.025918960571289, + "step": 4992 + }, + { + "epoch": 0.58, + "learning_rate": 1.2766032833353017e-07, + "logits/chosen": -2.8374853134155273, + "logits/rejected": -2.8801887035369873, + "logps/chosen": -128.32022094726562, + "logps/rejected": -193.45614624023438, + "loss": 0.1513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7128094434738159, + "rewards/margins": 2.7571961879730225, + "rewards/rejected": -3.470005750656128, + "step": 4993 + }, + { + "epoch": 0.58, + "learning_rate": 1.276248966576119e-07, + "logits/chosen": -2.778282642364502, + "logits/rejected": -2.7775795459747314, + "logps/chosen": -201.17347717285156, + "logps/rejected": -273.3324279785156, + "loss": 0.5322, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0694972276687622, + "rewards/margins": 0.9708243608474731, + "rewards/rejected": -2.0403215885162354, + "step": 4994 + }, + { + "epoch": 0.58, + "learning_rate": 1.2758946498169362e-07, + "logits/chosen": -2.6118276119232178, + "logits/rejected": -2.902498483657837, + "logps/chosen": -322.518798828125, + "logps/rejected": -275.03021240234375, + "loss": 0.2206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7189247608184814, + "rewards/margins": 2.8431403636932373, + "rewards/rejected": -3.5620648860931396, + "step": 4995 + }, + { + "epoch": 0.58, + "learning_rate": 1.2755403330577534e-07, + "logits/chosen": -2.91837215423584, + "logits/rejected": -2.8293378353118896, + "logps/chosen": -413.57830810546875, + "logps/rejected": -419.4398193359375, + "loss": 0.7061, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7350621223449707, + "rewards/margins": 2.0849318504333496, + "rewards/rejected": -3.8199942111968994, + "step": 4996 + }, + { + "epoch": 0.58, + "learning_rate": 1.275186016298571e-07, + "logits/chosen": -2.713042736053467, + "logits/rejected": -2.5138919353485107, + "logps/chosen": -273.91021728515625, + "logps/rejected": -206.5056610107422, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8296372890472412, + "rewards/margins": 1.9194782972335815, + "rewards/rejected": -2.749115467071533, + "step": 4997 + }, + { + "epoch": 0.58, + "learning_rate": 1.274831699539388e-07, + "logits/chosen": -2.042919397354126, + "logits/rejected": -2.1612935066223145, + "logps/chosen": -317.7601623535156, + "logps/rejected": -307.3943786621094, + "loss": 0.184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9101721048355103, + "rewards/margins": 2.3210501670837402, + "rewards/rejected": -3.231222152709961, + "step": 4998 + }, + { + "epoch": 0.58, + "learning_rate": 1.2744773827802053e-07, + "logits/chosen": -2.478100299835205, + "logits/rejected": -2.489065408706665, + "logps/chosen": -333.7314147949219, + "logps/rejected": -303.79534912109375, + "loss": 0.4342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8440136909484863, + "rewards/margins": 1.564299464225769, + "rewards/rejected": -2.408313035964966, + "step": 4999 + }, + { + "epoch": 0.58, + "learning_rate": 1.2741230660210228e-07, + "logits/chosen": -2.436861753463745, + "logits/rejected": -2.4165420532226562, + "logps/chosen": -125.89694213867188, + "logps/rejected": -180.00918579101562, + "loss": 0.6934, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9185258150100708, + "rewards/margins": 0.7490828037261963, + "rewards/rejected": -2.6676084995269775, + "step": 5000 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.7493003606796265, + "eval_logits/rejected": -1.749708890914917, + "eval_logps/chosen": -278.6475524902344, + "eval_logps/rejected": -278.8118591308594, + "eval_loss": 0.3704482614994049, + "eval_rewards/accuracies": 0.8491379022598267, + "eval_rewards/chosen": -0.6482082009315491, + "eval_rewards/margins": 2.1323232650756836, + "eval_rewards/rejected": -2.780531167984009, + "eval_runtime": 237.4807, + "eval_samples_per_second": 2.927, + "eval_steps_per_second": 1.465, + "step": 5000 + }, + { + "epoch": 0.58, + "learning_rate": 1.27376874926184e-07, + "logits/chosen": -2.448072671890259, + "logits/rejected": -2.5658533573150635, + "logps/chosen": -314.3951416015625, + "logps/rejected": -193.90748596191406, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1025920882821083, + "rewards/margins": 2.0626649856567383, + "rewards/rejected": -1.9600727558135986, + "step": 5001 + }, + { + "epoch": 0.58, + "learning_rate": 1.2734144325026575e-07, + "logits/chosen": -2.0485544204711914, + "logits/rejected": -2.0625832080841064, + "logps/chosen": -367.2135925292969, + "logps/rejected": -305.0137023925781, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1647567749023438, + "rewards/margins": 2.722200393676758, + "rewards/rejected": -3.8869569301605225, + "step": 5002 + }, + { + "epoch": 0.58, + "learning_rate": 1.2730601157434747e-07, + "logits/chosen": -2.3442068099975586, + "logits/rejected": -2.1199841499328613, + "logps/chosen": -235.193359375, + "logps/rejected": -238.95765686035156, + "loss": 0.3038, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5088386535644531, + "rewards/margins": 1.9518119096755981, + "rewards/rejected": -2.4606504440307617, + "step": 5003 + }, + { + "epoch": 0.58, + "learning_rate": 1.272705798984292e-07, + "logits/chosen": -2.8053717613220215, + "logits/rejected": -2.8121883869171143, + "logps/chosen": -122.35606384277344, + "logps/rejected": -124.37657165527344, + "loss": 0.5249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5447511076927185, + "rewards/margins": 2.140857219696045, + "rewards/rejected": -2.685608386993408, + "step": 5004 + }, + { + "epoch": 0.58, + "learning_rate": 1.2723514822251092e-07, + "logits/chosen": -2.2128281593322754, + "logits/rejected": -2.4840023517608643, + "logps/chosen": -337.5320129394531, + "logps/rejected": -206.02890014648438, + "loss": 0.4322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7329486012458801, + "rewards/margins": 1.5698072910308838, + "rewards/rejected": -2.3027560710906982, + "step": 5005 + }, + { + "epoch": 0.58, + "learning_rate": 1.2719971654659264e-07, + "logits/chosen": -2.4379220008850098, + "logits/rejected": -2.643517017364502, + "logps/chosen": -141.06040954589844, + "logps/rejected": -135.20481872558594, + "loss": 1.3239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6664382219314575, + "rewards/margins": 1.3340415954589844, + "rewards/rejected": -3.0004799365997314, + "step": 5006 + }, + { + "epoch": 0.58, + "learning_rate": 1.2716428487067436e-07, + "logits/chosen": -1.9871236085891724, + "logits/rejected": -2.002516984939575, + "logps/chosen": -324.06298828125, + "logps/rejected": -282.7936706542969, + "loss": 0.2882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38419878482818604, + "rewards/margins": 2.0457749366760254, + "rewards/rejected": -2.429973840713501, + "step": 5007 + }, + { + "epoch": 0.58, + "learning_rate": 1.271288531947561e-07, + "logits/chosen": -2.1172099113464355, + "logits/rejected": -2.522083282470703, + "logps/chosen": -374.1548767089844, + "logps/rejected": -217.88128662109375, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34124788641929626, + "rewards/margins": 2.3673157691955566, + "rewards/rejected": -2.708563804626465, + "step": 5008 + }, + { + "epoch": 0.58, + "learning_rate": 1.2709342151883783e-07, + "logits/chosen": -2.397780179977417, + "logits/rejected": -2.3363850116729736, + "logps/chosen": -191.94326782226562, + "logps/rejected": -253.56504821777344, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5940826535224915, + "rewards/margins": 3.58428955078125, + "rewards/rejected": -4.178372383117676, + "step": 5009 + }, + { + "epoch": 0.58, + "learning_rate": 1.2705798984291956e-07, + "logits/chosen": -2.1975460052490234, + "logits/rejected": -2.6020331382751465, + "logps/chosen": -347.7258605957031, + "logps/rejected": -190.0863037109375, + "loss": 0.3145, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6098583936691284, + "rewards/margins": 1.2346560955047607, + "rewards/rejected": -2.8445146083831787, + "step": 5010 + }, + { + "epoch": 0.58, + "learning_rate": 1.270225581670013e-07, + "logits/chosen": -2.1591057777404785, + "logits/rejected": -1.8677372932434082, + "logps/chosen": -235.5383758544922, + "logps/rejected": -205.22998046875, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9226926565170288, + "rewards/margins": 1.9067730903625488, + "rewards/rejected": -2.829465866088867, + "step": 5011 + }, + { + "epoch": 0.58, + "learning_rate": 1.2698712649108303e-07, + "logits/chosen": -2.4727463722229004, + "logits/rejected": -2.201777458190918, + "logps/chosen": -246.04653930664062, + "logps/rejected": -364.3743896484375, + "loss": 0.2528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37532657384872437, + "rewards/margins": 2.0345726013183594, + "rewards/rejected": -2.4098987579345703, + "step": 5012 + }, + { + "epoch": 0.58, + "learning_rate": 1.2695169481516475e-07, + "logits/chosen": -2.5710182189941406, + "logits/rejected": -2.420264482498169, + "logps/chosen": -138.28521728515625, + "logps/rejected": -238.74826049804688, + "loss": 0.2853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8522183895111084, + "rewards/margins": 2.1393890380859375, + "rewards/rejected": -2.991607666015625, + "step": 5013 + }, + { + "epoch": 0.58, + "learning_rate": 1.269162631392465e-07, + "logits/chosen": -2.347460985183716, + "logits/rejected": -2.742182731628418, + "logps/chosen": -566.0645141601562, + "logps/rejected": -380.82244873046875, + "loss": 0.2336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3380976915359497, + "rewards/margins": 2.815774440765381, + "rewards/rejected": -3.153872013092041, + "step": 5014 + }, + { + "epoch": 0.58, + "learning_rate": 1.2688083146332822e-07, + "logits/chosen": -2.326587200164795, + "logits/rejected": -2.3138411045074463, + "logps/chosen": -237.60281372070312, + "logps/rejected": -310.2696533203125, + "loss": 0.5868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8882647156715393, + "rewards/margins": 1.0294536352157593, + "rewards/rejected": -1.9177184104919434, + "step": 5015 + }, + { + "epoch": 0.58, + "learning_rate": 1.2684539978740994e-07, + "logits/chosen": -2.3812763690948486, + "logits/rejected": -2.2785987854003906, + "logps/chosen": -188.69285583496094, + "logps/rejected": -301.3819885253906, + "loss": 0.696, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.787169098854065, + "rewards/margins": 1.3538234233856201, + "rewards/rejected": -3.1409926414489746, + "step": 5016 + }, + { + "epoch": 0.58, + "learning_rate": 1.2680996811149166e-07, + "logits/chosen": -2.047483205795288, + "logits/rejected": -2.25661301612854, + "logps/chosen": -443.06365966796875, + "logps/rejected": -320.4031066894531, + "loss": 0.2352, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0828160047531128, + "rewards/margins": 2.474048376083374, + "rewards/rejected": -3.5568642616271973, + "step": 5017 + }, + { + "epoch": 0.58, + "learning_rate": 1.2677453643557339e-07, + "logits/chosen": -2.493412733078003, + "logits/rejected": -2.387420654296875, + "logps/chosen": -358.2150573730469, + "logps/rejected": -341.5381774902344, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9517525434494019, + "rewards/margins": 0.8550430536270142, + "rewards/rejected": -2.806795597076416, + "step": 5018 + }, + { + "epoch": 0.58, + "learning_rate": 1.267391047596551e-07, + "logits/chosen": -2.293483257293701, + "logits/rejected": -2.324214458465576, + "logps/chosen": -151.86395263671875, + "logps/rejected": -178.8552703857422, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8797059059143066, + "rewards/margins": 2.2820637226104736, + "rewards/rejected": -3.161769390106201, + "step": 5019 + }, + { + "epoch": 0.58, + "learning_rate": 1.2670367308373686e-07, + "logits/chosen": -2.3212504386901855, + "logits/rejected": -2.4591965675354004, + "logps/chosen": -446.8814697265625, + "logps/rejected": -331.9065246582031, + "loss": 0.2569, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9165117740631104, + "rewards/margins": 2.397764205932617, + "rewards/rejected": -3.3142757415771484, + "step": 5020 + }, + { + "epoch": 0.58, + "learning_rate": 1.2666824140781858e-07, + "logits/chosen": -1.9619414806365967, + "logits/rejected": -2.2968997955322266, + "logps/chosen": -216.6094512939453, + "logps/rejected": -178.23741149902344, + "loss": 0.9064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2431081533432007, + "rewards/margins": 2.0532853603363037, + "rewards/rejected": -3.296393394470215, + "step": 5021 + }, + { + "epoch": 0.58, + "learning_rate": 1.2663280973190033e-07, + "logits/chosen": -2.5364489555358887, + "logits/rejected": -2.5714333057403564, + "logps/chosen": -255.55294799804688, + "logps/rejected": -284.2391357421875, + "loss": 0.682, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2542182207107544, + "rewards/margins": 1.6134649515151978, + "rewards/rejected": -2.867683172225952, + "step": 5022 + }, + { + "epoch": 0.58, + "learning_rate": 1.2659737805598205e-07, + "logits/chosen": -2.0139377117156982, + "logits/rejected": -1.9455764293670654, + "logps/chosen": -268.730224609375, + "logps/rejected": -264.1647033691406, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8141624331474304, + "rewards/margins": 1.4268033504486084, + "rewards/rejected": -2.2409658432006836, + "step": 5023 + }, + { + "epoch": 0.58, + "learning_rate": 1.2656194638006377e-07, + "logits/chosen": -2.6256308555603027, + "logits/rejected": -2.6258370876312256, + "logps/chosen": -274.5530700683594, + "logps/rejected": -157.1398468017578, + "loss": 0.6867, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.627955675125122, + "rewards/margins": 0.9628015756607056, + "rewards/rejected": -2.590757369995117, + "step": 5024 + }, + { + "epoch": 0.58, + "learning_rate": 1.265265147041455e-07, + "logits/chosen": -2.555377960205078, + "logits/rejected": -2.477292776107788, + "logps/chosen": -344.78662109375, + "logps/rejected": -194.75804138183594, + "loss": 0.7569, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2015328407287598, + "rewards/margins": 1.2914994955062866, + "rewards/rejected": -2.493032455444336, + "step": 5025 + }, + { + "epoch": 0.58, + "learning_rate": 1.2649108302822724e-07, + "logits/chosen": -2.5004374980926514, + "logits/rejected": -2.6832122802734375, + "logps/chosen": -161.03521728515625, + "logps/rejected": -169.54747009277344, + "loss": 0.4776, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5409038066864014, + "rewards/margins": 1.6216109991073608, + "rewards/rejected": -2.1625146865844727, + "step": 5026 + }, + { + "epoch": 0.58, + "learning_rate": 1.2645565135230896e-07, + "logits/chosen": -2.5424160957336426, + "logits/rejected": -2.774426221847534, + "logps/chosen": -139.35586547851562, + "logps/rejected": -283.0226745605469, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5186126828193665, + "rewards/margins": 4.47479248046875, + "rewards/rejected": -4.993404388427734, + "step": 5027 + }, + { + "epoch": 0.58, + "learning_rate": 1.2642021967639069e-07, + "logits/chosen": -2.5196285247802734, + "logits/rejected": -2.599818229675293, + "logps/chosen": -298.28143310546875, + "logps/rejected": -204.89028930664062, + "loss": 0.3603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0066965818405151, + "rewards/margins": 2.8453176021575928, + "rewards/rejected": -3.8520145416259766, + "step": 5028 + }, + { + "epoch": 0.59, + "learning_rate": 1.263847880004724e-07, + "logits/chosen": -1.6583380699157715, + "logits/rejected": -1.8511111736297607, + "logps/chosen": -310.0856628417969, + "logps/rejected": -266.33209228515625, + "loss": 0.3913, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3269226551055908, + "rewards/margins": 2.3249833583831787, + "rewards/rejected": -3.6519060134887695, + "step": 5029 + }, + { + "epoch": 0.59, + "learning_rate": 1.2634935632455413e-07, + "logits/chosen": -2.902507781982422, + "logits/rejected": -2.9840400218963623, + "logps/chosen": -345.1080322265625, + "logps/rejected": -233.337646484375, + "loss": 0.2349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1498950719833374, + "rewards/margins": 2.3578126430511475, + "rewards/rejected": -3.5077078342437744, + "step": 5030 + }, + { + "epoch": 0.59, + "learning_rate": 1.2631392464863588e-07, + "logits/chosen": -1.8298745155334473, + "logits/rejected": -1.8847832679748535, + "logps/chosen": -464.0446472167969, + "logps/rejected": -550.2960815429688, + "loss": 0.6852, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2549769878387451, + "rewards/margins": 0.6575993299484253, + "rewards/rejected": -1.9125761985778809, + "step": 5031 + }, + { + "epoch": 0.59, + "learning_rate": 1.262784929727176e-07, + "logits/chosen": -2.466765880584717, + "logits/rejected": -2.5501484870910645, + "logps/chosen": -215.99560546875, + "logps/rejected": -126.53117370605469, + "loss": 1.1182, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5513098239898682, + "rewards/margins": -0.04120063781738281, + "rewards/rejected": -1.510109305381775, + "step": 5032 + }, + { + "epoch": 0.59, + "learning_rate": 1.2624306129679932e-07, + "logits/chosen": -2.2486977577209473, + "logits/rejected": -2.3930084705352783, + "logps/chosen": -349.591796875, + "logps/rejected": -250.3293914794922, + "loss": 0.2727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8739951848983765, + "rewards/margins": 1.664348840713501, + "rewards/rejected": -2.538343906402588, + "step": 5033 + }, + { + "epoch": 0.59, + "learning_rate": 1.2620762962088107e-07, + "logits/chosen": -2.7339017391204834, + "logits/rejected": -2.8964338302612305, + "logps/chosen": -229.69888305664062, + "logps/rejected": -166.30776977539062, + "loss": 0.23, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36191827058792114, + "rewards/margins": 3.2997090816497803, + "rewards/rejected": -3.6616272926330566, + "step": 5034 + }, + { + "epoch": 0.59, + "learning_rate": 1.261721979449628e-07, + "logits/chosen": -1.7661529779434204, + "logits/rejected": -2.319608211517334, + "logps/chosen": -468.3108215332031, + "logps/rejected": -266.27642822265625, + "loss": 0.184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013813894242048264, + "rewards/margins": 2.166487216949463, + "rewards/rejected": -2.1803011894226074, + "step": 5035 + }, + { + "epoch": 0.59, + "learning_rate": 1.2613676626904452e-07, + "logits/chosen": -2.4165430068969727, + "logits/rejected": -2.3733725547790527, + "logps/chosen": -298.70361328125, + "logps/rejected": -296.5895080566406, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.04870343208313, + "rewards/margins": 2.629744052886963, + "rewards/rejected": -4.678447246551514, + "step": 5036 + }, + { + "epoch": 0.59, + "learning_rate": 1.2610133459312626e-07, + "logits/chosen": -2.704406499862671, + "logits/rejected": -2.5900254249572754, + "logps/chosen": -302.1014099121094, + "logps/rejected": -253.86624145507812, + "loss": 0.4055, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9303373098373413, + "rewards/margins": 1.9476597309112549, + "rewards/rejected": -2.8779969215393066, + "step": 5037 + }, + { + "epoch": 0.59, + "learning_rate": 1.2606590291720799e-07, + "logits/chosen": -1.9082858562469482, + "logits/rejected": -1.8875195980072021, + "logps/chosen": -182.53204345703125, + "logps/rejected": -184.48992919921875, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0560022592544556, + "rewards/margins": 2.435718059539795, + "rewards/rejected": -3.491720199584961, + "step": 5038 + }, + { + "epoch": 0.59, + "learning_rate": 1.260304712412897e-07, + "logits/chosen": -2.369805335998535, + "logits/rejected": -2.6521146297454834, + "logps/chosen": -279.2736511230469, + "logps/rejected": -175.94692993164062, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8370301723480225, + "rewards/margins": 1.264455795288086, + "rewards/rejected": -2.1014859676361084, + "step": 5039 + }, + { + "epoch": 0.59, + "learning_rate": 1.2599503956537143e-07, + "logits/chosen": -2.2333791255950928, + "logits/rejected": -2.167800188064575, + "logps/chosen": -185.82015991210938, + "logps/rejected": -203.60986328125, + "loss": 0.966, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9001622200012207, + "rewards/margins": 0.5284583568572998, + "rewards/rejected": -2.4286205768585205, + "step": 5040 + }, + { + "epoch": 0.59, + "learning_rate": 1.2595960788945315e-07, + "logits/chosen": -1.598362684249878, + "logits/rejected": -1.490558385848999, + "logps/chosen": -267.8580322265625, + "logps/rejected": -372.0643005371094, + "loss": 0.3532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8839406967163086, + "rewards/margins": 1.3490755558013916, + "rewards/rejected": -2.233016014099121, + "step": 5041 + }, + { + "epoch": 0.59, + "learning_rate": 1.259241762135349e-07, + "logits/chosen": -2.785994529724121, + "logits/rejected": -2.7436108589172363, + "logps/chosen": -126.50717163085938, + "logps/rejected": -271.67034912109375, + "loss": 0.13, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9907951354980469, + "rewards/margins": 3.335768699645996, + "rewards/rejected": -4.326563835144043, + "step": 5042 + }, + { + "epoch": 0.59, + "learning_rate": 1.2588874453761662e-07, + "logits/chosen": -2.8406522274017334, + "logits/rejected": -2.847862482070923, + "logps/chosen": -285.98150634765625, + "logps/rejected": -243.86953735351562, + "loss": 0.2045, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.040951959788799286, + "rewards/margins": 3.677973747253418, + "rewards/rejected": -3.718925714492798, + "step": 5043 + }, + { + "epoch": 0.59, + "learning_rate": 1.2585331286169835e-07, + "logits/chosen": -2.5393457412719727, + "logits/rejected": -2.2833058834075928, + "logps/chosen": -103.62020111083984, + "logps/rejected": -287.2734680175781, + "loss": 0.1982, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.639127790927887, + "rewards/margins": 3.255880117416382, + "rewards/rejected": -3.895008087158203, + "step": 5044 + }, + { + "epoch": 0.59, + "learning_rate": 1.258178811857801e-07, + "logits/chosen": -2.3160483837127686, + "logits/rejected": -2.445261001586914, + "logps/chosen": -301.4508361816406, + "logps/rejected": -235.83108520507812, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6346229910850525, + "rewards/margins": 3.6629855632781982, + "rewards/rejected": -4.297608375549316, + "step": 5045 + }, + { + "epoch": 0.59, + "learning_rate": 1.2578244950986182e-07, + "logits/chosen": -2.4213991165161133, + "logits/rejected": -2.1528828144073486, + "logps/chosen": -260.78173828125, + "logps/rejected": -286.6127014160156, + "loss": 0.2338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5604619979858398, + "rewards/margins": 1.9202547073364258, + "rewards/rejected": -2.4807167053222656, + "step": 5046 + }, + { + "epoch": 0.59, + "learning_rate": 1.2574701783394354e-07, + "logits/chosen": -2.1239657402038574, + "logits/rejected": -2.2135210037231445, + "logps/chosen": -197.57440185546875, + "logps/rejected": -247.21759033203125, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5139496922492981, + "rewards/margins": 2.620455265045166, + "rewards/rejected": -3.1344048976898193, + "step": 5047 + }, + { + "epoch": 0.59, + "learning_rate": 1.2571158615802526e-07, + "logits/chosen": -1.8035229444503784, + "logits/rejected": -2.262049674987793, + "logps/chosen": -442.8195495605469, + "logps/rejected": -245.49559020996094, + "loss": 0.4067, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9732864499092102, + "rewards/margins": 1.5596760511398315, + "rewards/rejected": -2.5329623222351074, + "step": 5048 + }, + { + "epoch": 0.59, + "learning_rate": 1.25676154482107e-07, + "logits/chosen": -1.9747023582458496, + "logits/rejected": -1.9992501735687256, + "logps/chosen": -203.44345092773438, + "logps/rejected": -287.6009521484375, + "loss": 0.2356, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6251349449157715, + "rewards/margins": 2.7157435417175293, + "rewards/rejected": -3.340878486633301, + "step": 5049 + }, + { + "epoch": 0.59, + "learning_rate": 1.2564072280618873e-07, + "logits/chosen": -2.2412095069885254, + "logits/rejected": -2.173689365386963, + "logps/chosen": -277.09002685546875, + "logps/rejected": -331.48907470703125, + "loss": 0.2863, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5114327073097229, + "rewards/margins": 2.143389940261841, + "rewards/rejected": -2.654822826385498, + "step": 5050 + }, + { + "epoch": 0.59, + "learning_rate": 1.2560529113027045e-07, + "logits/chosen": -2.911099672317505, + "logits/rejected": -2.8257720470428467, + "logps/chosen": -302.1712646484375, + "logps/rejected": -275.680908203125, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5880608558654785, + "rewards/margins": 4.239800930023193, + "rewards/rejected": -4.827861309051514, + "step": 5051 + }, + { + "epoch": 0.59, + "learning_rate": 1.2556985945435218e-07, + "logits/chosen": -2.51224422454834, + "logits/rejected": -2.4394235610961914, + "logps/chosen": -178.4909210205078, + "logps/rejected": -292.4670715332031, + "loss": 0.5162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0399235486984253, + "rewards/margins": 2.5417978763580322, + "rewards/rejected": -3.581721782684326, + "step": 5052 + }, + { + "epoch": 0.59, + "learning_rate": 1.255344277784339e-07, + "logits/chosen": -2.345547676086426, + "logits/rejected": -2.367676019668579, + "logps/chosen": -337.0796203613281, + "logps/rejected": -343.7296447753906, + "loss": 0.4611, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2996642589569092, + "rewards/margins": 2.2832274436950684, + "rewards/rejected": -3.5828917026519775, + "step": 5053 + }, + { + "epoch": 0.59, + "learning_rate": 1.2549899610251565e-07, + "logits/chosen": -2.0981104373931885, + "logits/rejected": -1.7766231298446655, + "logps/chosen": -267.132568359375, + "logps/rejected": -426.4649353027344, + "loss": 0.4391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1440658569335938, + "rewards/margins": 1.9454902410507202, + "rewards/rejected": -3.0895562171936035, + "step": 5054 + }, + { + "epoch": 0.59, + "learning_rate": 1.2546356442659737e-07, + "logits/chosen": -2.27182936668396, + "logits/rejected": -2.4739980697631836, + "logps/chosen": -197.70553588867188, + "logps/rejected": -299.1203918457031, + "loss": 0.2109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9340135455131531, + "rewards/margins": 2.7556979656219482, + "rewards/rejected": -3.689711570739746, + "step": 5055 + }, + { + "epoch": 0.59, + "learning_rate": 1.2542813275067912e-07, + "logits/chosen": -2.1973214149475098, + "logits/rejected": -1.7851725816726685, + "logps/chosen": -161.9153289794922, + "logps/rejected": -316.9391784667969, + "loss": 0.5271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31334853172302246, + "rewards/margins": 2.3823776245117188, + "rewards/rejected": -2.695726156234741, + "step": 5056 + }, + { + "epoch": 0.59, + "learning_rate": 1.2539270107476084e-07, + "logits/chosen": -2.6415936946868896, + "logits/rejected": -2.725944995880127, + "logps/chosen": -345.1324157714844, + "logps/rejected": -281.581298828125, + "loss": 0.2281, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9726064801216125, + "rewards/margins": 3.1831581592559814, + "rewards/rejected": -4.155764579772949, + "step": 5057 + }, + { + "epoch": 0.59, + "learning_rate": 1.2535726939884256e-07, + "logits/chosen": -1.9976046085357666, + "logits/rejected": -2.064516067504883, + "logps/chosen": -300.18182373046875, + "logps/rejected": -240.24188232421875, + "loss": 0.3908, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1462929248809814, + "rewards/margins": 1.6564544439315796, + "rewards/rejected": -2.8027472496032715, + "step": 5058 + }, + { + "epoch": 0.59, + "learning_rate": 1.2532183772292428e-07, + "logits/chosen": -2.2003281116485596, + "logits/rejected": -2.2003026008605957, + "logps/chosen": -330.9859619140625, + "logps/rejected": -335.26641845703125, + "loss": 0.577, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4647020697593689, + "rewards/margins": 1.023249626159668, + "rewards/rejected": -1.487951636314392, + "step": 5059 + }, + { + "epoch": 0.59, + "learning_rate": 1.25286406047006e-07, + "logits/chosen": -1.8503432273864746, + "logits/rejected": -2.23567271232605, + "logps/chosen": -391.1458740234375, + "logps/rejected": -315.9316101074219, + "loss": 0.5458, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9872946739196777, + "rewards/margins": 1.5065724849700928, + "rewards/rejected": -2.4938669204711914, + "step": 5060 + }, + { + "epoch": 0.59, + "learning_rate": 1.2525097437108775e-07, + "logits/chosen": -2.194896697998047, + "logits/rejected": -2.2195792198181152, + "logps/chosen": -180.734619140625, + "logps/rejected": -298.43365478515625, + "loss": 0.0477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11201455444097519, + "rewards/margins": 4.649348258972168, + "rewards/rejected": -4.7613630294799805, + "step": 5061 + }, + { + "epoch": 0.59, + "learning_rate": 1.2521554269516948e-07, + "logits/chosen": -2.935894727706909, + "logits/rejected": -2.930459976196289, + "logps/chosen": -343.04119873046875, + "logps/rejected": -215.00030517578125, + "loss": 0.8591, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3056944608688354, + "rewards/margins": 1.528649926185608, + "rewards/rejected": -2.8343443870544434, + "step": 5062 + }, + { + "epoch": 0.59, + "learning_rate": 1.251801110192512e-07, + "logits/chosen": -2.2215189933776855, + "logits/rejected": -2.328315258026123, + "logps/chosen": -236.15066528320312, + "logps/rejected": -286.737060546875, + "loss": 0.421, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0598161220550537, + "rewards/margins": 2.3775668144226074, + "rewards/rejected": -3.437382936477661, + "step": 5063 + }, + { + "epoch": 0.59, + "learning_rate": 1.2514467934333292e-07, + "logits/chosen": -2.3271660804748535, + "logits/rejected": -2.3771748542785645, + "logps/chosen": -340.5474853515625, + "logps/rejected": -338.4072265625, + "loss": 0.0636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47275155782699585, + "rewards/margins": 4.264046669006348, + "rewards/rejected": -4.736798286437988, + "step": 5064 + }, + { + "epoch": 0.59, + "learning_rate": 1.2510924766741467e-07, + "logits/chosen": -2.382052421569824, + "logits/rejected": -2.1117660999298096, + "logps/chosen": -200.75177001953125, + "logps/rejected": -258.7510070800781, + "loss": 0.4138, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4100629091262817, + "rewards/margins": 1.7193949222564697, + "rewards/rejected": -3.129457712173462, + "step": 5065 + }, + { + "epoch": 0.59, + "learning_rate": 1.250738159914964e-07, + "logits/chosen": -2.3961143493652344, + "logits/rejected": -2.742340564727783, + "logps/chosen": -419.90472412109375, + "logps/rejected": -283.8713684082031, + "loss": 0.2118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06645327806472778, + "rewards/margins": 2.4807984828948975, + "rewards/rejected": -2.5472517013549805, + "step": 5066 + }, + { + "epoch": 0.59, + "learning_rate": 1.2503838431557814e-07, + "logits/chosen": -2.512465238571167, + "logits/rejected": -2.6450307369232178, + "logps/chosen": -202.74526977539062, + "logps/rejected": -253.4952392578125, + "loss": 0.1445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9202558994293213, + "rewards/margins": 4.262088298797607, + "rewards/rejected": -5.18234395980835, + "step": 5067 + }, + { + "epoch": 0.59, + "learning_rate": 1.2500295263965986e-07, + "logits/chosen": -2.7043914794921875, + "logits/rejected": -2.47170352935791, + "logps/chosen": -239.700439453125, + "logps/rejected": -351.883056640625, + "loss": 0.1642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07963220030069351, + "rewards/margins": 3.2495436668395996, + "rewards/rejected": -3.3291759490966797, + "step": 5068 + }, + { + "epoch": 0.59, + "learning_rate": 1.2496752096374158e-07, + "logits/chosen": -2.0160913467407227, + "logits/rejected": -2.293389320373535, + "logps/chosen": -295.4750061035156, + "logps/rejected": -246.48040771484375, + "loss": 0.3468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42977944016456604, + "rewards/margins": 2.011566162109375, + "rewards/rejected": -2.441345691680908, + "step": 5069 + }, + { + "epoch": 0.59, + "learning_rate": 1.249320892878233e-07, + "logits/chosen": -2.523953676223755, + "logits/rejected": -2.2209982872009277, + "logps/chosen": -145.88943481445312, + "logps/rejected": -168.9512481689453, + "loss": 0.4348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4279842376708984, + "rewards/margins": 1.55070161819458, + "rewards/rejected": -2.9786860942840576, + "step": 5070 + }, + { + "epoch": 0.59, + "learning_rate": 1.2489665761190503e-07, + "logits/chosen": -2.184809446334839, + "logits/rejected": -2.3422513008117676, + "logps/chosen": -519.1162109375, + "logps/rejected": -286.68035888671875, + "loss": 0.3948, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.020301342010498, + "rewards/margins": 1.8226814270019531, + "rewards/rejected": -2.842982769012451, + "step": 5071 + }, + { + "epoch": 0.59, + "learning_rate": 1.2486122593598678e-07, + "logits/chosen": -1.9471588134765625, + "logits/rejected": -2.1414012908935547, + "logps/chosen": -340.64569091796875, + "logps/rejected": -212.9463348388672, + "loss": 0.4739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4951516389846802, + "rewards/margins": 1.3302807807922363, + "rewards/rejected": -1.825432300567627, + "step": 5072 + }, + { + "epoch": 0.59, + "learning_rate": 1.248257942600685e-07, + "logits/chosen": -2.471249580383301, + "logits/rejected": -2.372087001800537, + "logps/chosen": -235.6441650390625, + "logps/rejected": -155.7246856689453, + "loss": 0.7172, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1826435327529907, + "rewards/margins": 0.6697194576263428, + "rewards/rejected": -1.8523629903793335, + "step": 5073 + }, + { + "epoch": 0.59, + "learning_rate": 1.2479036258415022e-07, + "logits/chosen": -2.237401247024536, + "logits/rejected": -2.372328758239746, + "logps/chosen": -366.558349609375, + "logps/rejected": -316.90704345703125, + "loss": 0.4739, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2678741216659546, + "rewards/margins": 2.0118048191070557, + "rewards/rejected": -2.2796788215637207, + "step": 5074 + }, + { + "epoch": 0.59, + "learning_rate": 1.2475493090823194e-07, + "logits/chosen": -2.4794363975524902, + "logits/rejected": -2.4415979385375977, + "logps/chosen": -338.049072265625, + "logps/rejected": -335.29071044921875, + "loss": 0.6196, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2521889209747314, + "rewards/margins": 2.6171348094940186, + "rewards/rejected": -3.869323492050171, + "step": 5075 + }, + { + "epoch": 0.59, + "learning_rate": 1.247194992323137e-07, + "logits/chosen": -2.2891132831573486, + "logits/rejected": -2.5519914627075195, + "logps/chosen": -269.9117431640625, + "logps/rejected": -178.40011596679688, + "loss": 0.3299, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5464590787887573, + "rewards/margins": 1.6039543151855469, + "rewards/rejected": -2.1504135131835938, + "step": 5076 + }, + { + "epoch": 0.59, + "learning_rate": 1.2468406755639541e-07, + "logits/chosen": -2.5970542430877686, + "logits/rejected": -2.608846664428711, + "logps/chosen": -197.98886108398438, + "logps/rejected": -222.43438720703125, + "loss": 0.6745, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4875085353851318, + "rewards/margins": 1.0582644939422607, + "rewards/rejected": -2.5457730293273926, + "step": 5077 + }, + { + "epoch": 0.59, + "learning_rate": 1.2464863588047714e-07, + "logits/chosen": -2.1260392665863037, + "logits/rejected": -2.1436879634857178, + "logps/chosen": -159.30738830566406, + "logps/rejected": -213.78289794921875, + "loss": 0.4453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7945365905761719, + "rewards/margins": 1.9845962524414062, + "rewards/rejected": -2.779132843017578, + "step": 5078 + }, + { + "epoch": 0.59, + "learning_rate": 1.2461320420455888e-07, + "logits/chosen": -2.6719436645507812, + "logits/rejected": -2.6745734214782715, + "logps/chosen": -174.7235870361328, + "logps/rejected": -230.06301879882812, + "loss": 0.3497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6107015609741211, + "rewards/margins": 3.4982619285583496, + "rewards/rejected": -4.108963966369629, + "step": 5079 + }, + { + "epoch": 0.59, + "learning_rate": 1.245777725286406e-07, + "logits/chosen": -2.4551637172698975, + "logits/rejected": -2.3567707538604736, + "logps/chosen": -173.47279357910156, + "logps/rejected": -266.3307189941406, + "loss": 0.2706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3563610911369324, + "rewards/margins": 2.4658963680267334, + "rewards/rejected": -2.8222575187683105, + "step": 5080 + }, + { + "epoch": 0.59, + "learning_rate": 1.2454234085272233e-07, + "logits/chosen": -2.1030921936035156, + "logits/rejected": -2.0538699626922607, + "logps/chosen": -305.3489990234375, + "logps/rejected": -312.9396057128906, + "loss": 0.4411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7178651094436646, + "rewards/margins": 0.9211158752441406, + "rewards/rejected": -1.6389811038970947, + "step": 5081 + }, + { + "epoch": 0.59, + "learning_rate": 1.2450690917680405e-07, + "logits/chosen": -1.7618861198425293, + "logits/rejected": -1.5333662033081055, + "logps/chosen": -280.0419616699219, + "logps/rejected": -291.8614196777344, + "loss": 0.5739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2153081893920898, + "rewards/margins": 0.7899072170257568, + "rewards/rejected": -2.0052154064178467, + "step": 5082 + }, + { + "epoch": 0.59, + "learning_rate": 1.2447147750088577e-07, + "logits/chosen": -2.7937374114990234, + "logits/rejected": -2.775571584701538, + "logps/chosen": -226.0089111328125, + "logps/rejected": -182.79132080078125, + "loss": 0.4453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8861177563667297, + "rewards/margins": 1.7546391487121582, + "rewards/rejected": -2.6407570838928223, + "step": 5083 + }, + { + "epoch": 0.59, + "learning_rate": 1.2443604582496752e-07, + "logits/chosen": -1.9664208889007568, + "logits/rejected": -1.9980876445770264, + "logps/chosen": -268.162841796875, + "logps/rejected": -444.4507751464844, + "loss": 0.8689, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7266005277633667, + "rewards/margins": 0.22701841592788696, + "rewards/rejected": -1.953619122505188, + "step": 5084 + }, + { + "epoch": 0.59, + "learning_rate": 1.2440061414904924e-07, + "logits/chosen": -1.9957945346832275, + "logits/rejected": -1.7428536415100098, + "logps/chosen": -296.41778564453125, + "logps/rejected": -272.56500244140625, + "loss": 0.2183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6645196080207825, + "rewards/margins": 2.1741867065429688, + "rewards/rejected": -2.8387064933776855, + "step": 5085 + }, + { + "epoch": 0.59, + "learning_rate": 1.2436518247313097e-07, + "logits/chosen": -2.274955987930298, + "logits/rejected": -2.2444496154785156, + "logps/chosen": -224.04209899902344, + "logps/rejected": -292.93817138671875, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7449394464492798, + "rewards/margins": 3.2602670192718506, + "rewards/rejected": -4.00520658493042, + "step": 5086 + }, + { + "epoch": 0.59, + "learning_rate": 1.243297507972127e-07, + "logits/chosen": -2.248771905899048, + "logits/rejected": -2.696075916290283, + "logps/chosen": -350.410400390625, + "logps/rejected": -282.1456298828125, + "loss": 0.2712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3212772011756897, + "rewards/margins": 2.8136792182922363, + "rewards/rejected": -3.1349563598632812, + "step": 5087 + }, + { + "epoch": 0.59, + "learning_rate": 1.2429431912129444e-07, + "logits/chosen": -2.1303837299346924, + "logits/rejected": -2.1944024562835693, + "logps/chosen": -279.9931640625, + "logps/rejected": -341.1676025390625, + "loss": 0.3759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7397469282150269, + "rewards/margins": 3.070223331451416, + "rewards/rejected": -3.8099701404571533, + "step": 5088 + }, + { + "epoch": 0.59, + "learning_rate": 1.2425888744537616e-07, + "logits/chosen": -1.941174864768982, + "logits/rejected": -2.042607069015503, + "logps/chosen": -346.42877197265625, + "logps/rejected": -241.75503540039062, + "loss": 0.4349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7154902219772339, + "rewards/margins": 1.3025219440460205, + "rewards/rejected": -2.018012046813965, + "step": 5089 + }, + { + "epoch": 0.59, + "learning_rate": 1.242234557694579e-07, + "logits/chosen": -2.345172166824341, + "logits/rejected": -2.315709114074707, + "logps/chosen": -247.92958068847656, + "logps/rejected": -323.3211364746094, + "loss": 0.6432, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3852211236953735, + "rewards/margins": 0.7937406897544861, + "rewards/rejected": -2.178961753845215, + "step": 5090 + }, + { + "epoch": 0.59, + "learning_rate": 1.2418802409353963e-07, + "logits/chosen": -2.0898611545562744, + "logits/rejected": -2.470738649368286, + "logps/chosen": -389.3899230957031, + "logps/rejected": -188.8907470703125, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1931101083755493, + "rewards/margins": 0.7235841155052185, + "rewards/rejected": -1.9166940450668335, + "step": 5091 + }, + { + "epoch": 0.59, + "learning_rate": 1.2415259241762135e-07, + "logits/chosen": -2.558830738067627, + "logits/rejected": -2.4750943183898926, + "logps/chosen": -196.71957397460938, + "logps/rejected": -158.61727905273438, + "loss": 0.248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9535135626792908, + "rewards/margins": 2.2230215072631836, + "rewards/rejected": -3.176535129547119, + "step": 5092 + }, + { + "epoch": 0.59, + "learning_rate": 1.2411716074170307e-07, + "logits/chosen": -2.383772850036621, + "logits/rejected": -2.226917028427124, + "logps/chosen": -258.3792724609375, + "logps/rejected": -299.1199645996094, + "loss": 0.3076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44346946477890015, + "rewards/margins": 3.150388479232788, + "rewards/rejected": -3.593857526779175, + "step": 5093 + }, + { + "epoch": 0.59, + "learning_rate": 1.240817290657848e-07, + "logits/chosen": -2.029862642288208, + "logits/rejected": -1.8136718273162842, + "logps/chosen": -163.6396026611328, + "logps/rejected": -278.8581237792969, + "loss": 0.8742, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1036734580993652, + "rewards/margins": 0.5058338642120361, + "rewards/rejected": -2.6095073223114014, + "step": 5094 + }, + { + "epoch": 0.59, + "learning_rate": 1.2404629738986652e-07, + "logits/chosen": -2.629366397857666, + "logits/rejected": -2.671375036239624, + "logps/chosen": -194.59194946289062, + "logps/rejected": -304.2836608886719, + "loss": 0.1737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5404179096221924, + "rewards/margins": 3.2201313972473145, + "rewards/rejected": -3.7605490684509277, + "step": 5095 + }, + { + "epoch": 0.59, + "learning_rate": 1.2401086571394827e-07, + "logits/chosen": -2.552117347717285, + "logits/rejected": -2.544235944747925, + "logps/chosen": -204.56903076171875, + "logps/rejected": -282.14569091796875, + "loss": 0.7417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6886357069015503, + "rewards/margins": 2.2382373809814453, + "rewards/rejected": -2.926873207092285, + "step": 5096 + }, + { + "epoch": 0.59, + "learning_rate": 1.2397543403803e-07, + "logits/chosen": -2.887909412384033, + "logits/rejected": -2.8467471599578857, + "logps/chosen": -165.85572814941406, + "logps/rejected": -163.55233764648438, + "loss": 0.3839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7653634548187256, + "rewards/margins": 1.6618831157684326, + "rewards/rejected": -2.427246570587158, + "step": 5097 + }, + { + "epoch": 0.59, + "learning_rate": 1.239400023621117e-07, + "logits/chosen": -2.587955951690674, + "logits/rejected": -2.579162836074829, + "logps/chosen": -212.80560302734375, + "logps/rejected": -275.4340515136719, + "loss": 0.3655, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9853346943855286, + "rewards/margins": 1.573212742805481, + "rewards/rejected": -2.5585474967956543, + "step": 5098 + }, + { + "epoch": 0.59, + "learning_rate": 1.2390457068619346e-07, + "logits/chosen": -1.8810653686523438, + "logits/rejected": -2.345719337463379, + "logps/chosen": -426.538818359375, + "logps/rejected": -319.029296875, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2576731443405151, + "rewards/margins": 2.5477428436279297, + "rewards/rejected": -3.8054158687591553, + "step": 5099 + }, + { + "epoch": 0.59, + "learning_rate": 1.2386913901027518e-07, + "logits/chosen": -2.380269765853882, + "logits/rejected": -2.322206735610962, + "logps/chosen": -339.3162536621094, + "logps/rejected": -346.724609375, + "loss": 0.3273, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.554209291934967, + "rewards/margins": 2.468238592147827, + "rewards/rejected": -3.0224475860595703, + "step": 5100 + }, + { + "epoch": 0.59, + "learning_rate": 1.238337073343569e-07, + "logits/chosen": -2.410684108734131, + "logits/rejected": -2.34245228767395, + "logps/chosen": -312.950439453125, + "logps/rejected": -315.50244140625, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8407118320465088, + "rewards/margins": 3.945054531097412, + "rewards/rejected": -4.7857666015625, + "step": 5101 + }, + { + "epoch": 0.59, + "learning_rate": 1.2379827565843865e-07, + "logits/chosen": -2.69411563873291, + "logits/rejected": -2.7130560874938965, + "logps/chosen": -221.516845703125, + "logps/rejected": -260.2184753417969, + "loss": 0.36, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5093930959701538, + "rewards/margins": 1.8112454414367676, + "rewards/rejected": -2.320638656616211, + "step": 5102 + }, + { + "epoch": 0.59, + "learning_rate": 1.2376284398252037e-07, + "logits/chosen": -2.017139434814453, + "logits/rejected": -2.154606580734253, + "logps/chosen": -291.7240905761719, + "logps/rejected": -346.83135986328125, + "loss": 0.1228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3959539532661438, + "rewards/margins": 2.9272682666778564, + "rewards/rejected": -3.3232223987579346, + "step": 5103 + }, + { + "epoch": 0.59, + "learning_rate": 1.237274123066021e-07, + "logits/chosen": -2.241389751434326, + "logits/rejected": -2.1206982135772705, + "logps/chosen": -223.3714599609375, + "logps/rejected": -238.818115234375, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5930240750312805, + "rewards/margins": 1.6766254901885986, + "rewards/rejected": -2.2696497440338135, + "step": 5104 + }, + { + "epoch": 0.59, + "learning_rate": 1.2369198063068382e-07, + "logits/chosen": -2.1535606384277344, + "logits/rejected": -2.0892107486724854, + "logps/chosen": -302.6767883300781, + "logps/rejected": -277.99493408203125, + "loss": 0.2835, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24181517958641052, + "rewards/margins": 3.8903353214263916, + "rewards/rejected": -3.6485202312469482, + "step": 5105 + }, + { + "epoch": 0.59, + "learning_rate": 1.2365654895476554e-07, + "logits/chosen": -2.332160234451294, + "logits/rejected": -2.3231637477874756, + "logps/chosen": -197.3474578857422, + "logps/rejected": -264.1737060546875, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21212142705917358, + "rewards/margins": 3.0222043991088867, + "rewards/rejected": -3.234325647354126, + "step": 5106 + }, + { + "epoch": 0.59, + "learning_rate": 1.2362111727884726e-07, + "logits/chosen": -2.438993453979492, + "logits/rejected": -2.334308624267578, + "logps/chosen": -242.27081298828125, + "logps/rejected": -166.3047332763672, + "loss": 0.5015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5868017673492432, + "rewards/margins": 1.2155275344848633, + "rewards/rejected": -2.8023295402526855, + "step": 5107 + }, + { + "epoch": 0.59, + "learning_rate": 1.23585685602929e-07, + "logits/chosen": -2.239373207092285, + "logits/rejected": -2.245403528213501, + "logps/chosen": -261.1809387207031, + "logps/rejected": -278.2749938964844, + "loss": 0.5542, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9882194995880127, + "rewards/margins": 0.8726952075958252, + "rewards/rejected": -2.860914707183838, + "step": 5108 + }, + { + "epoch": 0.59, + "learning_rate": 1.2355025392701073e-07, + "logits/chosen": -1.9228613376617432, + "logits/rejected": -1.977163314819336, + "logps/chosen": -350.00604248046875, + "logps/rejected": -308.2041015625, + "loss": 0.4457, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6628162860870361, + "rewards/margins": 0.7396672964096069, + "rewards/rejected": -2.4024834632873535, + "step": 5109 + }, + { + "epoch": 0.59, + "learning_rate": 1.2351482225109248e-07, + "logits/chosen": -2.5530567169189453, + "logits/rejected": -2.439129590988159, + "logps/chosen": -318.56402587890625, + "logps/rejected": -388.52862548828125, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9836808443069458, + "rewards/margins": 4.30989933013916, + "rewards/rejected": -5.293580055236816, + "step": 5110 + }, + { + "epoch": 0.59, + "learning_rate": 1.234793905751742e-07, + "logits/chosen": -2.426469087600708, + "logits/rejected": -2.5056188106536865, + "logps/chosen": -357.5161437988281, + "logps/rejected": -311.189697265625, + "loss": 0.5073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9797780513763428, + "rewards/margins": 1.274430274963379, + "rewards/rejected": -2.2542080879211426, + "step": 5111 + }, + { + "epoch": 0.59, + "learning_rate": 1.2344395889925593e-07, + "logits/chosen": -2.766845703125, + "logits/rejected": -2.656970739364624, + "logps/chosen": -159.90008544921875, + "logps/rejected": -202.7606964111328, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8860570788383484, + "rewards/margins": 2.4095001220703125, + "rewards/rejected": -3.2955570220947266, + "step": 5112 + }, + { + "epoch": 0.59, + "learning_rate": 1.2340852722333768e-07, + "logits/chosen": -1.8868474960327148, + "logits/rejected": -2.2078776359558105, + "logps/chosen": -450.057861328125, + "logps/rejected": -270.5749206542969, + "loss": 0.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8930110931396484, + "rewards/margins": 1.7636840343475342, + "rewards/rejected": -2.6566951274871826, + "step": 5113 + }, + { + "epoch": 0.59, + "learning_rate": 1.233730955474194e-07, + "logits/chosen": -1.9442545175552368, + "logits/rejected": -2.323054552078247, + "logps/chosen": -335.75250244140625, + "logps/rejected": -210.75537109375, + "loss": 0.1562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.488150030374527, + "rewards/margins": 3.3689723014831543, + "rewards/rejected": -3.8571221828460693, + "step": 5114 + }, + { + "epoch": 0.6, + "learning_rate": 1.2333766387150112e-07, + "logits/chosen": -2.5916597843170166, + "logits/rejected": -2.6921615600585938, + "logps/chosen": -324.7310791015625, + "logps/rejected": -280.6888122558594, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8536218404769897, + "rewards/margins": 2.0352423191070557, + "rewards/rejected": -2.888864040374756, + "step": 5115 + }, + { + "epoch": 0.6, + "learning_rate": 1.2330223219558284e-07, + "logits/chosen": -2.1391282081604004, + "logits/rejected": -2.338430166244507, + "logps/chosen": -225.7545166015625, + "logps/rejected": -315.41571044921875, + "loss": 0.5856, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2772448062896729, + "rewards/margins": 2.337618827819824, + "rewards/rejected": -3.614863395690918, + "step": 5116 + }, + { + "epoch": 0.6, + "learning_rate": 1.2326680051966456e-07, + "logits/chosen": -2.834620237350464, + "logits/rejected": -2.682323932647705, + "logps/chosen": -157.98403930664062, + "logps/rejected": -248.41929626464844, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03156479448080063, + "rewards/margins": 3.235058307647705, + "rewards/rejected": -3.203493595123291, + "step": 5117 + }, + { + "epoch": 0.6, + "learning_rate": 1.2323136884374629e-07, + "logits/chosen": -2.515840530395508, + "logits/rejected": -2.6033060550689697, + "logps/chosen": -210.25086975097656, + "logps/rejected": -170.61021423339844, + "loss": 0.4969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07243821769952774, + "rewards/margins": 1.0937703847885132, + "rewards/rejected": -1.1662086248397827, + "step": 5118 + }, + { + "epoch": 0.6, + "learning_rate": 1.2319593716782803e-07, + "logits/chosen": -2.3279213905334473, + "logits/rejected": -2.2057008743286133, + "logps/chosen": -219.17718505859375, + "logps/rejected": -266.13714599609375, + "loss": 0.3275, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9945389628410339, + "rewards/margins": 2.227848529815674, + "rewards/rejected": -3.2223877906799316, + "step": 5119 + }, + { + "epoch": 0.6, + "learning_rate": 1.2316050549190976e-07, + "logits/chosen": -2.2559127807617188, + "logits/rejected": -2.129075050354004, + "logps/chosen": -354.2698669433594, + "logps/rejected": -306.61846923828125, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2929072380065918, + "rewards/margins": 2.0850305557250977, + "rewards/rejected": -3.3779377937316895, + "step": 5120 + }, + { + "epoch": 0.6, + "learning_rate": 1.231250738159915e-07, + "logits/chosen": -2.3452606201171875, + "logits/rejected": -2.077998638153076, + "logps/chosen": -314.56597900390625, + "logps/rejected": -476.0709533691406, + "loss": 0.1039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7632959485054016, + "rewards/margins": 3.0705442428588867, + "rewards/rejected": -3.8338401317596436, + "step": 5121 + }, + { + "epoch": 0.6, + "learning_rate": 1.2308964214007323e-07, + "logits/chosen": -2.2064530849456787, + "logits/rejected": -2.591062545776367, + "logps/chosen": -342.0685119628906, + "logps/rejected": -286.3999328613281, + "loss": 0.2422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9231036901473999, + "rewards/margins": 1.549333095550537, + "rewards/rejected": -2.4724369049072266, + "step": 5122 + }, + { + "epoch": 0.6, + "learning_rate": 1.2305421046415495e-07, + "logits/chosen": -2.798582077026367, + "logits/rejected": -2.571605920791626, + "logps/chosen": -218.5767822265625, + "logps/rejected": -199.66358947753906, + "loss": 0.185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8188111186027527, + "rewards/margins": 2.7973177433013916, + "rewards/rejected": -3.616128921508789, + "step": 5123 + }, + { + "epoch": 0.6, + "learning_rate": 1.2301877878823667e-07, + "logits/chosen": -2.2091310024261475, + "logits/rejected": -2.1715455055236816, + "logps/chosen": -241.8619842529297, + "logps/rejected": -365.66448974609375, + "loss": 0.2226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9224108457565308, + "rewards/margins": 2.6204233169555664, + "rewards/rejected": -3.5428342819213867, + "step": 5124 + }, + { + "epoch": 0.6, + "learning_rate": 1.2298334711231842e-07, + "logits/chosen": -2.339036226272583, + "logits/rejected": -2.054238796234131, + "logps/chosen": -260.60540771484375, + "logps/rejected": -274.2049560546875, + "loss": 0.3287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6425458192825317, + "rewards/margins": 2.4909396171569824, + "rewards/rejected": -3.1334853172302246, + "step": 5125 + }, + { + "epoch": 0.6, + "learning_rate": 1.2294791543640014e-07, + "logits/chosen": -2.0086510181427, + "logits/rejected": -2.50942325592041, + "logps/chosen": -301.9664306640625, + "logps/rejected": -234.12249755859375, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33487218618392944, + "rewards/margins": 2.5505857467651367, + "rewards/rejected": -2.885457992553711, + "step": 5126 + }, + { + "epoch": 0.6, + "learning_rate": 1.2291248376048186e-07, + "logits/chosen": -2.1895875930786133, + "logits/rejected": -2.5083351135253906, + "logps/chosen": -352.14532470703125, + "logps/rejected": -148.89776611328125, + "loss": 0.3725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.339913547039032, + "rewards/margins": 1.6847015619277954, + "rewards/rejected": -2.0246152877807617, + "step": 5127 + }, + { + "epoch": 0.6, + "learning_rate": 1.2287705208456359e-07, + "logits/chosen": -1.7136218547821045, + "logits/rejected": -1.9946284294128418, + "logps/chosen": -507.454345703125, + "logps/rejected": -328.8541564941406, + "loss": 0.6494, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8290361166000366, + "rewards/margins": 2.297891139984131, + "rewards/rejected": -3.126927137374878, + "step": 5128 + }, + { + "epoch": 0.6, + "learning_rate": 1.228416204086453e-07, + "logits/chosen": -1.7071917057037354, + "logits/rejected": -2.1851866245269775, + "logps/chosen": -278.64117431640625, + "logps/rejected": -177.24632263183594, + "loss": 0.6272, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5662418603897095, + "rewards/margins": 0.5421977639198303, + "rewards/rejected": -2.1084396839141846, + "step": 5129 + }, + { + "epoch": 0.6, + "learning_rate": 1.2280618873272706e-07, + "logits/chosen": -2.3276870250701904, + "logits/rejected": -2.3523080348968506, + "logps/chosen": -220.08668518066406, + "logps/rejected": -296.5013732910156, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.418956995010376, + "rewards/margins": 1.9940530061721802, + "rewards/rejected": -2.4130098819732666, + "step": 5130 + }, + { + "epoch": 0.6, + "learning_rate": 1.2277075705680878e-07, + "logits/chosen": -2.3106372356414795, + "logits/rejected": -2.7049717903137207, + "logps/chosen": -327.0578308105469, + "logps/rejected": -191.98822021484375, + "loss": 0.4681, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9463878870010376, + "rewards/margins": 1.4810173511505127, + "rewards/rejected": -3.4274051189422607, + "step": 5131 + }, + { + "epoch": 0.6, + "learning_rate": 1.227353253808905e-07, + "logits/chosen": -2.7277674674987793, + "logits/rejected": -2.8120040893554688, + "logps/chosen": -403.78070068359375, + "logps/rejected": -338.1508483886719, + "loss": 0.2603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6966190338134766, + "rewards/margins": 1.779030442237854, + "rewards/rejected": -2.475649356842041, + "step": 5132 + }, + { + "epoch": 0.6, + "learning_rate": 1.2269989370497225e-07, + "logits/chosen": -2.506174325942993, + "logits/rejected": -2.473383903503418, + "logps/chosen": -446.2240905761719, + "logps/rejected": -369.17724609375, + "loss": 0.3913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8596564531326294, + "rewards/margins": 1.538688063621521, + "rewards/rejected": -2.3983445167541504, + "step": 5133 + }, + { + "epoch": 0.6, + "learning_rate": 1.2266446202905397e-07, + "logits/chosen": -2.4449896812438965, + "logits/rejected": -2.3463964462280273, + "logps/chosen": -279.21490478515625, + "logps/rejected": -218.77459716796875, + "loss": 0.4912, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5418498516082764, + "rewards/margins": 1.5551363229751587, + "rewards/rejected": -3.0969860553741455, + "step": 5134 + }, + { + "epoch": 0.6, + "learning_rate": 1.226290303531357e-07, + "logits/chosen": -2.219984531402588, + "logits/rejected": -2.3935954570770264, + "logps/chosen": -318.04443359375, + "logps/rejected": -289.8975830078125, + "loss": 0.4663, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0296363830566406, + "rewards/margins": 1.558647871017456, + "rewards/rejected": -2.588284492492676, + "step": 5135 + }, + { + "epoch": 0.6, + "learning_rate": 1.2259359867721742e-07, + "logits/chosen": -2.4949724674224854, + "logits/rejected": -2.6152076721191406, + "logps/chosen": -250.11190795898438, + "logps/rejected": -212.3108367919922, + "loss": 0.5792, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8342756032943726, + "rewards/margins": 1.760995626449585, + "rewards/rejected": -2.595271587371826, + "step": 5136 + }, + { + "epoch": 0.6, + "learning_rate": 1.2255816700129916e-07, + "logits/chosen": -2.0042953491210938, + "logits/rejected": -1.8382179737091064, + "logps/chosen": -162.68508911132812, + "logps/rejected": -209.60682678222656, + "loss": 0.3356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5327480435371399, + "rewards/margins": 2.375957727432251, + "rewards/rejected": -2.908705711364746, + "step": 5137 + }, + { + "epoch": 0.6, + "learning_rate": 1.225227353253809e-07, + "logits/chosen": -2.084937572479248, + "logits/rejected": -1.8819053173065186, + "logps/chosen": -314.6112976074219, + "logps/rejected": -353.434814453125, + "loss": 0.6759, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1111476421356201, + "rewards/margins": 1.1149036884307861, + "rewards/rejected": -2.226051092147827, + "step": 5138 + }, + { + "epoch": 0.6, + "learning_rate": 1.224873036494626e-07, + "logits/chosen": -1.7486134767532349, + "logits/rejected": -2.2891416549682617, + "logps/chosen": -475.4940185546875, + "logps/rejected": -223.64321899414062, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5282847285270691, + "rewards/margins": 2.0656471252441406, + "rewards/rejected": -2.5939316749572754, + "step": 5139 + }, + { + "epoch": 0.6, + "learning_rate": 1.2245187197354433e-07, + "logits/chosen": -2.7553608417510986, + "logits/rejected": -2.8568062782287598, + "logps/chosen": -213.13465881347656, + "logps/rejected": -194.9080047607422, + "loss": 0.4253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0414444208145142, + "rewards/margins": 2.980316400527954, + "rewards/rejected": -4.021760940551758, + "step": 5140 + }, + { + "epoch": 0.6, + "learning_rate": 1.2241644029762608e-07, + "logits/chosen": -2.0805649757385254, + "logits/rejected": -2.1005215644836426, + "logps/chosen": -263.3174743652344, + "logps/rejected": -338.59869384765625, + "loss": 0.3109, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4862335920333862, + "rewards/margins": 2.7639541625976562, + "rewards/rejected": -4.250187873840332, + "step": 5141 + }, + { + "epoch": 0.6, + "learning_rate": 1.223810086217078e-07, + "logits/chosen": -2.1309316158294678, + "logits/rejected": -2.3310976028442383, + "logps/chosen": -197.66177368164062, + "logps/rejected": -265.501708984375, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6281002759933472, + "rewards/margins": 3.1413731575012207, + "rewards/rejected": -3.7694733142852783, + "step": 5142 + }, + { + "epoch": 0.6, + "learning_rate": 1.2234557694578952e-07, + "logits/chosen": -2.7278285026550293, + "logits/rejected": -2.5791237354278564, + "logps/chosen": -153.018798828125, + "logps/rejected": -188.00750732421875, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8224601149559021, + "rewards/margins": 2.6298556327819824, + "rewards/rejected": -3.4523158073425293, + "step": 5143 + }, + { + "epoch": 0.6, + "learning_rate": 1.2231014526987127e-07, + "logits/chosen": -2.438666343688965, + "logits/rejected": -2.2669496536254883, + "logps/chosen": -217.2213592529297, + "logps/rejected": -223.17330932617188, + "loss": 0.2411, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29590755701065063, + "rewards/margins": 1.7999703884124756, + "rewards/rejected": -1.5040628910064697, + "step": 5144 + }, + { + "epoch": 0.6, + "learning_rate": 1.22274713593953e-07, + "logits/chosen": -2.2915568351745605, + "logits/rejected": -2.361651659011841, + "logps/chosen": -323.3281555175781, + "logps/rejected": -283.2708740234375, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28033846616744995, + "rewards/margins": 2.2793521881103516, + "rewards/rejected": -2.5596907138824463, + "step": 5145 + }, + { + "epoch": 0.6, + "learning_rate": 1.2223928191803472e-07, + "logits/chosen": -2.8296799659729004, + "logits/rejected": -2.785123825073242, + "logps/chosen": -529.276611328125, + "logps/rejected": -332.7628173828125, + "loss": 0.3218, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1657543182373047, + "rewards/margins": 3.213261604309082, + "rewards/rejected": -4.379015922546387, + "step": 5146 + }, + { + "epoch": 0.6, + "learning_rate": 1.2220385024211644e-07, + "logits/chosen": -1.9739183187484741, + "logits/rejected": -1.744006633758545, + "logps/chosen": -179.52938842773438, + "logps/rejected": -307.3097839355469, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25576990842819214, + "rewards/margins": 3.3169713020324707, + "rewards/rejected": -3.5727415084838867, + "step": 5147 + }, + { + "epoch": 0.6, + "learning_rate": 1.221684185661982e-07, + "logits/chosen": -2.251734495162964, + "logits/rejected": -2.5340583324432373, + "logps/chosen": -259.9818115234375, + "logps/rejected": -270.37744140625, + "loss": 0.2404, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9523962140083313, + "rewards/margins": 2.2593045234680176, + "rewards/rejected": -3.2117011547088623, + "step": 5148 + }, + { + "epoch": 0.6, + "learning_rate": 1.221329868902799e-07, + "logits/chosen": -2.4059689044952393, + "logits/rejected": -2.4282827377319336, + "logps/chosen": -132.5740509033203, + "logps/rejected": -270.3348693847656, + "loss": 0.6176, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3161264657974243, + "rewards/margins": 2.3858261108398438, + "rewards/rejected": -3.7019524574279785, + "step": 5149 + }, + { + "epoch": 0.6, + "learning_rate": 1.2209755521436163e-07, + "logits/chosen": -2.2006330490112305, + "logits/rejected": -2.2719225883483887, + "logps/chosen": -210.8809051513672, + "logps/rejected": -195.230712890625, + "loss": 0.3965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6237286329269409, + "rewards/margins": 2.2334136962890625, + "rewards/rejected": -2.857142448425293, + "step": 5150 + }, + { + "epoch": 0.6, + "learning_rate": 1.2206212353844335e-07, + "logits/chosen": -2.324965476989746, + "logits/rejected": -2.412593126296997, + "logps/chosen": -319.0421142578125, + "logps/rejected": -341.6940002441406, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.053290314972400665, + "rewards/margins": 4.247989654541016, + "rewards/rejected": -4.194699764251709, + "step": 5151 + }, + { + "epoch": 0.6, + "learning_rate": 1.2202669186252508e-07, + "logits/chosen": -2.9868533611297607, + "logits/rejected": -2.844783306121826, + "logps/chosen": -183.92478942871094, + "logps/rejected": -183.93618774414062, + "loss": 0.4751, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1546136140823364, + "rewards/margins": 1.3894068002700806, + "rewards/rejected": -2.544020414352417, + "step": 5152 + }, + { + "epoch": 0.6, + "learning_rate": 1.2199126018660682e-07, + "logits/chosen": -2.487842082977295, + "logits/rejected": -2.0799882411956787, + "logps/chosen": -104.04010772705078, + "logps/rejected": -201.80337524414062, + "loss": 0.4485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8131652474403381, + "rewards/margins": 1.4487608671188354, + "rewards/rejected": -2.2619261741638184, + "step": 5153 + }, + { + "epoch": 0.6, + "learning_rate": 1.2195582851068855e-07, + "logits/chosen": -2.5003345012664795, + "logits/rejected": -2.3973002433776855, + "logps/chosen": -309.2982482910156, + "logps/rejected": -252.57940673828125, + "loss": 0.4983, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9559922218322754, + "rewards/margins": 1.5331485271453857, + "rewards/rejected": -3.489140510559082, + "step": 5154 + }, + { + "epoch": 0.6, + "learning_rate": 1.219203968347703e-07, + "logits/chosen": -2.2191805839538574, + "logits/rejected": -2.3995485305786133, + "logps/chosen": -262.9721984863281, + "logps/rejected": -234.10498046875, + "loss": 0.1613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.953752338886261, + "rewards/margins": 2.3778343200683594, + "rewards/rejected": -3.3315868377685547, + "step": 5155 + }, + { + "epoch": 0.6, + "learning_rate": 1.2188496515885202e-07, + "logits/chosen": -2.3358514308929443, + "logits/rejected": -2.3938379287719727, + "logps/chosen": -221.9315948486328, + "logps/rejected": -218.56625366210938, + "loss": 0.1779, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2708457708358765, + "rewards/margins": 2.123467445373535, + "rewards/rejected": -3.3943135738372803, + "step": 5156 + }, + { + "epoch": 0.6, + "learning_rate": 1.2184953348293374e-07, + "logits/chosen": -2.707411289215088, + "logits/rejected": -2.5543408393859863, + "logps/chosen": -164.876953125, + "logps/rejected": -199.887451171875, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4612798690795898, + "rewards/margins": 2.03262996673584, + "rewards/rejected": -3.4939098358154297, + "step": 5157 + }, + { + "epoch": 0.6, + "learning_rate": 1.2181410180701546e-07, + "logits/chosen": -1.7999128103256226, + "logits/rejected": -1.9087313413619995, + "logps/chosen": -407.53643798828125, + "logps/rejected": -379.4078674316406, + "loss": 1.3412, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8983010053634644, + "rewards/margins": -0.7385311126708984, + "rewards/rejected": -1.1597700119018555, + "step": 5158 + }, + { + "epoch": 0.6, + "learning_rate": 1.2177867013109718e-07, + "logits/chosen": -2.223919630050659, + "logits/rejected": -2.1439318656921387, + "logps/chosen": -355.631103515625, + "logps/rejected": -359.44580078125, + "loss": 0.1598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6269824504852295, + "rewards/margins": 3.4270691871643066, + "rewards/rejected": -4.054051876068115, + "step": 5159 + }, + { + "epoch": 0.6, + "learning_rate": 1.2174323845517893e-07, + "logits/chosen": -2.5781381130218506, + "logits/rejected": -2.576977252960205, + "logps/chosen": -70.28850555419922, + "logps/rejected": -136.47933959960938, + "loss": 0.5058, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5026009678840637, + "rewards/margins": 2.0700178146362305, + "rewards/rejected": -2.5726184844970703, + "step": 5160 + }, + { + "epoch": 0.6, + "learning_rate": 1.2170780677926065e-07, + "logits/chosen": -2.138638496398926, + "logits/rejected": -2.0703682899475098, + "logps/chosen": -300.4554748535156, + "logps/rejected": -322.74853515625, + "loss": 0.5386, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48631101846694946, + "rewards/margins": 1.663435697555542, + "rewards/rejected": -2.1497466564178467, + "step": 5161 + }, + { + "epoch": 0.6, + "learning_rate": 1.2167237510334238e-07, + "logits/chosen": -2.5677742958068848, + "logits/rejected": -2.5401296615600586, + "logps/chosen": -165.1081085205078, + "logps/rejected": -186.43771362304688, + "loss": 0.2137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8036798238754272, + "rewards/margins": 1.856041431427002, + "rewards/rejected": -2.6597213745117188, + "step": 5162 + }, + { + "epoch": 0.6, + "learning_rate": 1.216369434274241e-07, + "logits/chosen": -2.3227591514587402, + "logits/rejected": -2.3879740238189697, + "logps/chosen": -349.0043029785156, + "logps/rejected": -313.2303466796875, + "loss": 0.2557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5349323153495789, + "rewards/margins": 2.2743756771087646, + "rewards/rejected": -2.809307813644409, + "step": 5163 + }, + { + "epoch": 0.6, + "learning_rate": 1.2160151175150585e-07, + "logits/chosen": -2.1383869647979736, + "logits/rejected": -2.2058053016662598, + "logps/chosen": -156.2996063232422, + "logps/rejected": -216.99822998046875, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3586556911468506, + "rewards/margins": 3.022068500518799, + "rewards/rejected": -3.3807241916656494, + "step": 5164 + }, + { + "epoch": 0.6, + "learning_rate": 1.2156608007558757e-07, + "logits/chosen": -1.791621446609497, + "logits/rejected": -1.9342632293701172, + "logps/chosen": -347.428466796875, + "logps/rejected": -304.0482177734375, + "loss": 0.67, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0351691246032715, + "rewards/margins": 0.9751028418540955, + "rewards/rejected": -2.0102720260620117, + "step": 5165 + }, + { + "epoch": 0.6, + "learning_rate": 1.2153064839966932e-07, + "logits/chosen": -1.8173198699951172, + "logits/rejected": -2.090070962905884, + "logps/chosen": -452.77154541015625, + "logps/rejected": -263.6712341308594, + "loss": 0.5891, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.331755518913269, + "rewards/margins": 0.7883865237236023, + "rewards/rejected": -2.1201422214508057, + "step": 5166 + }, + { + "epoch": 0.6, + "learning_rate": 1.2149521672375104e-07, + "logits/chosen": -2.16621470451355, + "logits/rejected": -1.9819211959838867, + "logps/chosen": -343.8999328613281, + "logps/rejected": -258.97271728515625, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.405301809310913, + "rewards/margins": 1.1323615312576294, + "rewards/rejected": -2.537663221359253, + "step": 5167 + }, + { + "epoch": 0.6, + "learning_rate": 1.2145978504783276e-07, + "logits/chosen": -1.9440858364105225, + "logits/rejected": -2.302837610244751, + "logps/chosen": -394.0356750488281, + "logps/rejected": -283.26116943359375, + "loss": 0.6152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9935222268104553, + "rewards/margins": 1.1876939535140991, + "rewards/rejected": -2.181216239929199, + "step": 5168 + }, + { + "epoch": 0.6, + "learning_rate": 1.2142435337191448e-07, + "logits/chosen": -2.5852365493774414, + "logits/rejected": -2.6824610233306885, + "logps/chosen": -295.34490966796875, + "logps/rejected": -241.04922485351562, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1216545104980469, + "rewards/margins": 1.8454992771148682, + "rewards/rejected": -2.967153787612915, + "step": 5169 + }, + { + "epoch": 0.6, + "learning_rate": 1.213889216959962e-07, + "logits/chosen": -2.3654069900512695, + "logits/rejected": -2.5773026943206787, + "logps/chosen": -331.6851806640625, + "logps/rejected": -270.00738525390625, + "loss": 0.1689, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13827908039093018, + "rewards/margins": 2.8870933055877686, + "rewards/rejected": -3.025372266769409, + "step": 5170 + }, + { + "epoch": 0.6, + "learning_rate": 1.2135349002007793e-07, + "logits/chosen": -1.98057222366333, + "logits/rejected": -1.5026447772979736, + "logps/chosen": -305.1203918457031, + "logps/rejected": -406.29541015625, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9541860222816467, + "rewards/margins": 1.725627064704895, + "rewards/rejected": -2.6798131465911865, + "step": 5171 + }, + { + "epoch": 0.6, + "learning_rate": 1.2131805834415968e-07, + "logits/chosen": -2.2253262996673584, + "logits/rejected": -2.284430503845215, + "logps/chosen": -262.65576171875, + "logps/rejected": -226.43206787109375, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5622559785842896, + "rewards/margins": 2.163120985031128, + "rewards/rejected": -2.725377082824707, + "step": 5172 + }, + { + "epoch": 0.6, + "learning_rate": 1.212826266682414e-07, + "logits/chosen": -2.8781540393829346, + "logits/rejected": -2.610921621322632, + "logps/chosen": -379.34783935546875, + "logps/rejected": -342.66961669921875, + "loss": 0.234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2945258617401123, + "rewards/margins": 2.5202436447143555, + "rewards/rejected": -2.8147695064544678, + "step": 5173 + }, + { + "epoch": 0.6, + "learning_rate": 1.2124719499232312e-07, + "logits/chosen": -1.9700376987457275, + "logits/rejected": -2.028470039367676, + "logps/chosen": -399.7346496582031, + "logps/rejected": -259.3099365234375, + "loss": 0.4372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6947767734527588, + "rewards/margins": 1.506657600402832, + "rewards/rejected": -2.20143461227417, + "step": 5174 + }, + { + "epoch": 0.6, + "learning_rate": 1.2121176331640487e-07, + "logits/chosen": -2.315840721130371, + "logits/rejected": -2.1478283405303955, + "logps/chosen": -220.9124755859375, + "logps/rejected": -265.1065979003906, + "loss": 0.3572, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.022269368171692, + "rewards/margins": 2.4682207107543945, + "rewards/rejected": -3.490490198135376, + "step": 5175 + }, + { + "epoch": 0.6, + "learning_rate": 1.211763316404866e-07, + "logits/chosen": -2.0749850273132324, + "logits/rejected": -2.399956226348877, + "logps/chosen": -643.225830078125, + "logps/rejected": -465.65826416015625, + "loss": 0.2581, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0667566061019897, + "rewards/margins": 3.2370879650115967, + "rewards/rejected": -4.303844451904297, + "step": 5176 + }, + { + "epoch": 0.6, + "learning_rate": 1.2114089996456831e-07, + "logits/chosen": -2.569577217102051, + "logits/rejected": -2.4923605918884277, + "logps/chosen": -197.9288330078125, + "logps/rejected": -171.02896118164062, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3524514138698578, + "rewards/margins": 0.7974086999893188, + "rewards/rejected": -1.149860143661499, + "step": 5177 + }, + { + "epoch": 0.6, + "learning_rate": 1.2110546828865006e-07, + "logits/chosen": -2.1022799015045166, + "logits/rejected": -2.0434725284576416, + "logps/chosen": -528.9983520507812, + "logps/rejected": -302.42633056640625, + "loss": 0.1902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20870739221572876, + "rewards/margins": 3.252781629562378, + "rewards/rejected": -3.461488723754883, + "step": 5178 + }, + { + "epoch": 0.6, + "learning_rate": 1.2107003661273179e-07, + "logits/chosen": -2.251540422439575, + "logits/rejected": -2.0910701751708984, + "logps/chosen": -204.24888610839844, + "logps/rejected": -243.0397186279297, + "loss": 0.4986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9263660907745361, + "rewards/margins": 2.4848403930664062, + "rewards/rejected": -3.4112062454223633, + "step": 5179 + }, + { + "epoch": 0.6, + "learning_rate": 1.210346049368135e-07, + "logits/chosen": -2.4439024925231934, + "logits/rejected": -2.450456142425537, + "logps/chosen": -248.8252716064453, + "logps/rejected": -215.6868896484375, + "loss": 0.2755, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7359672784805298, + "rewards/margins": 2.913820266723633, + "rewards/rejected": -3.6497879028320312, + "step": 5180 + }, + { + "epoch": 0.6, + "learning_rate": 1.2099917326089523e-07, + "logits/chosen": -2.243943214416504, + "logits/rejected": -2.1298763751983643, + "logps/chosen": -183.43824768066406, + "logps/rejected": -205.67611694335938, + "loss": 0.3913, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6132287383079529, + "rewards/margins": 1.9868220090866089, + "rewards/rejected": -2.600050926208496, + "step": 5181 + }, + { + "epoch": 0.6, + "learning_rate": 1.2096374158497695e-07, + "logits/chosen": -1.9042142629623413, + "logits/rejected": -1.5784499645233154, + "logps/chosen": -272.8074645996094, + "logps/rejected": -383.36474609375, + "loss": 0.6092, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.789008617401123, + "rewards/margins": 0.6373796463012695, + "rewards/rejected": -1.4263882637023926, + "step": 5182 + }, + { + "epoch": 0.6, + "learning_rate": 1.209283099090587e-07, + "logits/chosen": -2.452303171157837, + "logits/rejected": -2.2016170024871826, + "logps/chosen": -355.0323181152344, + "logps/rejected": -339.13128662109375, + "loss": 0.1151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21731846034526825, + "rewards/margins": 2.855644702911377, + "rewards/rejected": -3.072962999343872, + "step": 5183 + }, + { + "epoch": 0.6, + "learning_rate": 1.2089287823314042e-07, + "logits/chosen": -2.503416061401367, + "logits/rejected": -2.621192455291748, + "logps/chosen": -436.8119201660156, + "logps/rejected": -177.7572479248047, + "loss": 0.493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9297093749046326, + "rewards/margins": 0.5880588293075562, + "rewards/rejected": -1.517768144607544, + "step": 5184 + }, + { + "epoch": 0.6, + "learning_rate": 1.2085744655722214e-07, + "logits/chosen": -2.1680829524993896, + "logits/rejected": -2.223102569580078, + "logps/chosen": -310.1890869140625, + "logps/rejected": -204.76429748535156, + "loss": 0.3985, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7580963373184204, + "rewards/margins": 1.1790990829467773, + "rewards/rejected": -1.9371954202651978, + "step": 5185 + }, + { + "epoch": 0.6, + "learning_rate": 1.2082201488130387e-07, + "logits/chosen": -2.4249343872070312, + "logits/rejected": -2.531165838241577, + "logps/chosen": -264.2085266113281, + "logps/rejected": -270.48541259765625, + "loss": 0.6262, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.496390461921692, + "rewards/margins": 1.8428950309753418, + "rewards/rejected": -3.3392858505249023, + "step": 5186 + }, + { + "epoch": 0.6, + "learning_rate": 1.2078658320538562e-07, + "logits/chosen": -2.6830625534057617, + "logits/rejected": -2.6153247356414795, + "logps/chosen": -295.1492919921875, + "logps/rejected": -180.11886596679688, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0302248001098633, + "rewards/margins": 2.0213284492492676, + "rewards/rejected": -3.0515530109405518, + "step": 5187 + }, + { + "epoch": 0.6, + "learning_rate": 1.2075115152946734e-07, + "logits/chosen": -2.1325323581695557, + "logits/rejected": -2.2552058696746826, + "logps/chosen": -174.17930603027344, + "logps/rejected": -190.2611846923828, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2608143091201782, + "rewards/margins": 1.8889987468719482, + "rewards/rejected": -3.149813175201416, + "step": 5188 + }, + { + "epoch": 0.6, + "learning_rate": 1.2071571985354909e-07, + "logits/chosen": -2.1447079181671143, + "logits/rejected": -2.4205920696258545, + "logps/chosen": -328.7030334472656, + "logps/rejected": -189.96087646484375, + "loss": 0.441, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5200481414794922, + "rewards/margins": 1.9817452430725098, + "rewards/rejected": -2.501793384552002, + "step": 5189 + }, + { + "epoch": 0.6, + "learning_rate": 1.206802881776308e-07, + "logits/chosen": -2.840146541595459, + "logits/rejected": -2.8637099266052246, + "logps/chosen": -145.47161865234375, + "logps/rejected": -233.88980102539062, + "loss": 0.4143, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1663498878479004, + "rewards/margins": 4.163477420806885, + "rewards/rejected": -5.329827308654785, + "step": 5190 + }, + { + "epoch": 0.6, + "learning_rate": 1.2064485650171253e-07, + "logits/chosen": -2.148712635040283, + "logits/rejected": -2.5300376415252686, + "logps/chosen": -285.4311828613281, + "logps/rejected": -205.98638916015625, + "loss": 0.3611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8600549697875977, + "rewards/margins": 2.062434434890747, + "rewards/rejected": -2.9224894046783447, + "step": 5191 + }, + { + "epoch": 0.6, + "learning_rate": 1.2060942482579425e-07, + "logits/chosen": -2.138774871826172, + "logits/rejected": -2.0490779876708984, + "logps/chosen": -215.04754638671875, + "logps/rejected": -260.30718994140625, + "loss": 0.3514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3328709006309509, + "rewards/margins": 1.851085901260376, + "rewards/rejected": -2.1839566230773926, + "step": 5192 + }, + { + "epoch": 0.6, + "learning_rate": 1.2057399314987597e-07, + "logits/chosen": -2.6361491680145264, + "logits/rejected": -2.4686880111694336, + "logps/chosen": -226.85906982421875, + "logps/rejected": -348.58514404296875, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7174818515777588, + "rewards/margins": 1.592439889907837, + "rewards/rejected": -2.3099217414855957, + "step": 5193 + }, + { + "epoch": 0.6, + "learning_rate": 1.205385614739577e-07, + "logits/chosen": -2.2250266075134277, + "logits/rejected": -2.1893231868743896, + "logps/chosen": -188.78280639648438, + "logps/rejected": -262.77947998046875, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2067149877548218, + "rewards/margins": 1.190011978149414, + "rewards/rejected": -2.3967268466949463, + "step": 5194 + }, + { + "epoch": 0.6, + "learning_rate": 1.2050312979803945e-07, + "logits/chosen": -2.6227099895477295, + "logits/rejected": -2.554823875427246, + "logps/chosen": -171.34768676757812, + "logps/rejected": -264.05963134765625, + "loss": 0.7432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9436321258544922, + "rewards/margins": 1.6460413932800293, + "rewards/rejected": -2.5896735191345215, + "step": 5195 + }, + { + "epoch": 0.6, + "learning_rate": 1.2046769812212117e-07, + "logits/chosen": -1.9634126424789429, + "logits/rejected": -2.0565199851989746, + "logps/chosen": -320.5899658203125, + "logps/rejected": -301.64447021484375, + "loss": 0.2464, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.125593662261963, + "rewards/margins": 2.3610596656799316, + "rewards/rejected": -3.4866530895233154, + "step": 5196 + }, + { + "epoch": 0.6, + "learning_rate": 1.204322664462029e-07, + "logits/chosen": -2.60831880569458, + "logits/rejected": -2.817276954650879, + "logps/chosen": -182.84112548828125, + "logps/rejected": -207.76422119140625, + "loss": 0.4499, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9005789160728455, + "rewards/margins": 1.7930991649627686, + "rewards/rejected": -2.693678140640259, + "step": 5197 + }, + { + "epoch": 0.6, + "learning_rate": 1.2039683477028464e-07, + "logits/chosen": -2.356882095336914, + "logits/rejected": -2.6087868213653564, + "logps/chosen": -250.39501953125, + "logps/rejected": -182.9837188720703, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9269205331802368, + "rewards/margins": 2.7070443630218506, + "rewards/rejected": -3.633965015411377, + "step": 5198 + }, + { + "epoch": 0.6, + "learning_rate": 1.2036140309436636e-07, + "logits/chosen": -2.054184675216675, + "logits/rejected": -2.367687463760376, + "logps/chosen": -417.39190673828125, + "logps/rejected": -318.114990234375, + "loss": 0.6066, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9740835428237915, + "rewards/margins": 0.8706832528114319, + "rewards/rejected": -1.8447667360305786, + "step": 5199 + }, + { + "epoch": 0.6, + "learning_rate": 1.2032597141844808e-07, + "logits/chosen": -1.9945340156555176, + "logits/rejected": -1.6193487644195557, + "logps/chosen": -334.8529052734375, + "logps/rejected": -464.32940673828125, + "loss": 0.1595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13162831962108612, + "rewards/margins": 2.6609864234924316, + "rewards/rejected": -2.792614698410034, + "step": 5200 + }, + { + "epoch": 0.61, + "learning_rate": 1.2029053974252983e-07, + "logits/chosen": -2.262388229370117, + "logits/rejected": -2.6312084197998047, + "logps/chosen": -342.1814270019531, + "logps/rejected": -281.8554382324219, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0466265678405762, + "rewards/margins": 3.318283796310425, + "rewards/rejected": -4.364910125732422, + "step": 5201 + }, + { + "epoch": 0.61, + "learning_rate": 1.2025510806661155e-07, + "logits/chosen": -2.728365182876587, + "logits/rejected": -2.6839518547058105, + "logps/chosen": -205.17190551757812, + "logps/rejected": -205.14405822753906, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2852359116077423, + "rewards/margins": 3.2778968811035156, + "rewards/rejected": -3.5631327629089355, + "step": 5202 + }, + { + "epoch": 0.61, + "learning_rate": 1.2021967639069328e-07, + "logits/chosen": -2.496018886566162, + "logits/rejected": -2.448404312133789, + "logps/chosen": -232.7915802001953, + "logps/rejected": -225.1036376953125, + "loss": 0.4538, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7520129680633545, + "rewards/margins": 1.928113579750061, + "rewards/rejected": -2.680126428604126, + "step": 5203 + }, + { + "epoch": 0.61, + "learning_rate": 1.20184244714775e-07, + "logits/chosen": -1.822451114654541, + "logits/rejected": -2.1021296977996826, + "logps/chosen": -327.4976806640625, + "logps/rejected": -192.91050720214844, + "loss": 0.7255, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9814808368682861, + "rewards/margins": 0.8993427753448486, + "rewards/rejected": -1.8808236122131348, + "step": 5204 + }, + { + "epoch": 0.61, + "learning_rate": 1.2014881303885672e-07, + "logits/chosen": -2.415126323699951, + "logits/rejected": -2.2385666370391846, + "logps/chosen": -423.7177734375, + "logps/rejected": -362.6585693359375, + "loss": 0.4231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7231408357620239, + "rewards/margins": 2.2093112468719482, + "rewards/rejected": -2.9324522018432617, + "step": 5205 + }, + { + "epoch": 0.61, + "learning_rate": 1.2011338136293844e-07, + "logits/chosen": -2.4968698024749756, + "logits/rejected": -2.541084051132202, + "logps/chosen": -435.7365417480469, + "logps/rejected": -429.32476806640625, + "loss": 0.4976, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.930698037147522, + "rewards/margins": 2.9863271713256836, + "rewards/rejected": -3.917025089263916, + "step": 5206 + }, + { + "epoch": 0.61, + "learning_rate": 1.200779496870202e-07, + "logits/chosen": -2.710352659225464, + "logits/rejected": -2.6802237033843994, + "logps/chosen": -287.5000305175781, + "logps/rejected": -210.6776123046875, + "loss": 0.5631, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.449428528547287, + "rewards/margins": 1.4352753162384033, + "rewards/rejected": -1.8847038745880127, + "step": 5207 + }, + { + "epoch": 0.61, + "learning_rate": 1.200425180111019e-07, + "logits/chosen": -3.0040903091430664, + "logits/rejected": -2.9331958293914795, + "logps/chosen": -182.52853393554688, + "logps/rejected": -195.39010620117188, + "loss": 0.3667, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9012411832809448, + "rewards/margins": 2.3392202854156494, + "rewards/rejected": -3.2404613494873047, + "step": 5208 + }, + { + "epoch": 0.61, + "learning_rate": 1.2000708633518366e-07, + "logits/chosen": -2.3674371242523193, + "logits/rejected": -2.2761852741241455, + "logps/chosen": -159.58273315429688, + "logps/rejected": -285.1098937988281, + "loss": 0.1553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5408338904380798, + "rewards/margins": 4.506514072418213, + "rewards/rejected": -5.0473480224609375, + "step": 5209 + }, + { + "epoch": 0.61, + "learning_rate": 1.1997165465926538e-07, + "logits/chosen": -2.3298394680023193, + "logits/rejected": -2.3637166023254395, + "logps/chosen": -214.53109741210938, + "logps/rejected": -160.7362060546875, + "loss": 0.3904, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.010250210762024, + "rewards/margins": 1.2797260284423828, + "rewards/rejected": -2.2899763584136963, + "step": 5210 + }, + { + "epoch": 0.61, + "learning_rate": 1.199362229833471e-07, + "logits/chosen": -2.3432834148406982, + "logits/rejected": -2.530444622039795, + "logps/chosen": -189.16932678222656, + "logps/rejected": -195.637451171875, + "loss": 1.2276, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7863121032714844, + "rewards/margins": 0.4323478937149048, + "rewards/rejected": -2.2186601161956787, + "step": 5211 + }, + { + "epoch": 0.61, + "learning_rate": 1.1990079130742883e-07, + "logits/chosen": -2.657667636871338, + "logits/rejected": -2.3885087966918945, + "logps/chosen": -231.29000854492188, + "logps/rejected": -212.68896484375, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2781105041503906, + "rewards/margins": 2.279245138168335, + "rewards/rejected": -3.5573554039001465, + "step": 5212 + }, + { + "epoch": 0.61, + "learning_rate": 1.1986535963151058e-07, + "logits/chosen": -2.538577079772949, + "logits/rejected": -2.4338741302490234, + "logps/chosen": -370.89520263671875, + "logps/rejected": -400.186279296875, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8703994750976562, + "rewards/margins": 3.467846393585205, + "rewards/rejected": -4.3382463455200195, + "step": 5213 + }, + { + "epoch": 0.61, + "learning_rate": 1.198299279555923e-07, + "logits/chosen": -2.149869680404663, + "logits/rejected": -2.3282487392425537, + "logps/chosen": -309.484375, + "logps/rejected": -193.91026306152344, + "loss": 0.7639, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2988252639770508, + "rewards/margins": 0.3297821581363678, + "rewards/rejected": -1.6286073923110962, + "step": 5214 + }, + { + "epoch": 0.61, + "learning_rate": 1.1979449627967402e-07, + "logits/chosen": -2.143472194671631, + "logits/rejected": -2.3911712169647217, + "logps/chosen": -236.49417114257812, + "logps/rejected": -196.71041870117188, + "loss": 0.3835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4488144516944885, + "rewards/margins": 2.5874552726745605, + "rewards/rejected": -3.0362696647644043, + "step": 5215 + }, + { + "epoch": 0.61, + "learning_rate": 1.1975906460375574e-07, + "logits/chosen": -2.7759060859680176, + "logits/rejected": -2.816784381866455, + "logps/chosen": -131.42872619628906, + "logps/rejected": -189.11538696289062, + "loss": 0.1664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5083937048912048, + "rewards/margins": 3.1335415840148926, + "rewards/rejected": -3.6419355869293213, + "step": 5216 + }, + { + "epoch": 0.61, + "learning_rate": 1.1972363292783746e-07, + "logits/chosen": -2.3794891834259033, + "logits/rejected": -2.2853145599365234, + "logps/chosen": -223.5705108642578, + "logps/rejected": -220.18338012695312, + "loss": 0.4644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15381933748722076, + "rewards/margins": 1.4701015949249268, + "rewards/rejected": -1.6239209175109863, + "step": 5217 + }, + { + "epoch": 0.61, + "learning_rate": 1.196882012519192e-07, + "logits/chosen": -2.3720836639404297, + "logits/rejected": -2.232619524002075, + "logps/chosen": -233.04229736328125, + "logps/rejected": -313.33953857421875, + "loss": 0.555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6587268710136414, + "rewards/margins": 1.468065857887268, + "rewards/rejected": -2.1267926692962646, + "step": 5218 + }, + { + "epoch": 0.61, + "learning_rate": 1.1965276957600093e-07, + "logits/chosen": -2.04050612449646, + "logits/rejected": -2.323744535446167, + "logps/chosen": -202.7533416748047, + "logps/rejected": -183.7977752685547, + "loss": 0.7175, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.008948802947998, + "rewards/margins": 1.5431543588638306, + "rewards/rejected": -2.552103042602539, + "step": 5219 + }, + { + "epoch": 0.61, + "learning_rate": 1.1961733790008268e-07, + "logits/chosen": -2.806553840637207, + "logits/rejected": -2.6314992904663086, + "logps/chosen": -228.44308471679688, + "logps/rejected": -172.6291046142578, + "loss": 0.1837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5104161500930786, + "rewards/margins": 2.000979423522949, + "rewards/rejected": -2.5113954544067383, + "step": 5220 + }, + { + "epoch": 0.61, + "learning_rate": 1.195819062241644e-07, + "logits/chosen": -2.5998787879943848, + "logits/rejected": -2.66147518157959, + "logps/chosen": -208.212158203125, + "logps/rejected": -166.71063232421875, + "loss": 0.217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8014048933982849, + "rewards/margins": 1.714307427406311, + "rewards/rejected": -2.5157124996185303, + "step": 5221 + }, + { + "epoch": 0.61, + "learning_rate": 1.1954647454824613e-07, + "logits/chosen": -2.4281039237976074, + "logits/rejected": -2.5322682857513428, + "logps/chosen": -108.86763763427734, + "logps/rejected": -232.72381591796875, + "loss": 0.4878, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0086349248886108, + "rewards/margins": 2.048588752746582, + "rewards/rejected": -3.0572235584259033, + "step": 5222 + }, + { + "epoch": 0.61, + "learning_rate": 1.1951104287232785e-07, + "logits/chosen": -2.0509934425354004, + "logits/rejected": -2.138908863067627, + "logps/chosen": -198.68914794921875, + "logps/rejected": -257.7431945800781, + "loss": 0.5984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.283691644668579, + "rewards/margins": 1.7401546239852905, + "rewards/rejected": -3.023846387863159, + "step": 5223 + }, + { + "epoch": 0.61, + "learning_rate": 1.194756111964096e-07, + "logits/chosen": -2.2003681659698486, + "logits/rejected": -1.9633464813232422, + "logps/chosen": -236.55325317382812, + "logps/rejected": -340.1285400390625, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.757728099822998, + "rewards/margins": 3.0098705291748047, + "rewards/rejected": -3.7675986289978027, + "step": 5224 + }, + { + "epoch": 0.61, + "learning_rate": 1.1944017952049132e-07, + "logits/chosen": -2.143829107284546, + "logits/rejected": -2.2712697982788086, + "logps/chosen": -323.12054443359375, + "logps/rejected": -245.2421112060547, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4978917837142944, + "rewards/margins": 1.7034579515457153, + "rewards/rejected": -3.2013497352600098, + "step": 5225 + }, + { + "epoch": 0.61, + "learning_rate": 1.1940474784457304e-07, + "logits/chosen": -1.8851680755615234, + "logits/rejected": -2.0490682125091553, + "logps/chosen": -361.4912414550781, + "logps/rejected": -319.59429931640625, + "loss": 0.2101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6845676898956299, + "rewards/margins": 3.6948163509368896, + "rewards/rejected": -4.3793840408325195, + "step": 5226 + }, + { + "epoch": 0.61, + "learning_rate": 1.1936931616865476e-07, + "logits/chosen": -1.7843022346496582, + "logits/rejected": -2.1901628971099854, + "logps/chosen": -405.79974365234375, + "logps/rejected": -239.50567626953125, + "loss": 0.415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6462787985801697, + "rewards/margins": 2.27803373336792, + "rewards/rejected": -2.9243125915527344, + "step": 5227 + }, + { + "epoch": 0.61, + "learning_rate": 1.193338844927365e-07, + "logits/chosen": -2.101116418838501, + "logits/rejected": -2.230097532272339, + "logps/chosen": -385.493896484375, + "logps/rejected": -392.6390075683594, + "loss": 0.4562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9440162777900696, + "rewards/margins": 1.1745153665542603, + "rewards/rejected": -2.1185317039489746, + "step": 5228 + }, + { + "epoch": 0.61, + "learning_rate": 1.1929845281681824e-07, + "logits/chosen": -2.578500747680664, + "logits/rejected": -2.8369140625, + "logps/chosen": -295.1000671386719, + "logps/rejected": -178.70208740234375, + "loss": 0.4898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.14497709274292, + "rewards/margins": 2.224788188934326, + "rewards/rejected": -3.369765520095825, + "step": 5229 + }, + { + "epoch": 0.61, + "learning_rate": 1.1926302114089996e-07, + "logits/chosen": -2.27799654006958, + "logits/rejected": -2.21203351020813, + "logps/chosen": -251.28021240234375, + "logps/rejected": -311.6126403808594, + "loss": 0.603, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.18759024143219, + "rewards/margins": 0.6488876342773438, + "rewards/rejected": -1.8364778757095337, + "step": 5230 + }, + { + "epoch": 0.61, + "learning_rate": 1.1922758946498168e-07, + "logits/chosen": -2.394235849380493, + "logits/rejected": -2.4152748584747314, + "logps/chosen": -255.47183227539062, + "logps/rejected": -313.38677978515625, + "loss": 0.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.433862566947937, + "rewards/margins": 1.1317811012268066, + "rewards/rejected": -2.565643787384033, + "step": 5231 + }, + { + "epoch": 0.61, + "learning_rate": 1.1919215778906342e-07, + "logits/chosen": -2.4488236904144287, + "logits/rejected": -2.5489354133605957, + "logps/chosen": -115.35789489746094, + "logps/rejected": -170.82884216308594, + "loss": 0.8249, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0399065017700195, + "rewards/margins": 1.2633881568908691, + "rewards/rejected": -3.3032946586608887, + "step": 5232 + }, + { + "epoch": 0.61, + "learning_rate": 1.1915672611314515e-07, + "logits/chosen": -2.3067777156829834, + "logits/rejected": -2.19870662689209, + "logps/chosen": -204.93234252929688, + "logps/rejected": -291.32086181640625, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7462971210479736, + "rewards/margins": 2.242022752761841, + "rewards/rejected": -3.9883198738098145, + "step": 5233 + }, + { + "epoch": 0.61, + "learning_rate": 1.1912129443722687e-07, + "logits/chosen": -2.205462694168091, + "logits/rejected": -2.445732593536377, + "logps/chosen": -392.052001953125, + "logps/rejected": -245.5328826904297, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6796283721923828, + "rewards/margins": 1.7002649307250977, + "rewards/rejected": -2.3798933029174805, + "step": 5234 + }, + { + "epoch": 0.61, + "learning_rate": 1.190858627613086e-07, + "logits/chosen": -2.9344794750213623, + "logits/rejected": -2.6877963542938232, + "logps/chosen": -308.64727783203125, + "logps/rejected": -244.89910888671875, + "loss": 0.8133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8858540058135986, + "rewards/margins": 1.695253849029541, + "rewards/rejected": -2.5811080932617188, + "step": 5235 + }, + { + "epoch": 0.61, + "learning_rate": 1.1905043108539034e-07, + "logits/chosen": -2.739502191543579, + "logits/rejected": -2.7095513343811035, + "logps/chosen": -219.41241455078125, + "logps/rejected": -220.99903869628906, + "loss": 0.2125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6793192625045776, + "rewards/margins": 2.1359171867370605, + "rewards/rejected": -2.815236806869507, + "step": 5236 + }, + { + "epoch": 0.61, + "learning_rate": 1.1901499940947207e-07, + "logits/chosen": -2.078427791595459, + "logits/rejected": -2.3817410469055176, + "logps/chosen": -263.18310546875, + "logps/rejected": -178.794677734375, + "loss": 0.403, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7582724094390869, + "rewards/margins": 1.5949277877807617, + "rewards/rejected": -2.3532001972198486, + "step": 5237 + }, + { + "epoch": 0.61, + "learning_rate": 1.1897956773355379e-07, + "logits/chosen": -1.7767460346221924, + "logits/rejected": -1.7624926567077637, + "logps/chosen": -347.53662109375, + "logps/rejected": -288.1228332519531, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6475794911384583, + "rewards/margins": 1.3134269714355469, + "rewards/rejected": -1.96100652217865, + "step": 5238 + }, + { + "epoch": 0.61, + "learning_rate": 1.1894413605763552e-07, + "logits/chosen": -2.9477145671844482, + "logits/rejected": -2.900980234146118, + "logps/chosen": -164.41928100585938, + "logps/rejected": -148.6708526611328, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7536442875862122, + "rewards/margins": 1.8062372207641602, + "rewards/rejected": -2.5598816871643066, + "step": 5239 + }, + { + "epoch": 0.61, + "learning_rate": 1.1890870438171725e-07, + "logits/chosen": -2.2992303371429443, + "logits/rejected": -2.37612247467041, + "logps/chosen": -432.0656433105469, + "logps/rejected": -209.2398681640625, + "loss": 0.3639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9445242881774902, + "rewards/margins": 1.641132116317749, + "rewards/rejected": -2.5856566429138184, + "step": 5240 + }, + { + "epoch": 0.61, + "learning_rate": 1.1887327270579897e-07, + "logits/chosen": -2.08923077583313, + "logits/rejected": -2.3481321334838867, + "logps/chosen": -402.72210693359375, + "logps/rejected": -232.14007568359375, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6301354169845581, + "rewards/margins": 2.508328914642334, + "rewards/rejected": -3.1384642124176025, + "step": 5241 + }, + { + "epoch": 0.61, + "learning_rate": 1.1883784102988072e-07, + "logits/chosen": -2.5205092430114746, + "logits/rejected": -2.5372467041015625, + "logps/chosen": -195.1888427734375, + "logps/rejected": -166.8831787109375, + "loss": 0.4237, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0291069746017456, + "rewards/margins": 1.6552293300628662, + "rewards/rejected": -2.6843361854553223, + "step": 5242 + }, + { + "epoch": 0.61, + "learning_rate": 1.1880240935396244e-07, + "logits/chosen": -2.6876614093780518, + "logits/rejected": -2.643460750579834, + "logps/chosen": -239.96884155273438, + "logps/rejected": -219.08084106445312, + "loss": 0.4498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2831932604312897, + "rewards/margins": 2.206418991088867, + "rewards/rejected": -2.489612102508545, + "step": 5243 + }, + { + "epoch": 0.61, + "learning_rate": 1.1876697767804417e-07, + "logits/chosen": -1.6111646890640259, + "logits/rejected": -1.8130854368209839, + "logps/chosen": -486.34893798828125, + "logps/rejected": -336.9454040527344, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1003403663635254, + "rewards/margins": 0.7380590438842773, + "rewards/rejected": -1.8383995294570923, + "step": 5244 + }, + { + "epoch": 0.61, + "learning_rate": 1.187315460021259e-07, + "logits/chosen": -2.204624652862549, + "logits/rejected": -2.4227869510650635, + "logps/chosen": -212.95664978027344, + "logps/rejected": -263.3576965332031, + "loss": 0.4663, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0531284809112549, + "rewards/margins": 2.0488672256469727, + "rewards/rejected": -3.1019954681396484, + "step": 5245 + }, + { + "epoch": 0.61, + "learning_rate": 1.1869611432620762e-07, + "logits/chosen": -2.648639440536499, + "logits/rejected": -2.70845365524292, + "logps/chosen": -345.9920654296875, + "logps/rejected": -344.4086608886719, + "loss": 0.1256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9703644514083862, + "rewards/margins": 3.282701015472412, + "rewards/rejected": -4.253065586090088, + "step": 5246 + }, + { + "epoch": 0.61, + "learning_rate": 1.1866068265028934e-07, + "logits/chosen": -2.1289336681365967, + "logits/rejected": -2.2007386684417725, + "logps/chosen": -416.67645263671875, + "logps/rejected": -435.2542724609375, + "loss": 0.2219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20128487050533295, + "rewards/margins": 2.78542423248291, + "rewards/rejected": -2.9867093563079834, + "step": 5247 + }, + { + "epoch": 0.61, + "learning_rate": 1.1862525097437109e-07, + "logits/chosen": -2.453432083129883, + "logits/rejected": -2.5474154949188232, + "logps/chosen": -189.3832244873047, + "logps/rejected": -186.490478515625, + "loss": 0.7441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7731969356536865, + "rewards/margins": 1.908055305480957, + "rewards/rejected": -2.6812522411346436, + "step": 5248 + }, + { + "epoch": 0.61, + "learning_rate": 1.1858981929845281e-07, + "logits/chosen": -2.6520328521728516, + "logits/rejected": -2.3169631958007812, + "logps/chosen": -184.47994995117188, + "logps/rejected": -342.31231689453125, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0655137300491333, + "rewards/margins": 3.17734956741333, + "rewards/rejected": -4.242863178253174, + "step": 5249 + }, + { + "epoch": 0.61, + "learning_rate": 1.1855438762253455e-07, + "logits/chosen": -2.10669207572937, + "logits/rejected": -1.9971015453338623, + "logps/chosen": -125.4957275390625, + "logps/rejected": -237.892333984375, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.857762336730957, + "rewards/margins": 1.807873249053955, + "rewards/rejected": -2.665635585784912, + "step": 5250 + }, + { + "epoch": 0.61, + "learning_rate": 1.1851895594661627e-07, + "logits/chosen": -2.4672963619232178, + "logits/rejected": -2.4392919540405273, + "logps/chosen": -364.3828430175781, + "logps/rejected": -357.29742431640625, + "loss": 0.2152, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07487642765045166, + "rewards/margins": 3.1160213947296143, + "rewards/rejected": -3.0411453247070312, + "step": 5251 + }, + { + "epoch": 0.61, + "learning_rate": 1.1848352427069799e-07, + "logits/chosen": -2.539534330368042, + "logits/rejected": -2.6610054969787598, + "logps/chosen": -348.3246154785156, + "logps/rejected": -225.27252197265625, + "loss": 0.3165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.714199423789978, + "rewards/margins": 2.5985474586486816, + "rewards/rejected": -3.31274676322937, + "step": 5252 + }, + { + "epoch": 0.61, + "learning_rate": 1.1844809259477974e-07, + "logits/chosen": -1.9542683362960815, + "logits/rejected": -2.215526580810547, + "logps/chosen": -202.9188995361328, + "logps/rejected": -197.16281127929688, + "loss": 0.3073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1514233648777008, + "rewards/margins": 1.2696161270141602, + "rewards/rejected": -1.4210395812988281, + "step": 5253 + }, + { + "epoch": 0.61, + "learning_rate": 1.1841266091886146e-07, + "logits/chosen": -2.6250462532043457, + "logits/rejected": -2.8109302520751953, + "logps/chosen": -354.80133056640625, + "logps/rejected": -291.2445983886719, + "loss": 0.6602, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7423521280288696, + "rewards/margins": 1.6311440467834473, + "rewards/rejected": -3.3734960556030273, + "step": 5254 + }, + { + "epoch": 0.61, + "learning_rate": 1.1837722924294318e-07, + "logits/chosen": -2.140439510345459, + "logits/rejected": -1.7785425186157227, + "logps/chosen": -396.25030517578125, + "logps/rejected": -451.5938720703125, + "loss": 0.3564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.484535276889801, + "rewards/margins": 3.134540319442749, + "rewards/rejected": -3.6190755367279053, + "step": 5255 + }, + { + "epoch": 0.61, + "learning_rate": 1.1834179756702492e-07, + "logits/chosen": -2.1213769912719727, + "logits/rejected": -1.976830005645752, + "logps/chosen": -162.17613220214844, + "logps/rejected": -218.14202880859375, + "loss": 0.3676, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1742827892303467, + "rewards/margins": 1.604225516319275, + "rewards/rejected": -2.778508186340332, + "step": 5256 + }, + { + "epoch": 0.61, + "learning_rate": 1.1830636589110664e-07, + "logits/chosen": -3.048300266265869, + "logits/rejected": -3.05981707572937, + "logps/chosen": -300.2829284667969, + "logps/rejected": -255.86085510253906, + "loss": 0.2255, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1522409915924072, + "rewards/margins": 3.170846939086914, + "rewards/rejected": -4.3230881690979, + "step": 5257 + }, + { + "epoch": 0.61, + "learning_rate": 1.1827093421518836e-07, + "logits/chosen": -2.165253162384033, + "logits/rejected": -2.09025239944458, + "logps/chosen": -425.52862548828125, + "logps/rejected": -309.7415771484375, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.412817120552063, + "rewards/margins": 2.428269147872925, + "rewards/rejected": -2.8410863876342773, + "step": 5258 + }, + { + "epoch": 0.61, + "learning_rate": 1.1823550253927011e-07, + "logits/chosen": -1.8609963655471802, + "logits/rejected": -1.7712948322296143, + "logps/chosen": -261.3502197265625, + "logps/rejected": -180.3934326171875, + "loss": 0.3166, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3460504412651062, + "rewards/margins": 1.8323123455047607, + "rewards/rejected": -2.1783628463745117, + "step": 5259 + }, + { + "epoch": 0.61, + "learning_rate": 1.1820007086335183e-07, + "logits/chosen": -2.299649238586426, + "logits/rejected": -2.5753514766693115, + "logps/chosen": -296.44781494140625, + "logps/rejected": -239.747314453125, + "loss": 2.5189, + "rewards/accuracies": 0.375, + "rewards/chosen": -3.460230827331543, + "rewards/margins": -0.9753597974777222, + "rewards/rejected": -2.4848709106445312, + "step": 5260 + }, + { + "epoch": 0.61, + "learning_rate": 1.1816463918743357e-07, + "logits/chosen": -2.736708164215088, + "logits/rejected": -2.3362793922424316, + "logps/chosen": -134.73666381835938, + "logps/rejected": -189.3877716064453, + "loss": 0.3995, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29064756631851196, + "rewards/margins": 1.4641447067260742, + "rewards/rejected": -1.7547922134399414, + "step": 5261 + }, + { + "epoch": 0.61, + "learning_rate": 1.1812920751151529e-07, + "logits/chosen": -1.4951770305633545, + "logits/rejected": -1.9950357675552368, + "logps/chosen": -302.83184814453125, + "logps/rejected": -196.23143005371094, + "loss": 0.3007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.618175745010376, + "rewards/margins": 1.6477444171905518, + "rewards/rejected": -2.2659201622009277, + "step": 5262 + }, + { + "epoch": 0.61, + "learning_rate": 1.1809377583559701e-07, + "logits/chosen": -2.048804759979248, + "logits/rejected": -2.1541364192962646, + "logps/chosen": -321.9862976074219, + "logps/rejected": -218.3290557861328, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5715718865394592, + "rewards/margins": 1.8715957403182983, + "rewards/rejected": -2.4431676864624023, + "step": 5263 + }, + { + "epoch": 0.61, + "learning_rate": 1.1805834415967873e-07, + "logits/chosen": -2.424882173538208, + "logits/rejected": -2.6252951622009277, + "logps/chosen": -404.2431945800781, + "logps/rejected": -268.65484619140625, + "loss": 0.1829, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5891562104225159, + "rewards/margins": 2.7115139961242676, + "rewards/rejected": -3.3006701469421387, + "step": 5264 + }, + { + "epoch": 0.61, + "learning_rate": 1.1802291248376048e-07, + "logits/chosen": -1.9521167278289795, + "logits/rejected": -2.2592697143554688, + "logps/chosen": -360.3934326171875, + "logps/rejected": -245.67391967773438, + "loss": 0.253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08789709210395813, + "rewards/margins": 1.7856574058532715, + "rewards/rejected": -1.8735544681549072, + "step": 5265 + }, + { + "epoch": 0.61, + "learning_rate": 1.179874808078422e-07, + "logits/chosen": -2.68017840385437, + "logits/rejected": -2.656266212463379, + "logps/chosen": -294.82415771484375, + "logps/rejected": -192.40499877929688, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7716482281684875, + "rewards/margins": 1.8566749095916748, + "rewards/rejected": -2.6283230781555176, + "step": 5266 + }, + { + "epoch": 0.61, + "learning_rate": 1.1795204913192394e-07, + "logits/chosen": -2.299527168273926, + "logits/rejected": -2.500115394592285, + "logps/chosen": -347.5217590332031, + "logps/rejected": -284.2477722167969, + "loss": 0.5421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2631189227104187, + "rewards/margins": 1.4383418560028076, + "rewards/rejected": -1.701460838317871, + "step": 5267 + }, + { + "epoch": 0.61, + "learning_rate": 1.1791661745600566e-07, + "logits/chosen": -2.304253578186035, + "logits/rejected": -2.298375129699707, + "logps/chosen": -341.73443603515625, + "logps/rejected": -320.4442138671875, + "loss": 0.48, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8404834270477295, + "rewards/margins": 3.031440496444702, + "rewards/rejected": -3.8719236850738525, + "step": 5268 + }, + { + "epoch": 0.61, + "learning_rate": 1.1788118578008739e-07, + "logits/chosen": -2.5838303565979004, + "logits/rejected": -2.1793441772460938, + "logps/chosen": -158.73004150390625, + "logps/rejected": -322.43341064453125, + "loss": 0.6354, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3438777923583984, + "rewards/margins": 0.8087022304534912, + "rewards/rejected": -2.1525802612304688, + "step": 5269 + }, + { + "epoch": 0.61, + "learning_rate": 1.1784575410416912e-07, + "logits/chosen": -1.9441759586334229, + "logits/rejected": -1.7470110654830933, + "logps/chosen": -265.01055908203125, + "logps/rejected": -297.27178955078125, + "loss": 0.3428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7761585116386414, + "rewards/margins": 1.5984714031219482, + "rewards/rejected": -2.3746297359466553, + "step": 5270 + }, + { + "epoch": 0.61, + "learning_rate": 1.1781032242825086e-07, + "logits/chosen": -1.7500593662261963, + "logits/rejected": -1.811596393585205, + "logps/chosen": -409.02783203125, + "logps/rejected": -427.439697265625, + "loss": 0.3294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22791367769241333, + "rewards/margins": 2.1136672496795654, + "rewards/rejected": -2.341580867767334, + "step": 5271 + }, + { + "epoch": 0.61, + "learning_rate": 1.1777489075233258e-07, + "logits/chosen": -2.2640557289123535, + "logits/rejected": -2.283284902572632, + "logps/chosen": -259.98687744140625, + "logps/rejected": -206.06663513183594, + "loss": 0.395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5183029770851135, + "rewards/margins": 3.388443946838379, + "rewards/rejected": -3.9067466259002686, + "step": 5272 + }, + { + "epoch": 0.61, + "learning_rate": 1.1773945907641431e-07, + "logits/chosen": -2.5991086959838867, + "logits/rejected": -2.5637495517730713, + "logps/chosen": -248.1488800048828, + "logps/rejected": -237.92465209960938, + "loss": 0.4125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9665053486824036, + "rewards/margins": 1.0807572603225708, + "rewards/rejected": -2.047262668609619, + "step": 5273 + }, + { + "epoch": 0.61, + "learning_rate": 1.1770402740049604e-07, + "logits/chosen": -2.4517123699188232, + "logits/rejected": -2.3901944160461426, + "logps/chosen": -272.49908447265625, + "logps/rejected": -339.0950012207031, + "loss": 0.4553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5718115568161011, + "rewards/margins": 3.2487692832946777, + "rewards/rejected": -3.8205807209014893, + "step": 5274 + }, + { + "epoch": 0.61, + "learning_rate": 1.1766859572457776e-07, + "logits/chosen": -2.2701027393341064, + "logits/rejected": -2.2160847187042236, + "logps/chosen": -351.8022766113281, + "logps/rejected": -298.6549072265625, + "loss": 0.3985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33423224091529846, + "rewards/margins": 1.826180338859558, + "rewards/rejected": -2.160412549972534, + "step": 5275 + }, + { + "epoch": 0.61, + "learning_rate": 1.1763316404865949e-07, + "logits/chosen": -2.407958984375, + "logits/rejected": -2.7222139835357666, + "logps/chosen": -481.1859130859375, + "logps/rejected": -296.8026123046875, + "loss": 0.53, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6143051385879517, + "rewards/margins": 0.7701908349990845, + "rewards/rejected": -2.3844962120056152, + "step": 5276 + }, + { + "epoch": 0.61, + "learning_rate": 1.1759773237274123e-07, + "logits/chosen": -2.577221632003784, + "logits/rejected": -2.4440903663635254, + "logps/chosen": -337.23126220703125, + "logps/rejected": -292.03350830078125, + "loss": 0.1974, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6447274684906006, + "rewards/margins": 2.385129928588867, + "rewards/rejected": -4.029857158660889, + "step": 5277 + }, + { + "epoch": 0.61, + "learning_rate": 1.1756230069682296e-07, + "logits/chosen": -2.5873584747314453, + "logits/rejected": -2.8606417179107666, + "logps/chosen": -229.9432373046875, + "logps/rejected": -121.72816467285156, + "loss": 0.2367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17829251289367676, + "rewards/margins": 2.6714322566986084, + "rewards/rejected": -2.849724769592285, + "step": 5278 + }, + { + "epoch": 0.61, + "learning_rate": 1.1752686902090469e-07, + "logits/chosen": -2.697476863861084, + "logits/rejected": -2.474026679992676, + "logps/chosen": -229.49891662597656, + "logps/rejected": -279.8469543457031, + "loss": 0.3051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5056782960891724, + "rewards/margins": 2.6878819465637207, + "rewards/rejected": -3.1935601234436035, + "step": 5279 + }, + { + "epoch": 0.61, + "learning_rate": 1.1749143734498641e-07, + "logits/chosen": -2.7505712509155273, + "logits/rejected": -2.721304416656494, + "logps/chosen": -376.27740478515625, + "logps/rejected": -289.19873046875, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.931423008441925, + "rewards/margins": 3.0802342891693115, + "rewards/rejected": -4.011657238006592, + "step": 5280 + }, + { + "epoch": 0.61, + "learning_rate": 1.1745600566906813e-07, + "logits/chosen": -2.03131103515625, + "logits/rejected": -2.1894495487213135, + "logps/chosen": -283.6620178222656, + "logps/rejected": -214.67050170898438, + "loss": 0.5055, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1678783893585205, + "rewards/margins": 1.2625603675842285, + "rewards/rejected": -2.430438756942749, + "step": 5281 + }, + { + "epoch": 0.61, + "learning_rate": 1.1742057399314987e-07, + "logits/chosen": -2.5550785064697266, + "logits/rejected": -2.517813205718994, + "logps/chosen": -139.17137145996094, + "logps/rejected": -179.29153442382812, + "loss": 0.3278, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7388405799865723, + "rewards/margins": 2.288928508758545, + "rewards/rejected": -4.027769088745117, + "step": 5282 + }, + { + "epoch": 0.61, + "learning_rate": 1.173851423172316e-07, + "logits/chosen": -2.7774856090545654, + "logits/rejected": -2.6158766746520996, + "logps/chosen": -198.04556274414062, + "logps/rejected": -222.92657470703125, + "loss": 0.2694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6858325004577637, + "rewards/margins": 2.2583394050598145, + "rewards/rejected": -2.944171905517578, + "step": 5283 + }, + { + "epoch": 0.61, + "learning_rate": 1.1734971064131334e-07, + "logits/chosen": -2.2488937377929688, + "logits/rejected": -2.3045594692230225, + "logps/chosen": -491.4161071777344, + "logps/rejected": -429.3929138183594, + "loss": 0.3314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27050426602363586, + "rewards/margins": 1.6516382694244385, + "rewards/rejected": -1.922142505645752, + "step": 5284 + }, + { + "epoch": 0.61, + "learning_rate": 1.1731427896539506e-07, + "logits/chosen": -1.9537928104400635, + "logits/rejected": -1.8455679416656494, + "logps/chosen": -270.1189270019531, + "logps/rejected": -272.5382080078125, + "loss": 0.3381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.603519856929779, + "rewards/margins": 1.5209020376205444, + "rewards/rejected": -2.1244218349456787, + "step": 5285 + }, + { + "epoch": 0.61, + "learning_rate": 1.1727884728947678e-07, + "logits/chosen": -2.286505699157715, + "logits/rejected": -2.25192928314209, + "logps/chosen": -357.8780517578125, + "logps/rejected": -378.70257568359375, + "loss": 0.2016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5772948265075684, + "rewards/margins": 3.2156221866607666, + "rewards/rejected": -3.792917013168335, + "step": 5286 + }, + { + "epoch": 0.62, + "learning_rate": 1.1724341561355852e-07, + "logits/chosen": -2.4928858280181885, + "logits/rejected": -2.6404378414154053, + "logps/chosen": -260.2478332519531, + "logps/rejected": -251.83070373535156, + "loss": 0.2312, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005236834287643433, + "rewards/margins": 3.4741201400756836, + "rewards/rejected": -3.4688832759857178, + "step": 5287 + }, + { + "epoch": 0.62, + "learning_rate": 1.1720798393764024e-07, + "logits/chosen": -2.8182716369628906, + "logits/rejected": -2.631560802459717, + "logps/chosen": -380.1495666503906, + "logps/rejected": -362.5925598144531, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05954378843307495, + "rewards/margins": 4.238600730895996, + "rewards/rejected": -4.298144817352295, + "step": 5288 + }, + { + "epoch": 0.62, + "learning_rate": 1.1717255226172197e-07, + "logits/chosen": -2.528200626373291, + "logits/rejected": -2.6138134002685547, + "logps/chosen": -261.32989501953125, + "logps/rejected": -225.1515350341797, + "loss": 0.7444, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2387347221374512, + "rewards/margins": 2.447575092315674, + "rewards/rejected": -3.686309576034546, + "step": 5289 + }, + { + "epoch": 0.62, + "learning_rate": 1.1713712058580371e-07, + "logits/chosen": -2.6036620140075684, + "logits/rejected": -2.635847806930542, + "logps/chosen": -143.15992736816406, + "logps/rejected": -219.8861083984375, + "loss": 0.2193, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3654455840587616, + "rewards/margins": 2.343755006790161, + "rewards/rejected": -2.709200859069824, + "step": 5290 + }, + { + "epoch": 0.62, + "learning_rate": 1.1710168890988543e-07, + "logits/chosen": -1.97577965259552, + "logits/rejected": -2.262843132019043, + "logps/chosen": -334.03173828125, + "logps/rejected": -256.16815185546875, + "loss": 0.4685, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5646648406982422, + "rewards/margins": 1.7210431098937988, + "rewards/rejected": -2.28570818901062, + "step": 5291 + }, + { + "epoch": 0.62, + "learning_rate": 1.1706625723396715e-07, + "logits/chosen": -2.8297572135925293, + "logits/rejected": -2.787078619003296, + "logps/chosen": -490.55078125, + "logps/rejected": -354.1529235839844, + "loss": 0.2165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7940122485160828, + "rewards/margins": 2.617335796356201, + "rewards/rejected": -3.411348342895508, + "step": 5292 + }, + { + "epoch": 0.62, + "learning_rate": 1.1703082555804889e-07, + "logits/chosen": -1.3562649488449097, + "logits/rejected": -1.8054382801055908, + "logps/chosen": -468.336669921875, + "logps/rejected": -263.1922607421875, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7919514179229736, + "rewards/margins": 1.5936633348464966, + "rewards/rejected": -2.3856146335601807, + "step": 5293 + }, + { + "epoch": 0.62, + "learning_rate": 1.1699539388213062e-07, + "logits/chosen": -2.2575552463531494, + "logits/rejected": -2.2675092220306396, + "logps/chosen": -193.53082275390625, + "logps/rejected": -230.63601684570312, + "loss": 0.4661, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1878875494003296, + "rewards/margins": 1.904371738433838, + "rewards/rejected": -3.092259407043457, + "step": 5294 + }, + { + "epoch": 0.62, + "learning_rate": 1.1695996220621236e-07, + "logits/chosen": -1.7577286958694458, + "logits/rejected": -1.8983138799667358, + "logps/chosen": -314.48284912109375, + "logps/rejected": -276.42279052734375, + "loss": 0.8828, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8643360137939453, + "rewards/margins": 0.9514135718345642, + "rewards/rejected": -2.8157496452331543, + "step": 5295 + }, + { + "epoch": 0.62, + "learning_rate": 1.1692453053029408e-07, + "logits/chosen": -2.4131877422332764, + "logits/rejected": -2.2702126502990723, + "logps/chosen": -124.94967651367188, + "logps/rejected": -150.53201293945312, + "loss": 0.6018, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1203969344496727, + "rewards/margins": 0.7300107479095459, + "rewards/rejected": -0.6096138954162598, + "step": 5296 + }, + { + "epoch": 0.62, + "learning_rate": 1.168890988543758e-07, + "logits/chosen": -2.5262932777404785, + "logits/rejected": -2.503272771835327, + "logps/chosen": -207.5669403076172, + "logps/rejected": -263.85546875, + "loss": 0.6127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9905283451080322, + "rewards/margins": 1.005749225616455, + "rewards/rejected": -1.9962774515151978, + "step": 5297 + }, + { + "epoch": 0.62, + "learning_rate": 1.1685366717845754e-07, + "logits/chosen": -2.1826329231262207, + "logits/rejected": -2.363375186920166, + "logps/chosen": -406.6322021484375, + "logps/rejected": -311.2193908691406, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.057021260261535645, + "rewards/margins": 2.5437793731689453, + "rewards/rejected": -2.486758232116699, + "step": 5298 + }, + { + "epoch": 0.62, + "learning_rate": 1.1681823550253926e-07, + "logits/chosen": -2.3317127227783203, + "logits/rejected": -2.0828559398651123, + "logps/chosen": -120.66413879394531, + "logps/rejected": -179.9330596923828, + "loss": 0.3736, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6312776207923889, + "rewards/margins": 1.0773752927780151, + "rewards/rejected": -1.7086528539657593, + "step": 5299 + }, + { + "epoch": 0.62, + "learning_rate": 1.16782803826621e-07, + "logits/chosen": -2.1027679443359375, + "logits/rejected": -2.2560501098632812, + "logps/chosen": -353.9442138671875, + "logps/rejected": -382.62567138671875, + "loss": 0.7868, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.801364541053772, + "rewards/margins": 1.6687010526657104, + "rewards/rejected": -2.4700655937194824, + "step": 5300 + }, + { + "epoch": 0.62, + "learning_rate": 1.1674737215070273e-07, + "logits/chosen": -1.9048185348510742, + "logits/rejected": -2.0334906578063965, + "logps/chosen": -267.0411376953125, + "logps/rejected": -209.96096801757812, + "loss": 0.6011, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6247832179069519, + "rewards/margins": 1.2106143236160278, + "rewards/rejected": -1.835397481918335, + "step": 5301 + }, + { + "epoch": 0.62, + "learning_rate": 1.1671194047478445e-07, + "logits/chosen": -2.2913894653320312, + "logits/rejected": -2.5170986652374268, + "logps/chosen": -397.20947265625, + "logps/rejected": -285.99493408203125, + "loss": 0.3304, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.037906989455223083, + "rewards/margins": 1.9353054761886597, + "rewards/rejected": -1.973212480545044, + "step": 5302 + }, + { + "epoch": 0.62, + "learning_rate": 1.1667650879886618e-07, + "logits/chosen": -1.8038997650146484, + "logits/rejected": -1.9182120561599731, + "logps/chosen": -362.06390380859375, + "logps/rejected": -321.1721496582031, + "loss": 0.4378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24410377442836761, + "rewards/margins": 1.5396840572357178, + "rewards/rejected": -1.7837879657745361, + "step": 5303 + }, + { + "epoch": 0.62, + "learning_rate": 1.1664107712294791e-07, + "logits/chosen": -2.421043872833252, + "logits/rejected": -2.266037940979004, + "logps/chosen": -261.90008544921875, + "logps/rejected": -283.0332946777344, + "loss": 0.4065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12445847690105438, + "rewards/margins": 2.1373724937438965, + "rewards/rejected": -2.261831045150757, + "step": 5304 + }, + { + "epoch": 0.62, + "learning_rate": 1.1660564544702963e-07, + "logits/chosen": -2.56738018989563, + "logits/rejected": -2.5271265506744385, + "logps/chosen": -124.43252563476562, + "logps/rejected": -237.01507568359375, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5398470163345337, + "rewards/margins": 3.5363097190856934, + "rewards/rejected": -4.0761566162109375, + "step": 5305 + }, + { + "epoch": 0.62, + "learning_rate": 1.1657021377111138e-07, + "logits/chosen": -2.570099353790283, + "logits/rejected": -2.3443877696990967, + "logps/chosen": -232.73184204101562, + "logps/rejected": -283.84576416015625, + "loss": 0.2621, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7346103191375732, + "rewards/margins": 1.7430243492126465, + "rewards/rejected": -2.4776346683502197, + "step": 5306 + }, + { + "epoch": 0.62, + "learning_rate": 1.165347820951931e-07, + "logits/chosen": -2.7462167739868164, + "logits/rejected": -2.8138694763183594, + "logps/chosen": -282.20477294921875, + "logps/rejected": -305.8728942871094, + "loss": 0.4351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.747832179069519, + "rewards/margins": 1.5048048496246338, + "rewards/rejected": -2.2526373863220215, + "step": 5307 + }, + { + "epoch": 0.62, + "learning_rate": 1.1649935041927483e-07, + "logits/chosen": -2.152541160583496, + "logits/rejected": -2.078327178955078, + "logps/chosen": -166.49655151367188, + "logps/rejected": -204.6988067626953, + "loss": 0.7467, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.225314140319824, + "rewards/margins": 0.4870728850364685, + "rewards/rejected": -2.7123870849609375, + "step": 5308 + }, + { + "epoch": 0.62, + "learning_rate": 1.1646391874335655e-07, + "logits/chosen": -2.783792734146118, + "logits/rejected": -2.323777914047241, + "logps/chosen": -157.93177795410156, + "logps/rejected": -353.5307922363281, + "loss": 0.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0466148853302002, + "rewards/margins": 5.291600227355957, + "rewards/rejected": -6.33821439743042, + "step": 5309 + }, + { + "epoch": 0.62, + "learning_rate": 1.1642848706743828e-07, + "logits/chosen": -2.063654899597168, + "logits/rejected": -1.9952524900436401, + "logps/chosen": -407.60308837890625, + "logps/rejected": -301.52935791015625, + "loss": 0.2581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4817858338356018, + "rewards/margins": 2.077347755432129, + "rewards/rejected": -2.559133529663086, + "step": 5310 + }, + { + "epoch": 0.62, + "learning_rate": 1.1639305539152e-07, + "logits/chosen": -2.5035603046417236, + "logits/rejected": -2.645695447921753, + "logps/chosen": -445.1630859375, + "logps/rejected": -417.775146484375, + "loss": 0.5402, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.149680256843567, + "rewards/margins": 1.2361507415771484, + "rewards/rejected": -2.385831117630005, + "step": 5311 + }, + { + "epoch": 0.62, + "learning_rate": 1.1635762371560175e-07, + "logits/chosen": -2.6594557762145996, + "logits/rejected": -2.549384355545044, + "logps/chosen": -248.6284637451172, + "logps/rejected": -242.2784423828125, + "loss": 0.3487, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7310099601745605, + "rewards/margins": 1.6637134552001953, + "rewards/rejected": -3.394723415374756, + "step": 5312 + }, + { + "epoch": 0.62, + "learning_rate": 1.1632219203968348e-07, + "logits/chosen": -2.1348092555999756, + "logits/rejected": -2.3476319313049316, + "logps/chosen": -274.33599853515625, + "logps/rejected": -204.35475158691406, + "loss": 0.8265, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.228765845298767, + "rewards/margins": 1.54368257522583, + "rewards/rejected": -2.7724485397338867, + "step": 5313 + }, + { + "epoch": 0.62, + "learning_rate": 1.162867603637652e-07, + "logits/chosen": -2.0720951557159424, + "logits/rejected": -2.3176915645599365, + "logps/chosen": -274.685546875, + "logps/rejected": -226.14480590820312, + "loss": 0.3857, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49419498443603516, + "rewards/margins": 1.8361411094665527, + "rewards/rejected": -2.330336093902588, + "step": 5314 + }, + { + "epoch": 0.62, + "learning_rate": 1.1625132868784693e-07, + "logits/chosen": -1.8268955945968628, + "logits/rejected": -2.127134323120117, + "logps/chosen": -307.13525390625, + "logps/rejected": -189.55226135253906, + "loss": 0.3793, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5510401725769043, + "rewards/margins": 2.268220901489258, + "rewards/rejected": -2.819261074066162, + "step": 5315 + }, + { + "epoch": 0.62, + "learning_rate": 1.1621589701192866e-07, + "logits/chosen": -2.281550645828247, + "logits/rejected": -1.7439478635787964, + "logps/chosen": -218.48455810546875, + "logps/rejected": -279.8666076660156, + "loss": 0.5154, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0256143808364868, + "rewards/margins": 1.0093443393707275, + "rewards/rejected": -2.034958839416504, + "step": 5316 + }, + { + "epoch": 0.62, + "learning_rate": 1.1618046533601038e-07, + "logits/chosen": -1.5144336223602295, + "logits/rejected": -1.6629301309585571, + "logps/chosen": -289.74267578125, + "logps/rejected": -322.62677001953125, + "loss": 0.3827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9496184587478638, + "rewards/margins": 1.3883254528045654, + "rewards/rejected": -2.3379440307617188, + "step": 5317 + }, + { + "epoch": 0.62, + "learning_rate": 1.1614503366009213e-07, + "logits/chosen": -2.073505401611328, + "logits/rejected": -1.981796383857727, + "logps/chosen": -271.1490478515625, + "logps/rejected": -284.88458251953125, + "loss": 0.2236, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7359215617179871, + "rewards/margins": 2.2834086418151855, + "rewards/rejected": -3.0193305015563965, + "step": 5318 + }, + { + "epoch": 0.62, + "learning_rate": 1.1610960198417385e-07, + "logits/chosen": -2.1382827758789062, + "logits/rejected": -2.2655844688415527, + "logps/chosen": -190.14688110351562, + "logps/rejected": -262.853515625, + "loss": 0.5129, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8454734086990356, + "rewards/margins": 1.6533787250518799, + "rewards/rejected": -2.498852252960205, + "step": 5319 + }, + { + "epoch": 0.62, + "learning_rate": 1.1607417030825557e-07, + "logits/chosen": -2.4889798164367676, + "logits/rejected": -2.230797290802002, + "logps/chosen": -135.45750427246094, + "logps/rejected": -424.9656982421875, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34320545196533203, + "rewards/margins": 2.677400827407837, + "rewards/rejected": -3.02060604095459, + "step": 5320 + }, + { + "epoch": 0.62, + "learning_rate": 1.160387386323373e-07, + "logits/chosen": -2.0018177032470703, + "logits/rejected": -1.8580577373504639, + "logps/chosen": -344.46063232421875, + "logps/rejected": -311.4071960449219, + "loss": 0.4515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.644399106502533, + "rewards/margins": 1.293992519378662, + "rewards/rejected": -1.9383918046951294, + "step": 5321 + }, + { + "epoch": 0.62, + "learning_rate": 1.1600330695641903e-07, + "logits/chosen": -2.153836250305176, + "logits/rejected": -2.3938827514648438, + "logps/chosen": -326.6735534667969, + "logps/rejected": -279.66485595703125, + "loss": 0.2744, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3750455379486084, + "rewards/margins": 1.6775996685028076, + "rewards/rejected": -3.052645206451416, + "step": 5322 + }, + { + "epoch": 0.62, + "learning_rate": 1.1596787528050075e-07, + "logits/chosen": -2.844278335571289, + "logits/rejected": -2.741501569747925, + "logps/chosen": -130.61639404296875, + "logps/rejected": -207.46424865722656, + "loss": 0.667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9724605083465576, + "rewards/margins": 1.2323251962661743, + "rewards/rejected": -2.2047853469848633, + "step": 5323 + }, + { + "epoch": 0.62, + "learning_rate": 1.159324436045825e-07, + "logits/chosen": -2.1201515197753906, + "logits/rejected": -1.880948781967163, + "logps/chosen": -336.17327880859375, + "logps/rejected": -437.87567138671875, + "loss": 0.7724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5723652839660645, + "rewards/margins": 1.1516891717910767, + "rewards/rejected": -1.7240545749664307, + "step": 5324 + }, + { + "epoch": 0.62, + "learning_rate": 1.1589701192866422e-07, + "logits/chosen": -2.077421188354492, + "logits/rejected": -2.178849697113037, + "logps/chosen": -310.98297119140625, + "logps/rejected": -265.6669616699219, + "loss": 0.4356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9609177112579346, + "rewards/margins": 2.256821632385254, + "rewards/rejected": -3.2177393436431885, + "step": 5325 + }, + { + "epoch": 0.62, + "learning_rate": 1.1586158025274594e-07, + "logits/chosen": -2.277005910873413, + "logits/rejected": -2.312925338745117, + "logps/chosen": -168.8188934326172, + "logps/rejected": -220.8189697265625, + "loss": 0.1608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17282895743846893, + "rewards/margins": 3.451234817504883, + "rewards/rejected": -3.6240639686584473, + "step": 5326 + }, + { + "epoch": 0.62, + "learning_rate": 1.1582614857682768e-07, + "logits/chosen": -2.8460745811462402, + "logits/rejected": -2.5967495441436768, + "logps/chosen": -400.2037353515625, + "logps/rejected": -283.1959228515625, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5235212445259094, + "rewards/margins": 1.8115437030792236, + "rewards/rejected": -2.3350648880004883, + "step": 5327 + }, + { + "epoch": 0.62, + "learning_rate": 1.157907169009094e-07, + "logits/chosen": -2.116426944732666, + "logits/rejected": -2.1622731685638428, + "logps/chosen": -385.31842041015625, + "logps/rejected": -330.441650390625, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21976371109485626, + "rewards/margins": 2.940495491027832, + "rewards/rejected": -3.160259246826172, + "step": 5328 + }, + { + "epoch": 0.62, + "learning_rate": 1.1575528522499115e-07, + "logits/chosen": -1.7466893196105957, + "logits/rejected": -2.0879921913146973, + "logps/chosen": -270.5606689453125, + "logps/rejected": -219.05068969726562, + "loss": 1.85, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.194166898727417, + "rewards/margins": -0.014158755540847778, + "rewards/rejected": -2.1800081729888916, + "step": 5329 + }, + { + "epoch": 0.62, + "learning_rate": 1.1571985354907287e-07, + "logits/chosen": -2.3867106437683105, + "logits/rejected": -2.0094375610351562, + "logps/chosen": -157.71710205078125, + "logps/rejected": -297.6543273925781, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6314519643783569, + "rewards/margins": 3.869785785675049, + "rewards/rejected": -4.501237869262695, + "step": 5330 + }, + { + "epoch": 0.62, + "learning_rate": 1.156844218731546e-07, + "logits/chosen": -2.599961042404175, + "logits/rejected": -2.5207571983337402, + "logps/chosen": -123.40969848632812, + "logps/rejected": -188.2261505126953, + "loss": 0.44, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3310739994049072, + "rewards/margins": 1.702453374862671, + "rewards/rejected": -3.033527374267578, + "step": 5331 + }, + { + "epoch": 0.62, + "learning_rate": 1.1564899019723633e-07, + "logits/chosen": -1.8369178771972656, + "logits/rejected": -1.98710036277771, + "logps/chosen": -536.4494018554688, + "logps/rejected": -325.2657775878906, + "loss": 0.5485, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0740693807601929, + "rewards/margins": 1.0516221523284912, + "rewards/rejected": -2.1256914138793945, + "step": 5332 + }, + { + "epoch": 0.62, + "learning_rate": 1.1561355852131805e-07, + "logits/chosen": -2.590642213821411, + "logits/rejected": -2.4282639026641846, + "logps/chosen": -515.46826171875, + "logps/rejected": -465.89794921875, + "loss": 0.2467, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1489574909210205, + "rewards/margins": 2.1513025760650635, + "rewards/rejected": -3.300260305404663, + "step": 5333 + }, + { + "epoch": 0.62, + "learning_rate": 1.1557812684539977e-07, + "logits/chosen": -2.315420389175415, + "logits/rejected": -2.304166316986084, + "logps/chosen": -266.9661560058594, + "logps/rejected": -252.50802612304688, + "loss": 0.4396, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7433305978775024, + "rewards/margins": 1.637261152267456, + "rewards/rejected": -3.380591869354248, + "step": 5334 + }, + { + "epoch": 0.62, + "learning_rate": 1.1554269516948152e-07, + "logits/chosen": -2.2451415061950684, + "logits/rejected": -2.2613637447357178, + "logps/chosen": -338.59979248046875, + "logps/rejected": -402.37005615234375, + "loss": 0.6928, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9381811618804932, + "rewards/margins": 2.0637519359588623, + "rewards/rejected": -3.0019330978393555, + "step": 5335 + }, + { + "epoch": 0.62, + "learning_rate": 1.1550726349356324e-07, + "logits/chosen": -2.4758846759796143, + "logits/rejected": -2.21002459526062, + "logps/chosen": -422.95269775390625, + "logps/rejected": -268.5511474609375, + "loss": 0.532, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1272928714752197, + "rewards/margins": 1.7336997985839844, + "rewards/rejected": -2.860992908477783, + "step": 5336 + }, + { + "epoch": 0.62, + "learning_rate": 1.1547183181764497e-07, + "logits/chosen": -2.6345410346984863, + "logits/rejected": -2.8673479557037354, + "logps/chosen": -131.6870574951172, + "logps/rejected": -170.91351318359375, + "loss": 0.228, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1256211996078491, + "rewards/margins": 2.353010654449463, + "rewards/rejected": -3.4786319732666016, + "step": 5337 + }, + { + "epoch": 0.62, + "learning_rate": 1.154364001417267e-07, + "logits/chosen": -1.7189850807189941, + "logits/rejected": -1.7581462860107422, + "logps/chosen": -442.4598083496094, + "logps/rejected": -451.1578674316406, + "loss": 1.3066, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3435595035552979, + "rewards/margins": -0.4952393174171448, + "rewards/rejected": -0.8483201265335083, + "step": 5338 + }, + { + "epoch": 0.62, + "learning_rate": 1.1540096846580842e-07, + "logits/chosen": -2.337881565093994, + "logits/rejected": -2.572523832321167, + "logps/chosen": -165.08001708984375, + "logps/rejected": -170.83181762695312, + "loss": 0.2496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2044268697500229, + "rewards/margins": 2.4736313819885254, + "rewards/rejected": -2.678058385848999, + "step": 5339 + }, + { + "epoch": 0.62, + "learning_rate": 1.1536553678989015e-07, + "logits/chosen": -2.501457929611206, + "logits/rejected": -2.423488140106201, + "logps/chosen": -285.71319580078125, + "logps/rejected": -286.56060791015625, + "loss": 0.2485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6228606700897217, + "rewards/margins": 3.6428256034851074, + "rewards/rejected": -4.26568603515625, + "step": 5340 + }, + { + "epoch": 0.62, + "learning_rate": 1.153301051139719e-07, + "logits/chosen": -2.052064895629883, + "logits/rejected": -2.06030011177063, + "logps/chosen": -326.6309814453125, + "logps/rejected": -293.8786315917969, + "loss": 0.2024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5786959528923035, + "rewards/margins": 2.1206724643707275, + "rewards/rejected": -2.699368715286255, + "step": 5341 + }, + { + "epoch": 0.62, + "learning_rate": 1.1529467343805362e-07, + "logits/chosen": -2.396091938018799, + "logits/rejected": -2.5877866744995117, + "logps/chosen": -362.53765869140625, + "logps/rejected": -386.9617004394531, + "loss": 0.3246, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3591059446334839, + "rewards/margins": 2.194791316986084, + "rewards/rejected": -2.5538971424102783, + "step": 5342 + }, + { + "epoch": 0.62, + "learning_rate": 1.1525924176213535e-07, + "logits/chosen": -2.252765655517578, + "logits/rejected": -2.008424758911133, + "logps/chosen": -246.1446075439453, + "logps/rejected": -279.3450012207031, + "loss": 0.1694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4225243031978607, + "rewards/margins": 2.2018089294433594, + "rewards/rejected": -2.624333143234253, + "step": 5343 + }, + { + "epoch": 0.62, + "learning_rate": 1.1522381008621707e-07, + "logits/chosen": -2.8630661964416504, + "logits/rejected": -2.850386142730713, + "logps/chosen": -246.35342407226562, + "logps/rejected": -225.23934936523438, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43753331899642944, + "rewards/margins": 2.728153705596924, + "rewards/rejected": -3.165687084197998, + "step": 5344 + }, + { + "epoch": 0.62, + "learning_rate": 1.151883784102988e-07, + "logits/chosen": -2.513031482696533, + "logits/rejected": -2.0251822471618652, + "logps/chosen": -69.95834350585938, + "logps/rejected": -243.9511260986328, + "loss": 0.3549, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5751447081565857, + "rewards/margins": 2.300048351287842, + "rewards/rejected": -2.8751931190490723, + "step": 5345 + }, + { + "epoch": 0.62, + "learning_rate": 1.1515294673438052e-07, + "logits/chosen": -2.6216540336608887, + "logits/rejected": -2.7086193561553955, + "logps/chosen": -268.7843017578125, + "logps/rejected": -293.68548583984375, + "loss": 0.281, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14242079854011536, + "rewards/margins": 2.886920690536499, + "rewards/rejected": -2.744499683380127, + "step": 5346 + }, + { + "epoch": 0.62, + "learning_rate": 1.1511751505846227e-07, + "logits/chosen": -2.2427892684936523, + "logits/rejected": -2.435361385345459, + "logps/chosen": -285.2143249511719, + "logps/rejected": -184.26779174804688, + "loss": 0.553, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1017125844955444, + "rewards/margins": 1.1716386079788208, + "rewards/rejected": -2.2733511924743652, + "step": 5347 + }, + { + "epoch": 0.62, + "learning_rate": 1.1508208338254399e-07, + "logits/chosen": -2.0735509395599365, + "logits/rejected": -2.0336225032806396, + "logps/chosen": -255.41664123535156, + "logps/rejected": -267.1845397949219, + "loss": 0.2048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.734046459197998, + "rewards/margins": 2.8476364612579346, + "rewards/rejected": -3.5816829204559326, + "step": 5348 + }, + { + "epoch": 0.62, + "learning_rate": 1.1504665170662572e-07, + "logits/chosen": -1.7602059841156006, + "logits/rejected": -2.1081738471984863, + "logps/chosen": -286.80865478515625, + "logps/rejected": -250.80068969726562, + "loss": 0.4324, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.24528169631958, + "rewards/margins": 2.0437862873077393, + "rewards/rejected": -3.2890677452087402, + "step": 5349 + }, + { + "epoch": 0.62, + "learning_rate": 1.1501122003070745e-07, + "logits/chosen": -2.7501590251922607, + "logits/rejected": -2.717268228530884, + "logps/chosen": -131.63436889648438, + "logps/rejected": -259.3874206542969, + "loss": 0.4652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49505820870399475, + "rewards/margins": 1.6246461868286133, + "rewards/rejected": -2.119704484939575, + "step": 5350 + }, + { + "epoch": 0.62, + "learning_rate": 1.1497578835478917e-07, + "logits/chosen": -2.369781494140625, + "logits/rejected": -2.263546943664551, + "logps/chosen": -168.73956298828125, + "logps/rejected": -151.7049102783203, + "loss": 0.351, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6330385208129883, + "rewards/margins": 2.147521495819092, + "rewards/rejected": -2.7805604934692383, + "step": 5351 + }, + { + "epoch": 0.62, + "learning_rate": 1.149403566788709e-07, + "logits/chosen": -2.0895814895629883, + "logits/rejected": -2.2708892822265625, + "logps/chosen": -183.67526245117188, + "logps/rejected": -213.24270629882812, + "loss": 1.1146, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0220611095428467, + "rewards/margins": 1.0762057304382324, + "rewards/rejected": -3.0982666015625, + "step": 5352 + }, + { + "epoch": 0.62, + "learning_rate": 1.1490492500295264e-07, + "logits/chosen": -1.9805409908294678, + "logits/rejected": -2.4532768726348877, + "logps/chosen": -137.63040161132812, + "logps/rejected": -232.8541259765625, + "loss": 0.1846, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9267908334732056, + "rewards/margins": 3.1285951137542725, + "rewards/rejected": -4.055385589599609, + "step": 5353 + }, + { + "epoch": 0.62, + "learning_rate": 1.1486949332703436e-07, + "logits/chosen": -2.632026433944702, + "logits/rejected": -2.706714630126953, + "logps/chosen": -106.12554931640625, + "logps/rejected": -150.34532165527344, + "loss": 0.2706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8451471328735352, + "rewards/margins": 2.107205390930176, + "rewards/rejected": -2.952352285385132, + "step": 5354 + }, + { + "epoch": 0.62, + "learning_rate": 1.148340616511161e-07, + "logits/chosen": -2.433797597885132, + "logits/rejected": -2.5247974395751953, + "logps/chosen": -215.24429321289062, + "logps/rejected": -258.6317138671875, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.993907630443573, + "rewards/margins": 1.2607725858688354, + "rewards/rejected": -2.2546801567077637, + "step": 5355 + }, + { + "epoch": 0.62, + "learning_rate": 1.1479862997519782e-07, + "logits/chosen": -2.0873520374298096, + "logits/rejected": -2.054530620574951, + "logps/chosen": -311.9149475097656, + "logps/rejected": -334.8275146484375, + "loss": 1.0901, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.218367099761963, + "rewards/margins": 0.8420179486274719, + "rewards/rejected": -2.06038498878479, + "step": 5356 + }, + { + "epoch": 0.62, + "learning_rate": 1.1476319829927954e-07, + "logits/chosen": -2.120436429977417, + "logits/rejected": -2.16455078125, + "logps/chosen": -395.1813049316406, + "logps/rejected": -379.5257568359375, + "loss": 0.2807, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1493827849626541, + "rewards/margins": 2.4019041061401367, + "rewards/rejected": -2.5512866973876953, + "step": 5357 + }, + { + "epoch": 0.62, + "learning_rate": 1.1472776662336128e-07, + "logits/chosen": -2.323474884033203, + "logits/rejected": -2.125051498413086, + "logps/chosen": -395.9935607910156, + "logps/rejected": -270.6297302246094, + "loss": 0.2705, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8298876285552979, + "rewards/margins": 2.7952215671539307, + "rewards/rejected": -3.6251096725463867, + "step": 5358 + }, + { + "epoch": 0.62, + "learning_rate": 1.1469233494744301e-07, + "logits/chosen": -2.578296184539795, + "logits/rejected": -2.6547253131866455, + "logps/chosen": -379.9794921875, + "logps/rejected": -297.4342956542969, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.788794755935669, + "rewards/margins": 2.169572353363037, + "rewards/rejected": -2.958367109298706, + "step": 5359 + }, + { + "epoch": 0.62, + "learning_rate": 1.1465690327152475e-07, + "logits/chosen": -2.479835033416748, + "logits/rejected": -2.5383660793304443, + "logps/chosen": -324.4447937011719, + "logps/rejected": -273.3553771972656, + "loss": 0.1848, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2654675245285034, + "rewards/margins": 2.719896078109741, + "rewards/rejected": -2.985363483428955, + "step": 5360 + }, + { + "epoch": 0.62, + "learning_rate": 1.1462147159560647e-07, + "logits/chosen": -2.1093058586120605, + "logits/rejected": -2.2043099403381348, + "logps/chosen": -362.1557922363281, + "logps/rejected": -237.79739379882812, + "loss": 0.9247, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7323381900787354, + "rewards/margins": 1.5911972522735596, + "rewards/rejected": -3.323535442352295, + "step": 5361 + }, + { + "epoch": 0.62, + "learning_rate": 1.1458603991968819e-07, + "logits/chosen": -2.0444445610046387, + "logits/rejected": -2.354599952697754, + "logps/chosen": -287.296630859375, + "logps/rejected": -200.69656372070312, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05367937684059143, + "rewards/margins": 1.4882746934890747, + "rewards/rejected": -1.4345953464508057, + "step": 5362 + }, + { + "epoch": 0.62, + "learning_rate": 1.1455060824376991e-07, + "logits/chosen": -2.993025779724121, + "logits/rejected": -2.957380771636963, + "logps/chosen": -231.00228881835938, + "logps/rejected": -284.85540771484375, + "loss": 0.2742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.844107449054718, + "rewards/margins": 2.6686673164367676, + "rewards/rejected": -3.5127744674682617, + "step": 5363 + }, + { + "epoch": 0.62, + "learning_rate": 1.1451517656785166e-07, + "logits/chosen": -2.4452755451202393, + "logits/rejected": -2.360866069793701, + "logps/chosen": -216.28919982910156, + "logps/rejected": -228.65545654296875, + "loss": 0.2529, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7654356360435486, + "rewards/margins": 1.808443307876587, + "rewards/rejected": -2.5738790035247803, + "step": 5364 + }, + { + "epoch": 0.62, + "learning_rate": 1.1447974489193338e-07, + "logits/chosen": -2.5250790119171143, + "logits/rejected": -2.3355093002319336, + "logps/chosen": -147.6964569091797, + "logps/rejected": -205.58584594726562, + "loss": 0.5226, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.055158853530884, + "rewards/margins": 1.6498078107833862, + "rewards/rejected": -3.7049665451049805, + "step": 5365 + }, + { + "epoch": 0.62, + "learning_rate": 1.1444431321601512e-07, + "logits/chosen": -2.4814791679382324, + "logits/rejected": -2.82623028755188, + "logps/chosen": -230.5892333984375, + "logps/rejected": -118.79801940917969, + "loss": 0.4237, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5208330154418945, + "rewards/margins": 1.330108642578125, + "rewards/rejected": -1.850941777229309, + "step": 5366 + }, + { + "epoch": 0.62, + "learning_rate": 1.1440888154009684e-07, + "logits/chosen": -1.7854995727539062, + "logits/rejected": -2.1558923721313477, + "logps/chosen": -419.88360595703125, + "logps/rejected": -308.0103454589844, + "loss": 0.7217, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8216978907585144, + "rewards/margins": 0.766961932182312, + "rewards/rejected": -1.5886598825454712, + "step": 5367 + }, + { + "epoch": 0.62, + "learning_rate": 1.1437344986417856e-07, + "logits/chosen": -2.8295204639434814, + "logits/rejected": -2.5207693576812744, + "logps/chosen": -311.2629089355469, + "logps/rejected": -398.0264892578125, + "loss": 0.2207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0010223388671875, + "rewards/margins": 2.2625441551208496, + "rewards/rejected": -3.2635669708251953, + "step": 5368 + }, + { + "epoch": 0.62, + "learning_rate": 1.143380181882603e-07, + "logits/chosen": -2.4289450645446777, + "logits/rejected": -2.4903650283813477, + "logps/chosen": -229.8724822998047, + "logps/rejected": -191.74612426757812, + "loss": 0.5757, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.333589553833008, + "rewards/margins": 1.4477872848510742, + "rewards/rejected": -3.7813773155212402, + "step": 5369 + }, + { + "epoch": 0.62, + "learning_rate": 1.1430258651234203e-07, + "logits/chosen": -2.1118698120117188, + "logits/rejected": -2.22216796875, + "logps/chosen": -283.0202331542969, + "logps/rejected": -259.7262878417969, + "loss": 0.4346, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9116078615188599, + "rewards/margins": 2.2685108184814453, + "rewards/rejected": -3.1801185607910156, + "step": 5370 + }, + { + "epoch": 0.62, + "learning_rate": 1.1426715483642376e-07, + "logits/chosen": -2.340806245803833, + "logits/rejected": -2.7342162132263184, + "logps/chosen": -266.09814453125, + "logps/rejected": -190.0380859375, + "loss": 0.4641, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6011068224906921, + "rewards/margins": 0.9216456413269043, + "rewards/rejected": -1.5227525234222412, + "step": 5371 + }, + { + "epoch": 0.62, + "learning_rate": 1.1423172316050549e-07, + "logits/chosen": -1.6209710836410522, + "logits/rejected": -1.8452750444412231, + "logps/chosen": -385.224853515625, + "logps/rejected": -331.89556884765625, + "loss": 0.4362, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.883021354675293, + "rewards/margins": 1.509251356124878, + "rewards/rejected": -2.392272472381592, + "step": 5372 + }, + { + "epoch": 0.63, + "learning_rate": 1.1419629148458721e-07, + "logits/chosen": -2.1133692264556885, + "logits/rejected": -2.2323427200317383, + "logps/chosen": -227.40985107421875, + "logps/rejected": -319.9500732421875, + "loss": 0.316, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29349297285079956, + "rewards/margins": 2.6873676776885986, + "rewards/rejected": -2.980860710144043, + "step": 5373 + }, + { + "epoch": 0.63, + "learning_rate": 1.1416085980866894e-07, + "logits/chosen": -2.0857796669006348, + "logits/rejected": -2.2653074264526367, + "logps/chosen": -398.2340393066406, + "logps/rejected": -244.95411682128906, + "loss": 0.1929, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8428530693054199, + "rewards/margins": 2.702859878540039, + "rewards/rejected": -3.545713424682617, + "step": 5374 + }, + { + "epoch": 0.63, + "learning_rate": 1.1412542813275067e-07, + "logits/chosen": -2.481411933898926, + "logits/rejected": -2.4078009128570557, + "logps/chosen": -358.3394775390625, + "logps/rejected": -267.542724609375, + "loss": 0.2931, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43920016288757324, + "rewards/margins": 1.5420889854431152, + "rewards/rejected": -1.981289267539978, + "step": 5375 + }, + { + "epoch": 0.63, + "learning_rate": 1.1408999645683241e-07, + "logits/chosen": -2.0551159381866455, + "logits/rejected": -1.7485780715942383, + "logps/chosen": -185.59776306152344, + "logps/rejected": -353.1546630859375, + "loss": 0.2641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36728566884994507, + "rewards/margins": 3.1150283813476562, + "rewards/rejected": -3.482314109802246, + "step": 5376 + }, + { + "epoch": 0.63, + "learning_rate": 1.1405456478091414e-07, + "logits/chosen": -2.3738317489624023, + "logits/rejected": -2.4083411693573, + "logps/chosen": -182.7656707763672, + "logps/rejected": -162.0426788330078, + "loss": 0.3529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7681323885917664, + "rewards/margins": 1.5419130325317383, + "rewards/rejected": -2.3100454807281494, + "step": 5377 + }, + { + "epoch": 0.63, + "learning_rate": 1.1401913310499586e-07, + "logits/chosen": -1.8814460039138794, + "logits/rejected": -2.2363131046295166, + "logps/chosen": -481.83294677734375, + "logps/rejected": -380.61981201171875, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4333607852458954, + "rewards/margins": 2.334918260574341, + "rewards/rejected": -2.7682790756225586, + "step": 5378 + }, + { + "epoch": 0.63, + "learning_rate": 1.1398370142907759e-07, + "logits/chosen": -2.8467555046081543, + "logits/rejected": -2.5749404430389404, + "logps/chosen": -247.470703125, + "logps/rejected": -368.93798828125, + "loss": 0.7379, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8185607194900513, + "rewards/margins": 1.828495740890503, + "rewards/rejected": -3.6470565795898438, + "step": 5379 + }, + { + "epoch": 0.63, + "learning_rate": 1.1394826975315931e-07, + "logits/chosen": -2.614248514175415, + "logits/rejected": -2.2359120845794678, + "logps/chosen": -119.18404388427734, + "logps/rejected": -197.70083618164062, + "loss": 0.2712, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0320305824279785, + "rewards/margins": 1.892529845237732, + "rewards/rejected": -2.924560546875, + "step": 5380 + }, + { + "epoch": 0.63, + "learning_rate": 1.1391283807724104e-07, + "logits/chosen": -2.738949775695801, + "logits/rejected": -2.7817113399505615, + "logps/chosen": -139.37057495117188, + "logps/rejected": -153.03518676757812, + "loss": 0.1708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8323532342910767, + "rewards/margins": 2.9041097164154053, + "rewards/rejected": -3.7364625930786133, + "step": 5381 + }, + { + "epoch": 0.63, + "learning_rate": 1.1387740640132278e-07, + "logits/chosen": -2.021416187286377, + "logits/rejected": -2.3910136222839355, + "logps/chosen": -201.4613800048828, + "logps/rejected": -187.7165069580078, + "loss": 0.2727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7954983115196228, + "rewards/margins": 1.529793620109558, + "rewards/rejected": -2.3252921104431152, + "step": 5382 + }, + { + "epoch": 0.63, + "learning_rate": 1.1384197472540451e-07, + "logits/chosen": -1.8371105194091797, + "logits/rejected": -1.6156127452850342, + "logps/chosen": -361.0088806152344, + "logps/rejected": -434.0414733886719, + "loss": 0.6038, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4817534685134888, + "rewards/margins": 1.3600858449935913, + "rewards/rejected": -2.84183931350708, + "step": 5383 + }, + { + "epoch": 0.63, + "learning_rate": 1.1380654304948624e-07, + "logits/chosen": -2.0803580284118652, + "logits/rejected": -2.4836301803588867, + "logps/chosen": -219.02294921875, + "logps/rejected": -174.16793823242188, + "loss": 1.8926, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1486542224884033, + "rewards/margins": 0.286805123090744, + "rewards/rejected": -2.4354593753814697, + "step": 5384 + }, + { + "epoch": 0.63, + "learning_rate": 1.1377111137356796e-07, + "logits/chosen": -2.406153440475464, + "logits/rejected": -2.433340072631836, + "logps/chosen": -366.7221984863281, + "logps/rejected": -491.34649658203125, + "loss": 0.4119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5996214151382446, + "rewards/margins": 2.9444174766540527, + "rewards/rejected": -3.544039011001587, + "step": 5385 + }, + { + "epoch": 0.63, + "learning_rate": 1.137356796976497e-07, + "logits/chosen": -1.9369298219680786, + "logits/rejected": -2.0869154930114746, + "logps/chosen": -465.50823974609375, + "logps/rejected": -350.197509765625, + "loss": 0.2196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16899700462818146, + "rewards/margins": 1.9959421157836914, + "rewards/rejected": -2.1649391651153564, + "step": 5386 + }, + { + "epoch": 0.63, + "learning_rate": 1.1370024802173142e-07, + "logits/chosen": -2.493149757385254, + "logits/rejected": -2.368044853210449, + "logps/chosen": -350.370849609375, + "logps/rejected": -368.33697509765625, + "loss": 0.2441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27342501282691956, + "rewards/margins": 3.7940433025360107, + "rewards/rejected": -4.067468643188477, + "step": 5387 + }, + { + "epoch": 0.63, + "learning_rate": 1.1366481634581315e-07, + "logits/chosen": -2.1739561557769775, + "logits/rejected": -2.380276918411255, + "logps/chosen": -296.95556640625, + "logps/rejected": -224.952880859375, + "loss": 0.2964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.494283527135849, + "rewards/margins": 1.690805435180664, + "rewards/rejected": -2.185089111328125, + "step": 5388 + }, + { + "epoch": 0.63, + "learning_rate": 1.1362938466989489e-07, + "logits/chosen": -2.580578088760376, + "logits/rejected": -2.5316686630249023, + "logps/chosen": -344.95489501953125, + "logps/rejected": -243.54953002929688, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030601661652326584, + "rewards/margins": 2.894486427307129, + "rewards/rejected": -2.863884925842285, + "step": 5389 + }, + { + "epoch": 0.63, + "learning_rate": 1.1359395299397661e-07, + "logits/chosen": -2.0239391326904297, + "logits/rejected": -2.249756097793579, + "logps/chosen": -359.32037353515625, + "logps/rejected": -329.4820861816406, + "loss": 0.2823, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6357448101043701, + "rewards/margins": 3.2584493160247803, + "rewards/rejected": -3.8941946029663086, + "step": 5390 + }, + { + "epoch": 0.63, + "learning_rate": 1.1355852131805833e-07, + "logits/chosen": -2.79663348197937, + "logits/rejected": -2.8424994945526123, + "logps/chosen": -147.08351135253906, + "logps/rejected": -204.67236328125, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04837636649608612, + "rewards/margins": 3.8691134452819824, + "rewards/rejected": -3.8207366466522217, + "step": 5391 + }, + { + "epoch": 0.63, + "learning_rate": 1.1352308964214007e-07, + "logits/chosen": -2.0020720958709717, + "logits/rejected": -2.313211441040039, + "logps/chosen": -362.942626953125, + "logps/rejected": -179.58518981933594, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9049264788627625, + "rewards/margins": 2.499267101287842, + "rewards/rejected": -3.404193878173828, + "step": 5392 + }, + { + "epoch": 0.63, + "learning_rate": 1.1348765796622179e-07, + "logits/chosen": -2.142202615737915, + "logits/rejected": -2.2349166870117188, + "logps/chosen": -232.10662841796875, + "logps/rejected": -249.53184509277344, + "loss": 0.3677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8841705322265625, + "rewards/margins": 1.855550765991211, + "rewards/rejected": -2.7397212982177734, + "step": 5393 + }, + { + "epoch": 0.63, + "learning_rate": 1.1345222629030354e-07, + "logits/chosen": -2.0515010356903076, + "logits/rejected": -2.346853256225586, + "logps/chosen": -235.42884826660156, + "logps/rejected": -293.098876953125, + "loss": 0.2191, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.009724497795105, + "rewards/margins": 3.818309783935547, + "rewards/rejected": -4.828034400939941, + "step": 5394 + }, + { + "epoch": 0.63, + "learning_rate": 1.1341679461438526e-07, + "logits/chosen": -2.0894134044647217, + "logits/rejected": -2.0225439071655273, + "logps/chosen": -408.7860107421875, + "logps/rejected": -418.6668395996094, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.567647397518158, + "rewards/margins": 2.8327972888946533, + "rewards/rejected": -3.400444507598877, + "step": 5395 + }, + { + "epoch": 0.63, + "learning_rate": 1.1338136293846698e-07, + "logits/chosen": -2.6005935668945312, + "logits/rejected": -2.298593282699585, + "logps/chosen": -203.59519958496094, + "logps/rejected": -249.62623596191406, + "loss": 0.1183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8998126983642578, + "rewards/margins": 3.222750186920166, + "rewards/rejected": -4.122562885284424, + "step": 5396 + }, + { + "epoch": 0.63, + "learning_rate": 1.1334593126254872e-07, + "logits/chosen": -2.515056848526001, + "logits/rejected": -2.775061845779419, + "logps/chosen": -360.76275634765625, + "logps/rejected": -348.619384765625, + "loss": 0.3427, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2704813480377197, + "rewards/margins": 2.6100900173187256, + "rewards/rejected": -3.880571126937866, + "step": 5397 + }, + { + "epoch": 0.63, + "learning_rate": 1.1331049958663044e-07, + "logits/chosen": -1.8495633602142334, + "logits/rejected": -2.4554667472839355, + "logps/chosen": -394.687255859375, + "logps/rejected": -199.52792358398438, + "loss": 0.5357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9220387935638428, + "rewards/margins": 1.2433502674102783, + "rewards/rejected": -2.165389060974121, + "step": 5398 + }, + { + "epoch": 0.63, + "learning_rate": 1.1327506791071217e-07, + "logits/chosen": -2.276494026184082, + "logits/rejected": -2.0302937030792236, + "logps/chosen": -332.1329650878906, + "logps/rejected": -373.7216796875, + "loss": 0.6003, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4945908784866333, + "rewards/margins": 1.605769395828247, + "rewards/rejected": -3.10036039352417, + "step": 5399 + }, + { + "epoch": 0.63, + "learning_rate": 1.1323963623479391e-07, + "logits/chosen": -2.040966272354126, + "logits/rejected": -2.1161627769470215, + "logps/chosen": -235.60594177246094, + "logps/rejected": -411.2179870605469, + "loss": 0.3609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0819263458251953, + "rewards/margins": 4.6390862464904785, + "rewards/rejected": -5.721012115478516, + "step": 5400 + }, + { + "epoch": 0.63, + "learning_rate": 1.1320420455887563e-07, + "logits/chosen": -2.1681926250457764, + "logits/rejected": -2.3577537536621094, + "logps/chosen": -336.5526123046875, + "logps/rejected": -242.6056671142578, + "loss": 0.4116, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7304649353027344, + "rewards/margins": 1.41405189037323, + "rewards/rejected": -2.144516944885254, + "step": 5401 + }, + { + "epoch": 0.63, + "learning_rate": 1.1316877288295735e-07, + "logits/chosen": -2.0901708602905273, + "logits/rejected": -2.241230010986328, + "logps/chosen": -237.8018798828125, + "logps/rejected": -264.5483093261719, + "loss": 0.6419, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7429393529891968, + "rewards/margins": 1.4400684833526611, + "rewards/rejected": -2.1830077171325684, + "step": 5402 + }, + { + "epoch": 0.63, + "learning_rate": 1.1313334120703909e-07, + "logits/chosen": -2.0109751224517822, + "logits/rejected": -1.8507510423660278, + "logps/chosen": -588.3833618164062, + "logps/rejected": -488.09747314453125, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5851837396621704, + "rewards/margins": 2.9185261726379395, + "rewards/rejected": -3.5037097930908203, + "step": 5403 + }, + { + "epoch": 0.63, + "learning_rate": 1.1309790953112081e-07, + "logits/chosen": -1.6815078258514404, + "logits/rejected": -1.6843570470809937, + "logps/chosen": -291.677490234375, + "logps/rejected": -315.4087829589844, + "loss": 0.3521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5687878727912903, + "rewards/margins": 3.07369065284729, + "rewards/rejected": -3.6424784660339355, + "step": 5404 + }, + { + "epoch": 0.63, + "learning_rate": 1.1306247785520256e-07, + "logits/chosen": -2.3073229789733887, + "logits/rejected": -2.3163983821868896, + "logps/chosen": -257.4685363769531, + "logps/rejected": -362.3294677734375, + "loss": 0.3787, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.429797887802124, + "rewards/margins": 3.7340869903564453, + "rewards/rejected": -5.163885116577148, + "step": 5405 + }, + { + "epoch": 0.63, + "learning_rate": 1.1302704617928428e-07, + "logits/chosen": -2.373509407043457, + "logits/rejected": -2.4976487159729004, + "logps/chosen": -139.86788940429688, + "logps/rejected": -168.26275634765625, + "loss": 0.3271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5603963136672974, + "rewards/margins": 1.969430923461914, + "rewards/rejected": -2.529827117919922, + "step": 5406 + }, + { + "epoch": 0.63, + "learning_rate": 1.12991614503366e-07, + "logits/chosen": -2.0953168869018555, + "logits/rejected": -2.235417366027832, + "logps/chosen": -419.89447021484375, + "logps/rejected": -318.45135498046875, + "loss": 0.291, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5837041139602661, + "rewards/margins": 2.368928909301758, + "rewards/rejected": -3.9526329040527344, + "step": 5407 + }, + { + "epoch": 0.63, + "learning_rate": 1.1295618282744773e-07, + "logits/chosen": -1.8292272090911865, + "logits/rejected": -1.7883622646331787, + "logps/chosen": -264.42822265625, + "logps/rejected": -367.25616455078125, + "loss": 0.3194, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0295547246932983, + "rewards/margins": 2.5533621311187744, + "rewards/rejected": -3.582916736602783, + "step": 5408 + }, + { + "epoch": 0.63, + "learning_rate": 1.1292075115152946e-07, + "logits/chosen": -2.1756949424743652, + "logits/rejected": -2.1853675842285156, + "logps/chosen": -153.75531005859375, + "logps/rejected": -197.07398986816406, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16343599557876587, + "rewards/margins": 3.074493408203125, + "rewards/rejected": -3.237929344177246, + "step": 5409 + }, + { + "epoch": 0.63, + "learning_rate": 1.1288531947561118e-07, + "logits/chosen": -1.6149758100509644, + "logits/rejected": -1.8730405569076538, + "logps/chosen": -320.65069580078125, + "logps/rejected": -290.70599365234375, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4795359969139099, + "rewards/margins": 2.247673511505127, + "rewards/rejected": -2.7272095680236816, + "step": 5410 + }, + { + "epoch": 0.63, + "learning_rate": 1.1284988779969293e-07, + "logits/chosen": -2.6699986457824707, + "logits/rejected": -2.7237260341644287, + "logps/chosen": -154.31924438476562, + "logps/rejected": -239.70318603515625, + "loss": 0.6771, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8081600069999695, + "rewards/margins": 1.3225332498550415, + "rewards/rejected": -2.1306934356689453, + "step": 5411 + }, + { + "epoch": 0.63, + "learning_rate": 1.1281445612377465e-07, + "logits/chosen": -2.8297481536865234, + "logits/rejected": -2.570286273956299, + "logps/chosen": -394.79156494140625, + "logps/rejected": -231.51202392578125, + "loss": 0.3715, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9974328279495239, + "rewards/margins": 2.729815721511841, + "rewards/rejected": -3.7272486686706543, + "step": 5412 + }, + { + "epoch": 0.63, + "learning_rate": 1.1277902444785638e-07, + "logits/chosen": -2.1524434089660645, + "logits/rejected": -2.500807762145996, + "logps/chosen": -462.39031982421875, + "logps/rejected": -333.8258972167969, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07131797075271606, + "rewards/margins": 3.22157883644104, + "rewards/rejected": -3.150261163711548, + "step": 5413 + }, + { + "epoch": 0.63, + "learning_rate": 1.1274359277193811e-07, + "logits/chosen": -2.469902276992798, + "logits/rejected": -2.6426360607147217, + "logps/chosen": -257.4925842285156, + "logps/rejected": -265.8251647949219, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9830567240715027, + "rewards/margins": 3.1951873302459717, + "rewards/rejected": -4.178244113922119, + "step": 5414 + }, + { + "epoch": 0.63, + "learning_rate": 1.1270816109601983e-07, + "logits/chosen": -2.209501266479492, + "logits/rejected": -2.2544641494750977, + "logps/chosen": -356.9164733886719, + "logps/rejected": -228.82635498046875, + "loss": 0.4904, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6333261728286743, + "rewards/margins": 1.3822753429412842, + "rewards/rejected": -3.015601396560669, + "step": 5415 + }, + { + "epoch": 0.63, + "learning_rate": 1.1267272942010156e-07, + "logits/chosen": -2.226491689682007, + "logits/rejected": -2.109182596206665, + "logps/chosen": -495.8923034667969, + "logps/rejected": -380.00665283203125, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.036454200744629, + "rewards/margins": 2.5797996520996094, + "rewards/rejected": -3.6162538528442383, + "step": 5416 + }, + { + "epoch": 0.63, + "learning_rate": 1.126372977441833e-07, + "logits/chosen": -2.778865098953247, + "logits/rejected": -2.6349055767059326, + "logps/chosen": -369.8677978515625, + "logps/rejected": -348.53680419921875, + "loss": 0.2531, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1774243712425232, + "rewards/margins": 2.1286141872406006, + "rewards/rejected": -1.9511901140213013, + "step": 5417 + }, + { + "epoch": 0.63, + "learning_rate": 1.1260186606826503e-07, + "logits/chosen": -2.0492913722991943, + "logits/rejected": -2.1439034938812256, + "logps/chosen": -493.8384094238281, + "logps/rejected": -403.1107177734375, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3148505985736847, + "rewards/margins": 3.4577407836914062, + "rewards/rejected": -3.7725913524627686, + "step": 5418 + }, + { + "epoch": 0.63, + "learning_rate": 1.1256643439234675e-07, + "logits/chosen": -2.8936779499053955, + "logits/rejected": -2.8470215797424316, + "logps/chosen": -130.54893493652344, + "logps/rejected": -114.22411346435547, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03555231913924217, + "rewards/margins": 3.0255603790283203, + "rewards/rejected": -3.061112403869629, + "step": 5419 + }, + { + "epoch": 0.63, + "learning_rate": 1.1253100271642848e-07, + "logits/chosen": -2.0061471462249756, + "logits/rejected": -2.160956859588623, + "logps/chosen": -256.5695495605469, + "logps/rejected": -172.8482666015625, + "loss": 0.7687, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8914706110954285, + "rewards/margins": 0.7966321706771851, + "rewards/rejected": -1.6881027221679688, + "step": 5420 + }, + { + "epoch": 0.63, + "learning_rate": 1.1249557104051021e-07, + "logits/chosen": -2.3844470977783203, + "logits/rejected": -2.846339225769043, + "logps/chosen": -342.1619567871094, + "logps/rejected": -225.065185546875, + "loss": 0.3459, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.359169065952301, + "rewards/margins": 2.478038787841797, + "rewards/rejected": -2.837207794189453, + "step": 5421 + }, + { + "epoch": 0.63, + "learning_rate": 1.1246013936459193e-07, + "logits/chosen": -2.83151912689209, + "logits/rejected": -2.6686532497406006, + "logps/chosen": -191.8907928466797, + "logps/rejected": -158.0035400390625, + "loss": 0.6158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5885133147239685, + "rewards/margins": 1.7286145687103271, + "rewards/rejected": -2.3171279430389404, + "step": 5422 + }, + { + "epoch": 0.63, + "learning_rate": 1.1242470768867368e-07, + "logits/chosen": -2.2413179874420166, + "logits/rejected": -2.761350393295288, + "logps/chosen": -342.164306640625, + "logps/rejected": -247.7933349609375, + "loss": 0.6766, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2648792266845703, + "rewards/margins": 1.9464848041534424, + "rewards/rejected": -3.2113640308380127, + "step": 5423 + }, + { + "epoch": 0.63, + "learning_rate": 1.123892760127554e-07, + "logits/chosen": -1.7190141677856445, + "logits/rejected": -1.9828202724456787, + "logps/chosen": -363.981201171875, + "logps/rejected": -188.7792510986328, + "loss": 0.5835, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7304027080535889, + "rewards/margins": 0.7001680135726929, + "rewards/rejected": -2.430570602416992, + "step": 5424 + }, + { + "epoch": 0.63, + "learning_rate": 1.1235384433683712e-07, + "logits/chosen": -1.874596357345581, + "logits/rejected": -2.2107901573181152, + "logps/chosen": -298.2101745605469, + "logps/rejected": -197.6917724609375, + "loss": 0.3815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7896029353141785, + "rewards/margins": 1.0445470809936523, + "rewards/rejected": -1.8341500759124756, + "step": 5425 + }, + { + "epoch": 0.63, + "learning_rate": 1.1231841266091886e-07, + "logits/chosen": -2.7808456420898438, + "logits/rejected": -2.778216600418091, + "logps/chosen": -176.84768676757812, + "logps/rejected": -347.4643249511719, + "loss": 0.3397, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0814919471740723, + "rewards/margins": 2.9159164428710938, + "rewards/rejected": -3.997408390045166, + "step": 5426 + }, + { + "epoch": 0.63, + "learning_rate": 1.1228298098500058e-07, + "logits/chosen": -2.180626392364502, + "logits/rejected": -2.1392223834991455, + "logps/chosen": -284.5223083496094, + "logps/rejected": -383.3424987792969, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9218234419822693, + "rewards/margins": 1.339971899986267, + "rewards/rejected": -2.2617955207824707, + "step": 5427 + }, + { + "epoch": 0.63, + "learning_rate": 1.122475493090823e-07, + "logits/chosen": -2.175630569458008, + "logits/rejected": -2.191176652908325, + "logps/chosen": -627.421142578125, + "logps/rejected": -517.9371948242188, + "loss": 0.5195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9638887643814087, + "rewards/margins": 0.7159367799758911, + "rewards/rejected": -1.6798255443572998, + "step": 5428 + }, + { + "epoch": 0.63, + "learning_rate": 1.1221211763316405e-07, + "logits/chosen": -2.5000226497650146, + "logits/rejected": -2.5190014839172363, + "logps/chosen": -336.8921813964844, + "logps/rejected": -275.81488037109375, + "loss": 0.6511, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1737866401672363, + "rewards/margins": 0.8599332571029663, + "rewards/rejected": -2.033719778060913, + "step": 5429 + }, + { + "epoch": 0.63, + "learning_rate": 1.1217668595724577e-07, + "logits/chosen": -1.9417145252227783, + "logits/rejected": -2.311255931854248, + "logps/chosen": -347.3506774902344, + "logps/rejected": -291.76861572265625, + "loss": 0.9648, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4313206672668457, + "rewards/margins": 0.6953222751617432, + "rewards/rejected": -2.126643180847168, + "step": 5430 + }, + { + "epoch": 0.63, + "learning_rate": 1.1214125428132751e-07, + "logits/chosen": -2.7482590675354004, + "logits/rejected": -2.6745455265045166, + "logps/chosen": -244.3821258544922, + "logps/rejected": -247.88552856445312, + "loss": 0.7947, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0783932209014893, + "rewards/margins": 0.59649258852005, + "rewards/rejected": -2.6748857498168945, + "step": 5431 + }, + { + "epoch": 0.63, + "learning_rate": 1.1210582260540923e-07, + "logits/chosen": -2.320410966873169, + "logits/rejected": -2.2884135246276855, + "logps/chosen": -286.5833740234375, + "logps/rejected": -294.120849609375, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4949522614479065, + "rewards/margins": 3.1444108486175537, + "rewards/rejected": -3.6393628120422363, + "step": 5432 + }, + { + "epoch": 0.63, + "learning_rate": 1.1207039092949095e-07, + "logits/chosen": -1.7404357194900513, + "logits/rejected": -2.2082605361938477, + "logps/chosen": -524.9677734375, + "logps/rejected": -310.6058349609375, + "loss": 0.3667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4941253066062927, + "rewards/margins": 1.6549936532974243, + "rewards/rejected": -2.1491191387176514, + "step": 5433 + }, + { + "epoch": 0.63, + "learning_rate": 1.1203495925357269e-07, + "logits/chosen": -2.3483726978302, + "logits/rejected": -2.4620578289031982, + "logps/chosen": -321.5094909667969, + "logps/rejected": -308.6068420410156, + "loss": 0.4426, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0267605781555176, + "rewards/margins": 1.8645309209823608, + "rewards/rejected": -2.891291379928589, + "step": 5434 + }, + { + "epoch": 0.63, + "learning_rate": 1.1199952757765442e-07, + "logits/chosen": -2.545304536819458, + "logits/rejected": -2.7126946449279785, + "logps/chosen": -222.67269897460938, + "logps/rejected": -187.9613800048828, + "loss": 0.4663, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3508564233779907, + "rewards/margins": 1.007794737815857, + "rewards/rejected": -2.3586511611938477, + "step": 5435 + }, + { + "epoch": 0.63, + "learning_rate": 1.1196409590173614e-07, + "logits/chosen": -2.379878520965576, + "logits/rejected": -2.296349048614502, + "logps/chosen": -228.67666625976562, + "logps/rejected": -329.12542724609375, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35086387395858765, + "rewards/margins": 2.5683939456939697, + "rewards/rejected": -2.919257640838623, + "step": 5436 + }, + { + "epoch": 0.63, + "learning_rate": 1.1192866422581788e-07, + "logits/chosen": -2.834955930709839, + "logits/rejected": -2.7682485580444336, + "logps/chosen": -217.00167846679688, + "logps/rejected": -240.8406219482422, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47074994444847107, + "rewards/margins": 2.787675380706787, + "rewards/rejected": -3.258425235748291, + "step": 5437 + }, + { + "epoch": 0.63, + "learning_rate": 1.118932325498996e-07, + "logits/chosen": -2.4867069721221924, + "logits/rejected": -2.7169923782348633, + "logps/chosen": -194.88238525390625, + "logps/rejected": -220.4710693359375, + "loss": 0.3502, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4139651358127594, + "rewards/margins": 2.512171983718872, + "rewards/rejected": -2.9261372089385986, + "step": 5438 + }, + { + "epoch": 0.63, + "learning_rate": 1.1185780087398132e-07, + "logits/chosen": -2.7698581218719482, + "logits/rejected": -2.8102030754089355, + "logps/chosen": -82.76570129394531, + "logps/rejected": -112.25534057617188, + "loss": 0.4989, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0793108940124512, + "rewards/margins": 1.9017988443374634, + "rewards/rejected": -2.981109857559204, + "step": 5439 + }, + { + "epoch": 0.63, + "learning_rate": 1.1182236919806307e-07, + "logits/chosen": -1.9212427139282227, + "logits/rejected": -2.1138436794281006, + "logps/chosen": -346.54022216796875, + "logps/rejected": -307.0097351074219, + "loss": 0.2654, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5529037714004517, + "rewards/margins": 3.094998359680176, + "rewards/rejected": -3.647902011871338, + "step": 5440 + }, + { + "epoch": 0.63, + "learning_rate": 1.117869375221448e-07, + "logits/chosen": -2.2420120239257812, + "logits/rejected": -2.269819974899292, + "logps/chosen": -265.33056640625, + "logps/rejected": -277.6573486328125, + "loss": 0.3977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8846417665481567, + "rewards/margins": 1.7760423421859741, + "rewards/rejected": -2.660684108734131, + "step": 5441 + }, + { + "epoch": 0.63, + "learning_rate": 1.1175150584622652e-07, + "logits/chosen": -2.556025743484497, + "logits/rejected": -2.637547016143799, + "logps/chosen": -253.7698516845703, + "logps/rejected": -300.7780456542969, + "loss": 0.2951, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7236975431442261, + "rewards/margins": 2.6017839908599854, + "rewards/rejected": -3.325481414794922, + "step": 5442 + }, + { + "epoch": 0.63, + "learning_rate": 1.1171607417030825e-07, + "logits/chosen": -2.074341058731079, + "logits/rejected": -2.234650135040283, + "logps/chosen": -247.71060180664062, + "logps/rejected": -355.0132751464844, + "loss": 0.6224, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0118634700775146, + "rewards/margins": 0.9492691159248352, + "rewards/rejected": -1.9611326456069946, + "step": 5443 + }, + { + "epoch": 0.63, + "learning_rate": 1.1168064249438997e-07, + "logits/chosen": -2.4312615394592285, + "logits/rejected": -2.1841607093811035, + "logps/chosen": -279.50384521484375, + "logps/rejected": -262.96697998046875, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.513006329536438, + "rewards/margins": 2.134632110595703, + "rewards/rejected": -3.6476383209228516, + "step": 5444 + }, + { + "epoch": 0.63, + "learning_rate": 1.116452108184717e-07, + "logits/chosen": -1.9913761615753174, + "logits/rejected": -1.974186897277832, + "logps/chosen": -248.65628051757812, + "logps/rejected": -302.9364318847656, + "loss": 0.6167, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9924798011779785, + "rewards/margins": 0.5923253893852234, + "rewards/rejected": -1.5848052501678467, + "step": 5445 + }, + { + "epoch": 0.63, + "learning_rate": 1.1160977914255345e-07, + "logits/chosen": -2.268012046813965, + "logits/rejected": -2.5050535202026367, + "logps/chosen": -206.32081604003906, + "logps/rejected": -157.01922607421875, + "loss": 2.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.460832118988037, + "rewards/margins": -0.7061963081359863, + "rewards/rejected": -1.7546356916427612, + "step": 5446 + }, + { + "epoch": 0.63, + "learning_rate": 1.1157434746663517e-07, + "logits/chosen": -2.5190646648406982, + "logits/rejected": -2.529682159423828, + "logps/chosen": -272.3499450683594, + "logps/rejected": -266.38677978515625, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6423850655555725, + "rewards/margins": 3.420186996459961, + "rewards/rejected": -4.062572479248047, + "step": 5447 + }, + { + "epoch": 0.63, + "learning_rate": 1.115389157907169e-07, + "logits/chosen": -2.2268686294555664, + "logits/rejected": -2.3013153076171875, + "logps/chosen": -111.85850524902344, + "logps/rejected": -167.75538635253906, + "loss": 0.2708, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.83359295129776, + "rewards/margins": 2.4300522804260254, + "rewards/rejected": -3.2636451721191406, + "step": 5448 + }, + { + "epoch": 0.63, + "learning_rate": 1.1150348411479862e-07, + "logits/chosen": -2.4390482902526855, + "logits/rejected": -2.3662052154541016, + "logps/chosen": -425.13458251953125, + "logps/rejected": -272.8174743652344, + "loss": 0.6509, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.631458044052124, + "rewards/margins": 1.2138640880584717, + "rewards/rejected": -2.8453221321105957, + "step": 5449 + }, + { + "epoch": 0.63, + "learning_rate": 1.1146805243888035e-07, + "logits/chosen": -2.485001802444458, + "logits/rejected": -2.166154623031616, + "logps/chosen": -258.07470703125, + "logps/rejected": -368.3575744628906, + "loss": 0.4991, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0456717014312744, + "rewards/margins": 1.4731897115707397, + "rewards/rejected": -2.5188612937927246, + "step": 5450 + }, + { + "epoch": 0.63, + "learning_rate": 1.1143262076296208e-07, + "logits/chosen": -2.7795093059539795, + "logits/rejected": -2.6864068508148193, + "logps/chosen": -230.74053955078125, + "logps/rejected": -125.32540893554688, + "loss": 0.293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7749330997467041, + "rewards/margins": 1.314659595489502, + "rewards/rejected": -2.089592695236206, + "step": 5451 + }, + { + "epoch": 0.63, + "learning_rate": 1.1139718908704382e-07, + "logits/chosen": -2.4311282634735107, + "logits/rejected": -2.4431800842285156, + "logps/chosen": -249.26010131835938, + "logps/rejected": -240.43768310546875, + "loss": 0.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9348114132881165, + "rewards/margins": 3.031147003173828, + "rewards/rejected": -3.9659583568573, + "step": 5452 + }, + { + "epoch": 0.63, + "learning_rate": 1.1136175741112554e-07, + "logits/chosen": -1.9503041505813599, + "logits/rejected": -1.8971867561340332, + "logps/chosen": -461.7470703125, + "logps/rejected": -529.012451171875, + "loss": 0.4095, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.211175560951233, + "rewards/margins": 2.6800379753112793, + "rewards/rejected": -3.8912134170532227, + "step": 5453 + }, + { + "epoch": 0.63, + "learning_rate": 1.1132632573520728e-07, + "logits/chosen": -1.9389369487762451, + "logits/rejected": -2.486764430999756, + "logps/chosen": -748.1295166015625, + "logps/rejected": -273.95941162109375, + "loss": 0.3546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7899062633514404, + "rewards/margins": 1.5206000804901123, + "rewards/rejected": -2.3105063438415527, + "step": 5454 + }, + { + "epoch": 0.63, + "learning_rate": 1.11290894059289e-07, + "logits/chosen": -2.1409292221069336, + "logits/rejected": -2.108003616333008, + "logps/chosen": -159.531494140625, + "logps/rejected": -155.24563598632812, + "loss": 0.1981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7198622226715088, + "rewards/margins": 2.275578737258911, + "rewards/rejected": -2.99544095993042, + "step": 5455 + }, + { + "epoch": 0.63, + "learning_rate": 1.1125546238337072e-07, + "logits/chosen": -2.3768603801727295, + "logits/rejected": -2.7208669185638428, + "logps/chosen": -353.31585693359375, + "logps/rejected": -333.7634582519531, + "loss": 0.6581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1247022151947021, + "rewards/margins": 2.156001091003418, + "rewards/rejected": -3.280703544616699, + "step": 5456 + }, + { + "epoch": 0.63, + "learning_rate": 1.1122003070745245e-07, + "logits/chosen": -2.0702431201934814, + "logits/rejected": -1.9071733951568604, + "logps/chosen": -247.37591552734375, + "logps/rejected": -248.1339111328125, + "loss": 0.1469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3598642647266388, + "rewards/margins": 2.4316556453704834, + "rewards/rejected": -2.7915198802948, + "step": 5457 + }, + { + "epoch": 0.63, + "learning_rate": 1.1118459903153419e-07, + "logits/chosen": -2.339940071105957, + "logits/rejected": -2.6672112941741943, + "logps/chosen": -303.1612548828125, + "logps/rejected": -202.21490478515625, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3011424541473389, + "rewards/margins": 1.1551721096038818, + "rewards/rejected": -2.4563148021698, + "step": 5458 + }, + { + "epoch": 0.64, + "learning_rate": 1.1114916735561593e-07, + "logits/chosen": -2.4008448123931885, + "logits/rejected": -2.3614091873168945, + "logps/chosen": -253.9644775390625, + "logps/rejected": -332.78753662109375, + "loss": 1.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.65031361579895, + "rewards/margins": 0.7148020267486572, + "rewards/rejected": -3.365115165710449, + "step": 5459 + }, + { + "epoch": 0.64, + "learning_rate": 1.1111373567969765e-07, + "logits/chosen": -2.3067145347595215, + "logits/rejected": -2.3615288734436035, + "logps/chosen": -308.28607177734375, + "logps/rejected": -289.49456787109375, + "loss": 0.8091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5115925073623657, + "rewards/margins": 1.215430736541748, + "rewards/rejected": -1.7270233631134033, + "step": 5460 + }, + { + "epoch": 0.64, + "learning_rate": 1.1107830400377937e-07, + "logits/chosen": -2.474803924560547, + "logits/rejected": -2.5771422386169434, + "logps/chosen": -356.1029357910156, + "logps/rejected": -362.7196350097656, + "loss": 0.44, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3983619213104248, + "rewards/margins": 1.8603885173797607, + "rewards/rejected": -3.2587504386901855, + "step": 5461 + }, + { + "epoch": 0.64, + "learning_rate": 1.1104287232786109e-07, + "logits/chosen": -1.9505131244659424, + "logits/rejected": -2.1550159454345703, + "logps/chosen": -256.8675231933594, + "logps/rejected": -248.46829223632812, + "loss": 0.4941, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4862803518772125, + "rewards/margins": 1.7781143188476562, + "rewards/rejected": -2.264394760131836, + "step": 5462 + }, + { + "epoch": 0.64, + "learning_rate": 1.1100744065194283e-07, + "logits/chosen": -2.3333632946014404, + "logits/rejected": -2.093876361846924, + "logps/chosen": -229.13970947265625, + "logps/rejected": -271.5419006347656, + "loss": 0.5177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.908474862575531, + "rewards/margins": 1.9225562810897827, + "rewards/rejected": -2.831031084060669, + "step": 5463 + }, + { + "epoch": 0.64, + "learning_rate": 1.1097200897602456e-07, + "logits/chosen": -2.348353624343872, + "logits/rejected": -2.4400272369384766, + "logps/chosen": -373.0672912597656, + "logps/rejected": -272.0811462402344, + "loss": 0.213, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6249352693557739, + "rewards/margins": 2.5510406494140625, + "rewards/rejected": -3.175976037979126, + "step": 5464 + }, + { + "epoch": 0.64, + "learning_rate": 1.109365773001063e-07, + "logits/chosen": -2.973336696624756, + "logits/rejected": -2.977463483810425, + "logps/chosen": -178.27024841308594, + "logps/rejected": -267.38201904296875, + "loss": 0.2124, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5851482152938843, + "rewards/margins": 3.308651924133301, + "rewards/rejected": -3.8938000202178955, + "step": 5465 + }, + { + "epoch": 0.64, + "learning_rate": 1.1090114562418802e-07, + "logits/chosen": -2.819924831390381, + "logits/rejected": -2.825700283050537, + "logps/chosen": -264.49395751953125, + "logps/rejected": -249.4871368408203, + "loss": 0.4278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6320321559906006, + "rewards/margins": 1.8009101152420044, + "rewards/rejected": -2.4329423904418945, + "step": 5466 + }, + { + "epoch": 0.64, + "learning_rate": 1.1086571394826974e-07, + "logits/chosen": -2.5752809047698975, + "logits/rejected": -2.620699882507324, + "logps/chosen": -195.54476928710938, + "logps/rejected": -200.12403869628906, + "loss": 0.209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6223514676094055, + "rewards/margins": 2.6813302040100098, + "rewards/rejected": -3.3036813735961914, + "step": 5467 + }, + { + "epoch": 0.64, + "learning_rate": 1.1083028227235148e-07, + "logits/chosen": -2.450965642929077, + "logits/rejected": -2.538991928100586, + "logps/chosen": -354.7512512207031, + "logps/rejected": -319.60125732421875, + "loss": 0.16, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3484196066856384, + "rewards/margins": 3.0538203716278076, + "rewards/rejected": -3.4022397994995117, + "step": 5468 + }, + { + "epoch": 0.64, + "learning_rate": 1.107948505964332e-07, + "logits/chosen": -2.658229351043701, + "logits/rejected": -2.448155641555786, + "logps/chosen": -263.8769226074219, + "logps/rejected": -345.824462890625, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6639606952667236, + "rewards/margins": 2.4960074424743652, + "rewards/rejected": -3.159968376159668, + "step": 5469 + }, + { + "epoch": 0.64, + "learning_rate": 1.1075941892051494e-07, + "logits/chosen": -1.962981104850769, + "logits/rejected": -2.20139479637146, + "logps/chosen": -307.5494384765625, + "logps/rejected": -256.3520812988281, + "loss": 0.2275, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2471906840801239, + "rewards/margins": 2.6274847984313965, + "rewards/rejected": -2.8746752738952637, + "step": 5470 + }, + { + "epoch": 0.64, + "learning_rate": 1.1072398724459667e-07, + "logits/chosen": -1.9393223524093628, + "logits/rejected": -1.95176100730896, + "logps/chosen": -171.08279418945312, + "logps/rejected": -223.8411865234375, + "loss": 0.9538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7351804971694946, + "rewards/margins": 0.5884295701980591, + "rewards/rejected": -2.323610305786133, + "step": 5471 + }, + { + "epoch": 0.64, + "learning_rate": 1.1068855556867839e-07, + "logits/chosen": -2.6580963134765625, + "logits/rejected": -2.387050151824951, + "logps/chosen": -153.36741638183594, + "logps/rejected": -328.6927490234375, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006263114511966705, + "rewards/margins": 4.561356544494629, + "rewards/rejected": -4.567620277404785, + "step": 5472 + }, + { + "epoch": 0.64, + "learning_rate": 1.1065312389276011e-07, + "logits/chosen": -2.169116497039795, + "logits/rejected": -2.5653042793273926, + "logps/chosen": -217.5720977783203, + "logps/rejected": -170.7113037109375, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6674585938453674, + "rewards/margins": 1.918187141418457, + "rewards/rejected": -2.5856456756591797, + "step": 5473 + }, + { + "epoch": 0.64, + "learning_rate": 1.1061769221684185e-07, + "logits/chosen": -2.4139952659606934, + "logits/rejected": -2.8692803382873535, + "logps/chosen": -392.4227294921875, + "logps/rejected": -234.67332458496094, + "loss": 0.3434, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9784570336341858, + "rewards/margins": 2.009451389312744, + "rewards/rejected": -2.987908363342285, + "step": 5474 + }, + { + "epoch": 0.64, + "learning_rate": 1.1058226054092359e-07, + "logits/chosen": -2.2656748294830322, + "logits/rejected": -2.707684278488159, + "logps/chosen": -326.50738525390625, + "logps/rejected": -197.89549255371094, + "loss": 0.2934, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.415968656539917, + "rewards/margins": 1.9923152923583984, + "rewards/rejected": -3.4082837104797363, + "step": 5475 + }, + { + "epoch": 0.64, + "learning_rate": 1.1054682886500532e-07, + "logits/chosen": -2.1116418838500977, + "logits/rejected": -2.0619378089904785, + "logps/chosen": -311.6429443359375, + "logps/rejected": -302.07000732421875, + "loss": 0.3852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8603106141090393, + "rewards/margins": 2.037646532058716, + "rewards/rejected": -2.8979570865631104, + "step": 5476 + }, + { + "epoch": 0.64, + "learning_rate": 1.1051139718908704e-07, + "logits/chosen": -2.168062210083008, + "logits/rejected": -2.0354886054992676, + "logps/chosen": -330.4161682128906, + "logps/rejected": -407.875, + "loss": 0.7688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16985741257667542, + "rewards/margins": 1.2124873399734497, + "rewards/rejected": -1.3823448419570923, + "step": 5477 + }, + { + "epoch": 0.64, + "learning_rate": 1.1047596551316876e-07, + "logits/chosen": -2.7135202884674072, + "logits/rejected": -2.845043659210205, + "logps/chosen": -367.636474609375, + "logps/rejected": -231.40121459960938, + "loss": 0.2022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6420729160308838, + "rewards/margins": 2.128307819366455, + "rewards/rejected": -2.770380973815918, + "step": 5478 + }, + { + "epoch": 0.64, + "learning_rate": 1.1044053383725049e-07, + "logits/chosen": -2.1597025394439697, + "logits/rejected": -2.4846441745758057, + "logps/chosen": -360.82147216796875, + "logps/rejected": -180.10342407226562, + "loss": 0.3401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4870985150337219, + "rewards/margins": 2.516366720199585, + "rewards/rejected": -3.003465175628662, + "step": 5479 + }, + { + "epoch": 0.64, + "learning_rate": 1.1040510216133222e-07, + "logits/chosen": -2.197957992553711, + "logits/rejected": -2.111999273300171, + "logps/chosen": -239.9806671142578, + "logps/rejected": -414.5565490722656, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01108504831790924, + "rewards/margins": 3.659945487976074, + "rewards/rejected": -3.6710305213928223, + "step": 5480 + }, + { + "epoch": 0.64, + "learning_rate": 1.1036967048541396e-07, + "logits/chosen": -2.006141185760498, + "logits/rejected": -2.2732014656066895, + "logps/chosen": -325.8533935546875, + "logps/rejected": -286.2337341308594, + "loss": 0.5854, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1232013702392578, + "rewards/margins": 2.184298038482666, + "rewards/rejected": -3.307499408721924, + "step": 5481 + }, + { + "epoch": 0.64, + "learning_rate": 1.1033423880949569e-07, + "logits/chosen": -1.9891605377197266, + "logits/rejected": -2.2760303020477295, + "logps/chosen": -313.83575439453125, + "logps/rejected": -288.521484375, + "loss": 0.7758, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.297821283340454, + "rewards/margins": 1.2551591396331787, + "rewards/rejected": -2.552980422973633, + "step": 5482 + }, + { + "epoch": 0.64, + "learning_rate": 1.1029880713357742e-07, + "logits/chosen": -2.361585855484009, + "logits/rejected": -2.5484514236450195, + "logps/chosen": -354.91094970703125, + "logps/rejected": -303.721435546875, + "loss": 0.1664, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5444213151931763, + "rewards/margins": 2.629948377609253, + "rewards/rejected": -4.174369812011719, + "step": 5483 + }, + { + "epoch": 0.64, + "learning_rate": 1.1026337545765914e-07, + "logits/chosen": -2.4451351165771484, + "logits/rejected": -2.2414422035217285, + "logps/chosen": -248.25531005859375, + "logps/rejected": -261.5409851074219, + "loss": 0.2349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44500476121902466, + "rewards/margins": 2.1578893661499023, + "rewards/rejected": -2.6028940677642822, + "step": 5484 + }, + { + "epoch": 0.64, + "learning_rate": 1.1022794378174087e-07, + "logits/chosen": -2.6161246299743652, + "logits/rejected": -2.595123291015625, + "logps/chosen": -253.56582641601562, + "logps/rejected": -354.81890869140625, + "loss": 0.2084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38251179456710815, + "rewards/margins": 3.4004383087158203, + "rewards/rejected": -3.782949924468994, + "step": 5485 + }, + { + "epoch": 0.64, + "learning_rate": 1.101925121058226e-07, + "logits/chosen": -2.165921449661255, + "logits/rejected": -2.4346415996551514, + "logps/chosen": -241.74301147460938, + "logps/rejected": -145.16175842285156, + "loss": 0.6967, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4942351579666138, + "rewards/margins": 1.2649438381195068, + "rewards/rejected": -2.75917911529541, + "step": 5486 + }, + { + "epoch": 0.64, + "learning_rate": 1.1015708042990433e-07, + "logits/chosen": -2.009830951690674, + "logits/rejected": -2.1575984954833984, + "logps/chosen": -333.8755187988281, + "logps/rejected": -221.09906005859375, + "loss": 0.6292, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1169664859771729, + "rewards/margins": 1.103528618812561, + "rewards/rejected": -2.2204952239990234, + "step": 5487 + }, + { + "epoch": 0.64, + "learning_rate": 1.1012164875398607e-07, + "logits/chosen": -2.66276216506958, + "logits/rejected": -2.749074697494507, + "logps/chosen": -345.73553466796875, + "logps/rejected": -308.97259521484375, + "loss": 0.2267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7621593475341797, + "rewards/margins": 3.758425235748291, + "rewards/rejected": -4.520584583282471, + "step": 5488 + }, + { + "epoch": 0.64, + "learning_rate": 1.1008621707806779e-07, + "logits/chosen": -1.7442905902862549, + "logits/rejected": -1.8624660968780518, + "logps/chosen": -242.27590942382812, + "logps/rejected": -298.80938720703125, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4078303575515747, + "rewards/margins": 2.355605363845825, + "rewards/rejected": -2.7634356021881104, + "step": 5489 + }, + { + "epoch": 0.64, + "learning_rate": 1.1005078540214951e-07, + "logits/chosen": -2.168128728866577, + "logits/rejected": -1.9911104440689087, + "logps/chosen": -169.77000427246094, + "logps/rejected": -207.755615234375, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6974061131477356, + "rewards/margins": 1.2795287370681763, + "rewards/rejected": -1.976934790611267, + "step": 5490 + }, + { + "epoch": 0.64, + "learning_rate": 1.1001535372623125e-07, + "logits/chosen": -2.4600863456726074, + "logits/rejected": -2.760425329208374, + "logps/chosen": -251.51736450195312, + "logps/rejected": -187.92611694335938, + "loss": 0.4639, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.474007487297058, + "rewards/margins": 2.3428795337677, + "rewards/rejected": -3.8168869018554688, + "step": 5491 + }, + { + "epoch": 0.64, + "learning_rate": 1.0997992205031297e-07, + "logits/chosen": -2.4894893169403076, + "logits/rejected": -2.423626661300659, + "logps/chosen": -302.7921447753906, + "logps/rejected": -195.44076538085938, + "loss": 0.3864, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5127599835395813, + "rewards/margins": 1.6179898977279663, + "rewards/rejected": -2.1307497024536133, + "step": 5492 + }, + { + "epoch": 0.64, + "learning_rate": 1.0994449037439472e-07, + "logits/chosen": -2.753323793411255, + "logits/rejected": -2.7221145629882812, + "logps/chosen": -224.5933837890625, + "logps/rejected": -203.54347229003906, + "loss": 0.1975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3556295931339264, + "rewards/margins": 2.7632105350494385, + "rewards/rejected": -3.118839740753174, + "step": 5493 + }, + { + "epoch": 0.64, + "learning_rate": 1.0990905869847644e-07, + "logits/chosen": -1.559370756149292, + "logits/rejected": -1.891071081161499, + "logps/chosen": -416.7593994140625, + "logps/rejected": -322.3216552734375, + "loss": 1.4343, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.286785125732422, + "rewards/margins": 0.3305429220199585, + "rewards/rejected": -2.61732816696167, + "step": 5494 + }, + { + "epoch": 0.64, + "learning_rate": 1.0987362702255816e-07, + "logits/chosen": -2.3185677528381348, + "logits/rejected": -2.676105499267578, + "logps/chosen": -423.58221435546875, + "logps/rejected": -236.094482421875, + "loss": 0.1265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4550972580909729, + "rewards/margins": 3.681673049926758, + "rewards/rejected": -4.136770248413086, + "step": 5495 + }, + { + "epoch": 0.64, + "learning_rate": 1.098381953466399e-07, + "logits/chosen": -1.8339810371398926, + "logits/rejected": -1.78395676612854, + "logps/chosen": -229.42747497558594, + "logps/rejected": -330.4653015136719, + "loss": 0.173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8713713884353638, + "rewards/margins": 2.4712955951690674, + "rewards/rejected": -3.3426666259765625, + "step": 5496 + }, + { + "epoch": 0.64, + "learning_rate": 1.0980276367072162e-07, + "logits/chosen": -2.520328998565674, + "logits/rejected": -2.4316155910491943, + "logps/chosen": -264.7857971191406, + "logps/rejected": -253.3126220703125, + "loss": 0.567, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3359676599502563, + "rewards/margins": 0.8190702199935913, + "rewards/rejected": -2.1550378799438477, + "step": 5497 + }, + { + "epoch": 0.64, + "learning_rate": 1.0976733199480334e-07, + "logits/chosen": -2.2747983932495117, + "logits/rejected": -2.131091594696045, + "logps/chosen": -279.14404296875, + "logps/rejected": -353.19366455078125, + "loss": 0.3465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8522354364395142, + "rewards/margins": 1.7580828666687012, + "rewards/rejected": -2.610318422317505, + "step": 5498 + }, + { + "epoch": 0.64, + "learning_rate": 1.0973190031888509e-07, + "logits/chosen": -2.477996349334717, + "logits/rejected": -2.664045810699463, + "logps/chosen": -260.69122314453125, + "logps/rejected": -285.224853515625, + "loss": 0.3029, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8915267586708069, + "rewards/margins": 1.9391167163848877, + "rewards/rejected": -2.830643653869629, + "step": 5499 + }, + { + "epoch": 0.64, + "learning_rate": 1.0969646864296681e-07, + "logits/chosen": -1.5811165571212769, + "logits/rejected": -1.627776026725769, + "logps/chosen": -369.02520751953125, + "logps/rejected": -388.3089599609375, + "loss": 0.5241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.533074676990509, + "rewards/margins": 1.9415935277938843, + "rewards/rejected": -2.474668264389038, + "step": 5500 + }, + { + "epoch": 0.64, + "learning_rate": 1.0966103696704853e-07, + "logits/chosen": -1.6619644165039062, + "logits/rejected": -1.5948665142059326, + "logps/chosen": -167.68292236328125, + "logps/rejected": -220.64105224609375, + "loss": 0.2343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6588548421859741, + "rewards/margins": 2.0013203620910645, + "rewards/rejected": -2.660175085067749, + "step": 5501 + }, + { + "epoch": 0.64, + "learning_rate": 1.0962560529113027e-07, + "logits/chosen": -2.329218864440918, + "logits/rejected": -2.5369369983673096, + "logps/chosen": -324.39654541015625, + "logps/rejected": -259.3008728027344, + "loss": 0.1806, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3681390285491943, + "rewards/margins": 2.939256191253662, + "rewards/rejected": -4.307394981384277, + "step": 5502 + }, + { + "epoch": 0.64, + "learning_rate": 1.0959017361521199e-07, + "logits/chosen": -2.079803466796875, + "logits/rejected": -2.160726308822632, + "logps/chosen": -307.9873352050781, + "logps/rejected": -310.6032409667969, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1198577880859375, + "rewards/margins": 2.0183143615722656, + "rewards/rejected": -3.138171911239624, + "step": 5503 + }, + { + "epoch": 0.64, + "learning_rate": 1.0955474193929371e-07, + "logits/chosen": -2.4428975582122803, + "logits/rejected": -1.9963269233703613, + "logps/chosen": -289.301025390625, + "logps/rejected": -459.22137451171875, + "loss": 0.8773, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7280832529067993, + "rewards/margins": 1.1040599346160889, + "rewards/rejected": -1.8321431875228882, + "step": 5504 + }, + { + "epoch": 0.64, + "learning_rate": 1.0951931026337546e-07, + "logits/chosen": -2.336620569229126, + "logits/rejected": -2.431709051132202, + "logps/chosen": -169.52076721191406, + "logps/rejected": -162.01190185546875, + "loss": 0.4808, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5360898971557617, + "rewards/margins": 2.1919946670532227, + "rewards/rejected": -3.7280845642089844, + "step": 5505 + }, + { + "epoch": 0.64, + "learning_rate": 1.0948387858745718e-07, + "logits/chosen": -2.765146255493164, + "logits/rejected": -2.6556320190429688, + "logps/chosen": -176.542724609375, + "logps/rejected": -241.50125122070312, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8933104872703552, + "rewards/margins": 1.1431083679199219, + "rewards/rejected": -2.036418914794922, + "step": 5506 + }, + { + "epoch": 0.64, + "learning_rate": 1.094484469115389e-07, + "logits/chosen": -2.508908987045288, + "logits/rejected": -2.623159885406494, + "logps/chosen": -191.87985229492188, + "logps/rejected": -210.1908416748047, + "loss": 0.2431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5041604042053223, + "rewards/margins": 2.367405891418457, + "rewards/rejected": -2.8715660572052, + "step": 5507 + }, + { + "epoch": 0.64, + "learning_rate": 1.0941301523562064e-07, + "logits/chosen": -2.685666084289551, + "logits/rejected": -2.656187057495117, + "logps/chosen": -199.7700958251953, + "logps/rejected": -311.2677001953125, + "loss": 0.4329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6700700521469116, + "rewards/margins": 1.7948205471038818, + "rewards/rejected": -2.464890718460083, + "step": 5508 + }, + { + "epoch": 0.64, + "learning_rate": 1.0937758355970236e-07, + "logits/chosen": -2.3454654216766357, + "logits/rejected": -2.5005979537963867, + "logps/chosen": -351.37628173828125, + "logps/rejected": -314.9337463378906, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1744897365570068, + "rewards/margins": 3.7373228073120117, + "rewards/rejected": -4.911812782287598, + "step": 5509 + }, + { + "epoch": 0.64, + "learning_rate": 1.0934215188378411e-07, + "logits/chosen": -2.7421669960021973, + "logits/rejected": -2.9442861080169678, + "logps/chosen": -161.73841857910156, + "logps/rejected": -175.9002227783203, + "loss": 0.4284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32011324167251587, + "rewards/margins": 2.252490282058716, + "rewards/rejected": -2.572603225708008, + "step": 5510 + }, + { + "epoch": 0.64, + "learning_rate": 1.0930672020786583e-07, + "logits/chosen": -2.6186201572418213, + "logits/rejected": -2.59755277633667, + "logps/chosen": -214.5706787109375, + "logps/rejected": -286.5604248046875, + "loss": 0.3111, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6353821754455566, + "rewards/margins": 3.4341964721679688, + "rewards/rejected": -4.069578647613525, + "step": 5511 + }, + { + "epoch": 0.64, + "learning_rate": 1.0927128853194756e-07, + "logits/chosen": -2.8597218990325928, + "logits/rejected": -2.8215253353118896, + "logps/chosen": -176.03683471679688, + "logps/rejected": -236.8227996826172, + "loss": 0.7058, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8339489698410034, + "rewards/margins": 0.6430934071540833, + "rewards/rejected": -2.4770424365997314, + "step": 5512 + }, + { + "epoch": 0.64, + "learning_rate": 1.0923585685602929e-07, + "logits/chosen": -1.7799270153045654, + "logits/rejected": -1.611576795578003, + "logps/chosen": -309.53271484375, + "logps/rejected": -343.6754150390625, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0011677742004395, + "rewards/margins": 1.8267860412597656, + "rewards/rejected": -2.827953815460205, + "step": 5513 + }, + { + "epoch": 0.64, + "learning_rate": 1.0920042518011101e-07, + "logits/chosen": -2.5283117294311523, + "logits/rejected": -2.3851239681243896, + "logps/chosen": -298.6266784667969, + "logps/rejected": -329.6625671386719, + "loss": 0.0663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7362421751022339, + "rewards/margins": 3.9701457023620605, + "rewards/rejected": -4.706387519836426, + "step": 5514 + }, + { + "epoch": 0.64, + "learning_rate": 1.0916499350419274e-07, + "logits/chosen": -2.4814083576202393, + "logits/rejected": -2.399545431137085, + "logps/chosen": -168.21652221679688, + "logps/rejected": -174.70706176757812, + "loss": 0.3698, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8276681900024414, + "rewards/margins": 2.075510025024414, + "rewards/rejected": -2.9031782150268555, + "step": 5515 + }, + { + "epoch": 0.64, + "learning_rate": 1.0912956182827448e-07, + "logits/chosen": -2.420590400695801, + "logits/rejected": -2.348771572113037, + "logps/chosen": -312.38885498046875, + "logps/rejected": -185.67625427246094, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6957936882972717, + "rewards/margins": 1.8521244525909424, + "rewards/rejected": -2.5479180812835693, + "step": 5516 + }, + { + "epoch": 0.64, + "learning_rate": 1.090941301523562e-07, + "logits/chosen": -2.720383644104004, + "logits/rejected": -2.557812213897705, + "logps/chosen": -128.51434326171875, + "logps/rejected": -217.8095245361328, + "loss": 0.2455, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8978371024131775, + "rewards/margins": 1.972967505455017, + "rewards/rejected": -2.87080454826355, + "step": 5517 + }, + { + "epoch": 0.64, + "learning_rate": 1.0905869847643793e-07, + "logits/chosen": -2.7188265323638916, + "logits/rejected": -2.6899116039276123, + "logps/chosen": -129.9900360107422, + "logps/rejected": -234.42803955078125, + "loss": 0.652, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47055068612098694, + "rewards/margins": 2.3679323196411133, + "rewards/rejected": -2.8384828567504883, + "step": 5518 + }, + { + "epoch": 0.64, + "learning_rate": 1.0902326680051966e-07, + "logits/chosen": -2.5070927143096924, + "logits/rejected": -2.381716251373291, + "logps/chosen": -258.263671875, + "logps/rejected": -278.0553283691406, + "loss": 0.1947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5146371722221375, + "rewards/margins": 3.9579153060913086, + "rewards/rejected": -4.47255277633667, + "step": 5519 + }, + { + "epoch": 0.64, + "learning_rate": 1.0898783512460139e-07, + "logits/chosen": -1.9693636894226074, + "logits/rejected": -1.9643385410308838, + "logps/chosen": -245.49362182617188, + "logps/rejected": -286.5799560546875, + "loss": 0.5093, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8815093040466309, + "rewards/margins": 1.5759446620941162, + "rewards/rejected": -2.457453966140747, + "step": 5520 + }, + { + "epoch": 0.64, + "learning_rate": 1.0895240344868311e-07, + "logits/chosen": -2.1263818740844727, + "logits/rejected": -2.4805684089660645, + "logps/chosen": -274.5559387207031, + "logps/rejected": -207.94520568847656, + "loss": 0.4198, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1102595329284668, + "rewards/margins": 1.5282881259918213, + "rewards/rejected": -2.638547658920288, + "step": 5521 + }, + { + "epoch": 0.64, + "learning_rate": 1.0891697177276486e-07, + "logits/chosen": -2.340707302093506, + "logits/rejected": -2.7081639766693115, + "logps/chosen": -265.44415283203125, + "logps/rejected": -231.94821166992188, + "loss": 0.501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8569229245185852, + "rewards/margins": 0.7735512852668762, + "rewards/rejected": -1.6304740905761719, + "step": 5522 + }, + { + "epoch": 0.64, + "learning_rate": 1.0888154009684658e-07, + "logits/chosen": -2.3438587188720703, + "logits/rejected": -2.70676851272583, + "logps/chosen": -479.60821533203125, + "logps/rejected": -187.53616333007812, + "loss": 1.1443, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6542689800262451, + "rewards/margins": 0.4966229796409607, + "rewards/rejected": -2.1508922576904297, + "step": 5523 + }, + { + "epoch": 0.64, + "learning_rate": 1.088461084209283e-07, + "logits/chosen": -2.5530643463134766, + "logits/rejected": -2.318917989730835, + "logps/chosen": -314.6600646972656, + "logps/rejected": -309.53070068359375, + "loss": 0.1667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13424380123615265, + "rewards/margins": 2.321531295776367, + "rewards/rejected": -2.455775022506714, + "step": 5524 + }, + { + "epoch": 0.64, + "learning_rate": 1.0881067674501004e-07, + "logits/chosen": -2.5879530906677246, + "logits/rejected": -2.6974809169769287, + "logps/chosen": -168.13638305664062, + "logps/rejected": -320.28570556640625, + "loss": 0.2282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3991885483264923, + "rewards/margins": 2.07222318649292, + "rewards/rejected": -2.47141170501709, + "step": 5525 + }, + { + "epoch": 0.64, + "learning_rate": 1.0877524506909176e-07, + "logits/chosen": -1.8821018934249878, + "logits/rejected": -2.1345057487487793, + "logps/chosen": -296.1561279296875, + "logps/rejected": -163.584716796875, + "loss": 0.2191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03734889253973961, + "rewards/margins": 2.1962034702301025, + "rewards/rejected": -2.2335524559020996, + "step": 5526 + }, + { + "epoch": 0.64, + "learning_rate": 1.0873981339317348e-07, + "logits/chosen": -2.4681999683380127, + "logits/rejected": -2.197720527648926, + "logps/chosen": -231.4757080078125, + "logps/rejected": -347.0568542480469, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7732428908348083, + "rewards/margins": 2.0871329307556152, + "rewards/rejected": -2.8603758811950684, + "step": 5527 + }, + { + "epoch": 0.64, + "learning_rate": 1.0870438171725523e-07, + "logits/chosen": -2.2544362545013428, + "logits/rejected": -2.145930767059326, + "logps/chosen": -275.9822082519531, + "logps/rejected": -275.5411071777344, + "loss": 0.3039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9949970841407776, + "rewards/margins": 2.696363687515259, + "rewards/rejected": -3.6913604736328125, + "step": 5528 + }, + { + "epoch": 0.64, + "learning_rate": 1.0866895004133695e-07, + "logits/chosen": -2.8618671894073486, + "logits/rejected": -2.79007625579834, + "logps/chosen": -255.21202087402344, + "logps/rejected": -317.13995361328125, + "loss": 0.2349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9057572484016418, + "rewards/margins": 2.5286638736724854, + "rewards/rejected": -3.4344210624694824, + "step": 5529 + }, + { + "epoch": 0.64, + "learning_rate": 1.0863351836541869e-07, + "logits/chosen": -2.897818088531494, + "logits/rejected": -2.580827474594116, + "logps/chosen": -371.13116455078125, + "logps/rejected": -256.207275390625, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7634882926940918, + "rewards/margins": 2.5742335319519043, + "rewards/rejected": -4.337721824645996, + "step": 5530 + }, + { + "epoch": 0.64, + "learning_rate": 1.0859808668950041e-07, + "logits/chosen": -1.9573509693145752, + "logits/rejected": -2.30708646774292, + "logps/chosen": -410.7255554199219, + "logps/rejected": -444.55596923828125, + "loss": 0.1752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7326204776763916, + "rewards/margins": 2.6319923400878906, + "rewards/rejected": -3.364612579345703, + "step": 5531 + }, + { + "epoch": 0.64, + "learning_rate": 1.0856265501358213e-07, + "logits/chosen": -2.579740524291992, + "logits/rejected": -2.6268200874328613, + "logps/chosen": -387.17431640625, + "logps/rejected": -264.77313232421875, + "loss": 0.2798, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6288034915924072, + "rewards/margins": 1.4032468795776367, + "rewards/rejected": -2.032050371170044, + "step": 5532 + }, + { + "epoch": 0.64, + "learning_rate": 1.0852722333766387e-07, + "logits/chosen": -2.310770273208618, + "logits/rejected": -2.297313690185547, + "logps/chosen": -239.90469360351562, + "logps/rejected": -194.91561889648438, + "loss": 0.4838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8780336380004883, + "rewards/margins": 1.2692618370056152, + "rewards/rejected": -2.1472954750061035, + "step": 5533 + }, + { + "epoch": 0.64, + "learning_rate": 1.084917916617456e-07, + "logits/chosen": -2.4069600105285645, + "logits/rejected": -2.3253746032714844, + "logps/chosen": -228.01458740234375, + "logps/rejected": -237.7639923095703, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6791322231292725, + "rewards/margins": 2.6979074478149414, + "rewards/rejected": -3.377039670944214, + "step": 5534 + }, + { + "epoch": 0.64, + "learning_rate": 1.0845635998582732e-07, + "logits/chosen": -2.0040555000305176, + "logits/rejected": -2.3793389797210693, + "logps/chosen": -536.1353759765625, + "logps/rejected": -227.88751220703125, + "loss": 0.3706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7483206987380981, + "rewards/margins": 1.6440987586975098, + "rewards/rejected": -2.3924198150634766, + "step": 5535 + }, + { + "epoch": 0.64, + "learning_rate": 1.0842092830990906e-07, + "logits/chosen": -2.166121006011963, + "logits/rejected": -2.568272590637207, + "logps/chosen": -304.8279724121094, + "logps/rejected": -256.71380615234375, + "loss": 0.6612, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9218889474868774, + "rewards/margins": 3.310692310333252, + "rewards/rejected": -4.23258113861084, + "step": 5536 + }, + { + "epoch": 0.64, + "learning_rate": 1.0838549663399078e-07, + "logits/chosen": -1.910620093345642, + "logits/rejected": -1.955139398574829, + "logps/chosen": -277.6504821777344, + "logps/rejected": -232.3689727783203, + "loss": 0.9325, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4458885192871094, + "rewards/margins": 0.35922497510910034, + "rewards/rejected": -2.8051135540008545, + "step": 5537 + }, + { + "epoch": 0.64, + "learning_rate": 1.083500649580725e-07, + "logits/chosen": -2.5261237621307373, + "logits/rejected": -2.7145235538482666, + "logps/chosen": -290.4201354980469, + "logps/rejected": -190.94093322753906, + "loss": 0.3771, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.00648832321167, + "rewards/margins": 1.67669677734375, + "rewards/rejected": -2.68318510055542, + "step": 5538 + }, + { + "epoch": 0.64, + "learning_rate": 1.0831463328215424e-07, + "logits/chosen": -1.835139274597168, + "logits/rejected": -1.7962489128112793, + "logps/chosen": -272.0028076171875, + "logps/rejected": -284.15087890625, + "loss": 0.7982, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8461220264434814, + "rewards/margins": 0.16179919242858887, + "rewards/rejected": -2.0079212188720703, + "step": 5539 + }, + { + "epoch": 0.64, + "learning_rate": 1.0827920160623597e-07, + "logits/chosen": -2.3012776374816895, + "logits/rejected": -2.394357204437256, + "logps/chosen": -134.8734893798828, + "logps/rejected": -180.5397186279297, + "loss": 0.6008, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.568005084991455, + "rewards/margins": 2.0915956497192383, + "rewards/rejected": -3.6596007347106934, + "step": 5540 + }, + { + "epoch": 0.64, + "learning_rate": 1.082437699303177e-07, + "logits/chosen": -2.5282986164093018, + "logits/rejected": -2.422687292098999, + "logps/chosen": -277.6493835449219, + "logps/rejected": -269.4349365234375, + "loss": 0.4091, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3406157493591309, + "rewards/margins": 1.5547951459884644, + "rewards/rejected": -2.8954107761383057, + "step": 5541 + }, + { + "epoch": 0.64, + "learning_rate": 1.0820833825439943e-07, + "logits/chosen": -2.6439990997314453, + "logits/rejected": -2.7426793575286865, + "logps/chosen": -372.94256591796875, + "logps/rejected": -306.2283630371094, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3622992932796478, + "rewards/margins": 3.6952764987945557, + "rewards/rejected": -4.057575702667236, + "step": 5542 + }, + { + "epoch": 0.64, + "learning_rate": 1.0817290657848115e-07, + "logits/chosen": -2.5075714588165283, + "logits/rejected": -2.7141528129577637, + "logps/chosen": -239.81329345703125, + "logps/rejected": -314.10968017578125, + "loss": 0.1632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1700737476348877, + "rewards/margins": 3.3333075046539307, + "rewards/rejected": -3.163233757019043, + "step": 5543 + }, + { + "epoch": 0.64, + "learning_rate": 1.0813747490256288e-07, + "logits/chosen": -2.215956211090088, + "logits/rejected": -2.1957297325134277, + "logps/chosen": -236.16946411132812, + "logps/rejected": -267.1487731933594, + "loss": 0.3835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9504519701004028, + "rewards/margins": 2.6554319858551025, + "rewards/rejected": -3.605884075164795, + "step": 5544 + }, + { + "epoch": 0.65, + "learning_rate": 1.0810204322664462e-07, + "logits/chosen": -2.5025620460510254, + "logits/rejected": -2.482163667678833, + "logps/chosen": -315.2698669433594, + "logps/rejected": -324.51513671875, + "loss": 0.3941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48086681962013245, + "rewards/margins": 2.882641553878784, + "rewards/rejected": -3.363508462905884, + "step": 5545 + }, + { + "epoch": 0.65, + "learning_rate": 1.0806661155072635e-07, + "logits/chosen": -2.67179536819458, + "logits/rejected": -2.3595356941223145, + "logps/chosen": -123.38778686523438, + "logps/rejected": -217.46038818359375, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40293756127357483, + "rewards/margins": 2.5373082160949707, + "rewards/rejected": -2.9402456283569336, + "step": 5546 + }, + { + "epoch": 0.65, + "learning_rate": 1.0803117987480808e-07, + "logits/chosen": -2.1732001304626465, + "logits/rejected": -2.1965274810791016, + "logps/chosen": -283.5172424316406, + "logps/rejected": -219.78060913085938, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1102187633514404, + "rewards/margins": 2.165741205215454, + "rewards/rejected": -3.2759599685668945, + "step": 5547 + }, + { + "epoch": 0.65, + "learning_rate": 1.079957481988898e-07, + "logits/chosen": -2.053701639175415, + "logits/rejected": -2.184518814086914, + "logps/chosen": -302.4637756347656, + "logps/rejected": -248.36785888671875, + "loss": 0.1723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4212523102760315, + "rewards/margins": 2.355085849761963, + "rewards/rejected": -2.7763381004333496, + "step": 5548 + }, + { + "epoch": 0.65, + "learning_rate": 1.0796031652297153e-07, + "logits/chosen": -2.3230438232421875, + "logits/rejected": -2.2971608638763428, + "logps/chosen": -327.668212890625, + "logps/rejected": -267.6793518066406, + "loss": 0.4205, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.112288236618042, + "rewards/margins": 1.5340354442596436, + "rewards/rejected": -2.6463236808776855, + "step": 5549 + }, + { + "epoch": 0.65, + "learning_rate": 1.0792488484705326e-07, + "logits/chosen": -1.7293477058410645, + "logits/rejected": -1.8442070484161377, + "logps/chosen": -251.14048767089844, + "logps/rejected": -240.60621643066406, + "loss": 0.3003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6201245188713074, + "rewards/margins": 1.137850284576416, + "rewards/rejected": -1.7579747438430786, + "step": 5550 + }, + { + "epoch": 0.65, + "learning_rate": 1.07889453171135e-07, + "logits/chosen": -2.431334972381592, + "logits/rejected": -2.274172782897949, + "logps/chosen": -257.6517028808594, + "logps/rejected": -229.78927612304688, + "loss": 0.3421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.828568696975708, + "rewards/margins": 2.026825189590454, + "rewards/rejected": -2.855393886566162, + "step": 5551 + }, + { + "epoch": 0.65, + "learning_rate": 1.0785402149521672e-07, + "logits/chosen": -2.6414785385131836, + "logits/rejected": -2.2931535243988037, + "logps/chosen": -279.06451416015625, + "logps/rejected": -305.9254150390625, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16477230191230774, + "rewards/margins": 2.779637336730957, + "rewards/rejected": -2.9444096088409424, + "step": 5552 + }, + { + "epoch": 0.65, + "learning_rate": 1.0781858981929845e-07, + "logits/chosen": -2.275362253189087, + "logits/rejected": -2.1211600303649902, + "logps/chosen": -176.54620361328125, + "logps/rejected": -334.2018737792969, + "loss": 0.1474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8799606561660767, + "rewards/margins": 3.5989603996276855, + "rewards/rejected": -4.478921413421631, + "step": 5553 + }, + { + "epoch": 0.65, + "learning_rate": 1.0778315814338018e-07, + "logits/chosen": -1.9199397563934326, + "logits/rejected": -2.176997184753418, + "logps/chosen": -308.8395080566406, + "logps/rejected": -233.494140625, + "loss": 0.1672, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05099180340766907, + "rewards/margins": 3.0103812217712402, + "rewards/rejected": -2.9593894481658936, + "step": 5554 + }, + { + "epoch": 0.65, + "learning_rate": 1.077477264674619e-07, + "logits/chosen": -2.470576047897339, + "logits/rejected": -2.451507091522217, + "logps/chosen": -290.0706481933594, + "logps/rejected": -305.48089599609375, + "loss": 0.1315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9675343036651611, + "rewards/margins": 3.3533926010131836, + "rewards/rejected": -4.320926666259766, + "step": 5555 + }, + { + "epoch": 0.65, + "learning_rate": 1.0771229479154363e-07, + "logits/chosen": -2.6342782974243164, + "logits/rejected": -2.3942174911499023, + "logps/chosen": -346.26416015625, + "logps/rejected": -386.4808654785156, + "loss": 0.3061, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6556344628334045, + "rewards/margins": 3.6348209381103516, + "rewards/rejected": -4.2904558181762695, + "step": 5556 + }, + { + "epoch": 0.65, + "learning_rate": 1.0767686311562537e-07, + "logits/chosen": -2.1717352867126465, + "logits/rejected": -2.5242092609405518, + "logps/chosen": -250.3701934814453, + "logps/rejected": -202.9127197265625, + "loss": 0.3293, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1744482517242432, + "rewards/margins": 1.5571682453155518, + "rewards/rejected": -2.731616497039795, + "step": 5557 + }, + { + "epoch": 0.65, + "learning_rate": 1.076414314397071e-07, + "logits/chosen": -2.5993642807006836, + "logits/rejected": -2.5876288414001465, + "logps/chosen": -411.9694519042969, + "logps/rejected": -349.6552429199219, + "loss": 0.2406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0939135551452637, + "rewards/margins": 2.4891908168792725, + "rewards/rejected": -3.583104133605957, + "step": 5558 + }, + { + "epoch": 0.65, + "learning_rate": 1.0760599976378883e-07, + "logits/chosen": -2.6959004402160645, + "logits/rejected": -2.577112913131714, + "logps/chosen": -457.5589599609375, + "logps/rejected": -270.0552673339844, + "loss": 0.273, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7154108285903931, + "rewards/margins": 3.0198984146118164, + "rewards/rejected": -3.735309362411499, + "step": 5559 + }, + { + "epoch": 0.65, + "learning_rate": 1.0757056808787055e-07, + "logits/chosen": -2.3742010593414307, + "logits/rejected": -2.4471824169158936, + "logps/chosen": -218.62503051757812, + "logps/rejected": -210.44580078125, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8074192404747009, + "rewards/margins": 2.1079394817352295, + "rewards/rejected": -2.915358781814575, + "step": 5560 + }, + { + "epoch": 0.65, + "learning_rate": 1.0753513641195227e-07, + "logits/chosen": -2.5182600021362305, + "logits/rejected": -2.4238173961639404, + "logps/chosen": -301.5669250488281, + "logps/rejected": -195.61203002929688, + "loss": 0.6395, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1553606986999512, + "rewards/margins": 1.823959469795227, + "rewards/rejected": -2.9793200492858887, + "step": 5561 + }, + { + "epoch": 0.65, + "learning_rate": 1.07499704736034e-07, + "logits/chosen": -2.5983095169067383, + "logits/rejected": -2.600515127182007, + "logps/chosen": -211.55308532714844, + "logps/rejected": -214.12158203125, + "loss": 0.2764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6153090000152588, + "rewards/margins": 1.7521321773529053, + "rewards/rejected": -2.367441177368164, + "step": 5562 + }, + { + "epoch": 0.65, + "learning_rate": 1.0746427306011574e-07, + "logits/chosen": -1.9259390830993652, + "logits/rejected": -2.2441442012786865, + "logps/chosen": -401.49371337890625, + "logps/rejected": -383.50909423828125, + "loss": 0.689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8266350626945496, + "rewards/margins": 1.732300043106079, + "rewards/rejected": -2.5589349269866943, + "step": 5563 + }, + { + "epoch": 0.65, + "learning_rate": 1.0742884138419748e-07, + "logits/chosen": -1.953871488571167, + "logits/rejected": -2.317373275756836, + "logps/chosen": -396.308349609375, + "logps/rejected": -194.0185546875, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5964360237121582, + "rewards/margins": 2.008596897125244, + "rewards/rejected": -2.6050329208374023, + "step": 5564 + }, + { + "epoch": 0.65, + "learning_rate": 1.073934097082792e-07, + "logits/chosen": -2.0684657096862793, + "logits/rejected": -2.177018642425537, + "logps/chosen": -434.3354187011719, + "logps/rejected": -389.63226318359375, + "loss": 0.6438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5199898481369019, + "rewards/margins": 0.9814404249191284, + "rewards/rejected": -1.5014302730560303, + "step": 5565 + }, + { + "epoch": 0.65, + "learning_rate": 1.0735797803236092e-07, + "logits/chosen": -1.5296730995178223, + "logits/rejected": -2.0231168270111084, + "logps/chosen": -394.19659423828125, + "logps/rejected": -184.48046875, + "loss": 0.7009, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4306572675704956, + "rewards/margins": 1.125707983970642, + "rewards/rejected": -2.556365489959717, + "step": 5566 + }, + { + "epoch": 0.65, + "learning_rate": 1.0732254635644266e-07, + "logits/chosen": -2.3657724857330322, + "logits/rejected": -2.442661762237549, + "logps/chosen": -139.80911254882812, + "logps/rejected": -130.12615966796875, + "loss": 0.7822, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8504955768585205, + "rewards/margins": 0.10454201698303223, + "rewards/rejected": -1.9550375938415527, + "step": 5567 + }, + { + "epoch": 0.65, + "learning_rate": 1.0728711468052438e-07, + "logits/chosen": -2.454427480697632, + "logits/rejected": -2.3742730617523193, + "logps/chosen": -180.02435302734375, + "logps/rejected": -240.14312744140625, + "loss": 0.3357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17115499079227448, + "rewards/margins": 2.524588108062744, + "rewards/rejected": -2.6957430839538574, + "step": 5568 + }, + { + "epoch": 0.65, + "learning_rate": 1.0725168300460611e-07, + "logits/chosen": -1.8623799085617065, + "logits/rejected": -2.0532166957855225, + "logps/chosen": -504.10614013671875, + "logps/rejected": -468.05377197265625, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09231072664260864, + "rewards/margins": 1.7501569986343384, + "rewards/rejected": -1.6578463315963745, + "step": 5569 + }, + { + "epoch": 0.65, + "learning_rate": 1.0721625132868785e-07, + "logits/chosen": -1.8625051975250244, + "logits/rejected": -1.8379905223846436, + "logps/chosen": -299.1378479003906, + "logps/rejected": -348.2354736328125, + "loss": 0.3557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7667273879051208, + "rewards/margins": 1.4400935173034668, + "rewards/rejected": -2.2068209648132324, + "step": 5570 + }, + { + "epoch": 0.65, + "learning_rate": 1.0718081965276957e-07, + "logits/chosen": -1.5901904106140137, + "logits/rejected": -1.7126057147979736, + "logps/chosen": -490.5355224609375, + "logps/rejected": -422.37628173828125, + "loss": 0.4582, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3320876359939575, + "rewards/margins": 1.642423152923584, + "rewards/rejected": -2.974510908126831, + "step": 5571 + }, + { + "epoch": 0.65, + "learning_rate": 1.0714538797685129e-07, + "logits/chosen": -2.412890672683716, + "logits/rejected": -2.2661798000335693, + "logps/chosen": -331.3213806152344, + "logps/rejected": -352.2065734863281, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0036644116044044495, + "rewards/margins": 1.591478943824768, + "rewards/rejected": -1.5951433181762695, + "step": 5572 + }, + { + "epoch": 0.65, + "learning_rate": 1.0710995630093303e-07, + "logits/chosen": -2.6523828506469727, + "logits/rejected": -2.6323821544647217, + "logps/chosen": -191.44619750976562, + "logps/rejected": -283.21112060546875, + "loss": 0.6983, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2590867280960083, + "rewards/margins": 2.5970253944396973, + "rewards/rejected": -3.856112003326416, + "step": 5573 + }, + { + "epoch": 0.65, + "learning_rate": 1.0707452462501475e-07, + "logits/chosen": -2.4959301948547363, + "logits/rejected": -2.537555694580078, + "logps/chosen": -231.56277465820312, + "logps/rejected": -269.70928955078125, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9521153569221497, + "rewards/margins": 1.214499831199646, + "rewards/rejected": -2.1666150093078613, + "step": 5574 + }, + { + "epoch": 0.65, + "learning_rate": 1.070390929490965e-07, + "logits/chosen": -2.831625461578369, + "logits/rejected": -2.7964558601379395, + "logps/chosen": -126.67218780517578, + "logps/rejected": -111.9854965209961, + "loss": 0.3724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6167360544204712, + "rewards/margins": 1.2312123775482178, + "rewards/rejected": -1.8479483127593994, + "step": 5575 + }, + { + "epoch": 0.65, + "learning_rate": 1.0700366127317822e-07, + "logits/chosen": -2.536048412322998, + "logits/rejected": -2.7987852096557617, + "logps/chosen": -471.2641906738281, + "logps/rejected": -311.6507568359375, + "loss": 0.6253, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.280011773109436, + "rewards/margins": 1.086187720298767, + "rewards/rejected": -2.366199493408203, + "step": 5576 + }, + { + "epoch": 0.65, + "learning_rate": 1.0696822959725994e-07, + "logits/chosen": -2.625699043273926, + "logits/rejected": -2.8159613609313965, + "logps/chosen": -230.58999633789062, + "logps/rejected": -162.9471435546875, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37150681018829346, + "rewards/margins": 1.4768213033676147, + "rewards/rejected": -1.8483282327651978, + "step": 5577 + }, + { + "epoch": 0.65, + "learning_rate": 1.0693279792134167e-07, + "logits/chosen": -2.564779043197632, + "logits/rejected": -2.591029167175293, + "logps/chosen": -234.09881591796875, + "logps/rejected": -225.9639434814453, + "loss": 0.248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6889725923538208, + "rewards/margins": 2.4834468364715576, + "rewards/rejected": -3.172419548034668, + "step": 5578 + }, + { + "epoch": 0.65, + "learning_rate": 1.068973662454234e-07, + "logits/chosen": -2.2685186862945557, + "logits/rejected": -2.6765265464782715, + "logps/chosen": -451.13800048828125, + "logps/rejected": -267.298828125, + "loss": 0.262, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47293537855148315, + "rewards/margins": 1.9983187913894653, + "rewards/rejected": -2.4712541103363037, + "step": 5579 + }, + { + "epoch": 0.65, + "learning_rate": 1.0686193456950514e-07, + "logits/chosen": -2.419560432434082, + "logits/rejected": -2.7063004970550537, + "logps/chosen": -326.1977233886719, + "logps/rejected": -292.1570129394531, + "loss": 0.3343, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3205970227718353, + "rewards/margins": 1.8505513668060303, + "rewards/rejected": -2.1711483001708984, + "step": 5580 + }, + { + "epoch": 0.65, + "learning_rate": 1.0682650289358687e-07, + "logits/chosen": -2.2289986610412598, + "logits/rejected": -2.3114376068115234, + "logps/chosen": -191.40354919433594, + "logps/rejected": -264.1790466308594, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6331544518470764, + "rewards/margins": 3.2315258979797363, + "rewards/rejected": -3.864680290222168, + "step": 5581 + }, + { + "epoch": 0.65, + "learning_rate": 1.067910712176686e-07, + "logits/chosen": -2.594210624694824, + "logits/rejected": -2.479752540588379, + "logps/chosen": -238.67538452148438, + "logps/rejected": -277.4839782714844, + "loss": 0.342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9305959939956665, + "rewards/margins": 2.8977179527282715, + "rewards/rejected": -3.8283140659332275, + "step": 5582 + }, + { + "epoch": 0.65, + "learning_rate": 1.0675563954175032e-07, + "logits/chosen": -2.764878273010254, + "logits/rejected": -2.8600916862487793, + "logps/chosen": -383.59796142578125, + "logps/rejected": -231.89883422851562, + "loss": 0.2092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27205565571784973, + "rewards/margins": 3.286550998687744, + "rewards/rejected": -3.5586066246032715, + "step": 5583 + }, + { + "epoch": 0.65, + "learning_rate": 1.0672020786583205e-07, + "logits/chosen": -2.5947067737579346, + "logits/rejected": -2.7330265045166016, + "logps/chosen": -274.2263488769531, + "logps/rejected": -209.89559936523438, + "loss": 1.1811, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.778704047203064, + "rewards/margins": 0.6674702167510986, + "rewards/rejected": -2.446174144744873, + "step": 5584 + }, + { + "epoch": 0.65, + "learning_rate": 1.0668477618991377e-07, + "logits/chosen": -2.385666608810425, + "logits/rejected": -2.571998119354248, + "logps/chosen": -392.3490905761719, + "logps/rejected": -242.2446746826172, + "loss": 0.4291, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6309361457824707, + "rewards/margins": 1.5469849109649658, + "rewards/rejected": -2.1779210567474365, + "step": 5585 + }, + { + "epoch": 0.65, + "learning_rate": 1.0664934451399551e-07, + "logits/chosen": -2.7479805946350098, + "logits/rejected": -2.655322551727295, + "logps/chosen": -341.26025390625, + "logps/rejected": -318.4857177734375, + "loss": 0.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2927355766296387, + "rewards/margins": 1.186115026473999, + "rewards/rejected": -2.478850841522217, + "step": 5586 + }, + { + "epoch": 0.65, + "learning_rate": 1.0661391283807724e-07, + "logits/chosen": -2.2025818824768066, + "logits/rejected": -1.968019723892212, + "logps/chosen": -354.1976318359375, + "logps/rejected": -539.114990234375, + "loss": 0.3948, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2670934200286865, + "rewards/margins": 2.383455753326416, + "rewards/rejected": -3.6505491733551025, + "step": 5587 + }, + { + "epoch": 0.65, + "learning_rate": 1.0657848116215897e-07, + "logits/chosen": -2.154465675354004, + "logits/rejected": -2.0366625785827637, + "logps/chosen": -259.64947509765625, + "logps/rejected": -261.9423522949219, + "loss": 0.3547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.503532886505127, + "rewards/margins": 1.807664155960083, + "rewards/rejected": -2.311196804046631, + "step": 5588 + }, + { + "epoch": 0.65, + "learning_rate": 1.0654304948624069e-07, + "logits/chosen": -1.9179420471191406, + "logits/rejected": -1.9974387884140015, + "logps/chosen": -195.2952117919922, + "logps/rejected": -333.1743469238281, + "loss": 0.4323, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5956999659538269, + "rewards/margins": 2.8740077018737793, + "rewards/rejected": -3.469707489013672, + "step": 5589 + }, + { + "epoch": 0.65, + "learning_rate": 1.0650761781032242e-07, + "logits/chosen": -2.402742624282837, + "logits/rejected": -2.2347519397735596, + "logps/chosen": -182.13858032226562, + "logps/rejected": -225.77944946289062, + "loss": 0.2855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5126633644104004, + "rewards/margins": 1.6866238117218018, + "rewards/rejected": -2.199287176132202, + "step": 5590 + }, + { + "epoch": 0.65, + "learning_rate": 1.0647218613440415e-07, + "logits/chosen": -2.3169713020324707, + "logits/rejected": -2.43715500831604, + "logps/chosen": -335.8240661621094, + "logps/rejected": -298.69091796875, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8508182764053345, + "rewards/margins": 3.319321632385254, + "rewards/rejected": -4.170139789581299, + "step": 5591 + }, + { + "epoch": 0.65, + "learning_rate": 1.064367544584859e-07, + "logits/chosen": -1.9033262729644775, + "logits/rejected": -1.9991397857666016, + "logps/chosen": -342.1416015625, + "logps/rejected": -290.2689514160156, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9085351228713989, + "rewards/margins": 1.2899129390716553, + "rewards/rejected": -2.1984481811523438, + "step": 5592 + }, + { + "epoch": 0.65, + "learning_rate": 1.0640132278256762e-07, + "logits/chosen": -2.0126028060913086, + "logits/rejected": -2.030104160308838, + "logps/chosen": -211.09231567382812, + "logps/rejected": -202.5311279296875, + "loss": 0.3608, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8773146271705627, + "rewards/margins": 1.7375333309173584, + "rewards/rejected": -2.6148478984832764, + "step": 5593 + }, + { + "epoch": 0.65, + "learning_rate": 1.0636589110664934e-07, + "logits/chosen": -2.6777124404907227, + "logits/rejected": -2.897007465362549, + "logps/chosen": -420.89666748046875, + "logps/rejected": -269.5523376464844, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6979645490646362, + "rewards/margins": 3.9179677963256836, + "rewards/rejected": -4.615932464599609, + "step": 5594 + }, + { + "epoch": 0.65, + "learning_rate": 1.0633045943073107e-07, + "logits/chosen": -2.474107265472412, + "logits/rejected": -2.3461694717407227, + "logps/chosen": -309.263671875, + "logps/rejected": -218.72491455078125, + "loss": 0.3595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6593648195266724, + "rewards/margins": 1.8954733610153198, + "rewards/rejected": -2.554838180541992, + "step": 5595 + }, + { + "epoch": 0.65, + "learning_rate": 1.062950277548128e-07, + "logits/chosen": -2.1780173778533936, + "logits/rejected": -2.4122493267059326, + "logps/chosen": -496.2576904296875, + "logps/rejected": -241.11349487304688, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5786592364311218, + "rewards/margins": 2.5009305477142334, + "rewards/rejected": -3.079590082168579, + "step": 5596 + }, + { + "epoch": 0.65, + "learning_rate": 1.0625959607889452e-07, + "logits/chosen": -1.707923173904419, + "logits/rejected": -1.775159239768982, + "logps/chosen": -407.81304931640625, + "logps/rejected": -318.01422119140625, + "loss": 0.6141, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.48332005739212036, + "rewards/margins": 1.3423819541931152, + "rewards/rejected": -1.82570219039917, + "step": 5597 + }, + { + "epoch": 0.65, + "learning_rate": 1.0622416440297627e-07, + "logits/chosen": -2.690600633621216, + "logits/rejected": -2.5701708793640137, + "logps/chosen": -188.03851318359375, + "logps/rejected": -390.8686218261719, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7250154614448547, + "rewards/margins": 2.055575370788574, + "rewards/rejected": -2.780590772628784, + "step": 5598 + }, + { + "epoch": 0.65, + "learning_rate": 1.0618873272705799e-07, + "logits/chosen": -2.3172032833099365, + "logits/rejected": -1.9886953830718994, + "logps/chosen": -180.4352264404297, + "logps/rejected": -338.8710021972656, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0007446594536304474, + "rewards/margins": 4.374330997467041, + "rewards/rejected": -4.375075340270996, + "step": 5599 + }, + { + "epoch": 0.65, + "learning_rate": 1.0615330105113971e-07, + "logits/chosen": -2.54811954498291, + "logits/rejected": -2.342271566390991, + "logps/chosen": -231.7083282470703, + "logps/rejected": -307.0648193359375, + "loss": 0.4313, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7863280773162842, + "rewards/margins": 2.6879913806915283, + "rewards/rejected": -4.4743194580078125, + "step": 5600 + }, + { + "epoch": 0.65, + "learning_rate": 1.0611786937522145e-07, + "logits/chosen": -2.431410312652588, + "logits/rejected": -2.5768730640411377, + "logps/chosen": -338.81793212890625, + "logps/rejected": -344.4649658203125, + "loss": 0.422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1319657415151596, + "rewards/margins": 2.1297731399536133, + "rewards/rejected": -2.2617387771606445, + "step": 5601 + }, + { + "epoch": 0.65, + "learning_rate": 1.0608243769930317e-07, + "logits/chosen": -1.9816032648086548, + "logits/rejected": -2.0000553131103516, + "logps/chosen": -209.99826049804688, + "logps/rejected": -163.0596466064453, + "loss": 0.5682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7610423564910889, + "rewards/margins": 1.4955402612686157, + "rewards/rejected": -2.256582498550415, + "step": 5602 + }, + { + "epoch": 0.65, + "learning_rate": 1.0604700602338489e-07, + "logits/chosen": -2.6323509216308594, + "logits/rejected": -2.7565560340881348, + "logps/chosen": -241.28768920898438, + "logps/rejected": -217.859130859375, + "loss": 0.3784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6914969682693481, + "rewards/margins": 2.390824794769287, + "rewards/rejected": -3.0823216438293457, + "step": 5603 + }, + { + "epoch": 0.65, + "learning_rate": 1.0601157434746664e-07, + "logits/chosen": -2.4694392681121826, + "logits/rejected": -2.491980791091919, + "logps/chosen": -191.0977020263672, + "logps/rejected": -231.89402770996094, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7997926473617554, + "rewards/margins": 1.3519394397735596, + "rewards/rejected": -2.1517322063446045, + "step": 5604 + }, + { + "epoch": 0.65, + "learning_rate": 1.0597614267154836e-07, + "logits/chosen": -2.2028019428253174, + "logits/rejected": -2.2042770385742188, + "logps/chosen": -189.22779846191406, + "logps/rejected": -201.02769470214844, + "loss": 0.9092, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6107237339019775, + "rewards/margins": 1.2019548416137695, + "rewards/rejected": -2.812678337097168, + "step": 5605 + }, + { + "epoch": 0.65, + "learning_rate": 1.0594071099563008e-07, + "logits/chosen": -2.0499954223632812, + "logits/rejected": -2.1228108406066895, + "logps/chosen": -295.46307373046875, + "logps/rejected": -179.30352783203125, + "loss": 0.5153, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1482831239700317, + "rewards/margins": 1.5624401569366455, + "rewards/rejected": -2.710723400115967, + "step": 5606 + }, + { + "epoch": 0.65, + "learning_rate": 1.0590527931971182e-07, + "logits/chosen": -2.259133815765381, + "logits/rejected": -2.3870928287506104, + "logps/chosen": -478.9671630859375, + "logps/rejected": -319.3724365234375, + "loss": 0.3526, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5126805305480957, + "rewards/margins": 1.7548809051513672, + "rewards/rejected": -2.267561435699463, + "step": 5607 + }, + { + "epoch": 0.65, + "learning_rate": 1.0586984764379354e-07, + "logits/chosen": -1.7448177337646484, + "logits/rejected": -2.165412425994873, + "logps/chosen": -486.16912841796875, + "logps/rejected": -395.8465270996094, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.65990149974823, + "rewards/margins": 3.792248249053955, + "rewards/rejected": -4.452149391174316, + "step": 5608 + }, + { + "epoch": 0.65, + "learning_rate": 1.0583441596787526e-07, + "logits/chosen": -2.1054742336273193, + "logits/rejected": -2.488062858581543, + "logps/chosen": -508.16192626953125, + "logps/rejected": -388.604248046875, + "loss": 0.2532, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9517425298690796, + "rewards/margins": 2.2195377349853516, + "rewards/rejected": -3.1712803840637207, + "step": 5609 + }, + { + "epoch": 0.65, + "learning_rate": 1.0579898429195701e-07, + "logits/chosen": -2.430575370788574, + "logits/rejected": -2.5163168907165527, + "logps/chosen": -143.40626525878906, + "logps/rejected": -150.57357788085938, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7942186594009399, + "rewards/margins": 1.9873102903366089, + "rewards/rejected": -2.781528949737549, + "step": 5610 + }, + { + "epoch": 0.65, + "learning_rate": 1.0576355261603873e-07, + "logits/chosen": -2.0882678031921387, + "logits/rejected": -2.592775821685791, + "logps/chosen": -430.4734802246094, + "logps/rejected": -193.49856567382812, + "loss": 0.4517, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3439328074455261, + "rewards/margins": 2.3839666843414307, + "rewards/rejected": -2.7278993129730225, + "step": 5611 + }, + { + "epoch": 0.65, + "learning_rate": 1.0572812094012047e-07, + "logits/chosen": -2.150719165802002, + "logits/rejected": -1.9896302223205566, + "logps/chosen": -163.9005126953125, + "logps/rejected": -290.6529541015625, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23867546021938324, + "rewards/margins": 2.4266653060913086, + "rewards/rejected": -2.6653409004211426, + "step": 5612 + }, + { + "epoch": 0.65, + "learning_rate": 1.0569268926420219e-07, + "logits/chosen": -2.1129679679870605, + "logits/rejected": -2.525690793991089, + "logps/chosen": -616.1929931640625, + "logps/rejected": -230.37689208984375, + "loss": 0.4099, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8796554207801819, + "rewards/margins": 1.2876133918762207, + "rewards/rejected": -2.167268753051758, + "step": 5613 + }, + { + "epoch": 0.65, + "learning_rate": 1.0565725758828391e-07, + "logits/chosen": -2.0620594024658203, + "logits/rejected": -1.7740558385849, + "logps/chosen": -203.27467346191406, + "logps/rejected": -322.9368896484375, + "loss": 0.4968, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27982550859451294, + "rewards/margins": 1.2123559713363647, + "rewards/rejected": -1.4921815395355225, + "step": 5614 + }, + { + "epoch": 0.65, + "learning_rate": 1.0562182591236564e-07, + "logits/chosen": -2.437512159347534, + "logits/rejected": -2.418700933456421, + "logps/chosen": -265.8520812988281, + "logps/rejected": -285.62042236328125, + "loss": 0.5924, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1879808902740479, + "rewards/margins": 1.4863914251327515, + "rewards/rejected": -2.674372673034668, + "step": 5615 + }, + { + "epoch": 0.65, + "learning_rate": 1.0558639423644738e-07, + "logits/chosen": -1.913207769393921, + "logits/rejected": -2.1958112716674805, + "logps/chosen": -327.0488586425781, + "logps/rejected": -226.95755004882812, + "loss": 0.4423, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5466669797897339, + "rewards/margins": 2.5042710304260254, + "rewards/rejected": -3.0509376525878906, + "step": 5616 + }, + { + "epoch": 0.65, + "learning_rate": 1.055509625605291e-07, + "logits/chosen": -2.89825177192688, + "logits/rejected": -2.960782051086426, + "logps/chosen": -140.8758087158203, + "logps/rejected": -187.6630859375, + "loss": 0.4172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5988028645515442, + "rewards/margins": 3.142946481704712, + "rewards/rejected": -3.7417495250701904, + "step": 5617 + }, + { + "epoch": 0.65, + "learning_rate": 1.0551553088461084e-07, + "logits/chosen": -2.6265978813171387, + "logits/rejected": -2.6625616550445557, + "logps/chosen": -342.4358215332031, + "logps/rejected": -273.6228942871094, + "loss": 0.3229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9187076687812805, + "rewards/margins": 1.7407310009002686, + "rewards/rejected": -2.6594386100769043, + "step": 5618 + }, + { + "epoch": 0.65, + "learning_rate": 1.0548009920869256e-07, + "logits/chosen": -2.0639142990112305, + "logits/rejected": -2.0048723220825195, + "logps/chosen": -192.9703826904297, + "logps/rejected": -257.6492919921875, + "loss": 0.2617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.951195240020752, + "rewards/margins": 2.1815037727355957, + "rewards/rejected": -3.1326990127563477, + "step": 5619 + }, + { + "epoch": 0.65, + "learning_rate": 1.0544466753277429e-07, + "logits/chosen": -2.1003029346466064, + "logits/rejected": -2.1251637935638428, + "logps/chosen": -264.3726501464844, + "logps/rejected": -288.12408447265625, + "loss": 0.2463, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22717618942260742, + "rewards/margins": 2.2846388816833496, + "rewards/rejected": -2.511815071105957, + "step": 5620 + }, + { + "epoch": 0.65, + "learning_rate": 1.0540923585685603e-07, + "logits/chosen": -2.468763589859009, + "logits/rejected": -2.245225191116333, + "logps/chosen": -267.2696533203125, + "logps/rejected": -432.46551513671875, + "loss": 0.2394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3838314116001129, + "rewards/margins": 3.3134491443634033, + "rewards/rejected": -3.6972804069519043, + "step": 5621 + }, + { + "epoch": 0.65, + "learning_rate": 1.0537380418093776e-07, + "logits/chosen": -3.089451789855957, + "logits/rejected": -2.968903064727783, + "logps/chosen": -270.4367370605469, + "logps/rejected": -271.61016845703125, + "loss": 0.2842, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5795798301696777, + "rewards/margins": 3.185483694076538, + "rewards/rejected": -4.765063285827637, + "step": 5622 + }, + { + "epoch": 0.65, + "learning_rate": 1.0533837250501948e-07, + "logits/chosen": -2.679521322250366, + "logits/rejected": -2.5554208755493164, + "logps/chosen": -199.07595825195312, + "logps/rejected": -191.9415283203125, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8686326146125793, + "rewards/margins": 1.4549545049667358, + "rewards/rejected": -2.32358717918396, + "step": 5623 + }, + { + "epoch": 0.65, + "learning_rate": 1.0530294082910121e-07, + "logits/chosen": -2.399594306945801, + "logits/rejected": -2.4834470748901367, + "logps/chosen": -145.66175842285156, + "logps/rejected": -222.60934448242188, + "loss": 0.2773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17375105619430542, + "rewards/margins": 2.1814072132110596, + "rewards/rejected": -2.3551583290100098, + "step": 5624 + }, + { + "epoch": 0.65, + "learning_rate": 1.0526750915318294e-07, + "logits/chosen": -2.6118690967559814, + "logits/rejected": -2.5002565383911133, + "logps/chosen": -188.09170532226562, + "logps/rejected": -255.79156494140625, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9079570770263672, + "rewards/margins": 2.4687705039978027, + "rewards/rejected": -3.37672758102417, + "step": 5625 + }, + { + "epoch": 0.65, + "learning_rate": 1.0523207747726466e-07, + "logits/chosen": -2.489267587661743, + "logits/rejected": -2.6335031986236572, + "logps/chosen": -365.405517578125, + "logps/rejected": -300.27838134765625, + "loss": 0.2642, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8886477947235107, + "rewards/margins": 2.6124279499053955, + "rewards/rejected": -3.501075506210327, + "step": 5626 + }, + { + "epoch": 0.65, + "learning_rate": 1.0519664580134641e-07, + "logits/chosen": -2.9813945293426514, + "logits/rejected": -3.0162110328674316, + "logps/chosen": -174.27999877929688, + "logps/rejected": -214.72152709960938, + "loss": 0.4907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.664664089679718, + "rewards/margins": 1.9335359334945679, + "rewards/rejected": -2.5981998443603516, + "step": 5627 + }, + { + "epoch": 0.65, + "learning_rate": 1.0516121412542813e-07, + "logits/chosen": -1.7221534252166748, + "logits/rejected": -1.962601661682129, + "logps/chosen": -340.79742431640625, + "logps/rejected": -297.7884521484375, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9804275035858154, + "rewards/margins": 1.3806710243225098, + "rewards/rejected": -2.361098527908325, + "step": 5628 + }, + { + "epoch": 0.65, + "learning_rate": 1.0512578244950986e-07, + "logits/chosen": -1.6051430702209473, + "logits/rejected": -1.6317306756973267, + "logps/chosen": -271.6248474121094, + "logps/rejected": -261.099365234375, + "loss": 0.7323, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1928234100341797, + "rewards/margins": 1.8139066696166992, + "rewards/rejected": -4.006730556488037, + "step": 5629 + }, + { + "epoch": 0.65, + "learning_rate": 1.0509035077359159e-07, + "logits/chosen": -2.4578018188476562, + "logits/rejected": -2.5516843795776367, + "logps/chosen": -192.4944610595703, + "logps/rejected": -159.92626953125, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8193400502204895, + "rewards/margins": 1.9240024089813232, + "rewards/rejected": -2.743342399597168, + "step": 5630 + }, + { + "epoch": 0.66, + "learning_rate": 1.0505491909767331e-07, + "logits/chosen": -2.8046793937683105, + "logits/rejected": -2.7845048904418945, + "logps/chosen": -327.2584228515625, + "logps/rejected": -177.72872924804688, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28252071142196655, + "rewards/margins": 2.890591621398926, + "rewards/rejected": -2.6080708503723145, + "step": 5631 + }, + { + "epoch": 0.66, + "learning_rate": 1.0501948742175504e-07, + "logits/chosen": -2.2264182567596436, + "logits/rejected": -2.323803663253784, + "logps/chosen": -288.475830078125, + "logps/rejected": -424.555419921875, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25940507650375366, + "rewards/margins": 3.774869918823242, + "rewards/rejected": -4.034275054931641, + "step": 5632 + }, + { + "epoch": 0.66, + "learning_rate": 1.0498405574583678e-07, + "logits/chosen": -2.7708659172058105, + "logits/rejected": -2.708275318145752, + "logps/chosen": -217.19029235839844, + "logps/rejected": -280.57373046875, + "loss": 0.3191, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47454315423965454, + "rewards/margins": 3.2973673343658447, + "rewards/rejected": -3.7719106674194336, + "step": 5633 + }, + { + "epoch": 0.66, + "learning_rate": 1.049486240699185e-07, + "logits/chosen": -2.003082275390625, + "logits/rejected": -2.1110317707061768, + "logps/chosen": -338.17498779296875, + "logps/rejected": -311.8550720214844, + "loss": 0.3596, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0051870346069336, + "rewards/margins": 1.9960434436798096, + "rewards/rejected": -3.0012307167053223, + "step": 5634 + }, + { + "epoch": 0.66, + "learning_rate": 1.0491319239400024e-07, + "logits/chosen": -2.283311128616333, + "logits/rejected": -2.5141074657440186, + "logps/chosen": -295.2777099609375, + "logps/rejected": -213.91244506835938, + "loss": 0.1806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31349262595176697, + "rewards/margins": 1.8165096044540405, + "rewards/rejected": -2.13000226020813, + "step": 5635 + }, + { + "epoch": 0.66, + "learning_rate": 1.0487776071808196e-07, + "logits/chosen": -2.211575984954834, + "logits/rejected": -2.212348699569702, + "logps/chosen": -295.5931091308594, + "logps/rejected": -309.7364501953125, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3619875907897949, + "rewards/margins": 2.20029878616333, + "rewards/rejected": -1.8383111953735352, + "step": 5636 + }, + { + "epoch": 0.66, + "learning_rate": 1.0484232904216368e-07, + "logits/chosen": -2.047206401824951, + "logits/rejected": -2.2447280883789062, + "logps/chosen": -446.1842041015625, + "logps/rejected": -384.1418762207031, + "loss": 0.3096, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.926439642906189, + "rewards/margins": 2.4248485565185547, + "rewards/rejected": -3.351288318634033, + "step": 5637 + }, + { + "epoch": 0.66, + "learning_rate": 1.0480689736624542e-07, + "logits/chosen": -1.9269683361053467, + "logits/rejected": -2.0173122882843018, + "logps/chosen": -162.9124755859375, + "logps/rejected": -205.02239990234375, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.784050464630127, + "rewards/margins": 2.356318950653076, + "rewards/rejected": -3.140369176864624, + "step": 5638 + }, + { + "epoch": 0.66, + "learning_rate": 1.0477146569032715e-07, + "logits/chosen": -2.680712938308716, + "logits/rejected": -2.7194597721099854, + "logps/chosen": -208.26791381835938, + "logps/rejected": -129.04966735839844, + "loss": 0.3192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.540419340133667, + "rewards/margins": 1.8171100616455078, + "rewards/rejected": -2.357529401779175, + "step": 5639 + }, + { + "epoch": 0.66, + "learning_rate": 1.0473603401440887e-07, + "logits/chosen": -2.1089446544647217, + "logits/rejected": -2.269528865814209, + "logps/chosen": -195.4993133544922, + "logps/rejected": -254.5201416015625, + "loss": 0.4793, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7801486253738403, + "rewards/margins": 0.9757960438728333, + "rewards/rejected": -2.7559447288513184, + "step": 5640 + }, + { + "epoch": 0.66, + "learning_rate": 1.0470060233849061e-07, + "logits/chosen": -1.9523968696594238, + "logits/rejected": -1.7849524021148682, + "logps/chosen": -306.89752197265625, + "logps/rejected": -409.33428955078125, + "loss": 0.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7026433944702148, + "rewards/margins": 3.624249219894409, + "rewards/rejected": -4.326892852783203, + "step": 5641 + }, + { + "epoch": 0.66, + "learning_rate": 1.0466517066257233e-07, + "logits/chosen": -2.055495023727417, + "logits/rejected": -1.9896587133407593, + "logps/chosen": -207.96058654785156, + "logps/rejected": -214.98959350585938, + "loss": 0.2722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7799131870269775, + "rewards/margins": 2.2086238861083984, + "rewards/rejected": -2.988537073135376, + "step": 5642 + }, + { + "epoch": 0.66, + "learning_rate": 1.0462973898665405e-07, + "logits/chosen": -2.0959582328796387, + "logits/rejected": -2.104794979095459, + "logps/chosen": -351.9902038574219, + "logps/rejected": -312.3078918457031, + "loss": 1.2059, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.446608304977417, + "rewards/margins": 0.060374438762664795, + "rewards/rejected": -2.5069830417633057, + "step": 5643 + }, + { + "epoch": 0.66, + "learning_rate": 1.0459430731073579e-07, + "logits/chosen": -2.362825393676758, + "logits/rejected": -2.298342704772949, + "logps/chosen": -273.7911376953125, + "logps/rejected": -340.9767761230469, + "loss": 0.3405, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.240666151046753, + "rewards/margins": 1.6644459962844849, + "rewards/rejected": -2.9051120281219482, + "step": 5644 + }, + { + "epoch": 0.66, + "learning_rate": 1.0455887563481752e-07, + "logits/chosen": -1.8483754396438599, + "logits/rejected": -2.042412757873535, + "logps/chosen": -438.0550537109375, + "logps/rejected": -469.2054138183594, + "loss": 0.2888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5498804450035095, + "rewards/margins": 3.355186700820923, + "rewards/rejected": -3.905067205429077, + "step": 5645 + }, + { + "epoch": 0.66, + "learning_rate": 1.0452344395889926e-07, + "logits/chosen": -2.627197742462158, + "logits/rejected": -2.6115307807922363, + "logps/chosen": -310.6881408691406, + "logps/rejected": -317.25439453125, + "loss": 0.2185, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7008978128433228, + "rewards/margins": 2.742345094680786, + "rewards/rejected": -3.4432430267333984, + "step": 5646 + }, + { + "epoch": 0.66, + "learning_rate": 1.0448801228298098e-07, + "logits/chosen": -2.601165294647217, + "logits/rejected": -2.4152183532714844, + "logps/chosen": -252.05630493164062, + "logps/rejected": -319.3289794921875, + "loss": 0.1687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5914105176925659, + "rewards/margins": 4.035156726837158, + "rewards/rejected": -4.6265668869018555, + "step": 5647 + }, + { + "epoch": 0.66, + "learning_rate": 1.044525806070627e-07, + "logits/chosen": -2.5110795497894287, + "logits/rejected": -2.6164026260375977, + "logps/chosen": -148.21534729003906, + "logps/rejected": -300.07098388671875, + "loss": 0.4467, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7840666770935059, + "rewards/margins": 3.925251007080078, + "rewards/rejected": -5.709317684173584, + "step": 5648 + }, + { + "epoch": 0.66, + "learning_rate": 1.0441714893114444e-07, + "logits/chosen": -1.8799796104431152, + "logits/rejected": -1.8996953964233398, + "logps/chosen": -245.6571502685547, + "logps/rejected": -262.1639404296875, + "loss": 0.5568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7814924716949463, + "rewards/margins": 1.6691805124282837, + "rewards/rejected": -2.4506731033325195, + "step": 5649 + }, + { + "epoch": 0.66, + "learning_rate": 1.0438171725522616e-07, + "logits/chosen": -2.1249635219573975, + "logits/rejected": -2.4611122608184814, + "logps/chosen": -295.913818359375, + "logps/rejected": -288.91845703125, + "loss": 0.8687, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.436316967010498, + "rewards/margins": 1.2291241884231567, + "rewards/rejected": -2.6654410362243652, + "step": 5650 + }, + { + "epoch": 0.66, + "learning_rate": 1.043462855793079e-07, + "logits/chosen": -1.8214809894561768, + "logits/rejected": -2.228898286819458, + "logps/chosen": -271.5148620605469, + "logps/rejected": -163.38751220703125, + "loss": 0.3916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.882966160774231, + "rewards/margins": 1.46793794631958, + "rewards/rejected": -2.3509039878845215, + "step": 5651 + }, + { + "epoch": 0.66, + "learning_rate": 1.0431085390338963e-07, + "logits/chosen": -2.4616811275482178, + "logits/rejected": -2.5256142616271973, + "logps/chosen": -326.0119323730469, + "logps/rejected": -386.1529846191406, + "loss": 0.3041, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5476267337799072, + "rewards/margins": 2.62630558013916, + "rewards/rejected": -4.1739325523376465, + "step": 5652 + }, + { + "epoch": 0.66, + "learning_rate": 1.0427542222747135e-07, + "logits/chosen": -2.090790271759033, + "logits/rejected": -2.1614456176757812, + "logps/chosen": -201.30613708496094, + "logps/rejected": -179.7163848876953, + "loss": 0.8695, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0840425491333008, + "rewards/margins": 1.5900282859802246, + "rewards/rejected": -2.6740708351135254, + "step": 5653 + }, + { + "epoch": 0.66, + "learning_rate": 1.0423999055155308e-07, + "logits/chosen": -2.6787147521972656, + "logits/rejected": -2.8297760486602783, + "logps/chosen": -311.9983825683594, + "logps/rejected": -309.57916259765625, + "loss": 0.4029, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8317017555236816, + "rewards/margins": 3.0495874881744385, + "rewards/rejected": -3.881289482116699, + "step": 5654 + }, + { + "epoch": 0.66, + "learning_rate": 1.0420455887563481e-07, + "logits/chosen": -1.9957685470581055, + "logits/rejected": -2.167116165161133, + "logps/chosen": -372.3825378417969, + "logps/rejected": -334.19384765625, + "loss": 0.3331, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8929535150527954, + "rewards/margins": 2.9981539249420166, + "rewards/rejected": -3.8911073207855225, + "step": 5655 + }, + { + "epoch": 0.66, + "learning_rate": 1.0416912719971655e-07, + "logits/chosen": -1.8710715770721436, + "logits/rejected": -1.8476738929748535, + "logps/chosen": -380.03253173828125, + "logps/rejected": -436.19329833984375, + "loss": 0.2688, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3017499446868896, + "rewards/margins": 2.741666078567505, + "rewards/rejected": -4.0434160232543945, + "step": 5656 + }, + { + "epoch": 0.66, + "learning_rate": 1.0413369552379828e-07, + "logits/chosen": -2.547333002090454, + "logits/rejected": -2.612338066101074, + "logps/chosen": -148.70590209960938, + "logps/rejected": -105.24624633789062, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.328561544418335, + "rewards/margins": 0.8790522813796997, + "rewards/rejected": -2.207613945007324, + "step": 5657 + }, + { + "epoch": 0.66, + "learning_rate": 1.0409826384788e-07, + "logits/chosen": -2.8819124698638916, + "logits/rejected": -2.959810733795166, + "logps/chosen": -348.1834716796875, + "logps/rejected": -229.48875427246094, + "loss": 0.9472, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7797030210494995, + "rewards/margins": 1.0191231966018677, + "rewards/rejected": -1.7988262176513672, + "step": 5658 + }, + { + "epoch": 0.66, + "learning_rate": 1.0406283217196173e-07, + "logits/chosen": -2.276402473449707, + "logits/rejected": -2.19966721534729, + "logps/chosen": -396.5199279785156, + "logps/rejected": -417.11883544921875, + "loss": 0.4427, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9029924869537354, + "rewards/margins": 2.0527846813201904, + "rewards/rejected": -3.955777168273926, + "step": 5659 + }, + { + "epoch": 0.66, + "learning_rate": 1.0402740049604345e-07, + "logits/chosen": -2.93898344039917, + "logits/rejected": -2.9107820987701416, + "logps/chosen": -296.6107177734375, + "logps/rejected": -234.84799194335938, + "loss": 0.3557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46664807200431824, + "rewards/margins": 2.44972562789917, + "rewards/rejected": -2.9163737297058105, + "step": 5660 + }, + { + "epoch": 0.66, + "learning_rate": 1.0399196882012518e-07, + "logits/chosen": -2.2126054763793945, + "logits/rejected": -2.143376350402832, + "logps/chosen": -266.48681640625, + "logps/rejected": -386.7888488769531, + "loss": 0.3071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40486711263656616, + "rewards/margins": 2.5140881538391113, + "rewards/rejected": -2.918955087661743, + "step": 5661 + }, + { + "epoch": 0.66, + "learning_rate": 1.0395653714420692e-07, + "logits/chosen": -2.3556840419769287, + "logits/rejected": -2.47299861907959, + "logps/chosen": -322.6022033691406, + "logps/rejected": -288.6607666015625, + "loss": 0.2853, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7269226312637329, + "rewards/margins": 2.170443058013916, + "rewards/rejected": -2.8973653316497803, + "step": 5662 + }, + { + "epoch": 0.66, + "learning_rate": 1.0392110546828865e-07, + "logits/chosen": -2.42887544631958, + "logits/rejected": -2.5546767711639404, + "logps/chosen": -206.69461059570312, + "logps/rejected": -384.1551818847656, + "loss": 0.3659, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2105356454849243, + "rewards/margins": 2.2735595703125, + "rewards/rejected": -3.484095335006714, + "step": 5663 + }, + { + "epoch": 0.66, + "learning_rate": 1.0388567379237038e-07, + "logits/chosen": -2.636662006378174, + "logits/rejected": -2.8372044563293457, + "logps/chosen": -156.79464721679688, + "logps/rejected": -190.98333740234375, + "loss": 0.3509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5529892444610596, + "rewards/margins": 3.260711431503296, + "rewards/rejected": -3.8137006759643555, + "step": 5664 + }, + { + "epoch": 0.66, + "learning_rate": 1.038502421164521e-07, + "logits/chosen": -2.517970323562622, + "logits/rejected": -2.7369227409362793, + "logps/chosen": -684.9908447265625, + "logps/rejected": -559.7181396484375, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7034969329833984, + "rewards/margins": 4.921209812164307, + "rewards/rejected": -5.624707221984863, + "step": 5665 + }, + { + "epoch": 0.66, + "learning_rate": 1.0381481044053383e-07, + "logits/chosen": -1.8254623413085938, + "logits/rejected": -2.3381237983703613, + "logps/chosen": -308.38671875, + "logps/rejected": -208.96261596679688, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.794208824634552, + "rewards/margins": 2.7997324466705322, + "rewards/rejected": -3.5939412117004395, + "step": 5666 + }, + { + "epoch": 0.66, + "learning_rate": 1.0377937876461556e-07, + "logits/chosen": -2.396946907043457, + "logits/rejected": -2.1706907749176025, + "logps/chosen": -224.25955200195312, + "logps/rejected": -310.0357971191406, + "loss": 0.6437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8369054794311523, + "rewards/margins": 1.8134957551956177, + "rewards/rejected": -2.6504011154174805, + "step": 5667 + }, + { + "epoch": 0.66, + "learning_rate": 1.0374394708869729e-07, + "logits/chosen": -2.5202484130859375, + "logits/rejected": -2.5895838737487793, + "logps/chosen": -238.8297119140625, + "logps/rejected": -225.16664123535156, + "loss": 0.3936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3433902263641357, + "rewards/margins": 3.520153522491455, + "rewards/rejected": -4.86354398727417, + "step": 5668 + }, + { + "epoch": 0.66, + "learning_rate": 1.0370851541277903e-07, + "logits/chosen": -1.8457984924316406, + "logits/rejected": -1.8705971240997314, + "logps/chosen": -461.7740173339844, + "logps/rejected": -374.1877746582031, + "loss": 0.4043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3605010509490967, + "rewards/margins": 2.196782350540161, + "rewards/rejected": -2.557283401489258, + "step": 5669 + }, + { + "epoch": 0.66, + "learning_rate": 1.0367308373686075e-07, + "logits/chosen": -2.7027058601379395, + "logits/rejected": -2.317307472229004, + "logps/chosen": -213.99588012695312, + "logps/rejected": -242.6956329345703, + "loss": 0.1211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4589482247829437, + "rewards/margins": 3.606396198272705, + "rewards/rejected": -4.065344333648682, + "step": 5670 + }, + { + "epoch": 0.66, + "learning_rate": 1.0363765206094247e-07, + "logits/chosen": -2.751953363418579, + "logits/rejected": -2.712613582611084, + "logps/chosen": -228.58518981933594, + "logps/rejected": -240.92710876464844, + "loss": 0.9828, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.344312071800232, + "rewards/margins": 0.4630223214626312, + "rewards/rejected": -1.807334303855896, + "step": 5671 + }, + { + "epoch": 0.66, + "learning_rate": 1.0360222038502421e-07, + "logits/chosen": -1.7498023509979248, + "logits/rejected": -1.9027984142303467, + "logps/chosen": -487.6247863769531, + "logps/rejected": -404.3485412597656, + "loss": 0.2184, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006755739450454712, + "rewards/margins": 1.8866910934448242, + "rewards/rejected": -1.8799352645874023, + "step": 5672 + }, + { + "epoch": 0.66, + "learning_rate": 1.0356678870910593e-07, + "logits/chosen": -2.5228075981140137, + "logits/rejected": -2.5785574913024902, + "logps/chosen": -209.76174926757812, + "logps/rejected": -208.15707397460938, + "loss": 0.4904, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2170389890670776, + "rewards/margins": 2.4760961532592773, + "rewards/rejected": -3.6931354999542236, + "step": 5673 + }, + { + "epoch": 0.66, + "learning_rate": 1.0353135703318768e-07, + "logits/chosen": -2.896099090576172, + "logits/rejected": -2.8547146320343018, + "logps/chosen": -195.39804077148438, + "logps/rejected": -324.66510009765625, + "loss": 0.2585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05571621656417847, + "rewards/margins": 2.2246785163879395, + "rewards/rejected": -2.2803945541381836, + "step": 5674 + }, + { + "epoch": 0.66, + "learning_rate": 1.034959253572694e-07, + "logits/chosen": -1.964308261871338, + "logits/rejected": -2.215885877609253, + "logps/chosen": -370.36663818359375, + "logps/rejected": -287.57861328125, + "loss": 0.1656, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24088440835475922, + "rewards/margins": 2.769105911254883, + "rewards/rejected": -2.528221368789673, + "step": 5675 + }, + { + "epoch": 0.66, + "learning_rate": 1.0346049368135112e-07, + "logits/chosen": -2.484022855758667, + "logits/rejected": -2.3083415031433105, + "logps/chosen": -402.21075439453125, + "logps/rejected": -297.1026306152344, + "loss": 0.3879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095904588699341, + "rewards/margins": 1.2451809644699097, + "rewards/rejected": -2.0547714233398438, + "step": 5676 + }, + { + "epoch": 0.66, + "learning_rate": 1.0342506200543284e-07, + "logits/chosen": -2.4121782779693604, + "logits/rejected": -2.3747212886810303, + "logps/chosen": -244.59390258789062, + "logps/rejected": -224.47940063476562, + "loss": 0.878, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6101865768432617, + "rewards/margins": 0.684607744216919, + "rewards/rejected": -2.2947943210601807, + "step": 5677 + }, + { + "epoch": 0.66, + "learning_rate": 1.0338963032951458e-07, + "logits/chosen": -2.1911585330963135, + "logits/rejected": -2.1446990966796875, + "logps/chosen": -333.9509582519531, + "logps/rejected": -365.362060546875, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9364689588546753, + "rewards/margins": 2.8088223934173584, + "rewards/rejected": -3.7452917098999023, + "step": 5678 + }, + { + "epoch": 0.66, + "learning_rate": 1.033541986535963e-07, + "logits/chosen": -2.4206061363220215, + "logits/rejected": -2.5407021045684814, + "logps/chosen": -169.20144653320312, + "logps/rejected": -144.4083709716797, + "loss": 0.2362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4208838641643524, + "rewards/margins": 2.7999186515808105, + "rewards/rejected": -3.2208023071289062, + "step": 5679 + }, + { + "epoch": 0.66, + "learning_rate": 1.0331876697767805e-07, + "logits/chosen": -1.7812180519104004, + "logits/rejected": -1.8153513669967651, + "logps/chosen": -235.05714416503906, + "logps/rejected": -280.4712829589844, + "loss": 0.5536, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0953407287597656, + "rewards/margins": 1.6475452184677124, + "rewards/rejected": -2.7428860664367676, + "step": 5680 + }, + { + "epoch": 0.66, + "learning_rate": 1.0328333530175977e-07, + "logits/chosen": -2.259068012237549, + "logits/rejected": -2.455493927001953, + "logps/chosen": -155.1253662109375, + "logps/rejected": -135.94676208496094, + "loss": 0.9605, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9609016180038452, + "rewards/margins": 0.9500251412391663, + "rewards/rejected": -2.9109268188476562, + "step": 5681 + }, + { + "epoch": 0.66, + "learning_rate": 1.032479036258415e-07, + "logits/chosen": -2.591142177581787, + "logits/rejected": -2.2102584838867188, + "logps/chosen": -309.0902404785156, + "logps/rejected": -543.540771484375, + "loss": 0.4866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9471275210380554, + "rewards/margins": 2.0250461101531982, + "rewards/rejected": -2.9721734523773193, + "step": 5682 + }, + { + "epoch": 0.66, + "learning_rate": 1.0321247194992323e-07, + "logits/chosen": -2.267996311187744, + "logits/rejected": -1.9789023399353027, + "logps/chosen": -165.4520263671875, + "logps/rejected": -317.72698974609375, + "loss": 0.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0607959032058716, + "rewards/margins": 3.8675308227539062, + "rewards/rejected": -4.928326606750488, + "step": 5683 + }, + { + "epoch": 0.66, + "learning_rate": 1.0317704027400495e-07, + "logits/chosen": -1.8548684120178223, + "logits/rejected": -1.9856256246566772, + "logps/chosen": -297.42047119140625, + "logps/rejected": -306.18170166015625, + "loss": 0.3967, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.106410264968872, + "rewards/margins": 1.5973560810089111, + "rewards/rejected": -2.703766345977783, + "step": 5684 + }, + { + "epoch": 0.66, + "learning_rate": 1.0314160859808667e-07, + "logits/chosen": -1.9887953996658325, + "logits/rejected": -2.2334275245666504, + "logps/chosen": -307.009033203125, + "logps/rejected": -325.21746826171875, + "loss": 0.3754, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5989624261856079, + "rewards/margins": 1.8741008043289185, + "rewards/rejected": -2.4730634689331055, + "step": 5685 + }, + { + "epoch": 0.66, + "learning_rate": 1.0310617692216842e-07, + "logits/chosen": -2.2823076248168945, + "logits/rejected": -2.152216672897339, + "logps/chosen": -306.51507568359375, + "logps/rejected": -272.28997802734375, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.537173867225647, + "rewards/margins": 2.929534435272217, + "rewards/rejected": -3.466708183288574, + "step": 5686 + }, + { + "epoch": 0.66, + "learning_rate": 1.0307074524625014e-07, + "logits/chosen": -2.3854639530181885, + "logits/rejected": -2.104041814804077, + "logps/chosen": -310.034423828125, + "logps/rejected": -290.28680419921875, + "loss": 0.4197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3259224891662598, + "rewards/margins": 1.8734439611434937, + "rewards/rejected": -3.1993660926818848, + "step": 5687 + }, + { + "epoch": 0.66, + "learning_rate": 1.0303531357033187e-07, + "logits/chosen": -3.0074820518493652, + "logits/rejected": -3.0343217849731445, + "logps/chosen": -171.9145050048828, + "logps/rejected": -221.28074645996094, + "loss": 0.1307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1393604725599289, + "rewards/margins": 2.58916974067688, + "rewards/rejected": -2.728530168533325, + "step": 5688 + }, + { + "epoch": 0.66, + "learning_rate": 1.029998818944136e-07, + "logits/chosen": -2.7404379844665527, + "logits/rejected": -2.3216776847839355, + "logps/chosen": -268.2242431640625, + "logps/rejected": -407.40185546875, + "loss": 0.2884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9922752976417542, + "rewards/margins": 1.7920259237289429, + "rewards/rejected": -2.784301280975342, + "step": 5689 + }, + { + "epoch": 0.66, + "learning_rate": 1.0296445021849532e-07, + "logits/chosen": -1.9953800439834595, + "logits/rejected": -2.258453845977783, + "logps/chosen": -332.80487060546875, + "logps/rejected": -301.6964111328125, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7347323894500732, + "rewards/margins": 2.3722028732299805, + "rewards/rejected": -3.1069352626800537, + "step": 5690 + }, + { + "epoch": 0.66, + "learning_rate": 1.0292901854257707e-07, + "logits/chosen": -2.3219385147094727, + "logits/rejected": -2.3723931312561035, + "logps/chosen": -460.2960205078125, + "logps/rejected": -335.94158935546875, + "loss": 0.1729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3464643061161041, + "rewards/margins": 2.774585485458374, + "rewards/rejected": -3.1210501194000244, + "step": 5691 + }, + { + "epoch": 0.66, + "learning_rate": 1.028935868666588e-07, + "logits/chosen": -2.622586250305176, + "logits/rejected": -2.823594808578491, + "logps/chosen": -307.8970947265625, + "logps/rejected": -544.295654296875, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4733589291572571, + "rewards/margins": 3.605354070663452, + "rewards/rejected": -4.0787129402160645, + "step": 5692 + }, + { + "epoch": 0.66, + "learning_rate": 1.0285815519074052e-07, + "logits/chosen": -1.9538044929504395, + "logits/rejected": -2.387094020843506, + "logps/chosen": -571.9424438476562, + "logps/rejected": -269.4619445800781, + "loss": 0.4342, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7312449216842651, + "rewards/margins": 2.3624212741851807, + "rewards/rejected": -3.0936663150787354, + "step": 5693 + }, + { + "epoch": 0.66, + "learning_rate": 1.0282272351482225e-07, + "logits/chosen": -2.7999067306518555, + "logits/rejected": -2.830901622772217, + "logps/chosen": -224.41917419433594, + "logps/rejected": -262.5125732421875, + "loss": 0.2892, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1468156576156616, + "rewards/margins": 1.7163158655166626, + "rewards/rejected": -2.863131523132324, + "step": 5694 + }, + { + "epoch": 0.66, + "learning_rate": 1.0278729183890397e-07, + "logits/chosen": -2.0202271938323975, + "logits/rejected": -2.181016206741333, + "logps/chosen": -333.349853515625, + "logps/rejected": -324.0946044921875, + "loss": 0.4828, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9875384569168091, + "rewards/margins": 1.2193856239318848, + "rewards/rejected": -2.2069242000579834, + "step": 5695 + }, + { + "epoch": 0.66, + "learning_rate": 1.027518601629857e-07, + "logits/chosen": -2.4934492111206055, + "logits/rejected": -2.474184989929199, + "logps/chosen": -348.32623291015625, + "logps/rejected": -319.3465576171875, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23315924406051636, + "rewards/margins": 3.348158359527588, + "rewards/rejected": -3.58131742477417, + "step": 5696 + }, + { + "epoch": 0.66, + "learning_rate": 1.0271642848706745e-07, + "logits/chosen": -2.2634634971618652, + "logits/rejected": -2.402512550354004, + "logps/chosen": -437.80609130859375, + "logps/rejected": -332.95428466796875, + "loss": 0.1938, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9240847826004028, + "rewards/margins": 3.324362277984619, + "rewards/rejected": -4.248447418212891, + "step": 5697 + }, + { + "epoch": 0.66, + "learning_rate": 1.0268099681114917e-07, + "logits/chosen": -2.6575422286987305, + "logits/rejected": -2.4704792499542236, + "logps/chosen": -222.37686157226562, + "logps/rejected": -295.86529541015625, + "loss": 0.306, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2494723796844482, + "rewards/margins": 2.337425947189331, + "rewards/rejected": -3.5868983268737793, + "step": 5698 + }, + { + "epoch": 0.66, + "learning_rate": 1.0264556513523089e-07, + "logits/chosen": -1.5050008296966553, + "logits/rejected": -2.2734344005584717, + "logps/chosen": -560.3615112304688, + "logps/rejected": -187.5242919921875, + "loss": 0.1381, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27771225571632385, + "rewards/margins": 3.4961953163146973, + "rewards/rejected": -3.773907423019409, + "step": 5699 + }, + { + "epoch": 0.66, + "learning_rate": 1.0261013345931263e-07, + "logits/chosen": -2.540419816970825, + "logits/rejected": -2.593881607055664, + "logps/chosen": -168.9939727783203, + "logps/rejected": -243.46568298339844, + "loss": 0.1095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19797229766845703, + "rewards/margins": 3.935375928878784, + "rewards/rejected": -3.737403392791748, + "step": 5700 + }, + { + "epoch": 0.66, + "learning_rate": 1.0257470178339435e-07, + "logits/chosen": -1.9824333190917969, + "logits/rejected": -1.903713583946228, + "logps/chosen": -235.05239868164062, + "logps/rejected": -265.193115234375, + "loss": 0.6506, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.698386549949646, + "rewards/margins": 1.94468092918396, + "rewards/rejected": -3.6430673599243164, + "step": 5701 + }, + { + "epoch": 0.66, + "learning_rate": 1.0253927010747607e-07, + "logits/chosen": -2.522278070449829, + "logits/rejected": -2.6037659645080566, + "logps/chosen": -158.93414306640625, + "logps/rejected": -193.7322998046875, + "loss": 1.6273, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.752061367034912, + "rewards/margins": 0.6176124215126038, + "rewards/rejected": -3.369673490524292, + "step": 5702 + }, + { + "epoch": 0.66, + "learning_rate": 1.0250383843155782e-07, + "logits/chosen": -2.468505859375, + "logits/rejected": -2.437094211578369, + "logps/chosen": -271.2071228027344, + "logps/rejected": -220.4228515625, + "loss": 0.7156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.198683500289917, + "rewards/margins": 3.157135248184204, + "rewards/rejected": -4.355818748474121, + "step": 5703 + }, + { + "epoch": 0.66, + "learning_rate": 1.0246840675563954e-07, + "logits/chosen": -1.9327147006988525, + "logits/rejected": -1.8885383605957031, + "logps/chosen": -220.66006469726562, + "logps/rejected": -308.0321350097656, + "loss": 0.4349, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9108067750930786, + "rewards/margins": 2.7351884841918945, + "rewards/rejected": -3.6459949016571045, + "step": 5704 + }, + { + "epoch": 0.66, + "learning_rate": 1.0243297507972126e-07, + "logits/chosen": -1.7174080610275269, + "logits/rejected": -1.9539053440093994, + "logps/chosen": -332.41943359375, + "logps/rejected": -347.6141052246094, + "loss": 0.3344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.293500304222107, + "rewards/margins": 1.8750184774398804, + "rewards/rejected": -3.168518543243408, + "step": 5705 + }, + { + "epoch": 0.66, + "learning_rate": 1.02397543403803e-07, + "logits/chosen": -2.7614777088165283, + "logits/rejected": -2.3465769290924072, + "logps/chosen": -201.98431396484375, + "logps/rejected": -291.85626220703125, + "loss": 0.2932, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49604740738868713, + "rewards/margins": 2.3633601665496826, + "rewards/rejected": -2.859407663345337, + "step": 5706 + }, + { + "epoch": 0.66, + "learning_rate": 1.0236211172788472e-07, + "logits/chosen": -2.2498862743377686, + "logits/rejected": -2.203916311264038, + "logps/chosen": -307.3746032714844, + "logps/rejected": -279.220947265625, + "loss": 0.5872, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9476866126060486, + "rewards/margins": 2.216930866241455, + "rewards/rejected": -3.1646173000335693, + "step": 5707 + }, + { + "epoch": 0.66, + "learning_rate": 1.0232668005196644e-07, + "logits/chosen": -1.9243485927581787, + "logits/rejected": -2.198871612548828, + "logps/chosen": -439.82379150390625, + "logps/rejected": -343.59521484375, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5185532569885254, + "rewards/margins": 2.700611114501953, + "rewards/rejected": -4.2191643714904785, + "step": 5708 + }, + { + "epoch": 0.66, + "learning_rate": 1.0229124837604819e-07, + "logits/chosen": -1.93864905834198, + "logits/rejected": -1.6575130224227905, + "logps/chosen": -208.60574340820312, + "logps/rejected": -286.4598388671875, + "loss": 0.3951, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2352862358093262, + "rewards/margins": 1.8691368103027344, + "rewards/rejected": -3.1044230461120605, + "step": 5709 + }, + { + "epoch": 0.66, + "learning_rate": 1.0225581670012991e-07, + "logits/chosen": -2.609442949295044, + "logits/rejected": -2.318737030029297, + "logps/chosen": -230.5745849609375, + "logps/rejected": -729.001708984375, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.271448016166687, + "rewards/margins": 4.682802200317383, + "rewards/rejected": -5.954249858856201, + "step": 5710 + }, + { + "epoch": 0.66, + "learning_rate": 1.0222038502421165e-07, + "logits/chosen": -2.660043954849243, + "logits/rejected": -2.6118857860565186, + "logps/chosen": -259.57904052734375, + "logps/rejected": -221.65069580078125, + "loss": 0.6422, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1934735774993896, + "rewards/margins": 0.8994506001472473, + "rewards/rejected": -2.092924118041992, + "step": 5711 + }, + { + "epoch": 0.66, + "learning_rate": 1.0218495334829337e-07, + "logits/chosen": -2.278362274169922, + "logits/rejected": -2.3269429206848145, + "logps/chosen": -309.8248291015625, + "logps/rejected": -310.9016418457031, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7453591227531433, + "rewards/margins": 2.532506227493286, + "rewards/rejected": -3.277865409851074, + "step": 5712 + }, + { + "epoch": 0.66, + "learning_rate": 1.0214952167237509e-07, + "logits/chosen": -1.9521524906158447, + "logits/rejected": -2.203430652618408, + "logps/chosen": -290.2025451660156, + "logps/rejected": -286.20867919921875, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10247907787561417, + "rewards/margins": 1.7325737476348877, + "rewards/rejected": -1.8350528478622437, + "step": 5713 + }, + { + "epoch": 0.66, + "learning_rate": 1.0211408999645681e-07, + "logits/chosen": -1.9695463180541992, + "logits/rejected": -2.184995174407959, + "logps/chosen": -187.3121337890625, + "logps/rejected": -159.37315368652344, + "loss": 0.5797, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1828334331512451, + "rewards/margins": 0.5078913569450378, + "rewards/rejected": -1.6907248497009277, + "step": 5714 + }, + { + "epoch": 0.66, + "learning_rate": 1.0207865832053856e-07, + "logits/chosen": -1.908722162246704, + "logits/rejected": -2.211146831512451, + "logps/chosen": -337.79974365234375, + "logps/rejected": -252.11666870117188, + "loss": 0.2886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5847647786140442, + "rewards/margins": 2.1655218601226807, + "rewards/rejected": -2.75028657913208, + "step": 5715 + }, + { + "epoch": 0.66, + "learning_rate": 1.0204322664462028e-07, + "logits/chosen": -1.8768537044525146, + "logits/rejected": -2.3056490421295166, + "logps/chosen": -638.035888671875, + "logps/rejected": -350.0102233886719, + "loss": 0.9439, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2478303909301758, + "rewards/margins": 0.12308266758918762, + "rewards/rejected": -1.370913028717041, + "step": 5716 + }, + { + "epoch": 0.67, + "learning_rate": 1.0200779496870202e-07, + "logits/chosen": -1.6574689149856567, + "logits/rejected": -1.5764528512954712, + "logps/chosen": -304.44708251953125, + "logps/rejected": -252.310302734375, + "loss": 0.3659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7468189001083374, + "rewards/margins": 1.9753895998001099, + "rewards/rejected": -2.7222084999084473, + "step": 5717 + }, + { + "epoch": 0.67, + "learning_rate": 1.0197236329278374e-07, + "logits/chosen": -2.071197509765625, + "logits/rejected": -2.26833176612854, + "logps/chosen": -207.89273071289062, + "logps/rejected": -231.1511688232422, + "loss": 0.3212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3369930386543274, + "rewards/margins": 1.5215131044387817, + "rewards/rejected": -1.858506202697754, + "step": 5718 + }, + { + "epoch": 0.67, + "learning_rate": 1.0193693161686546e-07, + "logits/chosen": -2.6907482147216797, + "logits/rejected": -2.679058313369751, + "logps/chosen": -139.7802276611328, + "logps/rejected": -150.3061981201172, + "loss": 0.3158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4210570156574249, + "rewards/margins": 2.383976936340332, + "rewards/rejected": -2.8050339221954346, + "step": 5719 + }, + { + "epoch": 0.67, + "learning_rate": 1.019014999409472e-07, + "logits/chosen": -2.0877857208251953, + "logits/rejected": -2.005148410797119, + "logps/chosen": -295.5303955078125, + "logps/rejected": -239.89822387695312, + "loss": 0.5591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8500014543533325, + "rewards/margins": 1.6384848356246948, + "rewards/rejected": -2.4884862899780273, + "step": 5720 + }, + { + "epoch": 0.67, + "learning_rate": 1.0186606826502894e-07, + "logits/chosen": -2.07388973236084, + "logits/rejected": -1.7710657119750977, + "logps/chosen": -178.97964477539062, + "logps/rejected": -222.16748046875, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2213088274002075, + "rewards/margins": 1.7739155292510986, + "rewards/rejected": -2.9952239990234375, + "step": 5721 + }, + { + "epoch": 0.67, + "learning_rate": 1.0183063658911066e-07, + "logits/chosen": -2.58060884475708, + "logits/rejected": -2.418449878692627, + "logps/chosen": -396.7721862792969, + "logps/rejected": -360.379150390625, + "loss": 0.2855, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1663193702697754, + "rewards/margins": 3.6210176944732666, + "rewards/rejected": -4.787336826324463, + "step": 5722 + }, + { + "epoch": 0.67, + "learning_rate": 1.0179520491319239e-07, + "logits/chosen": -2.209815263748169, + "logits/rejected": -2.524604558944702, + "logps/chosen": -380.9732971191406, + "logps/rejected": -321.17657470703125, + "loss": 0.2435, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9208568334579468, + "rewards/margins": 2.3090951442718506, + "rewards/rejected": -3.2299516201019287, + "step": 5723 + }, + { + "epoch": 0.67, + "learning_rate": 1.0175977323727411e-07, + "logits/chosen": -1.5827622413635254, + "logits/rejected": -1.6703882217407227, + "logps/chosen": -170.38226318359375, + "logps/rejected": -278.2833251953125, + "loss": 0.7052, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.058730959892273, + "rewards/margins": 0.6451176404953003, + "rewards/rejected": -1.7038486003875732, + "step": 5724 + }, + { + "epoch": 0.67, + "learning_rate": 1.0172434156135584e-07, + "logits/chosen": -2.3528287410736084, + "logits/rejected": -2.4916741847991943, + "logps/chosen": -225.21652221679688, + "logps/rejected": -267.46240234375, + "loss": 0.6027, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6623985767364502, + "rewards/margins": 1.5840626955032349, + "rewards/rejected": -3.2464609146118164, + "step": 5725 + }, + { + "epoch": 0.67, + "learning_rate": 1.0168890988543759e-07, + "logits/chosen": -1.949461817741394, + "logits/rejected": -1.838788628578186, + "logps/chosen": -380.763671875, + "logps/rejected": -406.9295654296875, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5501372814178467, + "rewards/margins": 1.1464478969573975, + "rewards/rejected": -1.6965851783752441, + "step": 5726 + }, + { + "epoch": 0.67, + "learning_rate": 1.0165347820951931e-07, + "logits/chosen": -1.7326855659484863, + "logits/rejected": -1.9960318803787231, + "logps/chosen": -310.2761535644531, + "logps/rejected": -343.4034118652344, + "loss": 0.7927, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6247073411941528, + "rewards/margins": 1.178387999534607, + "rewards/rejected": -2.8030953407287598, + "step": 5727 + }, + { + "epoch": 0.67, + "learning_rate": 1.0161804653360104e-07, + "logits/chosen": -2.3971526622772217, + "logits/rejected": -2.3676724433898926, + "logps/chosen": -293.67315673828125, + "logps/rejected": -314.10626220703125, + "loss": 0.7996, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5329173803329468, + "rewards/margins": 2.017937660217285, + "rewards/rejected": -3.5508551597595215, + "step": 5728 + }, + { + "epoch": 0.67, + "learning_rate": 1.0158261485768277e-07, + "logits/chosen": -2.242605209350586, + "logits/rejected": -2.456092357635498, + "logps/chosen": -219.0775146484375, + "logps/rejected": -238.84564208984375, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7128984928131104, + "rewards/margins": 2.0546960830688477, + "rewards/rejected": -3.767594337463379, + "step": 5729 + }, + { + "epoch": 0.67, + "learning_rate": 1.0154718318176449e-07, + "logits/chosen": -2.6814515590667725, + "logits/rejected": -2.6922030448913574, + "logps/chosen": -206.15452575683594, + "logps/rejected": -221.15023803710938, + "loss": 0.4529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9341166615486145, + "rewards/margins": 2.8035199642181396, + "rewards/rejected": -3.7376368045806885, + "step": 5730 + }, + { + "epoch": 0.67, + "learning_rate": 1.0151175150584621e-07, + "logits/chosen": -2.334106922149658, + "logits/rejected": -2.215198040008545, + "logps/chosen": -172.55801391601562, + "logps/rejected": -215.14901733398438, + "loss": 0.8135, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7230803966522217, + "rewards/margins": 1.0622026920318604, + "rewards/rejected": -2.785283088684082, + "step": 5731 + }, + { + "epoch": 0.67, + "learning_rate": 1.0147631982992796e-07, + "logits/chosen": -2.4482102394104004, + "logits/rejected": -2.4757003784179688, + "logps/chosen": -472.3151550292969, + "logps/rejected": -328.2801513671875, + "loss": 0.4107, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1007823944091797, + "rewards/margins": 1.7438466548919678, + "rewards/rejected": -2.8446290493011475, + "step": 5732 + }, + { + "epoch": 0.67, + "learning_rate": 1.0144088815400968e-07, + "logits/chosen": -1.73236083984375, + "logits/rejected": -2.256962299346924, + "logps/chosen": -332.712158203125, + "logps/rejected": -170.97012329101562, + "loss": 0.5033, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6615587472915649, + "rewards/margins": 0.9288749694824219, + "rewards/rejected": -1.5904337167739868, + "step": 5733 + }, + { + "epoch": 0.67, + "learning_rate": 1.0140545647809142e-07, + "logits/chosen": -2.6494405269622803, + "logits/rejected": -2.6059999465942383, + "logps/chosen": -133.99842834472656, + "logps/rejected": -172.59774780273438, + "loss": 0.4521, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.001673698425293, + "rewards/margins": 1.8003664016723633, + "rewards/rejected": -2.8020401000976562, + "step": 5734 + }, + { + "epoch": 0.67, + "learning_rate": 1.0137002480217314e-07, + "logits/chosen": -2.567019462585449, + "logits/rejected": -2.537938117980957, + "logps/chosen": -286.87249755859375, + "logps/rejected": -281.37078857421875, + "loss": 0.7006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6186783313751221, + "rewards/margins": 1.7746076583862305, + "rewards/rejected": -2.3932859897613525, + "step": 5735 + }, + { + "epoch": 0.67, + "learning_rate": 1.0133459312625486e-07, + "logits/chosen": -1.5154378414154053, + "logits/rejected": -2.0747733116149902, + "logps/chosen": -485.62481689453125, + "logps/rejected": -338.37677001953125, + "loss": 0.6459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8394079804420471, + "rewards/margins": 1.0331939458847046, + "rewards/rejected": -1.8726019859313965, + "step": 5736 + }, + { + "epoch": 0.67, + "learning_rate": 1.012991614503366e-07, + "logits/chosen": -2.4816856384277344, + "logits/rejected": -2.587556838989258, + "logps/chosen": -359.301513671875, + "logps/rejected": -298.4873352050781, + "loss": 0.1836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29759326577186584, + "rewards/margins": 3.6978468894958496, + "rewards/rejected": -3.9954400062561035, + "step": 5737 + }, + { + "epoch": 0.67, + "learning_rate": 1.0126372977441833e-07, + "logits/chosen": -2.429328203201294, + "logits/rejected": -2.568686008453369, + "logps/chosen": -238.32521057128906, + "logps/rejected": -162.77139282226562, + "loss": 0.5633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.451534241437912, + "rewards/margins": 1.2890466451644897, + "rewards/rejected": -1.7405807971954346, + "step": 5738 + }, + { + "epoch": 0.67, + "learning_rate": 1.0122829809850005e-07, + "logits/chosen": -2.701542854309082, + "logits/rejected": -2.756730794906616, + "logps/chosen": -244.95867919921875, + "logps/rejected": -237.81527709960938, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7419660091400146, + "rewards/margins": 2.8073973655700684, + "rewards/rejected": -3.549363136291504, + "step": 5739 + }, + { + "epoch": 0.67, + "learning_rate": 1.0119286642258179e-07, + "logits/chosen": -2.395193338394165, + "logits/rejected": -2.1522490978240967, + "logps/chosen": -316.0656433105469, + "logps/rejected": -254.7091827392578, + "loss": 0.3435, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9142875671386719, + "rewards/margins": 1.70399808883667, + "rewards/rejected": -2.618285655975342, + "step": 5740 + }, + { + "epoch": 0.67, + "learning_rate": 1.0115743474666351e-07, + "logits/chosen": -2.492504596710205, + "logits/rejected": -2.5207035541534424, + "logps/chosen": -312.96014404296875, + "logps/rejected": -250.63743591308594, + "loss": 0.3079, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8065988421440125, + "rewards/margins": 1.9257049560546875, + "rewards/rejected": -2.7323038578033447, + "step": 5741 + }, + { + "epoch": 0.67, + "learning_rate": 1.0112200307074523e-07, + "logits/chosen": -2.487619400024414, + "logits/rejected": -2.586833953857422, + "logps/chosen": -230.24822998046875, + "logps/rejected": -269.104736328125, + "loss": 0.2565, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7277302742004395, + "rewards/margins": 2.7000045776367188, + "rewards/rejected": -3.427734613418579, + "step": 5742 + }, + { + "epoch": 0.67, + "learning_rate": 1.0108657139482697e-07, + "logits/chosen": -2.698556423187256, + "logits/rejected": -2.8122189044952393, + "logps/chosen": -125.97235107421875, + "logps/rejected": -179.52749633789062, + "loss": 0.5832, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9974445104598999, + "rewards/margins": 1.5923588275909424, + "rewards/rejected": -2.589803457260132, + "step": 5743 + }, + { + "epoch": 0.67, + "learning_rate": 1.010511397189087e-07, + "logits/chosen": -2.3821825981140137, + "logits/rejected": -2.4445407390594482, + "logps/chosen": -338.0587463378906, + "logps/rejected": -235.01148986816406, + "loss": 0.4868, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3402934074401855, + "rewards/margins": 1.1800202131271362, + "rewards/rejected": -2.5203137397766113, + "step": 5744 + }, + { + "epoch": 0.67, + "learning_rate": 1.0101570804299044e-07, + "logits/chosen": -2.66007399559021, + "logits/rejected": -2.729832649230957, + "logps/chosen": -365.5499267578125, + "logps/rejected": -265.78448486328125, + "loss": 0.7894, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3333251476287842, + "rewards/margins": 0.9008133411407471, + "rewards/rejected": -2.2341384887695312, + "step": 5745 + }, + { + "epoch": 0.67, + "learning_rate": 1.0098027636707216e-07, + "logits/chosen": -1.9044493436813354, + "logits/rejected": -2.294090509414673, + "logps/chosen": -373.70977783203125, + "logps/rejected": -216.33627319335938, + "loss": 0.5365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7308860421180725, + "rewards/margins": 0.5618414878845215, + "rewards/rejected": -1.2927274703979492, + "step": 5746 + }, + { + "epoch": 0.67, + "learning_rate": 1.0094484469115388e-07, + "logits/chosen": -2.390343427658081, + "logits/rejected": -2.6076478958129883, + "logps/chosen": -182.3687286376953, + "logps/rejected": -212.65625, + "loss": 0.8951, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0419673919677734, + "rewards/margins": 0.9703493118286133, + "rewards/rejected": -3.0123167037963867, + "step": 5747 + }, + { + "epoch": 0.67, + "learning_rate": 1.0090941301523562e-07, + "logits/chosen": -1.786605715751648, + "logits/rejected": -2.3089120388031006, + "logps/chosen": -510.920166015625, + "logps/rejected": -332.3559265136719, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5459075570106506, + "rewards/margins": 2.918745517730713, + "rewards/rejected": -3.4646530151367188, + "step": 5748 + }, + { + "epoch": 0.67, + "learning_rate": 1.0087398133931734e-07, + "logits/chosen": -1.976226806640625, + "logits/rejected": -2.018033742904663, + "logps/chosen": -337.2959899902344, + "logps/rejected": -370.3382873535156, + "loss": 0.3711, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8997442722320557, + "rewards/margins": 1.231888771057129, + "rewards/rejected": -3.1316330432891846, + "step": 5749 + }, + { + "epoch": 0.67, + "learning_rate": 1.0083854966339908e-07, + "logits/chosen": -2.8591208457946777, + "logits/rejected": -2.591240167617798, + "logps/chosen": -216.6610870361328, + "logps/rejected": -292.6071472167969, + "loss": 0.2974, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9944829940795898, + "rewards/margins": 3.8813953399658203, + "rewards/rejected": -5.87587833404541, + "step": 5750 + }, + { + "epoch": 0.67, + "learning_rate": 1.0080311798748081e-07, + "logits/chosen": -1.9769082069396973, + "logits/rejected": -2.2968480587005615, + "logps/chosen": -279.29754638671875, + "logps/rejected": -143.7392578125, + "loss": 0.6944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9152938723564148, + "rewards/margins": 0.4035572409629822, + "rewards/rejected": -1.318851113319397, + "step": 5751 + }, + { + "epoch": 0.67, + "learning_rate": 1.0076768631156253e-07, + "logits/chosen": -2.217264175415039, + "logits/rejected": -2.099442481994629, + "logps/chosen": -249.83682250976562, + "logps/rejected": -359.16790771484375, + "loss": 0.7673, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2593172788619995, + "rewards/margins": 1.5421158075332642, + "rewards/rejected": -2.8014328479766846, + "step": 5752 + }, + { + "epoch": 0.67, + "learning_rate": 1.0073225463564425e-07, + "logits/chosen": -1.978755235671997, + "logits/rejected": -2.0711700916290283, + "logps/chosen": -390.3529968261719, + "logps/rejected": -287.80615234375, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.218348503112793, + "rewards/margins": 1.8363316059112549, + "rewards/rejected": -3.054680347442627, + "step": 5753 + }, + { + "epoch": 0.67, + "learning_rate": 1.0069682295972599e-07, + "logits/chosen": -2.6326143741607666, + "logits/rejected": -2.525367021560669, + "logps/chosen": -187.7223663330078, + "logps/rejected": -285.8730773925781, + "loss": 0.4134, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7321880459785461, + "rewards/margins": 2.054845094680786, + "rewards/rejected": -2.7870330810546875, + "step": 5754 + }, + { + "epoch": 0.67, + "learning_rate": 1.0066139128380771e-07, + "logits/chosen": -2.011235237121582, + "logits/rejected": -2.0516176223754883, + "logps/chosen": -293.41400146484375, + "logps/rejected": -198.67410278320312, + "loss": 0.5663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5634838342666626, + "rewards/margins": 0.6417245268821716, + "rewards/rejected": -1.2052083015441895, + "step": 5755 + }, + { + "epoch": 0.67, + "learning_rate": 1.0062595960788946e-07, + "logits/chosen": -2.6013669967651367, + "logits/rejected": -2.6785786151885986, + "logps/chosen": -449.7982482910156, + "logps/rejected": -398.6786804199219, + "loss": 0.2954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9055079221725464, + "rewards/margins": 2.065767288208008, + "rewards/rejected": -2.9712753295898438, + "step": 5756 + }, + { + "epoch": 0.67, + "learning_rate": 1.0059052793197118e-07, + "logits/chosen": -1.6158208847045898, + "logits/rejected": -1.562638521194458, + "logps/chosen": -198.82675170898438, + "logps/rejected": -345.23828125, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6444261074066162, + "rewards/margins": 2.9441826343536377, + "rewards/rejected": -3.588608741760254, + "step": 5757 + }, + { + "epoch": 0.67, + "learning_rate": 1.005550962560529e-07, + "logits/chosen": -2.209902763366699, + "logits/rejected": -2.3361053466796875, + "logps/chosen": -370.64654541015625, + "logps/rejected": -391.99383544921875, + "loss": 0.4246, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4168072938919067, + "rewards/margins": 1.4995262622833252, + "rewards/rejected": -2.9163334369659424, + "step": 5758 + }, + { + "epoch": 0.67, + "learning_rate": 1.0051966458013463e-07, + "logits/chosen": -2.3736722469329834, + "logits/rejected": -2.490983724594116, + "logps/chosen": -403.9136962890625, + "logps/rejected": -280.2528076171875, + "loss": 0.2755, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.048612385988235474, + "rewards/margins": 2.9712984561920166, + "rewards/rejected": -2.9226863384246826, + "step": 5759 + }, + { + "epoch": 0.67, + "learning_rate": 1.0048423290421636e-07, + "logits/chosen": -2.0639231204986572, + "logits/rejected": -2.1353981494903564, + "logps/chosen": -299.6881103515625, + "logps/rejected": -332.575439453125, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0410830974578857, + "rewards/margins": 3.5966620445251465, + "rewards/rejected": -4.637744903564453, + "step": 5760 + }, + { + "epoch": 0.67, + "learning_rate": 1.0044880122829808e-07, + "logits/chosen": -1.9740675687789917, + "logits/rejected": -2.116307020187378, + "logps/chosen": -459.16058349609375, + "logps/rejected": -232.26039123535156, + "loss": 0.3425, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1385475397109985, + "rewards/margins": 2.828944444656372, + "rewards/rejected": -3.967491865158081, + "step": 5761 + }, + { + "epoch": 0.67, + "learning_rate": 1.0041336955237983e-07, + "logits/chosen": -1.720078945159912, + "logits/rejected": -1.577985405921936, + "logps/chosen": -368.5813293457031, + "logps/rejected": -349.7297668457031, + "loss": 0.5375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9944068193435669, + "rewards/margins": 2.064049482345581, + "rewards/rejected": -3.0584561824798584, + "step": 5762 + }, + { + "epoch": 0.67, + "learning_rate": 1.0037793787646156e-07, + "logits/chosen": -2.807548761367798, + "logits/rejected": -2.775440216064453, + "logps/chosen": -306.3334045410156, + "logps/rejected": -274.8289489746094, + "loss": 0.5316, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5152502059936523, + "rewards/margins": 1.0386261940002441, + "rewards/rejected": -2.5538761615753174, + "step": 5763 + }, + { + "epoch": 0.67, + "learning_rate": 1.0034250620054328e-07, + "logits/chosen": -2.4948530197143555, + "logits/rejected": -2.416546106338501, + "logps/chosen": -340.4671325683594, + "logps/rejected": -261.9438171386719, + "loss": 0.3535, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6160755157470703, + "rewards/margins": 1.214086890220642, + "rewards/rejected": -2.830162286758423, + "step": 5764 + }, + { + "epoch": 0.67, + "learning_rate": 1.0030707452462501e-07, + "logits/chosen": -2.817288398742676, + "logits/rejected": -2.9314796924591064, + "logps/chosen": -472.5043640136719, + "logps/rejected": -245.44647216796875, + "loss": 0.3663, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5141590237617493, + "rewards/margins": 1.9853473901748657, + "rewards/rejected": -2.4995064735412598, + "step": 5765 + }, + { + "epoch": 0.67, + "learning_rate": 1.0027164284870674e-07, + "logits/chosen": -2.557020902633667, + "logits/rejected": -2.5298523902893066, + "logps/chosen": -169.0706787109375, + "logps/rejected": -173.798095703125, + "loss": 0.4056, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4015989899635315, + "rewards/margins": 2.3247835636138916, + "rewards/rejected": -2.7263827323913574, + "step": 5766 + }, + { + "epoch": 0.67, + "learning_rate": 1.0023621117278847e-07, + "logits/chosen": -1.7525663375854492, + "logits/rejected": -1.9515842199325562, + "logps/chosen": -408.82598876953125, + "logps/rejected": -384.0552978515625, + "loss": 0.095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2872649133205414, + "rewards/margins": 3.3191471099853516, + "rewards/rejected": -3.606411933898926, + "step": 5767 + }, + { + "epoch": 0.67, + "learning_rate": 1.002007794968702e-07, + "logits/chosen": -2.5796375274658203, + "logits/rejected": -2.450972318649292, + "logps/chosen": -204.9462127685547, + "logps/rejected": -226.3411865234375, + "loss": 0.193, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0458745956420898, + "rewards/margins": 3.495532751083374, + "rewards/rejected": -4.541407108306885, + "step": 5768 + }, + { + "epoch": 0.67, + "learning_rate": 1.0016534782095193e-07, + "logits/chosen": -1.6434109210968018, + "logits/rejected": -2.025432825088501, + "logps/chosen": -314.3719787597656, + "logps/rejected": -228.85870361328125, + "loss": 0.7679, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1968960762023926, + "rewards/margins": 0.8928781747817993, + "rewards/rejected": -2.0897743701934814, + "step": 5769 + }, + { + "epoch": 0.67, + "learning_rate": 1.0012991614503365e-07, + "logits/chosen": -2.8037776947021484, + "logits/rejected": -2.761701822280884, + "logps/chosen": -191.1158447265625, + "logps/rejected": -156.84133911132812, + "loss": 0.4617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7867107391357422, + "rewards/margins": 1.4570469856262207, + "rewards/rejected": -2.243757724761963, + "step": 5770 + }, + { + "epoch": 0.67, + "learning_rate": 1.0009448446911539e-07, + "logits/chosen": -2.1562514305114746, + "logits/rejected": -2.417405366897583, + "logps/chosen": -262.6151123046875, + "logps/rejected": -347.05633544921875, + "loss": 0.3718, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26289331912994385, + "rewards/margins": 2.7212533950805664, + "rewards/rejected": -2.9841468334198, + "step": 5771 + }, + { + "epoch": 0.67, + "learning_rate": 1.0005905279319711e-07, + "logits/chosen": -2.2447824478149414, + "logits/rejected": -2.1203253269195557, + "logps/chosen": -518.9264526367188, + "logps/rejected": -492.086181640625, + "loss": 0.8053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.989099383354187, + "rewards/margins": 1.368168830871582, + "rewards/rejected": -2.3572683334350586, + "step": 5772 + }, + { + "epoch": 0.67, + "learning_rate": 1.0002362111727886e-07, + "logits/chosen": -1.9306023120880127, + "logits/rejected": -1.8628727197647095, + "logps/chosen": -162.7547149658203, + "logps/rejected": -307.2591857910156, + "loss": 0.7782, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.550424575805664, + "rewards/margins": 1.218955636024475, + "rewards/rejected": -2.7693800926208496, + "step": 5773 + }, + { + "epoch": 0.67, + "learning_rate": 9.998818944136058e-08, + "logits/chosen": -2.6193060874938965, + "logits/rejected": -2.563824415206909, + "logps/chosen": -188.22842407226562, + "logps/rejected": -267.38323974609375, + "loss": 0.3796, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4857118129730225, + "rewards/margins": 2.1975202560424805, + "rewards/rejected": -3.683232307434082, + "step": 5774 + }, + { + "epoch": 0.67, + "learning_rate": 9.99527577654423e-08, + "logits/chosen": -1.850621223449707, + "logits/rejected": -1.6796479225158691, + "logps/chosen": -398.70037841796875, + "logps/rejected": -448.2723388671875, + "loss": 0.308, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.884171187877655, + "rewards/margins": 3.823141574859619, + "rewards/rejected": -4.707313060760498, + "step": 5775 + }, + { + "epoch": 0.67, + "learning_rate": 9.991732608952402e-08, + "logits/chosen": -2.561994791030884, + "logits/rejected": -2.3181967735290527, + "logps/chosen": -159.31642150878906, + "logps/rejected": -219.54165649414062, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8138719201087952, + "rewards/margins": 2.1652028560638428, + "rewards/rejected": -2.9790749549865723, + "step": 5776 + }, + { + "epoch": 0.67, + "learning_rate": 9.988189441360576e-08, + "logits/chosen": -2.920694351196289, + "logits/rejected": -2.889153003692627, + "logps/chosen": -316.88543701171875, + "logps/rejected": -282.6190490722656, + "loss": 0.1648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8634724617004395, + "rewards/margins": 2.3018622398376465, + "rewards/rejected": -3.165334939956665, + "step": 5777 + }, + { + "epoch": 0.67, + "learning_rate": 9.984646273768748e-08, + "logits/chosen": -2.0603859424591064, + "logits/rejected": -2.454493999481201, + "logps/chosen": -207.4029998779297, + "logps/rejected": -215.81536865234375, + "loss": 0.2656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5988070964813232, + "rewards/margins": 3.0167288780212402, + "rewards/rejected": -3.6155362129211426, + "step": 5778 + }, + { + "epoch": 0.67, + "learning_rate": 9.981103106176923e-08, + "logits/chosen": -1.7181212902069092, + "logits/rejected": -2.1415064334869385, + "logps/chosen": -410.1514892578125, + "logps/rejected": -361.6053466796875, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014646857976913452, + "rewards/margins": 2.9200191497802734, + "rewards/rejected": -2.9053726196289062, + "step": 5779 + }, + { + "epoch": 0.67, + "learning_rate": 9.977559938585095e-08, + "logits/chosen": -2.12811017036438, + "logits/rejected": -2.302107810974121, + "logps/chosen": -368.584228515625, + "logps/rejected": -360.9226379394531, + "loss": 0.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7874436378479004, + "rewards/margins": 1.0331153869628906, + "rewards/rejected": -2.820558786392212, + "step": 5780 + }, + { + "epoch": 0.67, + "learning_rate": 9.974016770993267e-08, + "logits/chosen": -2.2165122032165527, + "logits/rejected": -2.307837963104248, + "logps/chosen": -227.49978637695312, + "logps/rejected": -218.623779296875, + "loss": 0.2024, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4706130027770996, + "rewards/margins": 2.2196743488311768, + "rewards/rejected": -3.6902875900268555, + "step": 5781 + }, + { + "epoch": 0.67, + "learning_rate": 9.970473603401441e-08, + "logits/chosen": -2.6064822673797607, + "logits/rejected": -2.682154893875122, + "logps/chosen": -375.7645263671875, + "logps/rejected": -335.4135437011719, + "loss": 0.3377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.144835352897644, + "rewards/margins": 2.792149066925049, + "rewards/rejected": -3.9369847774505615, + "step": 5782 + }, + { + "epoch": 0.67, + "learning_rate": 9.966930435809613e-08, + "logits/chosen": -2.0059328079223633, + "logits/rejected": -1.9022154808044434, + "logps/chosen": -426.1322937011719, + "logps/rejected": -396.5504150390625, + "loss": 0.4509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6676013469696045, + "rewards/margins": 1.9990286827087402, + "rewards/rejected": -2.6666300296783447, + "step": 5783 + }, + { + "epoch": 0.67, + "learning_rate": 9.963387268217785e-08, + "logits/chosen": -2.419198989868164, + "logits/rejected": -2.217604637145996, + "logps/chosen": -271.5042724609375, + "logps/rejected": -337.99688720703125, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7376140356063843, + "rewards/margins": 1.9669110774993896, + "rewards/rejected": -2.7045249938964844, + "step": 5784 + }, + { + "epoch": 0.67, + "learning_rate": 9.95984410062596e-08, + "logits/chosen": -2.3259098529815674, + "logits/rejected": -2.6586060523986816, + "logps/chosen": -222.43161010742188, + "logps/rejected": -177.86752319335938, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.209706425666809, + "rewards/margins": 1.0650098323822021, + "rewards/rejected": -2.2747161388397217, + "step": 5785 + }, + { + "epoch": 0.67, + "learning_rate": 9.956300933034132e-08, + "logits/chosen": -2.0341129302978516, + "logits/rejected": -2.269469738006592, + "logps/chosen": -163.59140014648438, + "logps/rejected": -140.89967346191406, + "loss": 0.6506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6979331374168396, + "rewards/margins": 1.7667651176452637, + "rewards/rejected": -2.464698314666748, + "step": 5786 + }, + { + "epoch": 0.67, + "learning_rate": 9.952757765442305e-08, + "logits/chosen": -1.7250542640686035, + "logits/rejected": -1.9845881462097168, + "logps/chosen": -447.96728515625, + "logps/rejected": -397.0347900390625, + "loss": 0.4518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6019169092178345, + "rewards/margins": 2.493518352508545, + "rewards/rejected": -3.095435380935669, + "step": 5787 + }, + { + "epoch": 0.67, + "learning_rate": 9.949214597850478e-08, + "logits/chosen": -2.4451355934143066, + "logits/rejected": -2.253032922744751, + "logps/chosen": -268.4742431640625, + "logps/rejected": -321.53265380859375, + "loss": 0.3656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9998315572738647, + "rewards/margins": 1.940568208694458, + "rewards/rejected": -2.940399646759033, + "step": 5788 + }, + { + "epoch": 0.67, + "learning_rate": 9.94567143025865e-08, + "logits/chosen": -2.178494453430176, + "logits/rejected": -2.2212491035461426, + "logps/chosen": -216.44918823242188, + "logps/rejected": -295.6053161621094, + "loss": 0.3469, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4273236989974976, + "rewards/margins": 2.5931284427642822, + "rewards/rejected": -4.02045202255249, + "step": 5789 + }, + { + "epoch": 0.67, + "learning_rate": 9.942128262666822e-08, + "logits/chosen": -2.5102224349975586, + "logits/rejected": -2.7717175483703613, + "logps/chosen": -282.730712890625, + "logps/rejected": -153.065673828125, + "loss": 0.3454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10285449028015137, + "rewards/margins": 2.8701112270355225, + "rewards/rejected": -2.972965717315674, + "step": 5790 + }, + { + "epoch": 0.67, + "learning_rate": 9.938585095074997e-08, + "logits/chosen": -2.7636642456054688, + "logits/rejected": -2.610619068145752, + "logps/chosen": -145.7222900390625, + "logps/rejected": -296.0362854003906, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7835463881492615, + "rewards/margins": 3.0779871940612793, + "rewards/rejected": -3.8615336418151855, + "step": 5791 + }, + { + "epoch": 0.67, + "learning_rate": 9.93504192748317e-08, + "logits/chosen": -2.711123466491699, + "logits/rejected": -2.680260181427002, + "logps/chosen": -139.6595458984375, + "logps/rejected": -186.71826171875, + "loss": 0.5611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4655061960220337, + "rewards/margins": 1.5441739559173584, + "rewards/rejected": -2.0096802711486816, + "step": 5792 + }, + { + "epoch": 0.67, + "learning_rate": 9.931498759891343e-08, + "logits/chosen": -2.421576738357544, + "logits/rejected": -2.2816014289855957, + "logps/chosen": -376.7750244140625, + "logps/rejected": -383.32293701171875, + "loss": 0.5136, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2615442276000977, + "rewards/margins": 2.4751136302948, + "rewards/rejected": -3.7366580963134766, + "step": 5793 + }, + { + "epoch": 0.67, + "learning_rate": 9.927955592299515e-08, + "logits/chosen": -2.3425345420837402, + "logits/rejected": -2.477050542831421, + "logps/chosen": -213.98519897460938, + "logps/rejected": -254.2338104248047, + "loss": 0.3272, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7614902257919312, + "rewards/margins": 3.1809847354888916, + "rewards/rejected": -3.9424750804901123, + "step": 5794 + }, + { + "epoch": 0.67, + "learning_rate": 9.924412424707688e-08, + "logits/chosen": -2.056183099746704, + "logits/rejected": -2.490325689315796, + "logps/chosen": -251.58493041992188, + "logps/rejected": -152.64537048339844, + "loss": 0.6667, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2129579782485962, + "rewards/margins": 1.0743306875228882, + "rewards/rejected": -2.2872886657714844, + "step": 5795 + }, + { + "epoch": 0.67, + "learning_rate": 9.92086925711586e-08, + "logits/chosen": -1.7949477434158325, + "logits/rejected": -2.2164885997772217, + "logps/chosen": -382.417236328125, + "logps/rejected": -320.01715087890625, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8480574488639832, + "rewards/margins": 2.1306028366088867, + "rewards/rejected": -2.9786601066589355, + "step": 5796 + }, + { + "epoch": 0.67, + "learning_rate": 9.917326089524035e-08, + "logits/chosen": -2.073305368423462, + "logits/rejected": -2.0590038299560547, + "logps/chosen": -347.04840087890625, + "logps/rejected": -326.3638916015625, + "loss": 0.7199, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.2119827270507812, + "rewards/margins": 0.22591352462768555, + "rewards/rejected": -1.4378962516784668, + "step": 5797 + }, + { + "epoch": 0.67, + "learning_rate": 9.913782921932207e-08, + "logits/chosen": -2.15633487701416, + "logits/rejected": -2.24438738822937, + "logps/chosen": -304.1710510253906, + "logps/rejected": -214.03878784179688, + "loss": 0.2889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5794135332107544, + "rewards/margins": 1.646407127380371, + "rewards/rejected": -2.225820541381836, + "step": 5798 + }, + { + "epoch": 0.67, + "learning_rate": 9.91023975434038e-08, + "logits/chosen": -2.176957845687866, + "logits/rejected": -2.390069007873535, + "logps/chosen": -184.3973388671875, + "logps/rejected": -201.41078186035156, + "loss": 0.7947, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6155750751495361, + "rewards/margins": 1.525986909866333, + "rewards/rejected": -2.141561985015869, + "step": 5799 + }, + { + "epoch": 0.67, + "learning_rate": 9.906696586748553e-08, + "logits/chosen": -2.652984142303467, + "logits/rejected": -2.677906036376953, + "logps/chosen": -155.05079650878906, + "logps/rejected": -227.96412658691406, + "loss": 0.1973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18932411074638367, + "rewards/margins": 2.6940526962280273, + "rewards/rejected": -2.8833768367767334, + "step": 5800 + }, + { + "epoch": 0.67, + "learning_rate": 9.903153419156725e-08, + "logits/chosen": -2.830456018447876, + "logits/rejected": -2.7919225692749023, + "logps/chosen": -179.61117553710938, + "logps/rejected": -232.68484497070312, + "loss": 0.3483, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5737810730934143, + "rewards/margins": 2.6185367107391357, + "rewards/rejected": -3.1923179626464844, + "step": 5801 + }, + { + "epoch": 0.67, + "learning_rate": 9.8996102515649e-08, + "logits/chosen": -1.5814012289047241, + "logits/rejected": -2.0337724685668945, + "logps/chosen": -491.41424560546875, + "logps/rejected": -342.4465026855469, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.913167417049408, + "rewards/margins": 1.7688361406326294, + "rewards/rejected": -2.6820034980773926, + "step": 5802 + }, + { + "epoch": 0.68, + "learning_rate": 9.896067083973072e-08, + "logits/chosen": -2.9493532180786133, + "logits/rejected": -2.9293782711029053, + "logps/chosen": -300.3930969238281, + "logps/rejected": -366.511474609375, + "loss": 0.2287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3780864477157593, + "rewards/margins": 2.9899775981903076, + "rewards/rejected": -3.3680639266967773, + "step": 5803 + }, + { + "epoch": 0.68, + "learning_rate": 9.892523916381244e-08, + "logits/chosen": -2.2271621227264404, + "logits/rejected": -2.051503896713257, + "logps/chosen": -367.313232421875, + "logps/rejected": -414.24249267578125, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2073097229003906, + "rewards/margins": 1.679544448852539, + "rewards/rejected": -2.8868541717529297, + "step": 5804 + }, + { + "epoch": 0.68, + "learning_rate": 9.888980748789418e-08, + "logits/chosen": -2.330772876739502, + "logits/rejected": -2.329786777496338, + "logps/chosen": -168.54380798339844, + "logps/rejected": -269.3568115234375, + "loss": 0.7435, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.604803204536438, + "rewards/margins": 0.6879359483718872, + "rewards/rejected": -2.292739152908325, + "step": 5805 + }, + { + "epoch": 0.68, + "learning_rate": 9.88543758119759e-08, + "logits/chosen": -2.379081964492798, + "logits/rejected": -2.16621470451355, + "logps/chosen": -275.62774658203125, + "logps/rejected": -277.1600036621094, + "loss": 0.3908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4698517620563507, + "rewards/margins": 1.8437947034835815, + "rewards/rejected": -2.3136463165283203, + "step": 5806 + }, + { + "epoch": 0.68, + "learning_rate": 9.881894413605762e-08, + "logits/chosen": -2.249687671661377, + "logits/rejected": -2.333430290222168, + "logps/chosen": -138.70065307617188, + "logps/rejected": -202.5987548828125, + "loss": 0.595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.474763035774231, + "rewards/margins": 1.8993209600448608, + "rewards/rejected": -3.374083995819092, + "step": 5807 + }, + { + "epoch": 0.68, + "learning_rate": 9.878351246013937e-08, + "logits/chosen": -2.2325475215911865, + "logits/rejected": -2.400157928466797, + "logps/chosen": -258.97601318359375, + "logps/rejected": -214.21539306640625, + "loss": 0.4433, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4454786777496338, + "rewards/margins": 1.8724851608276367, + "rewards/rejected": -3.3179638385772705, + "step": 5808 + }, + { + "epoch": 0.68, + "learning_rate": 9.874808078422109e-08, + "logits/chosen": -1.9814982414245605, + "logits/rejected": -2.259554624557495, + "logps/chosen": -370.0982666015625, + "logps/rejected": -289.6712951660156, + "loss": 0.3645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7674177289009094, + "rewards/margins": 1.3704586029052734, + "rewards/rejected": -2.137876510620117, + "step": 5809 + }, + { + "epoch": 0.68, + "learning_rate": 9.871264910830283e-08, + "logits/chosen": -3.021476984024048, + "logits/rejected": -3.0162901878356934, + "logps/chosen": -148.9092559814453, + "logps/rejected": -162.7642822265625, + "loss": 0.2438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22747698426246643, + "rewards/margins": 1.9398497343063354, + "rewards/rejected": -2.1673266887664795, + "step": 5810 + }, + { + "epoch": 0.68, + "learning_rate": 9.867721743238455e-08, + "logits/chosen": -2.288134813308716, + "logits/rejected": -2.5542681217193604, + "logps/chosen": -321.8184814453125, + "logps/rejected": -230.3305206298828, + "loss": 0.3836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03234012424945831, + "rewards/margins": 2.430572509765625, + "rewards/rejected": -2.4629125595092773, + "step": 5811 + }, + { + "epoch": 0.68, + "learning_rate": 9.864178575646627e-08, + "logits/chosen": -2.059936046600342, + "logits/rejected": -2.1046040058135986, + "logps/chosen": -271.4300537109375, + "logps/rejected": -309.09722900390625, + "loss": 0.3622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1504054218530655, + "rewards/margins": 1.479260802268982, + "rewards/rejected": -1.6296662092208862, + "step": 5812 + }, + { + "epoch": 0.68, + "learning_rate": 9.860635408054799e-08, + "logits/chosen": -2.626279354095459, + "logits/rejected": -2.7447381019592285, + "logps/chosen": -355.27130126953125, + "logps/rejected": -273.3246154785156, + "loss": 0.2139, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2988759279251099, + "rewards/margins": 2.518878221511841, + "rewards/rejected": -3.8177542686462402, + "step": 5813 + }, + { + "epoch": 0.68, + "learning_rate": 9.857092240462974e-08, + "logits/chosen": -2.65175724029541, + "logits/rejected": -2.5101280212402344, + "logps/chosen": -173.26950073242188, + "logps/rejected": -215.5498046875, + "loss": 0.479, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7574704885482788, + "rewards/margins": 2.199469804763794, + "rewards/rejected": -2.956940174102783, + "step": 5814 + }, + { + "epoch": 0.68, + "learning_rate": 9.853549072871146e-08, + "logits/chosen": -2.599482536315918, + "logits/rejected": -2.360781192779541, + "logps/chosen": -189.71966552734375, + "logps/rejected": -329.5677185058594, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2199496030807495, + "rewards/margins": 2.6072840690612793, + "rewards/rejected": -3.8272337913513184, + "step": 5815 + }, + { + "epoch": 0.68, + "learning_rate": 9.85000590527932e-08, + "logits/chosen": -2.5412893295288086, + "logits/rejected": -2.3722496032714844, + "logps/chosen": -333.02581787109375, + "logps/rejected": -285.10479736328125, + "loss": 0.6744, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.001763939857483, + "rewards/margins": 1.6396164894104004, + "rewards/rejected": -2.641380548477173, + "step": 5816 + }, + { + "epoch": 0.68, + "learning_rate": 9.846462737687492e-08, + "logits/chosen": -2.595106840133667, + "logits/rejected": -2.3334808349609375, + "logps/chosen": -210.63128662109375, + "logps/rejected": -287.3044128417969, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28211739659309387, + "rewards/margins": 5.175088405609131, + "rewards/rejected": -5.457205772399902, + "step": 5817 + }, + { + "epoch": 0.68, + "learning_rate": 9.842919570095664e-08, + "logits/chosen": -2.527214527130127, + "logits/rejected": -2.697998046875, + "logps/chosen": -262.33013916015625, + "logps/rejected": -287.5573425292969, + "loss": 0.2721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.053454361855983734, + "rewards/margins": 3.3497228622436523, + "rewards/rejected": -3.403177261352539, + "step": 5818 + }, + { + "epoch": 0.68, + "learning_rate": 9.839376402503838e-08, + "logits/chosen": -2.8611392974853516, + "logits/rejected": -2.7315587997436523, + "logps/chosen": -440.97021484375, + "logps/rejected": -381.95416259765625, + "loss": 0.1419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7846784591674805, + "rewards/margins": 2.507371425628662, + "rewards/rejected": -3.2920498847961426, + "step": 5819 + }, + { + "epoch": 0.68, + "learning_rate": 9.835833234912011e-08, + "logits/chosen": -2.6056699752807617, + "logits/rejected": -2.5562195777893066, + "logps/chosen": -331.08892822265625, + "logps/rejected": -412.41973876953125, + "loss": 0.2504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5552920699119568, + "rewards/margins": 4.398438453674316, + "rewards/rejected": -4.95373010635376, + "step": 5820 + }, + { + "epoch": 0.68, + "learning_rate": 9.832290067320184e-08, + "logits/chosen": -1.926238775253296, + "logits/rejected": -2.1735785007476807, + "logps/chosen": -274.7484436035156, + "logps/rejected": -201.3153076171875, + "loss": 0.2772, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7782412767410278, + "rewards/margins": 1.5303986072540283, + "rewards/rejected": -2.3086400032043457, + "step": 5821 + }, + { + "epoch": 0.68, + "learning_rate": 9.828746899728357e-08, + "logits/chosen": -2.6639742851257324, + "logits/rejected": -2.636319398880005, + "logps/chosen": -130.09457397460938, + "logps/rejected": -183.01690673828125, + "loss": 0.5483, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3506728410720825, + "rewards/margins": 0.4640253186225891, + "rewards/rejected": -1.8146979808807373, + "step": 5822 + }, + { + "epoch": 0.68, + "learning_rate": 9.825203732136529e-08, + "logits/chosen": -2.547755241394043, + "logits/rejected": -2.6127548217773438, + "logps/chosen": -281.2243347167969, + "logps/rejected": -359.19256591796875, + "loss": 0.2907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8423844575881958, + "rewards/margins": 2.339141368865967, + "rewards/rejected": -3.181525707244873, + "step": 5823 + }, + { + "epoch": 0.68, + "learning_rate": 9.821660564544702e-08, + "logits/chosen": -1.5287011861801147, + "logits/rejected": -1.8484904766082764, + "logps/chosen": -386.1397705078125, + "logps/rejected": -261.4129638671875, + "loss": 0.4082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38434702157974243, + "rewards/margins": 1.3657457828521729, + "rewards/rejected": -1.7500927448272705, + "step": 5824 + }, + { + "epoch": 0.68, + "learning_rate": 9.818117396952875e-08, + "logits/chosen": -2.249058723449707, + "logits/rejected": -2.184455633163452, + "logps/chosen": -120.81012725830078, + "logps/rejected": -257.14715576171875, + "loss": 0.3532, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1911541223526, + "rewards/margins": 2.8143506050109863, + "rewards/rejected": -4.005504131317139, + "step": 5825 + }, + { + "epoch": 0.68, + "learning_rate": 9.814574229361049e-08, + "logits/chosen": -2.513272285461426, + "logits/rejected": -2.444352149963379, + "logps/chosen": -151.2674102783203, + "logps/rejected": -227.27883911132812, + "loss": 0.2233, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2101948857307434, + "rewards/margins": 3.5450193881988525, + "rewards/rejected": -3.334824323654175, + "step": 5826 + }, + { + "epoch": 0.68, + "learning_rate": 9.811031061769222e-08, + "logits/chosen": -2.1268792152404785, + "logits/rejected": -2.2992568016052246, + "logps/chosen": -470.65948486328125, + "logps/rejected": -509.82257080078125, + "loss": 0.2224, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7836727499961853, + "rewards/margins": 2.6066439151763916, + "rewards/rejected": -3.3903164863586426, + "step": 5827 + }, + { + "epoch": 0.68, + "learning_rate": 9.807487894177394e-08, + "logits/chosen": -2.2792792320251465, + "logits/rejected": -2.6962006092071533, + "logps/chosen": -259.50274658203125, + "logps/rejected": -131.101806640625, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4020997285842896, + "rewards/margins": 0.7297334671020508, + "rewards/rejected": -2.131833076477051, + "step": 5828 + }, + { + "epoch": 0.68, + "learning_rate": 9.803944726585567e-08, + "logits/chosen": -1.9343293905258179, + "logits/rejected": -2.0016207695007324, + "logps/chosen": -589.5328979492188, + "logps/rejected": -499.66583251953125, + "loss": 0.4025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1232115030288696, + "rewards/margins": 2.042865514755249, + "rewards/rejected": -3.166076898574829, + "step": 5829 + }, + { + "epoch": 0.68, + "learning_rate": 9.800401558993739e-08, + "logits/chosen": -2.8486547470092773, + "logits/rejected": -2.7257375717163086, + "logps/chosen": -163.724853515625, + "logps/rejected": -219.4993438720703, + "loss": 0.2646, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.393648386001587, + "rewards/margins": 2.3560056686401367, + "rewards/rejected": -3.7496540546417236, + "step": 5830 + }, + { + "epoch": 0.68, + "learning_rate": 9.796858391401912e-08, + "logits/chosen": -1.8759996891021729, + "logits/rejected": -1.860798716545105, + "logps/chosen": -254.4556884765625, + "logps/rejected": -210.89381408691406, + "loss": 0.4051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1211622953414917, + "rewards/margins": 1.085024356842041, + "rewards/rejected": -2.2061867713928223, + "step": 5831 + }, + { + "epoch": 0.68, + "learning_rate": 9.793315223810086e-08, + "logits/chosen": -2.4924604892730713, + "logits/rejected": -2.5159144401550293, + "logps/chosen": -287.80181884765625, + "logps/rejected": -158.75582885742188, + "loss": 0.3922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4019641876220703, + "rewards/margins": 1.3297817707061768, + "rewards/rejected": -1.731745958328247, + "step": 5832 + }, + { + "epoch": 0.68, + "learning_rate": 9.78977205621826e-08, + "logits/chosen": -2.306197166442871, + "logits/rejected": -2.6921420097351074, + "logps/chosen": -363.8436584472656, + "logps/rejected": -369.6960754394531, + "loss": 0.3074, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1744704246520996, + "rewards/margins": 1.3661524057388306, + "rewards/rejected": -3.5406227111816406, + "step": 5833 + }, + { + "epoch": 0.68, + "learning_rate": 9.786228888626432e-08, + "logits/chosen": -2.684108257293701, + "logits/rejected": -2.8449478149414062, + "logps/chosen": -187.42185974121094, + "logps/rejected": -179.3641357421875, + "loss": 0.2168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9326674342155457, + "rewards/margins": 1.5117851495742798, + "rewards/rejected": -2.4444527626037598, + "step": 5834 + }, + { + "epoch": 0.68, + "learning_rate": 9.782685721034604e-08, + "logits/chosen": -2.4081640243530273, + "logits/rejected": -2.5438756942749023, + "logps/chosen": -221.11756896972656, + "logps/rejected": -247.10877990722656, + "loss": 0.5358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8151350617408752, + "rewards/margins": 2.8048768043518066, + "rewards/rejected": -3.620011568069458, + "step": 5835 + }, + { + "epoch": 0.68, + "learning_rate": 9.779142553442777e-08, + "logits/chosen": -2.777893304824829, + "logits/rejected": -2.6634902954101562, + "logps/chosen": -191.167236328125, + "logps/rejected": -323.3246154785156, + "loss": 0.4273, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5514953136444092, + "rewards/margins": 2.842097043991089, + "rewards/rejected": -4.393592357635498, + "step": 5836 + }, + { + "epoch": 0.68, + "learning_rate": 9.775599385850951e-08, + "logits/chosen": -2.401395559310913, + "logits/rejected": -2.4684596061706543, + "logps/chosen": -340.7705383300781, + "logps/rejected": -289.1254577636719, + "loss": 0.5287, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0362420082092285, + "rewards/margins": 1.4623596668243408, + "rewards/rejected": -2.4986016750335693, + "step": 5837 + }, + { + "epoch": 0.68, + "learning_rate": 9.772056218259123e-08, + "logits/chosen": -2.106703758239746, + "logits/rejected": -1.8573567867279053, + "logps/chosen": -280.6707763671875, + "logps/rejected": -290.6792297363281, + "loss": 0.6123, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0737457275390625, + "rewards/margins": 2.191828966140747, + "rewards/rejected": -3.2655749320983887, + "step": 5838 + }, + { + "epoch": 0.68, + "learning_rate": 9.768513050667297e-08, + "logits/chosen": -2.8857665061950684, + "logits/rejected": -2.9276299476623535, + "logps/chosen": -145.70106506347656, + "logps/rejected": -161.30609130859375, + "loss": 0.3633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28052622079849243, + "rewards/margins": 1.4684369564056396, + "rewards/rejected": -1.7489631175994873, + "step": 5839 + }, + { + "epoch": 0.68, + "learning_rate": 9.764969883075469e-08, + "logits/chosen": -2.5134902000427246, + "logits/rejected": -2.482090950012207, + "logps/chosen": -205.07733154296875, + "logps/rejected": -253.4620819091797, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3249451518058777, + "rewards/margins": 3.211097002029419, + "rewards/rejected": -3.5360419750213623, + "step": 5840 + }, + { + "epoch": 0.68, + "learning_rate": 9.761426715483641e-08, + "logits/chosen": -2.4590957164764404, + "logits/rejected": -2.310288429260254, + "logps/chosen": -114.83355712890625, + "logps/rejected": -320.64031982421875, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1090492382645607, + "rewards/margins": 2.796499729156494, + "rewards/rejected": -2.9055488109588623, + "step": 5841 + }, + { + "epoch": 0.68, + "learning_rate": 9.757883547891815e-08, + "logits/chosen": -2.533212900161743, + "logits/rejected": -2.3988802433013916, + "logps/chosen": -417.77557373046875, + "logps/rejected": -426.0582580566406, + "loss": 0.2546, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.031105399131775, + "rewards/margins": 2.0796828269958496, + "rewards/rejected": -3.110788345336914, + "step": 5842 + }, + { + "epoch": 0.68, + "learning_rate": 9.754340380299988e-08, + "logits/chosen": -2.1905882358551025, + "logits/rejected": -2.430752754211426, + "logps/chosen": -194.02206420898438, + "logps/rejected": -177.1234893798828, + "loss": 0.3432, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6669241189956665, + "rewards/margins": 1.2506080865859985, + "rewards/rejected": -2.917532205581665, + "step": 5843 + }, + { + "epoch": 0.68, + "learning_rate": 9.750797212708162e-08, + "logits/chosen": -2.768632411956787, + "logits/rejected": -2.81009578704834, + "logps/chosen": -164.57818603515625, + "logps/rejected": -211.46417236328125, + "loss": 0.2166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9066532850265503, + "rewards/margins": 2.587496757507324, + "rewards/rejected": -3.494150161743164, + "step": 5844 + }, + { + "epoch": 0.68, + "learning_rate": 9.747254045116334e-08, + "logits/chosen": -2.7227959632873535, + "logits/rejected": -2.661351203918457, + "logps/chosen": -242.81210327148438, + "logps/rejected": -290.8025207519531, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4141229391098022, + "rewards/margins": 3.292353630065918, + "rewards/rejected": -4.706476211547852, + "step": 5845 + }, + { + "epoch": 0.68, + "learning_rate": 9.743710877524506e-08, + "logits/chosen": -2.929567575454712, + "logits/rejected": -2.975761890411377, + "logps/chosen": -123.76954650878906, + "logps/rejected": -234.66558837890625, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06425062566995621, + "rewards/margins": 3.860234260559082, + "rewards/rejected": -3.9244847297668457, + "step": 5846 + }, + { + "epoch": 0.68, + "learning_rate": 9.74016770993268e-08, + "logits/chosen": -2.9565682411193848, + "logits/rejected": -2.8933205604553223, + "logps/chosen": -232.4525146484375, + "logps/rejected": -238.96649169921875, + "loss": 0.3053, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1003684997558594, + "rewards/margins": 2.5249862670898438, + "rewards/rejected": -3.625354528427124, + "step": 5847 + }, + { + "epoch": 0.68, + "learning_rate": 9.736624542340852e-08, + "logits/chosen": -2.7018609046936035, + "logits/rejected": -2.519109010696411, + "logps/chosen": -264.9702453613281, + "logps/rejected": -498.9739990234375, + "loss": 0.1762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8539862036705017, + "rewards/margins": 4.591601848602295, + "rewards/rejected": -5.445588111877441, + "step": 5848 + }, + { + "epoch": 0.68, + "learning_rate": 9.733081374749025e-08, + "logits/chosen": -3.0254151821136475, + "logits/rejected": -3.0488219261169434, + "logps/chosen": -174.10247802734375, + "logps/rejected": -171.7904052734375, + "loss": 0.4273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.194596767425537, + "rewards/margins": 1.5572588443756104, + "rewards/rejected": -2.7518558502197266, + "step": 5849 + }, + { + "epoch": 0.68, + "learning_rate": 9.729538207157199e-08, + "logits/chosen": -2.719857692718506, + "logits/rejected": -2.6141107082366943, + "logps/chosen": -447.00201416015625, + "logps/rejected": -362.6184387207031, + "loss": 0.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7481001615524292, + "rewards/margins": 2.6477272510528564, + "rewards/rejected": -3.395827531814575, + "step": 5850 + }, + { + "epoch": 0.68, + "learning_rate": 9.725995039565371e-08, + "logits/chosen": -2.04660964012146, + "logits/rejected": -1.894899845123291, + "logps/chosen": -459.4366149902344, + "logps/rejected": -459.6244812011719, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3228460550308228, + "rewards/margins": 1.7490999698638916, + "rewards/rejected": -3.071945905685425, + "step": 5851 + }, + { + "epoch": 0.68, + "learning_rate": 9.722451871973543e-08, + "logits/chosen": -2.043168544769287, + "logits/rejected": -2.140120506286621, + "logps/chosen": -442.5826110839844, + "logps/rejected": -426.6492919921875, + "loss": 0.5071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3536932170391083, + "rewards/margins": 1.730751633644104, + "rewards/rejected": -2.084444999694824, + "step": 5852 + }, + { + "epoch": 0.68, + "learning_rate": 9.718908704381717e-08, + "logits/chosen": -2.7855868339538574, + "logits/rejected": -2.6384315490722656, + "logps/chosen": -197.64749145507812, + "logps/rejected": -299.1425476074219, + "loss": 0.2763, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3158573508262634, + "rewards/margins": 3.7448999881744385, + "rewards/rejected": -4.060757160186768, + "step": 5853 + }, + { + "epoch": 0.68, + "learning_rate": 9.715365536789889e-08, + "logits/chosen": -2.4416534900665283, + "logits/rejected": -2.211107015609741, + "logps/chosen": -187.8308868408203, + "logps/rejected": -230.53701782226562, + "loss": 0.3152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6655954718589783, + "rewards/margins": 1.5582704544067383, + "rewards/rejected": -2.2238659858703613, + "step": 5854 + }, + { + "epoch": 0.68, + "learning_rate": 9.711822369198064e-08, + "logits/chosen": -2.4269604682922363, + "logits/rejected": -2.6616358757019043, + "logps/chosen": -230.49569702148438, + "logps/rejected": -145.82594299316406, + "loss": 1.7174, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.284794569015503, + "rewards/margins": 0.28378283977508545, + "rewards/rejected": -3.568577289581299, + "step": 5855 + }, + { + "epoch": 0.68, + "learning_rate": 9.708279201606236e-08, + "logits/chosen": -2.864290952682495, + "logits/rejected": -2.592101812362671, + "logps/chosen": -170.21739196777344, + "logps/rejected": -339.693603515625, + "loss": 0.5066, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3716148138046265, + "rewards/margins": 1.6630580425262451, + "rewards/rejected": -3.034672737121582, + "step": 5856 + }, + { + "epoch": 0.68, + "learning_rate": 9.704736034014408e-08, + "logits/chosen": -2.1063737869262695, + "logits/rejected": -2.0136847496032715, + "logps/chosen": -264.4392395019531, + "logps/rejected": -284.8487548828125, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.652680516242981, + "rewards/margins": 2.955890417098999, + "rewards/rejected": -3.6085710525512695, + "step": 5857 + }, + { + "epoch": 0.68, + "learning_rate": 9.70119286642258e-08, + "logits/chosen": -2.2321510314941406, + "logits/rejected": -2.1191959381103516, + "logps/chosen": -237.4574432373047, + "logps/rejected": -234.283447265625, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.017428398132324, + "rewards/margins": 2.4314522743225098, + "rewards/rejected": -4.448881149291992, + "step": 5858 + }, + { + "epoch": 0.68, + "learning_rate": 9.697649698830754e-08, + "logits/chosen": -2.7944183349609375, + "logits/rejected": -2.9035797119140625, + "logps/chosen": -209.94839477539062, + "logps/rejected": -290.82415771484375, + "loss": 0.2371, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4654172658920288, + "rewards/margins": 3.6061437129974365, + "rewards/rejected": -4.071560859680176, + "step": 5859 + }, + { + "epoch": 0.68, + "learning_rate": 9.694106531238926e-08, + "logits/chosen": -2.2708511352539062, + "logits/rejected": -2.408926486968994, + "logps/chosen": -391.6497497558594, + "logps/rejected": -448.88189697265625, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2796616554260254, + "rewards/margins": 4.741923809051514, + "rewards/rejected": -5.021585464477539, + "step": 5860 + }, + { + "epoch": 0.68, + "learning_rate": 9.690563363647101e-08, + "logits/chosen": -2.438347339630127, + "logits/rejected": -2.4875471591949463, + "logps/chosen": -251.97291564941406, + "logps/rejected": -221.66908264160156, + "loss": 0.3331, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5308715105056763, + "rewards/margins": 2.1991844177246094, + "rewards/rejected": -3.7300562858581543, + "step": 5861 + }, + { + "epoch": 0.68, + "learning_rate": 9.687020196055273e-08, + "logits/chosen": -1.7928955554962158, + "logits/rejected": -2.323960065841675, + "logps/chosen": -301.6091003417969, + "logps/rejected": -252.599853515625, + "loss": 0.6005, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0585882663726807, + "rewards/margins": 2.8673648834228516, + "rewards/rejected": -3.9259533882141113, + "step": 5862 + }, + { + "epoch": 0.68, + "learning_rate": 9.683477028463446e-08, + "logits/chosen": -2.190072536468506, + "logits/rejected": -2.344536304473877, + "logps/chosen": -281.3901062011719, + "logps/rejected": -325.34735107421875, + "loss": 0.2686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.79584801197052, + "rewards/margins": 1.9217419624328613, + "rewards/rejected": -2.717589855194092, + "step": 5863 + }, + { + "epoch": 0.68, + "learning_rate": 9.679933860871619e-08, + "logits/chosen": -1.975340485572815, + "logits/rejected": -2.309140682220459, + "logps/chosen": -320.14263916015625, + "logps/rejected": -200.86529541015625, + "loss": 0.3032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07751323282718658, + "rewards/margins": 2.256333589553833, + "rewards/rejected": -2.1788203716278076, + "step": 5864 + }, + { + "epoch": 0.68, + "learning_rate": 9.676390693279791e-08, + "logits/chosen": -2.949552536010742, + "logits/rejected": -3.009413957595825, + "logps/chosen": -396.7818298339844, + "logps/rejected": -366.3778076171875, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0566915273666382, + "rewards/margins": 4.445048809051514, + "rewards/rejected": -5.501739978790283, + "step": 5865 + }, + { + "epoch": 0.68, + "learning_rate": 9.672847525687964e-08, + "logits/chosen": -2.220644474029541, + "logits/rejected": -2.4218220710754395, + "logps/chosen": -361.0992736816406, + "logps/rejected": -247.60507202148438, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7737730741500854, + "rewards/margins": 0.7254037261009216, + "rewards/rejected": -1.4991768598556519, + "step": 5866 + }, + { + "epoch": 0.68, + "learning_rate": 9.669304358096138e-08, + "logits/chosen": -2.1382663249969482, + "logits/rejected": -2.238375186920166, + "logps/chosen": -284.5999755859375, + "logps/rejected": -163.6244659423828, + "loss": 0.4937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7055432796478271, + "rewards/margins": 0.7947183847427368, + "rewards/rejected": -1.5002617835998535, + "step": 5867 + }, + { + "epoch": 0.68, + "learning_rate": 9.66576119050431e-08, + "logits/chosen": -2.100579261779785, + "logits/rejected": -1.805131435394287, + "logps/chosen": -390.3517150878906, + "logps/rejected": -337.4720764160156, + "loss": 1.2853, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5684128999710083, + "rewards/margins": 0.687651515007019, + "rewards/rejected": -2.2560644149780273, + "step": 5868 + }, + { + "epoch": 0.68, + "learning_rate": 9.662218022912483e-08, + "logits/chosen": -2.3235695362091064, + "logits/rejected": -2.326822280883789, + "logps/chosen": -340.28302001953125, + "logps/rejected": -259.8599853515625, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9328219890594482, + "rewards/margins": 1.7996058464050293, + "rewards/rejected": -3.7324278354644775, + "step": 5869 + }, + { + "epoch": 0.68, + "learning_rate": 9.658674855320656e-08, + "logits/chosen": -2.0841453075408936, + "logits/rejected": -1.9630587100982666, + "logps/chosen": -426.2296447753906, + "logps/rejected": -445.25994873046875, + "loss": 0.1472, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4540475010871887, + "rewards/margins": 2.92598295211792, + "rewards/rejected": -3.380030393600464, + "step": 5870 + }, + { + "epoch": 0.68, + "learning_rate": 9.655131687728829e-08, + "logits/chosen": -2.6098687648773193, + "logits/rejected": -2.7706615924835205, + "logps/chosen": -305.1506652832031, + "logps/rejected": -256.80133056640625, + "loss": 0.1673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20819798111915588, + "rewards/margins": 2.841977119445801, + "rewards/rejected": -3.0501749515533447, + "step": 5871 + }, + { + "epoch": 0.68, + "learning_rate": 9.651588520137003e-08, + "logits/chosen": -2.7098987102508545, + "logits/rejected": -2.765056610107422, + "logps/chosen": -206.22410583496094, + "logps/rejected": -188.06324768066406, + "loss": 0.2634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9149890542030334, + "rewards/margins": 1.8388144969940186, + "rewards/rejected": -2.7538037300109863, + "step": 5872 + }, + { + "epoch": 0.68, + "learning_rate": 9.648045352545176e-08, + "logits/chosen": -2.5029172897338867, + "logits/rejected": -1.9610893726348877, + "logps/chosen": -128.87857055664062, + "logps/rejected": -374.7557373046875, + "loss": 0.2696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09530672430992126, + "rewards/margins": 1.943038821220398, + "rewards/rejected": -2.0383455753326416, + "step": 5873 + }, + { + "epoch": 0.68, + "learning_rate": 9.644502184953348e-08, + "logits/chosen": -2.2349438667297363, + "logits/rejected": -2.4086241722106934, + "logps/chosen": -264.426513671875, + "logps/rejected": -361.1946716308594, + "loss": 0.3745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.366037368774414, + "rewards/margins": 3.4394259452819824, + "rewards/rejected": -4.8054633140563965, + "step": 5874 + }, + { + "epoch": 0.68, + "learning_rate": 9.64095901736152e-08, + "logits/chosen": -2.061450958251953, + "logits/rejected": -2.1520233154296875, + "logps/chosen": -493.27294921875, + "logps/rejected": -373.57012939453125, + "loss": 0.3093, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.632220983505249, + "rewards/margins": 1.282753825187683, + "rewards/rejected": -2.9149746894836426, + "step": 5875 + }, + { + "epoch": 0.68, + "learning_rate": 9.637415849769694e-08, + "logits/chosen": -2.633035182952881, + "logits/rejected": -2.62864089012146, + "logps/chosen": -196.40182495117188, + "logps/rejected": -186.197998046875, + "loss": 0.2644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19892184436321259, + "rewards/margins": 3.5853686332702637, + "rewards/rejected": -3.784290313720703, + "step": 5876 + }, + { + "epoch": 0.68, + "learning_rate": 9.633872682177866e-08, + "logits/chosen": -2.3477790355682373, + "logits/rejected": -2.3270368576049805, + "logps/chosen": -262.3770446777344, + "logps/rejected": -302.68658447265625, + "loss": 0.3899, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1917483806610107, + "rewards/margins": 1.4134128093719482, + "rewards/rejected": -2.605161190032959, + "step": 5877 + }, + { + "epoch": 0.68, + "learning_rate": 9.630329514586041e-08, + "logits/chosen": -2.0783581733703613, + "logits/rejected": -1.8413623571395874, + "logps/chosen": -277.5154113769531, + "logps/rejected": -466.63800048828125, + "loss": 0.265, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.014033585786819458, + "rewards/margins": 1.8185244798660278, + "rewards/rejected": -1.8044910430908203, + "step": 5878 + }, + { + "epoch": 0.68, + "learning_rate": 9.626786346994213e-08, + "logits/chosen": -2.5362746715545654, + "logits/rejected": -2.6752707958221436, + "logps/chosen": -254.61607360839844, + "logps/rejected": -238.74249267578125, + "loss": 0.2132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6979035139083862, + "rewards/margins": 1.9780018329620361, + "rewards/rejected": -2.675905227661133, + "step": 5879 + }, + { + "epoch": 0.68, + "learning_rate": 9.623243179402385e-08, + "logits/chosen": -2.684633731842041, + "logits/rejected": -2.559316635131836, + "logps/chosen": -255.76148986816406, + "logps/rejected": -320.2872314453125, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3142439126968384, + "rewards/margins": 2.8019003868103027, + "rewards/rejected": -4.116144180297852, + "step": 5880 + }, + { + "epoch": 0.68, + "learning_rate": 9.619700011810559e-08, + "logits/chosen": -2.267869710922241, + "logits/rejected": -2.0087978839874268, + "logps/chosen": -428.28167724609375, + "logps/rejected": -306.6273498535156, + "loss": 0.2731, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4562082290649414, + "rewards/margins": 3.063324451446533, + "rewards/rejected": -3.5195324420928955, + "step": 5881 + }, + { + "epoch": 0.68, + "learning_rate": 9.616156844218731e-08, + "logits/chosen": -2.565192699432373, + "logits/rejected": -2.395318031311035, + "logps/chosen": -183.47491455078125, + "logps/rejected": -210.11965942382812, + "loss": 0.132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7746830582618713, + "rewards/margins": 4.530812740325928, + "rewards/rejected": -5.305495738983154, + "step": 5882 + }, + { + "epoch": 0.68, + "learning_rate": 9.612613676626903e-08, + "logits/chosen": -2.3027725219726562, + "logits/rejected": -2.328664779663086, + "logps/chosen": -472.92279052734375, + "logps/rejected": -313.06787109375, + "loss": 0.4451, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1310348510742188, + "rewards/margins": 1.2796835899353027, + "rewards/rejected": -2.4107184410095215, + "step": 5883 + }, + { + "epoch": 0.68, + "learning_rate": 9.609070509035078e-08, + "logits/chosen": -2.7532668113708496, + "logits/rejected": -2.848609447479248, + "logps/chosen": -343.0406799316406, + "logps/rejected": -316.49688720703125, + "loss": 0.2909, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06904959678649902, + "rewards/margins": 2.565916061401367, + "rewards/rejected": -2.496866464614868, + "step": 5884 + }, + { + "epoch": 0.68, + "learning_rate": 9.60552734144325e-08, + "logits/chosen": -2.546437978744507, + "logits/rejected": -2.877047300338745, + "logps/chosen": -287.32904052734375, + "logps/rejected": -207.63685607910156, + "loss": 0.1494, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6403865218162537, + "rewards/margins": 4.052772045135498, + "rewards/rejected": -4.693158149719238, + "step": 5885 + }, + { + "epoch": 0.68, + "learning_rate": 9.601984173851422e-08, + "logits/chosen": -1.9319920539855957, + "logits/rejected": -1.8634850978851318, + "logps/chosen": -294.12646484375, + "logps/rejected": -380.89697265625, + "loss": 0.8134, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4304003715515137, + "rewards/margins": 0.44577091932296753, + "rewards/rejected": -1.876171350479126, + "step": 5886 + }, + { + "epoch": 0.68, + "learning_rate": 9.598441006259596e-08, + "logits/chosen": -2.1247053146362305, + "logits/rejected": -2.233093023300171, + "logps/chosen": -370.36456298828125, + "logps/rejected": -374.5145263671875, + "loss": 0.1664, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43138623237609863, + "rewards/margins": 2.675057888031006, + "rewards/rejected": -3.1064443588256836, + "step": 5887 + }, + { + "epoch": 0.68, + "learning_rate": 9.594897838667768e-08, + "logits/chosen": -2.5947442054748535, + "logits/rejected": -2.422447919845581, + "logps/chosen": -293.9527893066406, + "logps/rejected": -351.14892578125, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30337369441986084, + "rewards/margins": 2.2513818740844727, + "rewards/rejected": -2.554755687713623, + "step": 5888 + }, + { + "epoch": 0.69, + "learning_rate": 9.59135467107594e-08, + "logits/chosen": -2.32375431060791, + "logits/rejected": -2.4762425422668457, + "logps/chosen": -216.7289276123047, + "logps/rejected": -221.71517944335938, + "loss": 0.6269, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.4433209896087646, + "rewards/margins": 0.9587020874023438, + "rewards/rejected": -3.4020230770111084, + "step": 5889 + }, + { + "epoch": 0.69, + "learning_rate": 9.587811503484115e-08, + "logits/chosen": -1.8865809440612793, + "logits/rejected": -1.92498779296875, + "logps/chosen": -292.66668701171875, + "logps/rejected": -256.88104248046875, + "loss": 1.0313, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3378713130950928, + "rewards/margins": 1.1495935916900635, + "rewards/rejected": -3.4874649047851562, + "step": 5890 + }, + { + "epoch": 0.69, + "learning_rate": 9.584268335892287e-08, + "logits/chosen": -1.949636697769165, + "logits/rejected": -2.1250860691070557, + "logps/chosen": -355.9758605957031, + "logps/rejected": -281.478515625, + "loss": 0.5243, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.42768132686615, + "rewards/margins": 1.5919275283813477, + "rewards/rejected": -3.019608736038208, + "step": 5891 + }, + { + "epoch": 0.69, + "learning_rate": 9.580725168300461e-08, + "logits/chosen": -2.072981119155884, + "logits/rejected": -2.3087215423583984, + "logps/chosen": -335.58642578125, + "logps/rejected": -303.81231689453125, + "loss": 0.3614, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31743279099464417, + "rewards/margins": 1.7269761562347412, + "rewards/rejected": -2.0444087982177734, + "step": 5892 + }, + { + "epoch": 0.69, + "learning_rate": 9.577182000708633e-08, + "logits/chosen": -2.3352630138397217, + "logits/rejected": -2.3733298778533936, + "logps/chosen": -369.0168762207031, + "logps/rejected": -347.66455078125, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41420289874076843, + "rewards/margins": 1.910370111465454, + "rewards/rejected": -2.324573040008545, + "step": 5893 + }, + { + "epoch": 0.69, + "learning_rate": 9.573638833116805e-08, + "logits/chosen": -2.337742805480957, + "logits/rejected": -2.1794371604919434, + "logps/chosen": -359.23370361328125, + "logps/rejected": -289.09222412109375, + "loss": 0.3418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6389098167419434, + "rewards/margins": 2.439901351928711, + "rewards/rejected": -3.0788111686706543, + "step": 5894 + }, + { + "epoch": 0.69, + "learning_rate": 9.570095665524978e-08, + "logits/chosen": -1.9090752601623535, + "logits/rejected": -2.084043025970459, + "logps/chosen": -284.69732666015625, + "logps/rejected": -306.3131103515625, + "loss": 0.4023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7799218893051147, + "rewards/margins": 2.544426918029785, + "rewards/rejected": -3.3243486881256104, + "step": 5895 + }, + { + "epoch": 0.69, + "learning_rate": 9.566552497933152e-08, + "logits/chosen": -1.9165470600128174, + "logits/rejected": -1.620482325553894, + "logps/chosen": -238.80709838867188, + "logps/rejected": -334.2662353515625, + "loss": 0.5079, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.262253999710083, + "rewards/margins": 1.624863862991333, + "rewards/rejected": -2.887117862701416, + "step": 5896 + }, + { + "epoch": 0.69, + "learning_rate": 9.563009330341325e-08, + "logits/chosen": -1.9619412422180176, + "logits/rejected": -2.1274194717407227, + "logps/chosen": -400.50946044921875, + "logps/rejected": -322.94012451171875, + "loss": 0.4957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.848469614982605, + "rewards/margins": 1.2204712629318237, + "rewards/rejected": -2.068941116333008, + "step": 5897 + }, + { + "epoch": 0.69, + "learning_rate": 9.559466162749498e-08, + "logits/chosen": -2.6760141849517822, + "logits/rejected": -2.6767990589141846, + "logps/chosen": -319.7278137207031, + "logps/rejected": -177.6492919921875, + "loss": 0.2742, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6762881278991699, + "rewards/margins": 2.2938010692596436, + "rewards/rejected": -2.9700894355773926, + "step": 5898 + }, + { + "epoch": 0.69, + "learning_rate": 9.55592299515767e-08, + "logits/chosen": -2.734656810760498, + "logits/rejected": -2.700079917907715, + "logps/chosen": -98.83802032470703, + "logps/rejected": -183.96836853027344, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6390364170074463, + "rewards/margins": 2.0303823947906494, + "rewards/rejected": -2.6694188117980957, + "step": 5899 + }, + { + "epoch": 0.69, + "learning_rate": 9.552379827565843e-08, + "logits/chosen": -2.077681064605713, + "logits/rejected": -1.7779815196990967, + "logps/chosen": -222.34445190429688, + "logps/rejected": -264.60589599609375, + "loss": 0.188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5411868095397949, + "rewards/margins": 2.003406047821045, + "rewards/rejected": -2.54459285736084, + "step": 5900 + }, + { + "epoch": 0.69, + "learning_rate": 9.548836659974016e-08, + "logits/chosen": -2.422165870666504, + "logits/rejected": -2.2710418701171875, + "logps/chosen": -190.32870483398438, + "logps/rejected": -237.65179443359375, + "loss": 0.3044, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5092857480049133, + "rewards/margins": 1.8819029331207275, + "rewards/rejected": -2.391188621520996, + "step": 5901 + }, + { + "epoch": 0.69, + "learning_rate": 9.54529349238219e-08, + "logits/chosen": -2.528597831726074, + "logits/rejected": -2.565235137939453, + "logps/chosen": -314.50408935546875, + "logps/rejected": -234.3599395751953, + "loss": 0.3795, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5634748339653015, + "rewards/margins": 1.376401662826538, + "rewards/rejected": -1.9398764371871948, + "step": 5902 + }, + { + "epoch": 0.69, + "learning_rate": 9.541750324790362e-08, + "logits/chosen": -2.3112144470214844, + "logits/rejected": -2.413743019104004, + "logps/chosen": -217.50584411621094, + "logps/rejected": -243.25726318359375, + "loss": 0.1812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17611321806907654, + "rewards/margins": 2.6496267318725586, + "rewards/rejected": -2.825739622116089, + "step": 5903 + }, + { + "epoch": 0.69, + "learning_rate": 9.538207157198535e-08, + "logits/chosen": -1.55003821849823, + "logits/rejected": -1.8767004013061523, + "logps/chosen": -439.70892333984375, + "logps/rejected": -420.8730163574219, + "loss": 0.57, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4753055572509766, + "rewards/margins": 0.8566839694976807, + "rewards/rejected": -2.3319895267486572, + "step": 5904 + }, + { + "epoch": 0.69, + "learning_rate": 9.534663989606708e-08, + "logits/chosen": -1.8701086044311523, + "logits/rejected": -2.1864428520202637, + "logps/chosen": -237.4652099609375, + "logps/rejected": -232.73223876953125, + "loss": 0.5811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8477592468261719, + "rewards/margins": 2.560506820678711, + "rewards/rejected": -3.408266067504883, + "step": 5905 + }, + { + "epoch": 0.69, + "learning_rate": 9.53112082201488e-08, + "logits/chosen": -1.9162864685058594, + "logits/rejected": -2.178011417388916, + "logps/chosen": -410.37445068359375, + "logps/rejected": -260.7994079589844, + "loss": 0.7261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6745463609695435, + "rewards/margins": 1.3188854455947876, + "rewards/rejected": -1.9934319257736206, + "step": 5906 + }, + { + "epoch": 0.69, + "learning_rate": 9.527577654423055e-08, + "logits/chosen": -2.1639902591705322, + "logits/rejected": -2.3871755599975586, + "logps/chosen": -133.58465576171875, + "logps/rejected": -126.93550109863281, + "loss": 0.4765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9120819568634033, + "rewards/margins": 1.528993844985962, + "rewards/rejected": -2.4410760402679443, + "step": 5907 + }, + { + "epoch": 0.69, + "learning_rate": 9.524034486831227e-08, + "logits/chosen": -2.173604726791382, + "logits/rejected": -1.8360134363174438, + "logps/chosen": -99.08837890625, + "logps/rejected": -266.2565002441406, + "loss": 0.4014, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5634245872497559, + "rewards/margins": 2.742495059967041, + "rewards/rejected": -4.305919170379639, + "step": 5908 + }, + { + "epoch": 0.69, + "learning_rate": 9.5204913192394e-08, + "logits/chosen": -2.8517205715179443, + "logits/rejected": -2.60160493850708, + "logps/chosen": -89.30880737304688, + "logps/rejected": -179.57623291015625, + "loss": 0.5477, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.708245038986206, + "rewards/margins": 0.9475343227386475, + "rewards/rejected": -2.6557793617248535, + "step": 5909 + }, + { + "epoch": 0.69, + "learning_rate": 9.516948151647573e-08, + "logits/chosen": -2.0299699306488037, + "logits/rejected": -2.201125383377075, + "logps/chosen": -273.859619140625, + "logps/rejected": -299.3193359375, + "loss": 0.4466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12934082746505737, + "rewards/margins": 1.8654190301895142, + "rewards/rejected": -1.9947597980499268, + "step": 5910 + }, + { + "epoch": 0.69, + "learning_rate": 9.513404984055745e-08, + "logits/chosen": -2.2381529808044434, + "logits/rejected": -2.412238597869873, + "logps/chosen": -181.9948272705078, + "logps/rejected": -161.71131896972656, + "loss": 0.2716, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.194922685623169, + "rewards/margins": 1.8007724285125732, + "rewards/rejected": -2.995695114135742, + "step": 5911 + }, + { + "epoch": 0.69, + "learning_rate": 9.509861816463917e-08, + "logits/chosen": -2.2182302474975586, + "logits/rejected": -2.3002400398254395, + "logps/chosen": -222.30081176757812, + "logps/rejected": -221.53790283203125, + "loss": 1.4641, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.611617088317871, + "rewards/margins": 0.47977834939956665, + "rewards/rejected": -2.091395378112793, + "step": 5912 + }, + { + "epoch": 0.69, + "learning_rate": 9.506318648872092e-08, + "logits/chosen": -2.3533477783203125, + "logits/rejected": -2.264291763305664, + "logps/chosen": -256.43707275390625, + "logps/rejected": -245.60302734375, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4704910218715668, + "rewards/margins": 2.043818712234497, + "rewards/rejected": -2.514309883117676, + "step": 5913 + }, + { + "epoch": 0.69, + "learning_rate": 9.502775481280264e-08, + "logits/chosen": -2.201035976409912, + "logits/rejected": -2.378178596496582, + "logps/chosen": -314.2226257324219, + "logps/rejected": -242.24322509765625, + "loss": 0.6851, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4087984561920166, + "rewards/margins": 1.5760602951049805, + "rewards/rejected": -2.984858751296997, + "step": 5914 + }, + { + "epoch": 0.69, + "learning_rate": 9.499232313688438e-08, + "logits/chosen": -2.6358203887939453, + "logits/rejected": -2.5250744819641113, + "logps/chosen": -255.57733154296875, + "logps/rejected": -356.587158203125, + "loss": 0.5078, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2061302661895752, + "rewards/margins": 1.6427199840545654, + "rewards/rejected": -2.8488500118255615, + "step": 5915 + }, + { + "epoch": 0.69, + "learning_rate": 9.49568914609661e-08, + "logits/chosen": -1.6475694179534912, + "logits/rejected": -1.6061614751815796, + "logps/chosen": -402.5577087402344, + "logps/rejected": -407.996337890625, + "loss": 0.9326, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.743146538734436, + "rewards/margins": 0.970516562461853, + "rewards/rejected": -2.713663101196289, + "step": 5916 + }, + { + "epoch": 0.69, + "learning_rate": 9.492145978504782e-08, + "logits/chosen": -2.179629325866699, + "logits/rejected": -2.039102554321289, + "logps/chosen": -301.76422119140625, + "logps/rejected": -389.2252197265625, + "loss": 0.271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1813288927078247, + "rewards/margins": 2.287900447845459, + "rewards/rejected": -2.4692294597625732, + "step": 5917 + }, + { + "epoch": 0.69, + "learning_rate": 9.488602810912956e-08, + "logits/chosen": -2.7107768058776855, + "logits/rejected": -2.750114679336548, + "logps/chosen": -294.894287109375, + "logps/rejected": -280.7994689941406, + "loss": 0.4793, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.198390245437622, + "rewards/margins": 3.3883914947509766, + "rewards/rejected": -4.5867815017700195, + "step": 5918 + }, + { + "epoch": 0.69, + "learning_rate": 9.485059643321129e-08, + "logits/chosen": -2.4870729446411133, + "logits/rejected": -2.305002212524414, + "logps/chosen": -389.4294128417969, + "logps/rejected": -433.03057861328125, + "loss": 0.2701, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.527874767780304, + "rewards/margins": 2.127601385116577, + "rewards/rejected": -2.6554760932922363, + "step": 5919 + }, + { + "epoch": 0.69, + "learning_rate": 9.481516475729301e-08, + "logits/chosen": -2.9721598625183105, + "logits/rejected": -3.0318779945373535, + "logps/chosen": -175.3630828857422, + "logps/rejected": -223.1979522705078, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6491037607192993, + "rewards/margins": 2.826045274734497, + "rewards/rejected": -3.475148916244507, + "step": 5920 + }, + { + "epoch": 0.69, + "learning_rate": 9.477973308137475e-08, + "logits/chosen": -1.7583205699920654, + "logits/rejected": -1.9558253288269043, + "logps/chosen": -586.5169677734375, + "logps/rejected": -455.55072021484375, + "loss": 0.3244, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0194284915924072, + "rewards/margins": 2.181220531463623, + "rewards/rejected": -3.200648784637451, + "step": 5921 + }, + { + "epoch": 0.69, + "learning_rate": 9.474430140545647e-08, + "logits/chosen": -2.657351493835449, + "logits/rejected": -2.9368834495544434, + "logps/chosen": -863.09912109375, + "logps/rejected": -168.06103515625, + "loss": 0.5833, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3771238327026367, + "rewards/margins": 1.4492801427841187, + "rewards/rejected": -2.826404094696045, + "step": 5922 + }, + { + "epoch": 0.69, + "learning_rate": 9.47088697295382e-08, + "logits/chosen": -2.249117851257324, + "logits/rejected": -2.3774282932281494, + "logps/chosen": -428.8744812011719, + "logps/rejected": -438.18658447265625, + "loss": 0.647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8838959336280823, + "rewards/margins": 1.1071912050247192, + "rewards/rejected": -1.9910871982574463, + "step": 5923 + }, + { + "epoch": 0.69, + "learning_rate": 9.467343805361993e-08, + "logits/chosen": -2.333578586578369, + "logits/rejected": -2.4373106956481934, + "logps/chosen": -289.1213073730469, + "logps/rejected": -202.0303497314453, + "loss": 0.3034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43871447443962097, + "rewards/margins": 2.020746946334839, + "rewards/rejected": -2.4594614505767822, + "step": 5924 + }, + { + "epoch": 0.69, + "learning_rate": 9.463800637770166e-08, + "logits/chosen": -2.077951192855835, + "logits/rejected": -1.8953227996826172, + "logps/chosen": -217.91490173339844, + "logps/rejected": -287.54241943359375, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4839100241661072, + "rewards/margins": 2.426374912261963, + "rewards/rejected": -2.9102847576141357, + "step": 5925 + }, + { + "epoch": 0.69, + "learning_rate": 9.46025747017834e-08, + "logits/chosen": -1.8402951955795288, + "logits/rejected": -1.91623854637146, + "logps/chosen": -505.0852355957031, + "logps/rejected": -452.1322326660156, + "loss": 0.4046, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7689120173454285, + "rewards/margins": 1.7076504230499268, + "rewards/rejected": -2.4765625, + "step": 5926 + }, + { + "epoch": 0.69, + "learning_rate": 9.456714302586512e-08, + "logits/chosen": -1.6595100164413452, + "logits/rejected": -1.982163667678833, + "logps/chosen": -293.1182861328125, + "logps/rejected": -265.4901123046875, + "loss": 0.2295, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2739243507385254, + "rewards/margins": 2.196908473968506, + "rewards/rejected": -3.4708328247070312, + "step": 5927 + }, + { + "epoch": 0.69, + "learning_rate": 9.453171134994684e-08, + "logits/chosen": -2.3765406608581543, + "logits/rejected": -2.3280892372131348, + "logps/chosen": -230.54444885253906, + "logps/rejected": -198.10858154296875, + "loss": 0.3386, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16764776408672333, + "rewards/margins": 1.4049782752990723, + "rewards/rejected": -1.5726261138916016, + "step": 5928 + }, + { + "epoch": 0.69, + "learning_rate": 9.449627967402857e-08, + "logits/chosen": -2.28157901763916, + "logits/rejected": -2.579763650894165, + "logps/chosen": -138.90377807617188, + "logps/rejected": -216.56285095214844, + "loss": 0.2334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.683760404586792, + "rewards/margins": 3.10937762260437, + "rewards/rejected": -3.793138027191162, + "step": 5929 + }, + { + "epoch": 0.69, + "learning_rate": 9.44608479981103e-08, + "logits/chosen": -2.830928087234497, + "logits/rejected": -2.681988000869751, + "logps/chosen": -670.5145874023438, + "logps/rejected": -298.57373046875, + "loss": 0.3143, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9294040203094482, + "rewards/margins": 1.8117976188659668, + "rewards/rejected": -3.741201639175415, + "step": 5930 + }, + { + "epoch": 0.69, + "learning_rate": 9.442541632219204e-08, + "logits/chosen": -2.1685876846313477, + "logits/rejected": -2.0510852336883545, + "logps/chosen": -136.60537719726562, + "logps/rejected": -192.06222534179688, + "loss": 0.2329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8772600293159485, + "rewards/margins": 2.285426139831543, + "rewards/rejected": -3.1626861095428467, + "step": 5931 + }, + { + "epoch": 0.69, + "learning_rate": 9.438998464627377e-08, + "logits/chosen": -2.0891244411468506, + "logits/rejected": -2.65095853805542, + "logps/chosen": -462.6180419921875, + "logps/rejected": -302.8314208984375, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9913123846054077, + "rewards/margins": 2.433444023132324, + "rewards/rejected": -3.4247565269470215, + "step": 5932 + }, + { + "epoch": 0.69, + "learning_rate": 9.43545529703555e-08, + "logits/chosen": -1.6498281955718994, + "logits/rejected": -1.9068920612335205, + "logps/chosen": -588.326904296875, + "logps/rejected": -409.0146179199219, + "loss": 1.189, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.684700846672058, + "rewards/margins": 0.0992247462272644, + "rewards/rejected": -1.7839255332946777, + "step": 5933 + }, + { + "epoch": 0.69, + "learning_rate": 9.431912129443722e-08, + "logits/chosen": -2.5157723426818848, + "logits/rejected": -2.7526533603668213, + "logps/chosen": -173.66766357421875, + "logps/rejected": -119.85406494140625, + "loss": 0.3312, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6468674540519714, + "rewards/margins": 1.2184251546859741, + "rewards/rejected": -1.8652925491333008, + "step": 5934 + }, + { + "epoch": 0.69, + "learning_rate": 9.428368961851895e-08, + "logits/chosen": -2.8032970428466797, + "logits/rejected": -2.7056663036346436, + "logps/chosen": -264.3237609863281, + "logps/rejected": -271.6268310546875, + "loss": 0.2401, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9757487177848816, + "rewards/margins": 2.0539379119873047, + "rewards/rejected": -3.029686689376831, + "step": 5935 + }, + { + "epoch": 0.69, + "learning_rate": 9.424825794260067e-08, + "logits/chosen": -1.95194411277771, + "logits/rejected": -2.1157774925231934, + "logps/chosen": -157.1025390625, + "logps/rejected": -210.97959899902344, + "loss": 1.1956, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0827066898345947, + "rewards/margins": 1.262181043624878, + "rewards/rejected": -3.3448877334594727, + "step": 5936 + }, + { + "epoch": 0.69, + "learning_rate": 9.421282626668241e-08, + "logits/chosen": -2.865729808807373, + "logits/rejected": -2.614572048187256, + "logps/chosen": -99.10787963867188, + "logps/rejected": -151.16262817382812, + "loss": 0.268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41255316138267517, + "rewards/margins": 1.4948984384536743, + "rewards/rejected": -1.9074515104293823, + "step": 5937 + }, + { + "epoch": 0.69, + "learning_rate": 9.417739459076414e-08, + "logits/chosen": -2.282487392425537, + "logits/rejected": -2.684597969055176, + "logps/chosen": -341.38702392578125, + "logps/rejected": -275.035400390625, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.064897060394287, + "rewards/margins": 3.2950215339660645, + "rewards/rejected": -4.359918594360352, + "step": 5938 + }, + { + "epoch": 0.69, + "learning_rate": 9.414196291484587e-08, + "logits/chosen": -2.266268730163574, + "logits/rejected": -2.3414816856384277, + "logps/chosen": -443.3470458984375, + "logps/rejected": -358.99951171875, + "loss": 0.3121, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2478749752044678, + "rewards/margins": 1.345739483833313, + "rewards/rejected": -2.5936145782470703, + "step": 5939 + }, + { + "epoch": 0.69, + "learning_rate": 9.410653123892759e-08, + "logits/chosen": -2.692594289779663, + "logits/rejected": -2.5957717895507812, + "logps/chosen": -113.17381286621094, + "logps/rejected": -198.43589782714844, + "loss": 0.3994, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35381466150283813, + "rewards/margins": 2.577329158782959, + "rewards/rejected": -2.9311442375183105, + "step": 5940 + }, + { + "epoch": 0.69, + "learning_rate": 9.407109956300932e-08, + "logits/chosen": -2.719303846359253, + "logits/rejected": -2.3532893657684326, + "logps/chosen": -215.58489990234375, + "logps/rejected": -243.45327758789062, + "loss": 0.2119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0949797630310059, + "rewards/margins": 2.5842397212982178, + "rewards/rejected": -3.6792197227478027, + "step": 5941 + }, + { + "epoch": 0.69, + "learning_rate": 9.403566788709105e-08, + "logits/chosen": -2.4591453075408936, + "logits/rejected": -2.3354687690734863, + "logps/chosen": -227.62564086914062, + "logps/rejected": -327.6260681152344, + "loss": 0.1955, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2488445043563843, + "rewards/margins": 2.28767728805542, + "rewards/rejected": -3.5365219116210938, + "step": 5942 + }, + { + "epoch": 0.69, + "learning_rate": 9.40002362111728e-08, + "logits/chosen": -2.248875379562378, + "logits/rejected": -2.5448269844055176, + "logps/chosen": -231.5139617919922, + "logps/rejected": -216.9073486328125, + "loss": 0.5503, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5718295574188232, + "rewards/margins": 1.9952366352081299, + "rewards/rejected": -2.567065954208374, + "step": 5943 + }, + { + "epoch": 0.69, + "learning_rate": 9.396480453525452e-08, + "logits/chosen": -2.1362948417663574, + "logits/rejected": -2.390921115875244, + "logps/chosen": -313.90802001953125, + "logps/rejected": -359.3048095703125, + "loss": 0.5642, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40796053409576416, + "rewards/margins": 2.4602465629577637, + "rewards/rejected": -2.8682069778442383, + "step": 5944 + }, + { + "epoch": 0.69, + "learning_rate": 9.392937285933624e-08, + "logits/chosen": -2.5148916244506836, + "logits/rejected": -2.684272289276123, + "logps/chosen": -334.59991455078125, + "logps/rejected": -188.01336669921875, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.575730323791504, + "rewards/margins": 1.7875216007232666, + "rewards/rejected": -3.3632519245147705, + "step": 5945 + }, + { + "epoch": 0.69, + "learning_rate": 9.389394118341797e-08, + "logits/chosen": -2.01924729347229, + "logits/rejected": -2.026073932647705, + "logps/chosen": -311.9098205566406, + "logps/rejected": -313.757568359375, + "loss": 0.2504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9026853442192078, + "rewards/margins": 2.4526865482330322, + "rewards/rejected": -3.3553714752197266, + "step": 5946 + }, + { + "epoch": 0.69, + "learning_rate": 9.38585095074997e-08, + "logits/chosen": -2.741791248321533, + "logits/rejected": -2.587472915649414, + "logps/chosen": -362.07330322265625, + "logps/rejected": -329.809814453125, + "loss": 0.4363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04341205954551697, + "rewards/margins": 2.5221872329711914, + "rewards/rejected": -2.565599203109741, + "step": 5947 + }, + { + "epoch": 0.69, + "learning_rate": 9.382307783158143e-08, + "logits/chosen": -2.2487430572509766, + "logits/rejected": -2.1011388301849365, + "logps/chosen": -328.9737548828125, + "logps/rejected": -481.4882507324219, + "loss": 0.5682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7789228558540344, + "rewards/margins": 1.8237905502319336, + "rewards/rejected": -2.6027135848999023, + "step": 5948 + }, + { + "epoch": 0.69, + "learning_rate": 9.378764615566317e-08, + "logits/chosen": -2.1852614879608154, + "logits/rejected": -2.221254825592041, + "logps/chosen": -403.380126953125, + "logps/rejected": -393.04730224609375, + "loss": 0.4, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6940209269523621, + "rewards/margins": 2.767766237258911, + "rewards/rejected": -3.461787223815918, + "step": 5949 + }, + { + "epoch": 0.69, + "learning_rate": 9.375221447974489e-08, + "logits/chosen": -1.5681110620498657, + "logits/rejected": -1.8948683738708496, + "logps/chosen": -428.328857421875, + "logps/rejected": -341.12884521484375, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.324679970741272, + "rewards/margins": 1.9649741649627686, + "rewards/rejected": -2.28965425491333, + "step": 5950 + }, + { + "epoch": 0.69, + "learning_rate": 9.371678280382661e-08, + "logits/chosen": -2.9862968921661377, + "logits/rejected": -3.0504016876220703, + "logps/chosen": -140.73727416992188, + "logps/rejected": -198.33126831054688, + "loss": 0.39, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8614821434020996, + "rewards/margins": 2.6209652423858643, + "rewards/rejected": -3.482447624206543, + "step": 5951 + }, + { + "epoch": 0.69, + "learning_rate": 9.368135112790835e-08, + "logits/chosen": -1.6383451223373413, + "logits/rejected": -1.7701480388641357, + "logps/chosen": -593.3883056640625, + "logps/rejected": -452.3918762207031, + "loss": 1.2717, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.3469135761260986, + "rewards/margins": -0.1748369336128235, + "rewards/rejected": -1.1720765829086304, + "step": 5952 + }, + { + "epoch": 0.69, + "learning_rate": 9.364591945199007e-08, + "logits/chosen": -2.013572931289673, + "logits/rejected": -1.9564919471740723, + "logps/chosen": -215.31222534179688, + "logps/rejected": -214.2820587158203, + "loss": 0.8383, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1268084049224854, + "rewards/margins": 0.5475807189941406, + "rewards/rejected": -2.674389123916626, + "step": 5953 + }, + { + "epoch": 0.69, + "learning_rate": 9.361048777607182e-08, + "logits/chosen": -1.7631113529205322, + "logits/rejected": -1.8286938667297363, + "logps/chosen": -398.39862060546875, + "logps/rejected": -356.8348693847656, + "loss": 0.2042, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5517328381538391, + "rewards/margins": 2.7781646251678467, + "rewards/rejected": -3.329897403717041, + "step": 5954 + }, + { + "epoch": 0.69, + "learning_rate": 9.357505610015354e-08, + "logits/chosen": -2.437823534011841, + "logits/rejected": -2.403437376022339, + "logps/chosen": -167.44659423828125, + "logps/rejected": -213.57162475585938, + "loss": 0.5455, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5479899644851685, + "rewards/margins": 1.2420594692230225, + "rewards/rejected": -2.7900495529174805, + "step": 5955 + }, + { + "epoch": 0.69, + "learning_rate": 9.353962442423526e-08, + "logits/chosen": -2.013030529022217, + "logits/rejected": -2.059264898300171, + "logps/chosen": -263.68426513671875, + "logps/rejected": -253.4541015625, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4490455389022827, + "rewards/margins": 2.705362319946289, + "rewards/rejected": -3.1544079780578613, + "step": 5956 + }, + { + "epoch": 0.69, + "learning_rate": 9.350419274831698e-08, + "logits/chosen": -2.433872699737549, + "logits/rejected": -2.173637628555298, + "logps/chosen": -168.36306762695312, + "logps/rejected": -218.60040283203125, + "loss": 0.4971, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2994093894958496, + "rewards/margins": 1.480369210243225, + "rewards/rejected": -2.779778480529785, + "step": 5957 + }, + { + "epoch": 0.69, + "learning_rate": 9.346876107239872e-08, + "logits/chosen": -2.1994478702545166, + "logits/rejected": -2.126127004623413, + "logps/chosen": -261.6136779785156, + "logps/rejected": -419.21502685546875, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7895863056182861, + "rewards/margins": 4.078079700469971, + "rewards/rejected": -4.867666244506836, + "step": 5958 + }, + { + "epoch": 0.69, + "learning_rate": 9.343332939648044e-08, + "logits/chosen": -1.7941107749938965, + "logits/rejected": -1.8441176414489746, + "logps/chosen": -307.96466064453125, + "logps/rejected": -352.8614501953125, + "loss": 0.7728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5004740953445435, + "rewards/margins": 1.545682668685913, + "rewards/rejected": -3.046156883239746, + "step": 5959 + }, + { + "epoch": 0.69, + "learning_rate": 9.339789772056219e-08, + "logits/chosen": -2.652026653289795, + "logits/rejected": -2.6799988746643066, + "logps/chosen": -278.9390869140625, + "logps/rejected": -332.3306579589844, + "loss": 0.518, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1404473781585693, + "rewards/margins": 2.0784149169921875, + "rewards/rejected": -3.218862295150757, + "step": 5960 + }, + { + "epoch": 0.69, + "learning_rate": 9.336246604464391e-08, + "logits/chosen": -2.4938302040100098, + "logits/rejected": -2.380173683166504, + "logps/chosen": -304.5406494140625, + "logps/rejected": -317.55615234375, + "loss": 0.2866, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6309347152709961, + "rewards/margins": 1.4928696155548096, + "rewards/rejected": -2.1238043308258057, + "step": 5961 + }, + { + "epoch": 0.69, + "learning_rate": 9.332703436872563e-08, + "logits/chosen": -2.2930479049682617, + "logits/rejected": -2.491283416748047, + "logps/chosen": -245.7716827392578, + "logps/rejected": -301.66522216796875, + "loss": 0.1723, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.13448166847229, + "rewards/margins": 3.241150140762329, + "rewards/rejected": -4.375631809234619, + "step": 5962 + }, + { + "epoch": 0.69, + "learning_rate": 9.329160269280737e-08, + "logits/chosen": -2.1727917194366455, + "logits/rejected": -2.5466363430023193, + "logps/chosen": -453.0307922363281, + "logps/rejected": -359.4427185058594, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.494231104850769, + "rewards/margins": 2.26005220413208, + "rewards/rejected": -3.7542834281921387, + "step": 5963 + }, + { + "epoch": 0.69, + "learning_rate": 9.325617101688909e-08, + "logits/chosen": -2.8971292972564697, + "logits/rejected": -2.8799123764038086, + "logps/chosen": -292.66351318359375, + "logps/rejected": -248.83889770507812, + "loss": 0.1071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.54169499874115, + "rewards/margins": 3.625748634338379, + "rewards/rejected": -5.16744327545166, + "step": 5964 + }, + { + "epoch": 0.69, + "learning_rate": 9.322073934097081e-08, + "logits/chosen": -2.1410813331604004, + "logits/rejected": -1.830519437789917, + "logps/chosen": -168.0423126220703, + "logps/rejected": -266.092041015625, + "loss": 0.4915, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8001751899719238, + "rewards/margins": 2.052030563354492, + "rewards/rejected": -3.852205753326416, + "step": 5965 + }, + { + "epoch": 0.69, + "learning_rate": 9.318530766505256e-08, + "logits/chosen": -2.5409159660339355, + "logits/rejected": -2.723055362701416, + "logps/chosen": -330.30291748046875, + "logps/rejected": -324.2977294921875, + "loss": 0.2499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3429834246635437, + "rewards/margins": 3.1278910636901855, + "rewards/rejected": -3.470874309539795, + "step": 5966 + }, + { + "epoch": 0.69, + "learning_rate": 9.314987598913429e-08, + "logits/chosen": -2.169447898864746, + "logits/rejected": -1.918920874595642, + "logps/chosen": -296.6964416503906, + "logps/rejected": -553.6142578125, + "loss": 0.349, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3497463464736938, + "rewards/margins": 1.7751201391220093, + "rewards/rejected": -3.124866485595703, + "step": 5967 + }, + { + "epoch": 0.69, + "learning_rate": 9.311444431321601e-08, + "logits/chosen": -2.9531173706054688, + "logits/rejected": -2.958043336868286, + "logps/chosen": -68.79165649414062, + "logps/rejected": -177.22085571289062, + "loss": 0.4329, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6737321019172668, + "rewards/margins": 3.1215827465057373, + "rewards/rejected": -3.7953150272369385, + "step": 5968 + }, + { + "epoch": 0.69, + "learning_rate": 9.307901263729774e-08, + "logits/chosen": -2.692622184753418, + "logits/rejected": -2.624939441680908, + "logps/chosen": -223.71083068847656, + "logps/rejected": -190.51426696777344, + "loss": 0.7559, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3284344673156738, + "rewards/margins": 0.8538775444030762, + "rewards/rejected": -2.18231201171875, + "step": 5969 + }, + { + "epoch": 0.69, + "learning_rate": 9.304358096137946e-08, + "logits/chosen": -2.783207893371582, + "logits/rejected": -2.6350581645965576, + "logps/chosen": -189.0893096923828, + "logps/rejected": -171.85400390625, + "loss": 0.214, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1891270875930786, + "rewards/margins": 3.6914901733398438, + "rewards/rejected": -4.880617618560791, + "step": 5970 + }, + { + "epoch": 0.69, + "learning_rate": 9.300814928546119e-08, + "logits/chosen": -2.4099061489105225, + "logits/rejected": -2.021446466445923, + "logps/chosen": -198.8851776123047, + "logps/rejected": -319.7222900390625, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7842992544174194, + "rewards/margins": 2.9663493633270264, + "rewards/rejected": -3.7506484985351562, + "step": 5971 + }, + { + "epoch": 0.69, + "learning_rate": 9.297271760954294e-08, + "logits/chosen": -2.1567671298980713, + "logits/rejected": -2.4220125675201416, + "logps/chosen": -390.8017272949219, + "logps/rejected": -307.2712097167969, + "loss": 0.3252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9562134742736816, + "rewards/margins": 2.350801944732666, + "rewards/rejected": -3.3070154190063477, + "step": 5972 + }, + { + "epoch": 0.69, + "learning_rate": 9.293728593362466e-08, + "logits/chosen": -1.9482084512710571, + "logits/rejected": -1.7152926921844482, + "logps/chosen": -129.21717834472656, + "logps/rejected": -179.1575469970703, + "loss": 0.4035, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48120176792144775, + "rewards/margins": 1.815718412399292, + "rewards/rejected": -2.2969202995300293, + "step": 5973 + }, + { + "epoch": 0.69, + "learning_rate": 9.290185425770638e-08, + "logits/chosen": -2.2032294273376465, + "logits/rejected": -2.5073676109313965, + "logps/chosen": -404.4399719238281, + "logps/rejected": -237.85464477539062, + "loss": 0.3043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022510483860969543, + "rewards/margins": 2.283503293991089, + "rewards/rejected": -2.306013822555542, + "step": 5974 + }, + { + "epoch": 0.7, + "learning_rate": 9.286642258178811e-08, + "logits/chosen": -2.5684890747070312, + "logits/rejected": -2.4349396228790283, + "logps/chosen": -218.24407958984375, + "logps/rejected": -287.7623596191406, + "loss": 0.4725, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.193178653717041, + "rewards/margins": 1.247343897819519, + "rewards/rejected": -2.4405226707458496, + "step": 5975 + }, + { + "epoch": 0.7, + "learning_rate": 9.283099090586984e-08, + "logits/chosen": -2.634812116622925, + "logits/rejected": -2.8056836128234863, + "logps/chosen": -242.88885498046875, + "logps/rejected": -231.482666015625, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.384013295173645, + "rewards/margins": 1.9812326431274414, + "rewards/rejected": -3.365246057510376, + "step": 5976 + }, + { + "epoch": 0.7, + "learning_rate": 9.279555922995156e-08, + "logits/chosen": -2.0458903312683105, + "logits/rejected": -2.479682207107544, + "logps/chosen": -576.851806640625, + "logps/rejected": -334.5779113769531, + "loss": 0.2845, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0324729681015015, + "rewards/margins": 2.067734479904175, + "rewards/rejected": -3.1002073287963867, + "step": 5977 + }, + { + "epoch": 0.7, + "learning_rate": 9.276012755403331e-08, + "logits/chosen": -2.3929603099823, + "logits/rejected": -2.2969861030578613, + "logps/chosen": -108.85712432861328, + "logps/rejected": -186.12872314453125, + "loss": 0.7329, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1857686042785645, + "rewards/margins": 1.537548303604126, + "rewards/rejected": -2.7233171463012695, + "step": 5978 + }, + { + "epoch": 0.7, + "learning_rate": 9.272469587811503e-08, + "logits/chosen": -1.8463034629821777, + "logits/rejected": -1.9940028190612793, + "logps/chosen": -336.3285827636719, + "logps/rejected": -306.70233154296875, + "loss": 0.6161, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8039653301239014, + "rewards/margins": 0.31288450956344604, + "rewards/rejected": -1.1168498992919922, + "step": 5979 + }, + { + "epoch": 0.7, + "learning_rate": 9.268926420219677e-08, + "logits/chosen": -2.8811416625976562, + "logits/rejected": -2.775979518890381, + "logps/chosen": -254.50343322753906, + "logps/rejected": -246.05128479003906, + "loss": 0.2994, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9869256019592285, + "rewards/margins": 1.5724821090698242, + "rewards/rejected": -2.5594077110290527, + "step": 5980 + }, + { + "epoch": 0.7, + "learning_rate": 9.265383252627849e-08, + "logits/chosen": -1.7466903924942017, + "logits/rejected": -1.8183021545410156, + "logps/chosen": -331.76348876953125, + "logps/rejected": -286.8199462890625, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07527542859315872, + "rewards/margins": 3.1024773120880127, + "rewards/rejected": -3.0272016525268555, + "step": 5981 + }, + { + "epoch": 0.7, + "learning_rate": 9.261840085036021e-08, + "logits/chosen": -3.00887393951416, + "logits/rejected": -2.956235885620117, + "logps/chosen": -231.02664184570312, + "logps/rejected": -208.3655242919922, + "loss": 0.3936, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.820006251335144, + "rewards/margins": 1.6724148988723755, + "rewards/rejected": -2.4924211502075195, + "step": 5982 + }, + { + "epoch": 0.7, + "learning_rate": 9.258296917444196e-08, + "logits/chosen": -2.571348190307617, + "logits/rejected": -2.55328106880188, + "logps/chosen": -185.45504760742188, + "logps/rejected": -300.9732666015625, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1201887130737305, + "rewards/margins": 2.3118059635162354, + "rewards/rejected": -3.431994676589966, + "step": 5983 + }, + { + "epoch": 0.7, + "learning_rate": 9.254753749852368e-08, + "logits/chosen": -2.0843253135681152, + "logits/rejected": -1.8757576942443848, + "logps/chosen": -263.79571533203125, + "logps/rejected": -417.78656005859375, + "loss": 0.1634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22406062483787537, + "rewards/margins": 3.275120973587036, + "rewards/rejected": -3.051060438156128, + "step": 5984 + }, + { + "epoch": 0.7, + "learning_rate": 9.25121058226054e-08, + "logits/chosen": -2.97792911529541, + "logits/rejected": -2.8449926376342773, + "logps/chosen": -194.62420654296875, + "logps/rejected": -265.5486145019531, + "loss": 0.1545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.428298681974411, + "rewards/margins": 4.042585849761963, + "rewards/rejected": -4.470884323120117, + "step": 5985 + }, + { + "epoch": 0.7, + "learning_rate": 9.247667414668714e-08, + "logits/chosen": -2.135791063308716, + "logits/rejected": -2.1938891410827637, + "logps/chosen": -253.03616333007812, + "logps/rejected": -191.9647979736328, + "loss": 0.7574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7978093028068542, + "rewards/margins": 0.5191839933395386, + "rewards/rejected": -1.316993236541748, + "step": 5986 + }, + { + "epoch": 0.7, + "learning_rate": 9.244124247076886e-08, + "logits/chosen": -2.6979825496673584, + "logits/rejected": -2.681398391723633, + "logps/chosen": -264.9971008300781, + "logps/rejected": -319.8002014160156, + "loss": 0.3114, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3239859342575073, + "rewards/margins": 1.5056757926940918, + "rewards/rejected": -2.8296616077423096, + "step": 5987 + }, + { + "epoch": 0.7, + "learning_rate": 9.240581079485058e-08, + "logits/chosen": -2.4685072898864746, + "logits/rejected": -2.56435489654541, + "logps/chosen": -183.2127227783203, + "logps/rejected": -210.27597045898438, + "loss": 0.3203, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9452823996543884, + "rewards/margins": 2.252321243286133, + "rewards/rejected": -3.197603702545166, + "step": 5988 + }, + { + "epoch": 0.7, + "learning_rate": 9.237037911893233e-08, + "logits/chosen": -1.550560712814331, + "logits/rejected": -1.153158187866211, + "logps/chosen": -280.5323486328125, + "logps/rejected": -421.38482666015625, + "loss": 1.1279, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3674099445343018, + "rewards/margins": 1.1257030963897705, + "rewards/rejected": -3.4931132793426514, + "step": 5989 + }, + { + "epoch": 0.7, + "learning_rate": 9.233494744301405e-08, + "logits/chosen": -2.6287765502929688, + "logits/rejected": -2.4114041328430176, + "logps/chosen": -181.55838012695312, + "logps/rejected": -227.42025756835938, + "loss": 0.5254, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9875730276107788, + "rewards/margins": 2.732818365097046, + "rewards/rejected": -3.720391273498535, + "step": 5990 + }, + { + "epoch": 0.7, + "learning_rate": 9.229951576709577e-08, + "logits/chosen": -2.578174114227295, + "logits/rejected": -2.420846462249756, + "logps/chosen": -425.3371276855469, + "logps/rejected": -371.03643798828125, + "loss": 0.3028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9448429346084595, + "rewards/margins": 2.1427884101867676, + "rewards/rejected": -3.0876314640045166, + "step": 5991 + }, + { + "epoch": 0.7, + "learning_rate": 9.226408409117751e-08, + "logits/chosen": -2.466123104095459, + "logits/rejected": -2.5625219345092773, + "logps/chosen": -289.1417236328125, + "logps/rejected": -276.5509948730469, + "loss": 0.1494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6761846542358398, + "rewards/margins": 3.657160758972168, + "rewards/rejected": -4.333345413208008, + "step": 5992 + }, + { + "epoch": 0.7, + "learning_rate": 9.222865241525923e-08, + "logits/chosen": -2.333064556121826, + "logits/rejected": -2.7006757259368896, + "logps/chosen": -232.5068359375, + "logps/rejected": -265.67822265625, + "loss": 0.8478, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1403058767318726, + "rewards/margins": 2.566277265548706, + "rewards/rejected": -3.706583023071289, + "step": 5993 + }, + { + "epoch": 0.7, + "learning_rate": 9.219322073934095e-08, + "logits/chosen": -1.8091753721237183, + "logits/rejected": -1.693538784980774, + "logps/chosen": -264.4583740234375, + "logps/rejected": -322.2759704589844, + "loss": 0.2388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17475838959217072, + "rewards/margins": 2.284785270690918, + "rewards/rejected": -2.4595437049865723, + "step": 5994 + }, + { + "epoch": 0.7, + "learning_rate": 9.21577890634227e-08, + "logits/chosen": -2.579787492752075, + "logits/rejected": -2.596958875656128, + "logps/chosen": -223.7586212158203, + "logps/rejected": -224.19454956054688, + "loss": 0.5459, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.28402841091156, + "rewards/margins": 1.1929841041564941, + "rewards/rejected": -2.4770123958587646, + "step": 5995 + }, + { + "epoch": 0.7, + "learning_rate": 9.212235738750443e-08, + "logits/chosen": -2.2675952911376953, + "logits/rejected": -2.266467809677124, + "logps/chosen": -250.20394897460938, + "logps/rejected": -327.83441162109375, + "loss": 0.2396, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4304215908050537, + "rewards/margins": 2.713576316833496, + "rewards/rejected": -4.143998146057129, + "step": 5996 + }, + { + "epoch": 0.7, + "learning_rate": 9.208692571158616e-08, + "logits/chosen": -2.6624648571014404, + "logits/rejected": -2.6537322998046875, + "logps/chosen": -358.8682861328125, + "logps/rejected": -236.94256591796875, + "loss": 0.2816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36517655849456787, + "rewards/margins": 1.998573660850525, + "rewards/rejected": -2.3637502193450928, + "step": 5997 + }, + { + "epoch": 0.7, + "learning_rate": 9.205149403566788e-08, + "logits/chosen": -1.9768404960632324, + "logits/rejected": -2.039572238922119, + "logps/chosen": -341.776123046875, + "logps/rejected": -325.33099365234375, + "loss": 0.4145, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3710134029388428, + "rewards/margins": 2.7524940967559814, + "rewards/rejected": -4.123507499694824, + "step": 5998 + }, + { + "epoch": 0.7, + "learning_rate": 9.20160623597496e-08, + "logits/chosen": -2.439814567565918, + "logits/rejected": -2.283468723297119, + "logps/chosen": -327.0221252441406, + "logps/rejected": -299.84490966796875, + "loss": 0.3501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35666030645370483, + "rewards/margins": 1.7146227359771729, + "rewards/rejected": -2.0712828636169434, + "step": 5999 + }, + { + "epoch": 0.7, + "learning_rate": 9.198063068383134e-08, + "logits/chosen": -2.2308316230773926, + "logits/rejected": -2.031500816345215, + "logps/chosen": -299.0888671875, + "logps/rejected": -379.53070068359375, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6453994512557983, + "rewards/margins": 3.115950107574463, + "rewards/rejected": -3.7613492012023926, + "step": 6000 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -1.7527562379837036, + "eval_logits/rejected": -1.7537353038787842, + "eval_logps/chosen": -278.6492614746094, + "eval_logps/rejected": -279.0399475097656, + "eval_loss": 0.36728817224502563, + "eval_rewards/accuracies": 0.8548850417137146, + "eval_rewards/chosen": -0.648383617401123, + "eval_rewards/margins": 2.1549606323242188, + "eval_rewards/rejected": -2.803344249725342, + "eval_runtime": 238.5977, + "eval_samples_per_second": 2.913, + "eval_steps_per_second": 1.459, + "step": 6000 + }, + { + "epoch": 0.7, + "learning_rate": 9.194519900791308e-08, + "logits/chosen": -2.1462483406066895, + "logits/rejected": -2.2073240280151367, + "logps/chosen": -324.7988586425781, + "logps/rejected": -418.3434753417969, + "loss": 0.7451, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6907975673675537, + "rewards/margins": 0.7781219482421875, + "rewards/rejected": -1.4689193964004517, + "step": 6001 + }, + { + "epoch": 0.7, + "learning_rate": 9.19097673319948e-08, + "logits/chosen": -2.4827237129211426, + "logits/rejected": -2.5902347564697266, + "logps/chosen": -213.63230895996094, + "logps/rejected": -145.9033660888672, + "loss": 1.4522, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1601834297180176, + "rewards/margins": 1.175827145576477, + "rewards/rejected": -3.3360109329223633, + "step": 6002 + }, + { + "epoch": 0.7, + "learning_rate": 9.187433565607653e-08, + "logits/chosen": -1.9801803827285767, + "logits/rejected": -2.0193517208099365, + "logps/chosen": -399.0745849609375, + "logps/rejected": -300.7531433105469, + "loss": 0.5171, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1907086372375488, + "rewards/margins": 1.2503740787506104, + "rewards/rejected": -2.441082715988159, + "step": 6003 + }, + { + "epoch": 0.7, + "learning_rate": 9.183890398015826e-08, + "logits/chosen": -2.383993148803711, + "logits/rejected": -2.341671943664551, + "logps/chosen": -277.90240478515625, + "logps/rejected": -268.15765380859375, + "loss": 0.2204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7308348417282104, + "rewards/margins": 3.4386465549468994, + "rewards/rejected": -4.16948127746582, + "step": 6004 + }, + { + "epoch": 0.7, + "learning_rate": 9.180347230423998e-08, + "logits/chosen": -2.455585479736328, + "logits/rejected": -2.4666290283203125, + "logps/chosen": -284.3740539550781, + "logps/rejected": -356.8467102050781, + "loss": 0.0939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08365730196237564, + "rewards/margins": 3.768669843673706, + "rewards/rejected": -3.8523268699645996, + "step": 6005 + }, + { + "epoch": 0.7, + "learning_rate": 9.176804062832171e-08, + "logits/chosen": -2.163043260574341, + "logits/rejected": -2.4181694984436035, + "logps/chosen": -347.5899658203125, + "logps/rejected": -377.31134033203125, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9709057807922363, + "rewards/margins": 2.2633957862854004, + "rewards/rejected": -3.2343015670776367, + "step": 6006 + }, + { + "epoch": 0.7, + "learning_rate": 9.173260895240345e-08, + "logits/chosen": -2.540018081665039, + "logits/rejected": -2.403459310531616, + "logps/chosen": -498.76702880859375, + "logps/rejected": -323.305908203125, + "loss": 0.3377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5676903128623962, + "rewards/margins": 2.13607120513916, + "rewards/rejected": -2.703761339187622, + "step": 6007 + }, + { + "epoch": 0.7, + "learning_rate": 9.169717727648518e-08, + "logits/chosen": -2.448683738708496, + "logits/rejected": -2.412686824798584, + "logps/chosen": -238.2685546875, + "logps/rejected": -173.99188232421875, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8431081771850586, + "rewards/margins": 2.941160202026367, + "rewards/rejected": -3.784268379211426, + "step": 6008 + }, + { + "epoch": 0.7, + "learning_rate": 9.16617456005669e-08, + "logits/chosen": -2.6664700508117676, + "logits/rejected": -2.6652727127075195, + "logps/chosen": -205.84657287597656, + "logps/rejected": -233.36599731445312, + "loss": 0.183, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9583196043968201, + "rewards/margins": 4.038956642150879, + "rewards/rejected": -4.997276782989502, + "step": 6009 + }, + { + "epoch": 0.7, + "learning_rate": 9.162631392464863e-08, + "logits/chosen": -2.2818922996520996, + "logits/rejected": -2.285827159881592, + "logps/chosen": -332.89154052734375, + "logps/rejected": -366.5816345214844, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040971674025058746, + "rewards/margins": 2.3611698150634766, + "rewards/rejected": -2.3201980590820312, + "step": 6010 + }, + { + "epoch": 0.7, + "learning_rate": 9.159088224873035e-08, + "logits/chosen": -2.167664051055908, + "logits/rejected": -2.3829503059387207, + "logps/chosen": -369.7292785644531, + "logps/rejected": -319.5907897949219, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20869721472263336, + "rewards/margins": 1.6884161233901978, + "rewards/rejected": -1.89711332321167, + "step": 6011 + }, + { + "epoch": 0.7, + "learning_rate": 9.155545057281208e-08, + "logits/chosen": -2.605367422103882, + "logits/rejected": -2.6658053398132324, + "logps/chosen": -186.47979736328125, + "logps/rejected": -181.69515991210938, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8381533622741699, + "rewards/margins": 2.5030322074890137, + "rewards/rejected": -3.341186046600342, + "step": 6012 + }, + { + "epoch": 0.7, + "learning_rate": 9.152001889689382e-08, + "logits/chosen": -2.70961332321167, + "logits/rejected": -2.7942330837249756, + "logps/chosen": -174.91329956054688, + "logps/rejected": -221.78334045410156, + "loss": 0.2489, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3838317394256592, + "rewards/margins": 2.8249096870422363, + "rewards/rejected": -4.208741664886475, + "step": 6013 + }, + { + "epoch": 0.7, + "learning_rate": 9.148458722097556e-08, + "logits/chosen": -2.072396755218506, + "logits/rejected": -2.619096517562866, + "logps/chosen": -287.3940124511719, + "logps/rejected": -149.68157958984375, + "loss": 0.3905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5017650723457336, + "rewards/margins": 1.3530287742614746, + "rewards/rejected": -1.8547937870025635, + "step": 6014 + }, + { + "epoch": 0.7, + "learning_rate": 9.144915554505728e-08, + "logits/chosen": -2.825576066970825, + "logits/rejected": -2.6692593097686768, + "logps/chosen": -268.5045166015625, + "logps/rejected": -292.56048583984375, + "loss": 0.3052, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9862189292907715, + "rewards/margins": 2.3398356437683105, + "rewards/rejected": -4.326054573059082, + "step": 6015 + }, + { + "epoch": 0.7, + "learning_rate": 9.1413723869139e-08, + "logits/chosen": -2.2917284965515137, + "logits/rejected": -2.1904425621032715, + "logps/chosen": -319.18792724609375, + "logps/rejected": -441.0121154785156, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0515291690826416, + "rewards/margins": 3.179701805114746, + "rewards/rejected": -4.231231212615967, + "step": 6016 + }, + { + "epoch": 0.7, + "learning_rate": 9.137829219322074e-08, + "logits/chosen": -1.898000955581665, + "logits/rejected": -2.12349271774292, + "logps/chosen": -449.4615173339844, + "logps/rejected": -338.26629638671875, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37480705976486206, + "rewards/margins": 2.8934261798858643, + "rewards/rejected": -2.5186190605163574, + "step": 6017 + }, + { + "epoch": 0.7, + "learning_rate": 9.134286051730247e-08, + "logits/chosen": -2.3604321479797363, + "logits/rejected": -2.4165232181549072, + "logps/chosen": -374.125732421875, + "logps/rejected": -355.0762023925781, + "loss": 0.3898, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7255774140357971, + "rewards/margins": 1.6422920227050781, + "rewards/rejected": -2.3678693771362305, + "step": 6018 + }, + { + "epoch": 0.7, + "learning_rate": 9.130742884138419e-08, + "logits/chosen": -2.3641490936279297, + "logits/rejected": -2.2291553020477295, + "logps/chosen": -352.9361267089844, + "logps/rejected": -300.0913391113281, + "loss": 0.6935, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7091320753097534, + "rewards/margins": 0.45051148533821106, + "rewards/rejected": -2.1596436500549316, + "step": 6019 + }, + { + "epoch": 0.7, + "learning_rate": 9.127199716546593e-08, + "logits/chosen": -2.709162473678589, + "logits/rejected": -2.6785128116607666, + "logps/chosen": -249.75723266601562, + "logps/rejected": -233.406982421875, + "loss": 0.3172, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3355178833007812, + "rewards/margins": 1.3936246633529663, + "rewards/rejected": -2.729142427444458, + "step": 6020 + }, + { + "epoch": 0.7, + "learning_rate": 9.123656548954765e-08, + "logits/chosen": -2.5697898864746094, + "logits/rejected": -2.4685654640197754, + "logps/chosen": -329.4343566894531, + "logps/rejected": -222.78826904296875, + "loss": 0.348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1228890419006348, + "rewards/margins": 2.361448049545288, + "rewards/rejected": -3.4843368530273438, + "step": 6021 + }, + { + "epoch": 0.7, + "learning_rate": 9.120113381362937e-08, + "logits/chosen": -1.3579021692276, + "logits/rejected": -1.5742639303207397, + "logps/chosen": -807.8760375976562, + "logps/rejected": -562.3926391601562, + "loss": 0.858, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9015644192695618, + "rewards/margins": 0.15051400661468506, + "rewards/rejected": -1.0520784854888916, + "step": 6022 + }, + { + "epoch": 0.7, + "learning_rate": 9.116570213771111e-08, + "logits/chosen": -2.6098814010620117, + "logits/rejected": -2.5496938228607178, + "logps/chosen": -203.02767944335938, + "logps/rejected": -192.68238830566406, + "loss": 1.0241, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.018750786781311, + "rewards/margins": 1.342000961303711, + "rewards/rejected": -2.3607516288757324, + "step": 6023 + }, + { + "epoch": 0.7, + "learning_rate": 9.113027046179284e-08, + "logits/chosen": -2.1932501792907715, + "logits/rejected": -2.2903051376342773, + "logps/chosen": -259.55120849609375, + "logps/rejected": -271.6265869140625, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08828525245189667, + "rewards/margins": 1.9427448511123657, + "rewards/rejected": -2.0310299396514893, + "step": 6024 + }, + { + "epoch": 0.7, + "learning_rate": 9.109483878587458e-08, + "logits/chosen": -2.216691255569458, + "logits/rejected": -2.299765110015869, + "logps/chosen": -249.56524658203125, + "logps/rejected": -267.18670654296875, + "loss": 0.2552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7095420360565186, + "rewards/margins": 3.237900733947754, + "rewards/rejected": -3.9474427700042725, + "step": 6025 + }, + { + "epoch": 0.7, + "learning_rate": 9.10594071099563e-08, + "logits/chosen": -2.3363699913024902, + "logits/rejected": -2.805180549621582, + "logps/chosen": -354.978759765625, + "logps/rejected": -277.37188720703125, + "loss": 0.2015, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.059920310974121, + "rewards/margins": 2.2413792610168457, + "rewards/rejected": -3.301299571990967, + "step": 6026 + }, + { + "epoch": 0.7, + "learning_rate": 9.102397543403802e-08, + "logits/chosen": -2.4577157497406006, + "logits/rejected": -2.6041719913482666, + "logps/chosen": -468.6709899902344, + "logps/rejected": -258.8568115234375, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.142607569694519, + "rewards/margins": 2.4675354957580566, + "rewards/rejected": -3.6101431846618652, + "step": 6027 + }, + { + "epoch": 0.7, + "learning_rate": 9.098854375811974e-08, + "logits/chosen": -2.583595037460327, + "logits/rejected": -2.362802743911743, + "logps/chosen": -142.79420471191406, + "logps/rejected": -245.39947509765625, + "loss": 0.3251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6801767945289612, + "rewards/margins": 3.1240344047546387, + "rewards/rejected": -3.804211378097534, + "step": 6028 + }, + { + "epoch": 0.7, + "learning_rate": 9.095311208220148e-08, + "logits/chosen": -2.2925822734832764, + "logits/rejected": -2.1861793994903564, + "logps/chosen": -162.0034637451172, + "logps/rejected": -181.0818328857422, + "loss": 0.1665, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36120232939720154, + "rewards/margins": 2.606855869293213, + "rewards/rejected": -2.9680581092834473, + "step": 6029 + }, + { + "epoch": 0.7, + "learning_rate": 9.091768040628322e-08, + "logits/chosen": -1.6989262104034424, + "logits/rejected": -1.9809398651123047, + "logps/chosen": -328.41827392578125, + "logps/rejected": -242.74935913085938, + "loss": 0.4249, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.034381628036499, + "rewards/margins": 1.1511019468307495, + "rewards/rejected": -3.185483455657959, + "step": 6030 + }, + { + "epoch": 0.7, + "learning_rate": 9.088224873036495e-08, + "logits/chosen": -2.7312920093536377, + "logits/rejected": -2.3301899433135986, + "logps/chosen": -133.37928771972656, + "logps/rejected": -434.45391845703125, + "loss": 0.2294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7052136659622192, + "rewards/margins": 2.406933307647705, + "rewards/rejected": -3.1121466159820557, + "step": 6031 + }, + { + "epoch": 0.7, + "learning_rate": 9.084681705444667e-08, + "logits/chosen": -2.7333908081054688, + "logits/rejected": -2.348658561706543, + "logps/chosen": -242.15560913085938, + "logps/rejected": -284.3896789550781, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1187427043914795, + "rewards/margins": 2.5835251808166504, + "rewards/rejected": -3.70226788520813, + "step": 6032 + }, + { + "epoch": 0.7, + "learning_rate": 9.08113853785284e-08, + "logits/chosen": -2.4055323600769043, + "logits/rejected": -1.8480944633483887, + "logps/chosen": -193.98248291015625, + "logps/rejected": -300.6365051269531, + "loss": 0.7382, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4279305934906006, + "rewards/margins": 1.651532769203186, + "rewards/rejected": -3.079463481903076, + "step": 6033 + }, + { + "epoch": 0.7, + "learning_rate": 9.077595370261013e-08, + "logits/chosen": -2.743155002593994, + "logits/rejected": -2.7312674522399902, + "logps/chosen": -280.6197509765625, + "logps/rejected": -294.8072509765625, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0820462703704834, + "rewards/margins": 3.43280029296875, + "rewards/rejected": -4.5148468017578125, + "step": 6034 + }, + { + "epoch": 0.7, + "learning_rate": 9.074052202669185e-08, + "logits/chosen": -2.3577675819396973, + "logits/rejected": -2.151329517364502, + "logps/chosen": -285.47540283203125, + "logps/rejected": -348.01495361328125, + "loss": 0.4573, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9262704849243164, + "rewards/margins": 2.042452096939087, + "rewards/rejected": -3.9687228202819824, + "step": 6035 + }, + { + "epoch": 0.7, + "learning_rate": 9.070509035077359e-08, + "logits/chosen": -2.242931842803955, + "logits/rejected": -2.2619190216064453, + "logps/chosen": -265.83502197265625, + "logps/rejected": -249.09239196777344, + "loss": 0.4337, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5185831785202026, + "rewards/margins": 1.6929024457931519, + "rewards/rejected": -2.2114856243133545, + "step": 6036 + }, + { + "epoch": 0.7, + "learning_rate": 9.066965867485532e-08, + "logits/chosen": -2.492661476135254, + "logits/rejected": -2.518251657485962, + "logps/chosen": -266.2502136230469, + "logps/rejected": -205.04013061523438, + "loss": 0.287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9849428534507751, + "rewards/margins": 2.030200242996216, + "rewards/rejected": -3.0151431560516357, + "step": 6037 + }, + { + "epoch": 0.7, + "learning_rate": 9.063422699893705e-08, + "logits/chosen": -2.359525680541992, + "logits/rejected": -2.4171929359436035, + "logps/chosen": -308.9859619140625, + "logps/rejected": -237.25267028808594, + "loss": 0.4134, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.586553931236267, + "rewards/margins": 1.5468459129333496, + "rewards/rejected": -3.1333999633789062, + "step": 6038 + }, + { + "epoch": 0.7, + "learning_rate": 9.059879532301877e-08, + "logits/chosen": -2.2778449058532715, + "logits/rejected": -2.2827181816101074, + "logps/chosen": -235.49188232421875, + "logps/rejected": -284.8341369628906, + "loss": 0.6327, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.257972478866577, + "rewards/margins": 1.9846856594085693, + "rewards/rejected": -4.242657661437988, + "step": 6039 + }, + { + "epoch": 0.7, + "learning_rate": 9.05633636471005e-08, + "logits/chosen": -2.323399543762207, + "logits/rejected": -2.4326274394989014, + "logps/chosen": -234.27146911621094, + "logps/rejected": -192.68829345703125, + "loss": 0.8768, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3022602796554565, + "rewards/margins": 1.129872441291809, + "rewards/rejected": -2.4321327209472656, + "step": 6040 + }, + { + "epoch": 0.7, + "learning_rate": 9.052793197118223e-08, + "logits/chosen": -3.095167875289917, + "logits/rejected": -2.994746208190918, + "logps/chosen": -405.2083740234375, + "logps/rejected": -247.9383544921875, + "loss": 0.1841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9737396240234375, + "rewards/margins": 3.586784601211548, + "rewards/rejected": -4.560523986816406, + "step": 6041 + }, + { + "epoch": 0.7, + "learning_rate": 9.049250029526397e-08, + "logits/chosen": -2.5182836055755615, + "logits/rejected": -2.4292612075805664, + "logps/chosen": -236.18609619140625, + "logps/rejected": -267.0619201660156, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2799082398414612, + "rewards/margins": 2.2520065307617188, + "rewards/rejected": -2.5319149494171143, + "step": 6042 + }, + { + "epoch": 0.7, + "learning_rate": 9.04570686193457e-08, + "logits/chosen": -1.8934950828552246, + "logits/rejected": -1.8614875078201294, + "logps/chosen": -219.97900390625, + "logps/rejected": -295.099609375, + "loss": 0.719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0394965410232544, + "rewards/margins": 1.2311452627182007, + "rewards/rejected": -2.270641803741455, + "step": 6043 + }, + { + "epoch": 0.7, + "learning_rate": 9.042163694342742e-08, + "logits/chosen": -2.4254908561706543, + "logits/rejected": -2.548405647277832, + "logps/chosen": -268.73187255859375, + "logps/rejected": -293.1374206542969, + "loss": 0.1939, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6550068855285645, + "rewards/margins": 2.934358596801758, + "rewards/rejected": -3.589365243911743, + "step": 6044 + }, + { + "epoch": 0.7, + "learning_rate": 9.038620526750915e-08, + "logits/chosen": -2.0549476146698, + "logits/rejected": -2.3383572101593018, + "logps/chosen": -463.4259033203125, + "logps/rejected": -281.25592041015625, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7854830622673035, + "rewards/margins": 1.306495189666748, + "rewards/rejected": -2.091978073120117, + "step": 6045 + }, + { + "epoch": 0.7, + "learning_rate": 9.035077359159088e-08, + "logits/chosen": -2.6422457695007324, + "logits/rejected": -2.4961774349212646, + "logps/chosen": -213.76400756835938, + "logps/rejected": -189.88604736328125, + "loss": 0.4351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7342209815979004, + "rewards/margins": 1.4535390138626099, + "rewards/rejected": -2.1877598762512207, + "step": 6046 + }, + { + "epoch": 0.7, + "learning_rate": 9.03153419156726e-08, + "logits/chosen": -2.4081578254699707, + "logits/rejected": -2.3567299842834473, + "logps/chosen": -247.90664672851562, + "logps/rejected": -234.61669921875, + "loss": 0.6025, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9996240139007568, + "rewards/margins": 0.7125396728515625, + "rewards/rejected": -1.7121635675430298, + "step": 6047 + }, + { + "epoch": 0.7, + "learning_rate": 9.027991023975435e-08, + "logits/chosen": -2.8147592544555664, + "logits/rejected": -2.668168544769287, + "logps/chosen": -381.6708984375, + "logps/rejected": -256.8616027832031, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.338503360748291, + "rewards/margins": 2.7136435508728027, + "rewards/rejected": -4.052146911621094, + "step": 6048 + }, + { + "epoch": 0.7, + "learning_rate": 9.024447856383607e-08, + "logits/chosen": -2.0083932876586914, + "logits/rejected": -1.8788243532180786, + "logps/chosen": -272.7646484375, + "logps/rejected": -366.0218505859375, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8544009923934937, + "rewards/margins": 1.3425747156143188, + "rewards/rejected": -2.1969757080078125, + "step": 6049 + }, + { + "epoch": 0.7, + "learning_rate": 9.020904688791779e-08, + "logits/chosen": -2.2220420837402344, + "logits/rejected": -1.9433717727661133, + "logps/chosen": -286.26007080078125, + "logps/rejected": -354.78302001953125, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41965430974960327, + "rewards/margins": 2.7266602516174316, + "rewards/rejected": -3.1463146209716797, + "step": 6050 + }, + { + "epoch": 0.7, + "learning_rate": 9.017361521199953e-08, + "logits/chosen": -2.3732101917266846, + "logits/rejected": -2.5975537300109863, + "logps/chosen": -494.0625, + "logps/rejected": -298.8734130859375, + "loss": 0.5308, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5878989100456238, + "rewards/margins": 1.2254228591918945, + "rewards/rejected": -1.813321828842163, + "step": 6051 + }, + { + "epoch": 0.7, + "learning_rate": 9.013818353608125e-08, + "logits/chosen": -1.9585765600204468, + "logits/rejected": -1.9640977382659912, + "logps/chosen": -345.22027587890625, + "logps/rejected": -305.5877380371094, + "loss": 0.6802, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2118251323699951, + "rewards/margins": 1.8873087167739868, + "rewards/rejected": -3.0991339683532715, + "step": 6052 + }, + { + "epoch": 0.7, + "learning_rate": 9.0102751860163e-08, + "logits/chosen": -2.2497875690460205, + "logits/rejected": -2.3619680404663086, + "logps/chosen": -168.23890686035156, + "logps/rejected": -155.3815460205078, + "loss": 0.2327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8057801723480225, + "rewards/margins": 2.0978238582611084, + "rewards/rejected": -2.903604030609131, + "step": 6053 + }, + { + "epoch": 0.7, + "learning_rate": 9.006732018424472e-08, + "logits/chosen": -2.493454933166504, + "logits/rejected": -2.463866710662842, + "logps/chosen": -223.10906982421875, + "logps/rejected": -248.75152587890625, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5285241603851318, + "rewards/margins": 3.50827693939209, + "rewards/rejected": -4.036801338195801, + "step": 6054 + }, + { + "epoch": 0.7, + "learning_rate": 9.003188850832644e-08, + "logits/chosen": -2.026017427444458, + "logits/rejected": -1.9918798208236694, + "logps/chosen": -332.57305908203125, + "logps/rejected": -336.7238464355469, + "loss": 0.2898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4716756343841553, + "rewards/margins": 1.7782105207443237, + "rewards/rejected": -2.2498860359191895, + "step": 6055 + }, + { + "epoch": 0.7, + "learning_rate": 8.999645683240816e-08, + "logits/chosen": -2.1670002937316895, + "logits/rejected": -1.8779200315475464, + "logps/chosen": -315.0589599609375, + "logps/rejected": -293.11614990234375, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3251738548278809, + "rewards/margins": 1.0696611404418945, + "rewards/rejected": -2.3948349952697754, + "step": 6056 + }, + { + "epoch": 0.7, + "learning_rate": 8.99610251564899e-08, + "logits/chosen": -2.207033634185791, + "logits/rejected": -2.105794668197632, + "logps/chosen": -316.33135986328125, + "logps/rejected": -320.90728759765625, + "loss": 0.3469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7274892330169678, + "rewards/margins": 2.840513229370117, + "rewards/rejected": -3.568002223968506, + "step": 6057 + }, + { + "epoch": 0.7, + "learning_rate": 8.992559348057162e-08, + "logits/chosen": -2.7574825286865234, + "logits/rejected": -2.841709613800049, + "logps/chosen": -285.93109130859375, + "logps/rejected": -340.4156799316406, + "loss": 0.1552, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3708571195602417, + "rewards/margins": 3.010004997253418, + "rewards/rejected": -4.380862236022949, + "step": 6058 + }, + { + "epoch": 0.7, + "learning_rate": 8.989016180465337e-08, + "logits/chosen": -2.195568084716797, + "logits/rejected": -2.2587990760803223, + "logps/chosen": -172.8002471923828, + "logps/rejected": -236.90951538085938, + "loss": 0.347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8750612735748291, + "rewards/margins": 2.4218783378601074, + "rewards/rejected": -3.2969393730163574, + "step": 6059 + }, + { + "epoch": 0.7, + "learning_rate": 8.985473012873509e-08, + "logits/chosen": -2.0671024322509766, + "logits/rejected": -1.7654187679290771, + "logps/chosen": -143.0915069580078, + "logps/rejected": -245.78111267089844, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46524545550346375, + "rewards/margins": 2.8809163570404053, + "rewards/rejected": -3.3461618423461914, + "step": 6060 + }, + { + "epoch": 0.71, + "learning_rate": 8.981929845281681e-08, + "logits/chosen": -2.921506404876709, + "logits/rejected": -3.0079240798950195, + "logps/chosen": -129.94105529785156, + "logps/rejected": -191.42147827148438, + "loss": 0.5496, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5242642760276794, + "rewards/margins": 3.1556496620178223, + "rewards/rejected": -3.6799139976501465, + "step": 6061 + }, + { + "epoch": 0.71, + "learning_rate": 8.978386677689855e-08, + "logits/chosen": -2.34425950050354, + "logits/rejected": -2.3116650581359863, + "logps/chosen": -266.7066345214844, + "logps/rejected": -307.5665283203125, + "loss": 0.3734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1719905585050583, + "rewards/margins": 2.0604403018951416, + "rewards/rejected": -2.232430934906006, + "step": 6062 + }, + { + "epoch": 0.71, + "learning_rate": 8.974843510098027e-08, + "logits/chosen": -2.739182949066162, + "logits/rejected": -2.8692543506622314, + "logps/chosen": -217.7708282470703, + "logps/rejected": -260.9698791503906, + "loss": 0.716, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.037595272064209, + "rewards/margins": 1.46248197555542, + "rewards/rejected": -2.50007700920105, + "step": 6063 + }, + { + "epoch": 0.71, + "learning_rate": 8.971300342506199e-08, + "logits/chosen": -2.8602612018585205, + "logits/rejected": -2.8754830360412598, + "logps/chosen": -140.65780639648438, + "logps/rejected": -189.33399963378906, + "loss": 1.1151, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8129007816314697, + "rewards/margins": 0.14040732383728027, + "rewards/rejected": -1.95330810546875, + "step": 6064 + }, + { + "epoch": 0.71, + "learning_rate": 8.967757174914374e-08, + "logits/chosen": -2.6244564056396484, + "logits/rejected": -2.771458864212036, + "logps/chosen": -249.37396240234375, + "logps/rejected": -274.462890625, + "loss": 0.466, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3309949040412903, + "rewards/margins": 1.898422122001648, + "rewards/rejected": -2.229417085647583, + "step": 6065 + }, + { + "epoch": 0.71, + "learning_rate": 8.964214007322546e-08, + "logits/chosen": -2.621119499206543, + "logits/rejected": -2.5922396183013916, + "logps/chosen": -244.64468383789062, + "logps/rejected": -235.27439880371094, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13181746006011963, + "rewards/margins": 1.8906285762786865, + "rewards/rejected": -2.0224459171295166, + "step": 6066 + }, + { + "epoch": 0.71, + "learning_rate": 8.960670839730719e-08, + "logits/chosen": -2.2560882568359375, + "logits/rejected": -2.493377447128296, + "logps/chosen": -428.2352600097656, + "logps/rejected": -288.28436279296875, + "loss": 0.3439, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4931015968322754, + "rewards/margins": 1.546626091003418, + "rewards/rejected": -3.0397276878356934, + "step": 6067 + }, + { + "epoch": 0.71, + "learning_rate": 8.957127672138892e-08, + "logits/chosen": -2.8651716709136963, + "logits/rejected": -2.9749045372009277, + "logps/chosen": -208.01931762695312, + "logps/rejected": -183.84371948242188, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3931627571582794, + "rewards/margins": 2.5370540618896484, + "rewards/rejected": -2.9302167892456055, + "step": 6068 + }, + { + "epoch": 0.71, + "learning_rate": 8.953584504547064e-08, + "logits/chosen": -2.1059317588806152, + "logits/rejected": -2.3672032356262207, + "logps/chosen": -459.32586669921875, + "logps/rejected": -273.21856689453125, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12873800098896027, + "rewards/margins": 2.3498189449310303, + "rewards/rejected": -2.221081018447876, + "step": 6069 + }, + { + "epoch": 0.71, + "learning_rate": 8.950041336955237e-08, + "logits/chosen": -1.9180374145507812, + "logits/rejected": -2.4997997283935547, + "logps/chosen": -546.3544921875, + "logps/rejected": -312.7955627441406, + "loss": 0.3954, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31952184438705444, + "rewards/margins": 1.688346028327942, + "rewards/rejected": -2.0078678131103516, + "step": 6070 + }, + { + "epoch": 0.71, + "learning_rate": 8.946498169363411e-08, + "logits/chosen": -2.54764723777771, + "logits/rejected": -2.595222234725952, + "logps/chosen": -252.62982177734375, + "logps/rejected": -275.7747802734375, + "loss": 0.0552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20762206614017487, + "rewards/margins": 3.547131061553955, + "rewards/rejected": -3.7547531127929688, + "step": 6071 + }, + { + "epoch": 0.71, + "learning_rate": 8.942955001771584e-08, + "logits/chosen": -2.8214502334594727, + "logits/rejected": -2.5950121879577637, + "logps/chosen": -194.98770141601562, + "logps/rejected": -223.40989685058594, + "loss": 0.5833, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4574030935764313, + "rewards/margins": 2.3068861961364746, + "rewards/rejected": -2.764289140701294, + "step": 6072 + }, + { + "epoch": 0.71, + "learning_rate": 8.939411834179756e-08, + "logits/chosen": -2.3900208473205566, + "logits/rejected": -2.465254068374634, + "logps/chosen": -245.37562561035156, + "logps/rejected": -240.39381408691406, + "loss": 0.7263, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.182644844055176, + "rewards/margins": 1.126953125, + "rewards/rejected": -3.3095977306365967, + "step": 6073 + }, + { + "epoch": 0.71, + "learning_rate": 8.93586866658793e-08, + "logits/chosen": -1.9208862781524658, + "logits/rejected": -1.8747988939285278, + "logps/chosen": -388.1885986328125, + "logps/rejected": -359.90594482421875, + "loss": 0.2965, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0844615697860718, + "rewards/margins": 2.5975072383880615, + "rewards/rejected": -3.6819686889648438, + "step": 6074 + }, + { + "epoch": 0.71, + "learning_rate": 8.932325498996102e-08, + "logits/chosen": -2.1524553298950195, + "logits/rejected": -2.2502312660217285, + "logps/chosen": -177.98049926757812, + "logps/rejected": -188.83380126953125, + "loss": 0.5204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8347357511520386, + "rewards/margins": 0.762645423412323, + "rewards/rejected": -1.5973812341690063, + "step": 6075 + }, + { + "epoch": 0.71, + "learning_rate": 8.928782331404274e-08, + "logits/chosen": -2.355353355407715, + "logits/rejected": -2.4701359272003174, + "logps/chosen": -241.41578674316406, + "logps/rejected": -182.42572021484375, + "loss": 0.5684, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6510765552520752, + "rewards/margins": 1.980826735496521, + "rewards/rejected": -3.6319031715393066, + "step": 6076 + }, + { + "epoch": 0.71, + "learning_rate": 8.925239163812449e-08, + "logits/chosen": -2.297243356704712, + "logits/rejected": -2.124403715133667, + "logps/chosen": -240.0218048095703, + "logps/rejected": -263.4620666503906, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4449262022972107, + "rewards/margins": 1.9022812843322754, + "rewards/rejected": -2.347207546234131, + "step": 6077 + }, + { + "epoch": 0.71, + "learning_rate": 8.921695996220621e-08, + "logits/chosen": -2.2461559772491455, + "logits/rejected": -2.086864709854126, + "logps/chosen": -291.0412292480469, + "logps/rejected": -261.37286376953125, + "loss": 0.314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41498541831970215, + "rewards/margins": 1.7279516458511353, + "rewards/rejected": -2.142937183380127, + "step": 6078 + }, + { + "epoch": 0.71, + "learning_rate": 8.918152828628794e-08, + "logits/chosen": -2.1897013187408447, + "logits/rejected": -2.363321542739868, + "logps/chosen": -246.17330932617188, + "logps/rejected": -174.49530029296875, + "loss": 0.5452, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2312779426574707, + "rewards/margins": 0.9113813638687134, + "rewards/rejected": -2.1426591873168945, + "step": 6079 + }, + { + "epoch": 0.71, + "learning_rate": 8.914609661036967e-08, + "logits/chosen": -2.7428009510040283, + "logits/rejected": -2.639326572418213, + "logps/chosen": -428.898681640625, + "logps/rejected": -215.15017700195312, + "loss": 1.107, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7672739028930664, + "rewards/margins": 1.100378394126892, + "rewards/rejected": -2.867652416229248, + "step": 6080 + }, + { + "epoch": 0.71, + "learning_rate": 8.911066493445139e-08, + "logits/chosen": -2.4604074954986572, + "logits/rejected": -2.3406734466552734, + "logps/chosen": -507.4819030761719, + "logps/rejected": -454.30072021484375, + "loss": 0.3133, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2158894538879395, + "rewards/margins": 1.718186378479004, + "rewards/rejected": -2.9340758323669434, + "step": 6081 + }, + { + "epoch": 0.71, + "learning_rate": 8.907523325853312e-08, + "logits/chosen": -2.285922050476074, + "logits/rejected": -2.1631877422332764, + "logps/chosen": -136.53541564941406, + "logps/rejected": -285.9084777832031, + "loss": 0.5063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0933934450149536, + "rewards/margins": 2.5071191787719727, + "rewards/rejected": -3.6005125045776367, + "step": 6082 + }, + { + "epoch": 0.71, + "learning_rate": 8.903980158261486e-08, + "logits/chosen": -2.2997090816497803, + "logits/rejected": -2.5307321548461914, + "logps/chosen": -303.2523193359375, + "logps/rejected": -312.78546142578125, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0345184803009033, + "rewards/margins": 4.063953399658203, + "rewards/rejected": -5.0984721183776855, + "step": 6083 + }, + { + "epoch": 0.71, + "learning_rate": 8.900436990669658e-08, + "logits/chosen": -2.289158344268799, + "logits/rejected": -2.4126124382019043, + "logps/chosen": -256.8719787597656, + "logps/rejected": -156.02981567382812, + "loss": 0.856, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2161816358566284, + "rewards/margins": 0.17590780556201935, + "rewards/rejected": -1.3920893669128418, + "step": 6084 + }, + { + "epoch": 0.71, + "learning_rate": 8.896893823077832e-08, + "logits/chosen": -2.1791434288024902, + "logits/rejected": -2.1278724670410156, + "logps/chosen": -164.05587768554688, + "logps/rejected": -167.4168701171875, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.035451889038086, + "rewards/margins": 0.6230434775352478, + "rewards/rejected": -2.6584951877593994, + "step": 6085 + }, + { + "epoch": 0.71, + "learning_rate": 8.893350655486004e-08, + "logits/chosen": -1.7802207469940186, + "logits/rejected": -1.886575698852539, + "logps/chosen": -358.449951171875, + "logps/rejected": -364.9169921875, + "loss": 0.2627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9199423789978027, + "rewards/margins": 3.3337976932525635, + "rewards/rejected": -4.253739833831787, + "step": 6086 + }, + { + "epoch": 0.71, + "learning_rate": 8.889807487894176e-08, + "logits/chosen": -2.404851198196411, + "logits/rejected": -2.5447511672973633, + "logps/chosen": -175.25811767578125, + "logps/rejected": -151.1827392578125, + "loss": 1.7476, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.184103488922119, + "rewards/margins": -1.1567902565002441, + "rewards/rejected": -1.027313232421875, + "step": 6087 + }, + { + "epoch": 0.71, + "learning_rate": 8.88626432030235e-08, + "logits/chosen": -2.3675529956817627, + "logits/rejected": -2.3178658485412598, + "logps/chosen": -378.8748779296875, + "logps/rejected": -228.85000610351562, + "loss": 0.3234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03923603892326355, + "rewards/margins": 2.106400489807129, + "rewards/rejected": -2.145636558532715, + "step": 6088 + }, + { + "epoch": 0.71, + "learning_rate": 8.882721152710523e-08, + "logits/chosen": -2.497647762298584, + "logits/rejected": -2.734200954437256, + "logps/chosen": -308.90020751953125, + "logps/rejected": -314.0935974121094, + "loss": 0.5421, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5567786693572998, + "rewards/margins": 1.806865930557251, + "rewards/rejected": -3.363644599914551, + "step": 6089 + }, + { + "epoch": 0.71, + "learning_rate": 8.879177985118695e-08, + "logits/chosen": -1.565824270248413, + "logits/rejected": -1.8275678157806396, + "logps/chosen": -294.8878173828125, + "logps/rejected": -282.58154296875, + "loss": 0.2518, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15170909464359283, + "rewards/margins": 2.397779941558838, + "rewards/rejected": -2.5494887828826904, + "step": 6090 + }, + { + "epoch": 0.71, + "learning_rate": 8.875634817526869e-08, + "logits/chosen": -2.2223644256591797, + "logits/rejected": -2.143782615661621, + "logps/chosen": -609.1446533203125, + "logps/rejected": -457.02886962890625, + "loss": 0.6349, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.375617265701294, + "rewards/margins": 0.9190353155136108, + "rewards/rejected": -2.2946527004241943, + "step": 6091 + }, + { + "epoch": 0.71, + "learning_rate": 8.872091649935041e-08, + "logits/chosen": -2.5329525470733643, + "logits/rejected": -2.7066824436187744, + "logps/chosen": -303.14617919921875, + "logps/rejected": -273.23626708984375, + "loss": 0.5213, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7791649699211121, + "rewards/margins": 0.875169038772583, + "rewards/rejected": -1.6543340682983398, + "step": 6092 + }, + { + "epoch": 0.71, + "learning_rate": 8.868548482343213e-08, + "logits/chosen": -2.4131641387939453, + "logits/rejected": -2.2675180435180664, + "logps/chosen": -162.85696411132812, + "logps/rejected": -225.6729278564453, + "loss": 1.0922, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2169970273971558, + "rewards/margins": 0.748391330242157, + "rewards/rejected": -1.965388298034668, + "step": 6093 + }, + { + "epoch": 0.71, + "learning_rate": 8.865005314751388e-08, + "logits/chosen": -1.9691004753112793, + "logits/rejected": -1.7309893369674683, + "logps/chosen": -291.86651611328125, + "logps/rejected": -343.6380615234375, + "loss": 0.3573, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3300131559371948, + "rewards/margins": 3.140977144241333, + "rewards/rejected": -4.470990180969238, + "step": 6094 + }, + { + "epoch": 0.71, + "learning_rate": 8.86146214715956e-08, + "logits/chosen": -2.269162893295288, + "logits/rejected": -2.318301200866699, + "logps/chosen": -454.9854736328125, + "logps/rejected": -291.75335693359375, + "loss": 0.3068, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36650753021240234, + "rewards/margins": 1.7758901119232178, + "rewards/rejected": -2.14239764213562, + "step": 6095 + }, + { + "epoch": 0.71, + "learning_rate": 8.857918979567734e-08, + "logits/chosen": -2.0823662281036377, + "logits/rejected": -2.214926242828369, + "logps/chosen": -306.0577697753906, + "logps/rejected": -264.1078186035156, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8658745884895325, + "rewards/margins": 3.4998414516448975, + "rewards/rejected": -4.365715980529785, + "step": 6096 + }, + { + "epoch": 0.71, + "learning_rate": 8.854375811975906e-08, + "logits/chosen": -2.298790454864502, + "logits/rejected": -2.11314058303833, + "logps/chosen": -272.09844970703125, + "logps/rejected": -336.40478515625, + "loss": 0.3971, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5103957653045654, + "rewards/margins": 1.598239779472351, + "rewards/rejected": -2.108635425567627, + "step": 6097 + }, + { + "epoch": 0.71, + "learning_rate": 8.850832644384078e-08, + "logits/chosen": -2.495558500289917, + "logits/rejected": -2.3835060596466064, + "logps/chosen": -250.13194274902344, + "logps/rejected": -206.45150756835938, + "loss": 0.2864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6320149898529053, + "rewards/margins": 1.4325981140136719, + "rewards/rejected": -2.064613103866577, + "step": 6098 + }, + { + "epoch": 0.71, + "learning_rate": 8.847289476792252e-08, + "logits/chosen": -2.202238082885742, + "logits/rejected": -2.077327013015747, + "logps/chosen": -175.7583770751953, + "logps/rejected": -369.3607482910156, + "loss": 0.8092, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.522727608680725, + "rewards/margins": 2.3033037185668945, + "rewards/rejected": -3.82603120803833, + "step": 6099 + }, + { + "epoch": 0.71, + "learning_rate": 8.843746309200425e-08, + "logits/chosen": -1.6138476133346558, + "logits/rejected": -2.2819066047668457, + "logps/chosen": -464.3603820800781, + "logps/rejected": -219.4442138671875, + "loss": 0.5702, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0938005447387695, + "rewards/margins": 0.9703003168106079, + "rewards/rejected": -2.064100742340088, + "step": 6100 + }, + { + "epoch": 0.71, + "learning_rate": 8.840203141608598e-08, + "logits/chosen": -1.9255167245864868, + "logits/rejected": -1.979909062385559, + "logps/chosen": -299.1183166503906, + "logps/rejected": -334.2889404296875, + "loss": 0.465, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1435785293579102, + "rewards/margins": 1.842218041419983, + "rewards/rejected": -2.9857966899871826, + "step": 6101 + }, + { + "epoch": 0.71, + "learning_rate": 8.836659974016771e-08, + "logits/chosen": -2.456321954727173, + "logits/rejected": -2.594014883041382, + "logps/chosen": -136.5105438232422, + "logps/rejected": -176.92172241210938, + "loss": 0.3602, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2972975969314575, + "rewards/margins": 1.7850086688995361, + "rewards/rejected": -3.082306385040283, + "step": 6102 + }, + { + "epoch": 0.71, + "learning_rate": 8.833116806424943e-08, + "logits/chosen": -2.4109106063842773, + "logits/rejected": -2.203990936279297, + "logps/chosen": -233.26119995117188, + "logps/rejected": -299.64349365234375, + "loss": 0.1744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.616451621055603, + "rewards/margins": 4.101806640625, + "rewards/rejected": -4.718257904052734, + "step": 6103 + }, + { + "epoch": 0.71, + "learning_rate": 8.829573638833116e-08, + "logits/chosen": -2.1415700912475586, + "logits/rejected": -1.9274588823318481, + "logps/chosen": -219.3123779296875, + "logps/rejected": -301.9386901855469, + "loss": 0.459, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9475097060203552, + "rewards/margins": 1.4611196517944336, + "rewards/rejected": -2.4086294174194336, + "step": 6104 + }, + { + "epoch": 0.71, + "learning_rate": 8.826030471241289e-08, + "logits/chosen": -2.3756103515625, + "logits/rejected": -2.4115521907806396, + "logps/chosen": -369.58740234375, + "logps/rejected": -276.358154296875, + "loss": 0.2302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22028805315494537, + "rewards/margins": 3.006739616394043, + "rewards/rejected": -3.227027416229248, + "step": 6105 + }, + { + "epoch": 0.71, + "learning_rate": 8.822487303649463e-08, + "logits/chosen": -2.215615749359131, + "logits/rejected": -2.3276073932647705, + "logps/chosen": -413.2349853515625, + "logps/rejected": -346.08721923828125, + "loss": 0.4898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3091329336166382, + "rewards/margins": 0.9969435930252075, + "rewards/rejected": -2.3060765266418457, + "step": 6106 + }, + { + "epoch": 0.71, + "learning_rate": 8.818944136057636e-08, + "logits/chosen": -2.226386547088623, + "logits/rejected": -1.9804866313934326, + "logps/chosen": -264.3817138671875, + "logps/rejected": -317.307373046875, + "loss": 0.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4915817677974701, + "rewards/margins": 2.6018714904785156, + "rewards/rejected": -3.0934531688690186, + "step": 6107 + }, + { + "epoch": 0.71, + "learning_rate": 8.815400968465808e-08, + "logits/chosen": -1.9151116609573364, + "logits/rejected": -2.1489970684051514, + "logps/chosen": -434.1298828125, + "logps/rejected": -403.305908203125, + "loss": 0.6361, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5194711685180664, + "rewards/margins": 1.9919497966766357, + "rewards/rejected": -3.511420726776123, + "step": 6108 + }, + { + "epoch": 0.71, + "learning_rate": 8.81185780087398e-08, + "logits/chosen": -2.2899513244628906, + "logits/rejected": -2.1781821250915527, + "logps/chosen": -242.45130920410156, + "logps/rejected": -330.3238525390625, + "loss": 0.5673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7320735454559326, + "rewards/margins": 0.6779049038887024, + "rewards/rejected": -1.4099783897399902, + "step": 6109 + }, + { + "epoch": 0.71, + "learning_rate": 8.808314633282153e-08, + "logits/chosen": -2.13594913482666, + "logits/rejected": -1.8555288314819336, + "logps/chosen": -419.04302978515625, + "logps/rejected": -587.0065307617188, + "loss": 0.2918, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7920877933502197, + "rewards/margins": 2.6420600414276123, + "rewards/rejected": -3.434147834777832, + "step": 6110 + }, + { + "epoch": 0.71, + "learning_rate": 8.804771465690326e-08, + "logits/chosen": -2.480276584625244, + "logits/rejected": -2.44240140914917, + "logps/chosen": -206.36062622070312, + "logps/rejected": -237.22869873046875, + "loss": 0.4797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2510621249675751, + "rewards/margins": 1.4230225086212158, + "rewards/rejected": -1.6740846633911133, + "step": 6111 + }, + { + "epoch": 0.71, + "learning_rate": 8.8012282980985e-08, + "logits/chosen": -2.06315279006958, + "logits/rejected": -2.0393190383911133, + "logps/chosen": -303.54876708984375, + "logps/rejected": -260.7725830078125, + "loss": 0.3088, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8212411403656006, + "rewards/margins": 2.6151843070983887, + "rewards/rejected": -3.4364254474639893, + "step": 6112 + }, + { + "epoch": 0.71, + "learning_rate": 8.797685130506673e-08, + "logits/chosen": -2.980529546737671, + "logits/rejected": -2.961334466934204, + "logps/chosen": -182.6428985595703, + "logps/rejected": -158.72781372070312, + "loss": 0.2067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9052941799163818, + "rewards/margins": 2.855271816253662, + "rewards/rejected": -3.760566234588623, + "step": 6113 + }, + { + "epoch": 0.71, + "learning_rate": 8.794141962914846e-08, + "logits/chosen": -1.7858860492706299, + "logits/rejected": -1.8515753746032715, + "logps/chosen": -334.43182373046875, + "logps/rejected": -290.6058044433594, + "loss": 0.403, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3740646839141846, + "rewards/margins": 2.0690348148345947, + "rewards/rejected": -3.4430994987487793, + "step": 6114 + }, + { + "epoch": 0.71, + "learning_rate": 8.790598795323018e-08, + "logits/chosen": -1.912555456161499, + "logits/rejected": -1.9130315780639648, + "logps/chosen": -521.101806640625, + "logps/rejected": -463.4203796386719, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06353634595870972, + "rewards/margins": 2.244542360305786, + "rewards/rejected": -2.3080787658691406, + "step": 6115 + }, + { + "epoch": 0.71, + "learning_rate": 8.787055627731191e-08, + "logits/chosen": -2.262333631515503, + "logits/rejected": -2.6874678134918213, + "logps/chosen": -241.91064453125, + "logps/rejected": -279.5497131347656, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24283675849437714, + "rewards/margins": 4.939653396606445, + "rewards/rejected": -5.182490348815918, + "step": 6116 + }, + { + "epoch": 0.71, + "learning_rate": 8.783512460139364e-08, + "logits/chosen": -2.3150930404663086, + "logits/rejected": -2.2221925258636475, + "logps/chosen": -264.23016357421875, + "logps/rejected": -352.434814453125, + "loss": 0.791, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4289357662200928, + "rewards/margins": 1.3579154014587402, + "rewards/rejected": -2.786851406097412, + "step": 6117 + }, + { + "epoch": 0.71, + "learning_rate": 8.779969292547537e-08, + "logits/chosen": -1.5268027782440186, + "logits/rejected": -1.658892273902893, + "logps/chosen": -391.9781188964844, + "logps/rejected": -341.7236633300781, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5500835180282593, + "rewards/margins": 2.1006102561950684, + "rewards/rejected": -2.650693893432617, + "step": 6118 + }, + { + "epoch": 0.71, + "learning_rate": 8.776426124955711e-08, + "logits/chosen": -1.764309048652649, + "logits/rejected": -2.007469415664673, + "logps/chosen": -236.9412384033203, + "logps/rejected": -249.05166625976562, + "loss": 0.2891, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8151686787605286, + "rewards/margins": 2.8563342094421387, + "rewards/rejected": -3.6715028285980225, + "step": 6119 + }, + { + "epoch": 0.71, + "learning_rate": 8.772882957363883e-08, + "logits/chosen": -2.3307769298553467, + "logits/rejected": -2.0163726806640625, + "logps/chosen": -178.000732421875, + "logps/rejected": -250.12823486328125, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.14437997341156, + "rewards/margins": 2.359159231185913, + "rewards/rejected": -3.5035390853881836, + "step": 6120 + }, + { + "epoch": 0.71, + "learning_rate": 8.769339789772055e-08, + "logits/chosen": -2.597825765609741, + "logits/rejected": -2.7223739624023438, + "logps/chosen": -299.5548400878906, + "logps/rejected": -303.1390380859375, + "loss": 0.4602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7089821100234985, + "rewards/margins": 2.198246955871582, + "rewards/rejected": -2.907228946685791, + "step": 6121 + }, + { + "epoch": 0.71, + "learning_rate": 8.765796622180229e-08, + "logits/chosen": -2.1313023567199707, + "logits/rejected": -2.283750534057617, + "logps/chosen": -220.73583984375, + "logps/rejected": -199.9409637451172, + "loss": 0.5779, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8378703594207764, + "rewards/margins": 0.8049442768096924, + "rewards/rejected": -1.6428146362304688, + "step": 6122 + }, + { + "epoch": 0.71, + "learning_rate": 8.762253454588401e-08, + "logits/chosen": -1.481194019317627, + "logits/rejected": -1.7667092084884644, + "logps/chosen": -417.4812316894531, + "logps/rejected": -199.42239379882812, + "loss": 0.5664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.974650502204895, + "rewards/margins": 0.913427472114563, + "rewards/rejected": -1.888077974319458, + "step": 6123 + }, + { + "epoch": 0.71, + "learning_rate": 8.758710286996576e-08, + "logits/chosen": -2.097006320953369, + "logits/rejected": -2.336116075515747, + "logps/chosen": -216.31507873535156, + "logps/rejected": -119.80402374267578, + "loss": 0.3131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40683791041374207, + "rewards/margins": 1.505118489265442, + "rewards/rejected": -1.9119564294815063, + "step": 6124 + }, + { + "epoch": 0.71, + "learning_rate": 8.755167119404748e-08, + "logits/chosen": -2.0113472938537598, + "logits/rejected": -2.361189126968384, + "logps/chosen": -334.7735595703125, + "logps/rejected": -173.824462890625, + "loss": 0.4211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21142104268074036, + "rewards/margins": 2.062908887863159, + "rewards/rejected": -2.2743301391601562, + "step": 6125 + }, + { + "epoch": 0.71, + "learning_rate": 8.75162395181292e-08, + "logits/chosen": -2.684218406677246, + "logits/rejected": -2.8137998580932617, + "logps/chosen": -281.62213134765625, + "logps/rejected": -178.46836853027344, + "loss": 0.3558, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2136485576629639, + "rewards/margins": 1.5049245357513428, + "rewards/rejected": -2.7185730934143066, + "step": 6126 + }, + { + "epoch": 0.71, + "learning_rate": 8.748080784221092e-08, + "logits/chosen": -2.444291353225708, + "logits/rejected": -2.55389666557312, + "logps/chosen": -321.7677307128906, + "logps/rejected": -271.75885009765625, + "loss": 0.205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2117134928703308, + "rewards/margins": 2.6430160999298096, + "rewards/rejected": -2.854729413986206, + "step": 6127 + }, + { + "epoch": 0.71, + "learning_rate": 8.744537616629266e-08, + "logits/chosen": -2.079340934753418, + "logits/rejected": -2.063159942626953, + "logps/chosen": -347.285888671875, + "logps/rejected": -255.70108032226562, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6500391960144043, + "rewards/margins": 2.943251609802246, + "rewards/rejected": -3.5932908058166504, + "step": 6128 + }, + { + "epoch": 0.71, + "learning_rate": 8.74099444903744e-08, + "logits/chosen": -2.451814651489258, + "logits/rejected": -2.6078076362609863, + "logps/chosen": -245.636962890625, + "logps/rejected": -179.5076904296875, + "loss": 0.1986, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008210569620132446, + "rewards/margins": 2.986668109893799, + "rewards/rejected": -2.978457450866699, + "step": 6129 + }, + { + "epoch": 0.71, + "learning_rate": 8.737451281445613e-08, + "logits/chosen": -2.0049984455108643, + "logits/rejected": -2.03981876373291, + "logps/chosen": -384.27972412109375, + "logps/rejected": -280.62908935546875, + "loss": 0.721, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4910625219345093, + "rewards/margins": 0.545496940612793, + "rewards/rejected": -2.036559581756592, + "step": 6130 + }, + { + "epoch": 0.71, + "learning_rate": 8.733908113853785e-08, + "logits/chosen": -2.844108819961548, + "logits/rejected": -2.79852032661438, + "logps/chosen": -186.81813049316406, + "logps/rejected": -305.7603454589844, + "loss": 0.2305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20040485262870789, + "rewards/margins": 3.1491432189941406, + "rewards/rejected": -3.34954833984375, + "step": 6131 + }, + { + "epoch": 0.71, + "learning_rate": 8.730364946261957e-08, + "logits/chosen": -1.9015370607376099, + "logits/rejected": -2.1039295196533203, + "logps/chosen": -389.6858215332031, + "logps/rejected": -312.4541320800781, + "loss": 0.2114, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6738407015800476, + "rewards/margins": 2.252793788909912, + "rewards/rejected": -2.9266345500946045, + "step": 6132 + }, + { + "epoch": 0.71, + "learning_rate": 8.726821778670131e-08, + "logits/chosen": -1.9939970970153809, + "logits/rejected": -2.312680959701538, + "logps/chosen": -500.7137145996094, + "logps/rejected": -274.1947937011719, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9713073968887329, + "rewards/margins": 2.330764055252075, + "rewards/rejected": -3.3020713329315186, + "step": 6133 + }, + { + "epoch": 0.71, + "learning_rate": 8.723278611078303e-08, + "logits/chosen": -2.509303092956543, + "logits/rejected": -2.608140707015991, + "logps/chosen": -245.470703125, + "logps/rejected": -260.2325439453125, + "loss": 0.3626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5932873487472534, + "rewards/margins": 1.338752269744873, + "rewards/rejected": -1.932039499282837, + "step": 6134 + }, + { + "epoch": 0.71, + "learning_rate": 8.719735443486477e-08, + "logits/chosen": -2.457812786102295, + "logits/rejected": -2.4139039516448975, + "logps/chosen": -273.72528076171875, + "logps/rejected": -284.458251953125, + "loss": 0.3993, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.376622200012207, + "rewards/margins": 1.8005832433700562, + "rewards/rejected": -3.1772053241729736, + "step": 6135 + }, + { + "epoch": 0.71, + "learning_rate": 8.71619227589465e-08, + "logits/chosen": -2.0673482418060303, + "logits/rejected": -2.491957187652588, + "logps/chosen": -388.52923583984375, + "logps/rejected": -151.33969116210938, + "loss": 0.6432, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8891506791114807, + "rewards/margins": 1.5820744037628174, + "rewards/rejected": -2.471224784851074, + "step": 6136 + }, + { + "epoch": 0.71, + "learning_rate": 8.712649108302822e-08, + "logits/chosen": -2.2548575401306152, + "logits/rejected": -2.6401479244232178, + "logps/chosen": -373.9761962890625, + "logps/rejected": -270.98590087890625, + "loss": 0.4378, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0048713684082031, + "rewards/margins": 2.7983908653259277, + "rewards/rejected": -3.803262233734131, + "step": 6137 + }, + { + "epoch": 0.71, + "learning_rate": 8.709105940710995e-08, + "logits/chosen": -1.9743678569793701, + "logits/rejected": -2.2567379474639893, + "logps/chosen": -231.7701416015625, + "logps/rejected": -271.6626281738281, + "loss": 0.3696, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6343276500701904, + "rewards/margins": 2.8061866760253906, + "rewards/rejected": -4.440514087677002, + "step": 6138 + }, + { + "epoch": 0.71, + "learning_rate": 8.705562773119168e-08, + "logits/chosen": -2.522420883178711, + "logits/rejected": -2.7375705242156982, + "logps/chosen": -468.5842590332031, + "logps/rejected": -243.8650665283203, + "loss": 0.3982, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8197323679924011, + "rewards/margins": 1.949836254119873, + "rewards/rejected": -2.769568920135498, + "step": 6139 + }, + { + "epoch": 0.71, + "learning_rate": 8.70201960552734e-08, + "logits/chosen": -2.249361038208008, + "logits/rejected": -2.1344447135925293, + "logps/chosen": -377.2974548339844, + "logps/rejected": -437.4736633300781, + "loss": 0.3119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.973794162273407, + "rewards/margins": 2.7540817260742188, + "rewards/rejected": -3.7278757095336914, + "step": 6140 + }, + { + "epoch": 0.71, + "learning_rate": 8.698476437935515e-08, + "logits/chosen": -2.182097911834717, + "logits/rejected": -2.3200480937957764, + "logps/chosen": -462.1526794433594, + "logps/rejected": -291.4093017578125, + "loss": 0.1433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3261638581752777, + "rewards/margins": 3.1167874336242676, + "rewards/rejected": -3.4429514408111572, + "step": 6141 + }, + { + "epoch": 0.71, + "learning_rate": 8.694933270343687e-08, + "logits/chosen": -1.771376371383667, + "logits/rejected": -2.426353931427002, + "logps/chosen": -405.4763488769531, + "logps/rejected": -176.96792602539062, + "loss": 0.2778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40019354224205017, + "rewards/margins": 1.604334831237793, + "rewards/rejected": -2.004528284072876, + "step": 6142 + }, + { + "epoch": 0.71, + "learning_rate": 8.69139010275186e-08, + "logits/chosen": -2.0422451496124268, + "logits/rejected": -1.9427660703659058, + "logps/chosen": -290.16351318359375, + "logps/rejected": -245.6251220703125, + "loss": 0.5169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4202842116355896, + "rewards/margins": 1.5211806297302246, + "rewards/rejected": -1.9414650201797485, + "step": 6143 + }, + { + "epoch": 0.71, + "learning_rate": 8.687846935160033e-08, + "logits/chosen": -2.257930278778076, + "logits/rejected": -2.6794776916503906, + "logps/chosen": -336.50933837890625, + "logps/rejected": -168.97991943359375, + "loss": 0.7564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7911392450332642, + "rewards/margins": 0.9203197360038757, + "rewards/rejected": -1.7114589214324951, + "step": 6144 + }, + { + "epoch": 0.71, + "learning_rate": 8.684303767568205e-08, + "logits/chosen": -2.2868313789367676, + "logits/rejected": -2.3585524559020996, + "logps/chosen": -234.69717407226562, + "logps/rejected": -232.47018432617188, + "loss": 0.4208, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.042853832244873, + "rewards/margins": 1.8289209604263306, + "rewards/rejected": -2.871774673461914, + "step": 6145 + }, + { + "epoch": 0.71, + "learning_rate": 8.680760599976378e-08, + "logits/chosen": -2.2574613094329834, + "logits/rejected": -2.174962043762207, + "logps/chosen": -321.06182861328125, + "logps/rejected": -364.7124938964844, + "loss": 0.1577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22402261197566986, + "rewards/margins": 3.276345729827881, + "rewards/rejected": -3.500368595123291, + "step": 6146 + }, + { + "epoch": 0.72, + "learning_rate": 8.677217432384552e-08, + "logits/chosen": -2.818155527114868, + "logits/rejected": -2.77358341217041, + "logps/chosen": -238.77276611328125, + "logps/rejected": -230.65713500976562, + "loss": 0.7218, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4851832389831543, + "rewards/margins": 1.5832417011260986, + "rewards/rejected": -3.068425178527832, + "step": 6147 + }, + { + "epoch": 0.72, + "learning_rate": 8.673674264792725e-08, + "logits/chosen": -2.423346519470215, + "logits/rejected": -2.671715259552002, + "logps/chosen": -216.0159912109375, + "logps/rejected": -138.0213623046875, + "loss": 0.4761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8891265392303467, + "rewards/margins": 1.8276431560516357, + "rewards/rejected": -2.7167694568634033, + "step": 6148 + }, + { + "epoch": 0.72, + "learning_rate": 8.670131097200897e-08, + "logits/chosen": -1.7372336387634277, + "logits/rejected": -1.9758015871047974, + "logps/chosen": -453.142822265625, + "logps/rejected": -322.1582946777344, + "loss": 0.2735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36735546588897705, + "rewards/margins": 2.688380241394043, + "rewards/rejected": -3.0557358264923096, + "step": 6149 + }, + { + "epoch": 0.72, + "learning_rate": 8.66658792960907e-08, + "logits/chosen": -2.889728546142578, + "logits/rejected": -2.765704393386841, + "logps/chosen": -180.3844451904297, + "logps/rejected": -193.21685791015625, + "loss": 0.2256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2347992658615112, + "rewards/margins": 2.1784989833831787, + "rewards/rejected": -3.4132981300354004, + "step": 6150 + }, + { + "epoch": 0.72, + "learning_rate": 8.663044762017243e-08, + "logits/chosen": -2.4635672569274902, + "logits/rejected": -2.6356940269470215, + "logps/chosen": -291.2437438964844, + "logps/rejected": -273.0096130371094, + "loss": 0.5601, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.043256163597107, + "rewards/margins": 2.6376941204071045, + "rewards/rejected": -3.680950164794922, + "step": 6151 + }, + { + "epoch": 0.72, + "learning_rate": 8.659501594425415e-08, + "logits/chosen": -2.1826043128967285, + "logits/rejected": -2.427936553955078, + "logps/chosen": -492.7127380371094, + "logps/rejected": -349.9996643066406, + "loss": 0.5329, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2952625751495361, + "rewards/margins": 0.6753624677658081, + "rewards/rejected": -1.9706249237060547, + "step": 6152 + }, + { + "epoch": 0.72, + "learning_rate": 8.65595842683359e-08, + "logits/chosen": -2.2325987815856934, + "logits/rejected": -2.2353317737579346, + "logps/chosen": -362.5477600097656, + "logps/rejected": -251.04669189453125, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.376921147108078, + "rewards/margins": 1.9942996501922607, + "rewards/rejected": -2.371220827102661, + "step": 6153 + }, + { + "epoch": 0.72, + "learning_rate": 8.652415259241762e-08, + "logits/chosen": -1.9101965427398682, + "logits/rejected": -1.9166339635849, + "logps/chosen": -241.71304321289062, + "logps/rejected": -242.64959716796875, + "loss": 0.408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8892908096313477, + "rewards/margins": 1.5895254611968994, + "rewards/rejected": -2.478816032409668, + "step": 6154 + }, + { + "epoch": 0.72, + "learning_rate": 8.648872091649934e-08, + "logits/chosen": -2.627551317214966, + "logits/rejected": -2.467507839202881, + "logps/chosen": -136.03323364257812, + "logps/rejected": -269.40283203125, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.202125906944275, + "rewards/margins": 1.8675696849822998, + "rewards/rejected": -3.069695472717285, + "step": 6155 + }, + { + "epoch": 0.72, + "learning_rate": 8.645328924058108e-08, + "logits/chosen": -1.7970623970031738, + "logits/rejected": -1.8942314386367798, + "logps/chosen": -164.4700164794922, + "logps/rejected": -242.071044921875, + "loss": 0.3473, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36236006021499634, + "rewards/margins": 2.44041109085083, + "rewards/rejected": -2.8027710914611816, + "step": 6156 + }, + { + "epoch": 0.72, + "learning_rate": 8.64178575646628e-08, + "logits/chosen": -2.385831594467163, + "logits/rejected": -2.25411057472229, + "logps/chosen": -559.1810913085938, + "logps/rejected": -252.39291381835938, + "loss": 0.2477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24882155656814575, + "rewards/margins": 2.4359004497528076, + "rewards/rejected": -2.684722423553467, + "step": 6157 + }, + { + "epoch": 0.72, + "learning_rate": 8.638242588874452e-08, + "logits/chosen": -2.1215083599090576, + "logits/rejected": -2.135525941848755, + "logps/chosen": -288.5606994628906, + "logps/rejected": -256.0438537597656, + "loss": 0.1791, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.034604549407959, + "rewards/margins": 3.9834158420562744, + "rewards/rejected": -5.018019676208496, + "step": 6158 + }, + { + "epoch": 0.72, + "learning_rate": 8.634699421282627e-08, + "logits/chosen": -2.3887667655944824, + "logits/rejected": -2.6001358032226562, + "logps/chosen": -312.2806396484375, + "logps/rejected": -182.0302734375, + "loss": 0.4372, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4884106516838074, + "rewards/margins": 1.521395206451416, + "rewards/rejected": -2.009805917739868, + "step": 6159 + }, + { + "epoch": 0.72, + "learning_rate": 8.631156253690799e-08, + "logits/chosen": -1.9172754287719727, + "logits/rejected": -1.948781967163086, + "logps/chosen": -370.1256408691406, + "logps/rejected": -278.2213439941406, + "loss": 0.5927, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2394564151763916, + "rewards/margins": 1.520858645439148, + "rewards/rejected": -2.76031494140625, + "step": 6160 + }, + { + "epoch": 0.72, + "learning_rate": 8.627613086098973e-08, + "logits/chosen": -2.6870713233947754, + "logits/rejected": -2.6699600219726562, + "logps/chosen": -211.618896484375, + "logps/rejected": -233.21446228027344, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7012476325035095, + "rewards/margins": 4.089520454406738, + "rewards/rejected": -4.790768146514893, + "step": 6161 + }, + { + "epoch": 0.72, + "learning_rate": 8.624069918507145e-08, + "logits/chosen": -1.9824742078781128, + "logits/rejected": -1.862900733947754, + "logps/chosen": -237.03225708007812, + "logps/rejected": -218.35902404785156, + "loss": 0.3693, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4816663265228271, + "rewards/margins": 2.5816895961761475, + "rewards/rejected": -4.063355922698975, + "step": 6162 + }, + { + "epoch": 0.72, + "learning_rate": 8.620526750915317e-08, + "logits/chosen": -2.8389625549316406, + "logits/rejected": -2.64193058013916, + "logps/chosen": -322.6404724121094, + "logps/rejected": -311.8883056640625, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7945222854614258, + "rewards/margins": 2.6259117126464844, + "rewards/rejected": -3.42043399810791, + "step": 6163 + }, + { + "epoch": 0.72, + "learning_rate": 8.616983583323492e-08, + "logits/chosen": -2.212230920791626, + "logits/rejected": -1.8447842597961426, + "logps/chosen": -185.98309326171875, + "logps/rejected": -354.83331298828125, + "loss": 0.3154, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6632559299468994, + "rewards/margins": 1.492000937461853, + "rewards/rejected": -2.155256986618042, + "step": 6164 + }, + { + "epoch": 0.72, + "learning_rate": 8.613440415731664e-08, + "logits/chosen": -2.2033283710479736, + "logits/rejected": -2.3496227264404297, + "logps/chosen": -177.17449951171875, + "logps/rejected": -166.38232421875, + "loss": 0.4474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25591638684272766, + "rewards/margins": 0.778154730796814, + "rewards/rejected": -1.0340712070465088, + "step": 6165 + }, + { + "epoch": 0.72, + "learning_rate": 8.609897248139836e-08, + "logits/chosen": -2.310760974884033, + "logits/rejected": -2.3604140281677246, + "logps/chosen": -173.6068115234375, + "logps/rejected": -308.4166564941406, + "loss": 0.2243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.963188648223877, + "rewards/margins": 4.7303643226623535, + "rewards/rejected": -5.693553447723389, + "step": 6166 + }, + { + "epoch": 0.72, + "learning_rate": 8.60635408054801e-08, + "logits/chosen": -2.4454691410064697, + "logits/rejected": -2.4416043758392334, + "logps/chosen": -242.67405700683594, + "logps/rejected": -205.78683471679688, + "loss": 0.2653, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6365424394607544, + "rewards/margins": 2.5353024005889893, + "rewards/rejected": -3.171844959259033, + "step": 6167 + }, + { + "epoch": 0.72, + "learning_rate": 8.602810912956182e-08, + "logits/chosen": -2.693699836730957, + "logits/rejected": -2.5056145191192627, + "logps/chosen": -213.38446044921875, + "logps/rejected": -264.4823913574219, + "loss": 0.4081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6370081305503845, + "rewards/margins": 1.6609513759613037, + "rewards/rejected": -2.297959566116333, + "step": 6168 + }, + { + "epoch": 0.72, + "learning_rate": 8.599267745364354e-08, + "logits/chosen": -2.4321999549865723, + "logits/rejected": -2.5228521823883057, + "logps/chosen": -240.80467224121094, + "logps/rejected": -300.208984375, + "loss": 0.3714, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.053305745124817, + "rewards/margins": 1.3856127262115479, + "rewards/rejected": -2.4389185905456543, + "step": 6169 + }, + { + "epoch": 0.72, + "learning_rate": 8.595724577772529e-08, + "logits/chosen": -2.459749221801758, + "logits/rejected": -2.7593846321105957, + "logps/chosen": -281.6546630859375, + "logps/rejected": -257.7071533203125, + "loss": 0.4225, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7256710529327393, + "rewards/margins": 1.9130609035491943, + "rewards/rejected": -2.6387319564819336, + "step": 6170 + }, + { + "epoch": 0.72, + "learning_rate": 8.592181410180701e-08, + "logits/chosen": -2.3389015197753906, + "logits/rejected": -2.3837218284606934, + "logps/chosen": -243.61997985839844, + "logps/rejected": -380.61376953125, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3441395163536072, + "rewards/margins": 2.917905330657959, + "rewards/rejected": -3.262044906616211, + "step": 6171 + }, + { + "epoch": 0.72, + "learning_rate": 8.588638242588874e-08, + "logits/chosen": -2.6372053623199463, + "logits/rejected": -2.3704934120178223, + "logps/chosen": -254.6194610595703, + "logps/rejected": -302.18560791015625, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9449309706687927, + "rewards/margins": 4.170022964477539, + "rewards/rejected": -5.114953994750977, + "step": 6172 + }, + { + "epoch": 0.72, + "learning_rate": 8.585095074997047e-08, + "logits/chosen": -2.195780038833618, + "logits/rejected": -2.344179391860962, + "logps/chosen": -363.1256103515625, + "logps/rejected": -236.9369354248047, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4474216103553772, + "rewards/margins": 2.278592348098755, + "rewards/rejected": -2.7260138988494873, + "step": 6173 + }, + { + "epoch": 0.72, + "learning_rate": 8.58155190740522e-08, + "logits/chosen": -2.0392343997955322, + "logits/rejected": -2.508249282836914, + "logps/chosen": -424.61749267578125, + "logps/rejected": -268.83563232421875, + "loss": 0.3973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5108384490013123, + "rewards/margins": 2.0427064895629883, + "rewards/rejected": -2.5535449981689453, + "step": 6174 + }, + { + "epoch": 0.72, + "learning_rate": 8.578008739813392e-08, + "logits/chosen": -2.2129507064819336, + "logits/rejected": -2.526017665863037, + "logps/chosen": -180.93739318847656, + "logps/rejected": -169.88795471191406, + "loss": 0.543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.903856635093689, + "rewards/margins": 1.367254376411438, + "rewards/rejected": -2.271111249923706, + "step": 6175 + }, + { + "epoch": 0.72, + "learning_rate": 8.574465572221566e-08, + "logits/chosen": -2.4413280487060547, + "logits/rejected": -2.4938228130340576, + "logps/chosen": -236.22341918945312, + "logps/rejected": -188.71583557128906, + "loss": 0.3126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6718831658363342, + "rewards/margins": 2.6141607761383057, + "rewards/rejected": -3.286044120788574, + "step": 6176 + }, + { + "epoch": 0.72, + "learning_rate": 8.570922404629739e-08, + "logits/chosen": -2.324399709701538, + "logits/rejected": -2.035539388656616, + "logps/chosen": -122.91212463378906, + "logps/rejected": -291.3343811035156, + "loss": 0.3625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47544822096824646, + "rewards/margins": 2.7233357429504395, + "rewards/rejected": -3.198784112930298, + "step": 6177 + }, + { + "epoch": 0.72, + "learning_rate": 8.567379237037912e-08, + "logits/chosen": -2.3361992835998535, + "logits/rejected": -2.360306978225708, + "logps/chosen": -124.30534362792969, + "logps/rejected": -183.5393829345703, + "loss": 0.546, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6026584506034851, + "rewards/margins": 1.652522087097168, + "rewards/rejected": -2.2551803588867188, + "step": 6178 + }, + { + "epoch": 0.72, + "learning_rate": 8.563836069446084e-08, + "logits/chosen": -2.132028102874756, + "logits/rejected": -1.9834133386611938, + "logps/chosen": -334.4992370605469, + "logps/rejected": -241.6773681640625, + "loss": 0.9026, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6995103359222412, + "rewards/margins": 0.7607282400131226, + "rewards/rejected": -2.460238456726074, + "step": 6179 + }, + { + "epoch": 0.72, + "learning_rate": 8.560292901854257e-08, + "logits/chosen": -2.6839652061462402, + "logits/rejected": -2.980827808380127, + "logps/chosen": -193.55174255371094, + "logps/rejected": -251.4596405029297, + "loss": 0.3439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.809939444065094, + "rewards/margins": 2.896678924560547, + "rewards/rejected": -3.706618309020996, + "step": 6180 + }, + { + "epoch": 0.72, + "learning_rate": 8.556749734262429e-08, + "logits/chosen": -2.463465452194214, + "logits/rejected": -2.464388132095337, + "logps/chosen": -333.2839660644531, + "logps/rejected": -241.25946044921875, + "loss": 0.4707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7414499521255493, + "rewards/margins": 1.9656450748443604, + "rewards/rejected": -2.707094669342041, + "step": 6181 + }, + { + "epoch": 0.72, + "learning_rate": 8.553206566670604e-08, + "logits/chosen": -2.0876364707946777, + "logits/rejected": -2.232290267944336, + "logps/chosen": -189.64222717285156, + "logps/rejected": -165.8405303955078, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3754830360412598, + "rewards/margins": 1.6044104099273682, + "rewards/rejected": -2.979893684387207, + "step": 6182 + }, + { + "epoch": 0.72, + "learning_rate": 8.549663399078776e-08, + "logits/chosen": -2.80676007270813, + "logits/rejected": -2.7026617527008057, + "logps/chosen": -191.81985473632812, + "logps/rejected": -202.89183044433594, + "loss": 0.2427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.497165709733963, + "rewards/margins": 3.0778274536132812, + "rewards/rejected": -3.574993133544922, + "step": 6183 + }, + { + "epoch": 0.72, + "learning_rate": 8.54612023148695e-08, + "logits/chosen": -2.5514931678771973, + "logits/rejected": -2.708113193511963, + "logps/chosen": -227.96493530273438, + "logps/rejected": -285.8348693847656, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4490073025226593, + "rewards/margins": 3.7989754676818848, + "rewards/rejected": -4.247982978820801, + "step": 6184 + }, + { + "epoch": 0.72, + "learning_rate": 8.542577063895122e-08, + "logits/chosen": -2.4027223587036133, + "logits/rejected": -2.1411232948303223, + "logps/chosen": -353.6353759765625, + "logps/rejected": -471.2122802734375, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43692782521247864, + "rewards/margins": 1.8715507984161377, + "rewards/rejected": -2.308478593826294, + "step": 6185 + }, + { + "epoch": 0.72, + "learning_rate": 8.539033896303294e-08, + "logits/chosen": -2.835728168487549, + "logits/rejected": -2.8514132499694824, + "logps/chosen": -64.1361083984375, + "logps/rejected": -276.1983642578125, + "loss": 0.0964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3603079915046692, + "rewards/margins": 4.898098945617676, + "rewards/rejected": -5.2584075927734375, + "step": 6186 + }, + { + "epoch": 0.72, + "learning_rate": 8.535490728711467e-08, + "logits/chosen": -2.5773115158081055, + "logits/rejected": -2.3103737831115723, + "logps/chosen": -194.20513916015625, + "logps/rejected": -396.41119384765625, + "loss": 0.2381, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0636407136917114, + "rewards/margins": 2.9007060527801514, + "rewards/rejected": -3.9643468856811523, + "step": 6187 + }, + { + "epoch": 0.72, + "learning_rate": 8.531947561119641e-08, + "logits/chosen": -2.0861454010009766, + "logits/rejected": -2.1122045516967773, + "logps/chosen": -198.61102294921875, + "logps/rejected": -252.63970947265625, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6557865142822266, + "rewards/margins": 3.877220392227173, + "rewards/rejected": -4.53300666809082, + "step": 6188 + }, + { + "epoch": 0.72, + "learning_rate": 8.528404393527813e-08, + "logits/chosen": -2.6571435928344727, + "logits/rejected": -2.651176929473877, + "logps/chosen": -163.44204711914062, + "logps/rejected": -267.6033630371094, + "loss": 0.1642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5860204696655273, + "rewards/margins": 2.4400436878204346, + "rewards/rejected": -3.026064395904541, + "step": 6189 + }, + { + "epoch": 0.72, + "learning_rate": 8.524861225935987e-08, + "logits/chosen": -2.0719962120056152, + "logits/rejected": -2.2410714626312256, + "logps/chosen": -405.42620849609375, + "logps/rejected": -254.29638671875, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.026648998260498, + "rewards/margins": 1.6360039710998535, + "rewards/rejected": -2.6626529693603516, + "step": 6190 + }, + { + "epoch": 0.72, + "learning_rate": 8.521318058344159e-08, + "logits/chosen": -2.004477024078369, + "logits/rejected": -1.934449553489685, + "logps/chosen": -261.0602722167969, + "logps/rejected": -281.9028625488281, + "loss": 0.3347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20662790536880493, + "rewards/margins": 1.9031565189361572, + "rewards/rejected": -2.1097846031188965, + "step": 6191 + }, + { + "epoch": 0.72, + "learning_rate": 8.517774890752331e-08, + "logits/chosen": -2.9406423568725586, + "logits/rejected": -2.7553324699401855, + "logps/chosen": -264.8376159667969, + "logps/rejected": -267.23419189453125, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4856548309326172, + "rewards/margins": 2.628995418548584, + "rewards/rejected": -4.114650249481201, + "step": 6192 + }, + { + "epoch": 0.72, + "learning_rate": 8.514231723160505e-08, + "logits/chosen": -1.9721488952636719, + "logits/rejected": -2.162766933441162, + "logps/chosen": -454.4625549316406, + "logps/rejected": -344.04058837890625, + "loss": 0.3449, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3118884563446045, + "rewards/margins": 1.383805751800537, + "rewards/rejected": -2.6956942081451416, + "step": 6193 + }, + { + "epoch": 0.72, + "learning_rate": 8.510688555568678e-08, + "logits/chosen": -2.573375701904297, + "logits/rejected": -2.389963150024414, + "logps/chosen": -236.73129272460938, + "logps/rejected": -338.0162658691406, + "loss": 0.1066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2645387649536133, + "rewards/margins": 3.9866623878479004, + "rewards/rejected": -5.2512006759643555, + "step": 6194 + }, + { + "epoch": 0.72, + "learning_rate": 8.507145387976852e-08, + "logits/chosen": -2.3015005588531494, + "logits/rejected": -2.458374500274658, + "logps/chosen": -264.60552978515625, + "logps/rejected": -301.0356140136719, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2469845414161682, + "rewards/margins": 2.1689815521240234, + "rewards/rejected": -2.415966033935547, + "step": 6195 + }, + { + "epoch": 0.72, + "learning_rate": 8.503602220385024e-08, + "logits/chosen": -2.38104510307312, + "logits/rejected": -2.7435553073883057, + "logps/chosen": -269.9562072753906, + "logps/rejected": -224.93417358398438, + "loss": 0.1072, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18478454649448395, + "rewards/margins": 3.2650461196899414, + "rewards/rejected": -3.4498305320739746, + "step": 6196 + }, + { + "epoch": 0.72, + "learning_rate": 8.500059052793196e-08, + "logits/chosen": -2.415226936340332, + "logits/rejected": -2.386878252029419, + "logps/chosen": -172.85328674316406, + "logps/rejected": -237.433349609375, + "loss": 0.2787, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4820743799209595, + "rewards/margins": 2.8432388305664062, + "rewards/rejected": -3.3253135681152344, + "step": 6197 + }, + { + "epoch": 0.72, + "learning_rate": 8.49651588520137e-08, + "logits/chosen": -1.7650117874145508, + "logits/rejected": -1.920945644378662, + "logps/chosen": -207.30711364746094, + "logps/rejected": -169.3771209716797, + "loss": 0.7708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5649362802505493, + "rewards/margins": 0.559837818145752, + "rewards/rejected": -1.1247740983963013, + "step": 6198 + }, + { + "epoch": 0.72, + "learning_rate": 8.492972717609543e-08, + "logits/chosen": -2.414623737335205, + "logits/rejected": -2.37878680229187, + "logps/chosen": -193.81712341308594, + "logps/rejected": -129.8374481201172, + "loss": 0.3786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14342671632766724, + "rewards/margins": 1.7765001058578491, + "rewards/rejected": -1.9199268817901611, + "step": 6199 + }, + { + "epoch": 0.72, + "learning_rate": 8.489429550017715e-08, + "logits/chosen": -2.040703535079956, + "logits/rejected": -2.2989096641540527, + "logps/chosen": -352.1123046875, + "logps/rejected": -280.35595703125, + "loss": 0.287, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8461208343505859, + "rewards/margins": 2.6924686431884766, + "rewards/rejected": -3.5385897159576416, + "step": 6200 + }, + { + "epoch": 0.72, + "learning_rate": 8.485886382425889e-08, + "logits/chosen": -2.19221830368042, + "logits/rejected": -2.2637648582458496, + "logps/chosen": -507.0249328613281, + "logps/rejected": -407.64886474609375, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9982216358184814, + "rewards/margins": 2.1928834915161133, + "rewards/rejected": -3.191105365753174, + "step": 6201 + }, + { + "epoch": 0.72, + "learning_rate": 8.482343214834061e-08, + "logits/chosen": -1.9008351564407349, + "logits/rejected": -1.8096072673797607, + "logps/chosen": -224.05242919921875, + "logps/rejected": -299.04852294921875, + "loss": 0.2414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5479227304458618, + "rewards/margins": 3.027313232421875, + "rewards/rejected": -4.5752363204956055, + "step": 6202 + }, + { + "epoch": 0.72, + "learning_rate": 8.478800047242233e-08, + "logits/chosen": -2.5383598804473877, + "logits/rejected": -2.5531089305877686, + "logps/chosen": -468.4386291503906, + "logps/rejected": -336.1687316894531, + "loss": 0.0866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21443511545658112, + "rewards/margins": 2.9762144088745117, + "rewards/rejected": -3.1906492710113525, + "step": 6203 + }, + { + "epoch": 0.72, + "learning_rate": 8.475256879650407e-08, + "logits/chosen": -2.44520902633667, + "logits/rejected": -2.201347589492798, + "logps/chosen": -261.1496887207031, + "logps/rejected": -446.5155944824219, + "loss": 0.5358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2741830348968506, + "rewards/margins": 0.8823078274726868, + "rewards/rejected": -1.1564908027648926, + "step": 6204 + }, + { + "epoch": 0.72, + "learning_rate": 8.47171371205858e-08, + "logits/chosen": -2.526021957397461, + "logits/rejected": -2.67010498046875, + "logps/chosen": -203.3842315673828, + "logps/rejected": -164.10516357421875, + "loss": 0.5899, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4620704650878906, + "rewards/margins": 0.48228365182876587, + "rewards/rejected": -1.9443540573120117, + "step": 6205 + }, + { + "epoch": 0.72, + "learning_rate": 8.468170544466754e-08, + "logits/chosen": -2.526762008666992, + "logits/rejected": -2.3130388259887695, + "logps/chosen": -234.00985717773438, + "logps/rejected": -237.1852264404297, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.259589672088623, + "rewards/margins": 1.7542468309402466, + "rewards/rejected": -3.013836622238159, + "step": 6206 + }, + { + "epoch": 0.72, + "learning_rate": 8.464627376874926e-08, + "logits/chosen": -2.032775640487671, + "logits/rejected": -2.3475842475891113, + "logps/chosen": -333.972900390625, + "logps/rejected": -221.43685913085938, + "loss": 0.9185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8068813681602478, + "rewards/margins": 2.1693239212036133, + "rewards/rejected": -2.9762051105499268, + "step": 6207 + }, + { + "epoch": 0.72, + "learning_rate": 8.461084209283098e-08, + "logits/chosen": -2.5879197120666504, + "logits/rejected": -2.6478116512298584, + "logps/chosen": -233.4046173095703, + "logps/rejected": -133.99044799804688, + "loss": 0.4864, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40472841262817383, + "rewards/margins": 1.106684684753418, + "rewards/rejected": -1.5114130973815918, + "step": 6208 + }, + { + "epoch": 0.72, + "learning_rate": 8.45754104169127e-08, + "logits/chosen": -2.888587236404419, + "logits/rejected": -2.8596105575561523, + "logps/chosen": -227.11813354492188, + "logps/rejected": -233.48587036132812, + "loss": 0.3597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4482610523700714, + "rewards/margins": 2.368706703186035, + "rewards/rejected": -2.8169679641723633, + "step": 6209 + }, + { + "epoch": 0.72, + "learning_rate": 8.453997874099444e-08, + "logits/chosen": -2.0012283325195312, + "logits/rejected": -2.011317729949951, + "logps/chosen": -271.8927307128906, + "logps/rejected": -204.44851684570312, + "loss": 1.1301, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8087866306304932, + "rewards/margins": 0.46790850162506104, + "rewards/rejected": -2.2766952514648438, + "step": 6210 + }, + { + "epoch": 0.72, + "learning_rate": 8.450454706507618e-08, + "logits/chosen": -2.0724267959594727, + "logits/rejected": -2.040477752685547, + "logps/chosen": -328.2983093261719, + "logps/rejected": -352.61566162109375, + "loss": 0.266, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07237634062767029, + "rewards/margins": 2.3381383419036865, + "rewards/rejected": -2.2657618522644043, + "step": 6211 + }, + { + "epoch": 0.72, + "learning_rate": 8.446911538915791e-08, + "logits/chosen": -2.006709575653076, + "logits/rejected": -1.9986475706100464, + "logps/chosen": -373.9992980957031, + "logps/rejected": -456.7115478515625, + "loss": 0.3627, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1275596171617508, + "rewards/margins": 2.6488778591156006, + "rewards/rejected": -2.776437282562256, + "step": 6212 + }, + { + "epoch": 0.72, + "learning_rate": 8.443368371323963e-08, + "logits/chosen": -2.3259706497192383, + "logits/rejected": -2.128541946411133, + "logps/chosen": -161.970703125, + "logps/rejected": -213.26870727539062, + "loss": 0.1886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5905094742774963, + "rewards/margins": 2.942847490310669, + "rewards/rejected": -3.5333569049835205, + "step": 6213 + }, + { + "epoch": 0.72, + "learning_rate": 8.439825203732136e-08, + "logits/chosen": -1.8114686012268066, + "logits/rejected": -2.1918437480926514, + "logps/chosen": -260.6184387207031, + "logps/rejected": -229.102783203125, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6389410495758057, + "rewards/margins": 1.9950064420700073, + "rewards/rejected": -2.6339476108551025, + "step": 6214 + }, + { + "epoch": 0.72, + "learning_rate": 8.436282036140309e-08, + "logits/chosen": -2.5514657497406006, + "logits/rejected": -2.207322359085083, + "logps/chosen": -304.41595458984375, + "logps/rejected": -361.548095703125, + "loss": 0.1953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1112205982208252, + "rewards/margins": 4.914114952087402, + "rewards/rejected": -6.025335311889648, + "step": 6215 + }, + { + "epoch": 0.72, + "learning_rate": 8.432738868548481e-08, + "logits/chosen": -2.446741819381714, + "logits/rejected": -2.650381565093994, + "logps/chosen": -379.11114501953125, + "logps/rejected": -306.677978515625, + "loss": 0.6567, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9503588676452637, + "rewards/margins": 1.070216417312622, + "rewards/rejected": -3.0205752849578857, + "step": 6216 + }, + { + "epoch": 0.72, + "learning_rate": 8.429195700956655e-08, + "logits/chosen": -2.6395018100738525, + "logits/rejected": -2.451396942138672, + "logps/chosen": -56.987632751464844, + "logps/rejected": -223.51284790039062, + "loss": 0.2781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5905852913856506, + "rewards/margins": 3.0262582302093506, + "rewards/rejected": -3.6168434619903564, + "step": 6217 + }, + { + "epoch": 0.72, + "learning_rate": 8.425652533364829e-08, + "logits/chosen": -2.2031772136688232, + "logits/rejected": -2.522761821746826, + "logps/chosen": -279.779541015625, + "logps/rejected": -201.77590942382812, + "loss": 0.6028, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8779282569885254, + "rewards/margins": 1.0375052690505981, + "rewards/rejected": -1.9154332876205444, + "step": 6218 + }, + { + "epoch": 0.72, + "learning_rate": 8.422109365773001e-08, + "logits/chosen": -1.8250367641448975, + "logits/rejected": -1.8257873058319092, + "logps/chosen": -225.02142333984375, + "logps/rejected": -212.8092803955078, + "loss": 0.447, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5590106844902039, + "rewards/margins": 1.4504485130310059, + "rewards/rejected": -2.0094590187072754, + "step": 6219 + }, + { + "epoch": 0.72, + "learning_rate": 8.418566198181173e-08, + "logits/chosen": -1.9522621631622314, + "logits/rejected": -2.305051326751709, + "logps/chosen": -309.0178527832031, + "logps/rejected": -214.41786193847656, + "loss": 0.2315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5981003046035767, + "rewards/margins": 1.6721316576004028, + "rewards/rejected": -2.2702319622039795, + "step": 6220 + }, + { + "epoch": 0.72, + "learning_rate": 8.415023030589346e-08, + "logits/chosen": -1.8973274230957031, + "logits/rejected": -2.44476056098938, + "logps/chosen": -425.1492614746094, + "logps/rejected": -194.3166961669922, + "loss": 0.5543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6609974503517151, + "rewards/margins": 0.44252222776412964, + "rewards/rejected": -1.1035196781158447, + "step": 6221 + }, + { + "epoch": 0.72, + "learning_rate": 8.411479862997519e-08, + "logits/chosen": -2.379875898361206, + "logits/rejected": -2.364088535308838, + "logps/chosen": -273.5645446777344, + "logps/rejected": -248.3965301513672, + "loss": 0.2128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5945512056350708, + "rewards/margins": 2.2761082649230957, + "rewards/rejected": -2.870659351348877, + "step": 6222 + }, + { + "epoch": 0.72, + "learning_rate": 8.407936695405694e-08, + "logits/chosen": -2.43648099899292, + "logits/rejected": -2.3652286529541016, + "logps/chosen": -269.4208984375, + "logps/rejected": -221.37271118164062, + "loss": 0.7281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2312710285186768, + "rewards/margins": 1.2549548149108887, + "rewards/rejected": -2.4862256050109863, + "step": 6223 + }, + { + "epoch": 0.72, + "learning_rate": 8.404393527813866e-08, + "logits/chosen": -2.913649082183838, + "logits/rejected": -2.9290053844451904, + "logps/chosen": -585.7796020507812, + "logps/rejected": -271.4493408203125, + "loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.985693097114563, + "rewards/margins": 1.9941980838775635, + "rewards/rejected": -2.979891061782837, + "step": 6224 + }, + { + "epoch": 0.72, + "learning_rate": 8.400850360222038e-08, + "logits/chosen": -2.622750997543335, + "logits/rejected": -2.7329254150390625, + "logps/chosen": -221.3226318359375, + "logps/rejected": -232.8501434326172, + "loss": 0.1827, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47570037841796875, + "rewards/margins": 2.9860920906066895, + "rewards/rejected": -3.461792469024658, + "step": 6225 + }, + { + "epoch": 0.72, + "learning_rate": 8.39730719263021e-08, + "logits/chosen": -2.4836740493774414, + "logits/rejected": -2.277036666870117, + "logps/chosen": -216.7913360595703, + "logps/rejected": -347.1855163574219, + "loss": 0.239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.022188007831573486, + "rewards/margins": 2.173504590988159, + "rewards/rejected": -2.195692777633667, + "step": 6226 + }, + { + "epoch": 0.72, + "learning_rate": 8.393764025038384e-08, + "logits/chosen": -2.93351411819458, + "logits/rejected": -2.915302276611328, + "logps/chosen": -279.9453125, + "logps/rejected": -232.0118408203125, + "loss": 0.2597, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0144299268722534, + "rewards/margins": 1.9939179420471191, + "rewards/rejected": -3.008347988128662, + "step": 6227 + }, + { + "epoch": 0.72, + "learning_rate": 8.390220857446556e-08, + "logits/chosen": -2.5109243392944336, + "logits/rejected": -2.519717216491699, + "logps/chosen": -307.77008056640625, + "logps/rejected": -269.854736328125, + "loss": 0.4955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6725038886070251, + "rewards/margins": 1.921865701675415, + "rewards/rejected": -2.594369411468506, + "step": 6228 + }, + { + "epoch": 0.72, + "learning_rate": 8.386677689854731e-08, + "logits/chosen": -2.0238726139068604, + "logits/rejected": -2.059744358062744, + "logps/chosen": -171.95297241210938, + "logps/rejected": -258.15911865234375, + "loss": 0.3455, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2111319303512573, + "rewards/margins": 2.3702280521392822, + "rewards/rejected": -3.58135986328125, + "step": 6229 + }, + { + "epoch": 0.72, + "learning_rate": 8.383134522262903e-08, + "logits/chosen": -1.8444111347198486, + "logits/rejected": -2.151204824447632, + "logps/chosen": -518.5436401367188, + "logps/rejected": -318.41326904296875, + "loss": 0.2865, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6902800798416138, + "rewards/margins": 1.8736826181411743, + "rewards/rejected": -3.563962697982788, + "step": 6230 + }, + { + "epoch": 0.72, + "learning_rate": 8.379591354671075e-08, + "logits/chosen": -2.206238269805908, + "logits/rejected": -2.2179579734802246, + "logps/chosen": -339.2690124511719, + "logps/rejected": -312.040283203125, + "loss": 0.1921, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7065054774284363, + "rewards/margins": 3.4422428607940674, + "rewards/rejected": -4.148748397827148, + "step": 6231 + }, + { + "epoch": 0.72, + "learning_rate": 8.376048187079249e-08, + "logits/chosen": -2.38478946685791, + "logits/rejected": -2.216792345046997, + "logps/chosen": -258.33447265625, + "logps/rejected": -276.9412841796875, + "loss": 0.3868, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9541309475898743, + "rewards/margins": 1.157449722290039, + "rewards/rejected": -2.1115808486938477, + "step": 6232 + }, + { + "epoch": 0.73, + "learning_rate": 8.372505019487421e-08, + "logits/chosen": -2.0269575119018555, + "logits/rejected": -1.791346788406372, + "logps/chosen": -203.01148986816406, + "logps/rejected": -294.86224365234375, + "loss": 0.1588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6437211632728577, + "rewards/margins": 3.725830554962158, + "rewards/rejected": -4.369551658630371, + "step": 6233 + }, + { + "epoch": 0.73, + "learning_rate": 8.368961851895595e-08, + "logits/chosen": -2.8475847244262695, + "logits/rejected": -2.7001724243164062, + "logps/chosen": -196.25662231445312, + "logps/rejected": -149.90695190429688, + "loss": 0.1622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2663803696632385, + "rewards/margins": 2.360764503479004, + "rewards/rejected": -2.6271448135375977, + "step": 6234 + }, + { + "epoch": 0.73, + "learning_rate": 8.365418684303768e-08, + "logits/chosen": -2.278712272644043, + "logits/rejected": -2.6379964351654053, + "logps/chosen": -382.8663330078125, + "logps/rejected": -247.62770080566406, + "loss": 0.3212, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.319392442703247, + "rewards/margins": 1.6596360206604004, + "rewards/rejected": -2.9790284633636475, + "step": 6235 + }, + { + "epoch": 0.73, + "learning_rate": 8.36187551671194e-08, + "logits/chosen": -2.6201014518737793, + "logits/rejected": -2.6098618507385254, + "logps/chosen": -304.2347412109375, + "logps/rejected": -365.8212890625, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4426062107086182, + "rewards/margins": 3.0042455196380615, + "rewards/rejected": -4.44685173034668, + "step": 6236 + }, + { + "epoch": 0.73, + "learning_rate": 8.358332349120112e-08, + "logits/chosen": -2.079807758331299, + "logits/rejected": -1.887887716293335, + "logps/chosen": -139.32276916503906, + "logps/rejected": -202.21681213378906, + "loss": 0.3087, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1958633661270142, + "rewards/margins": 1.9506926536560059, + "rewards/rejected": -3.1465561389923096, + "step": 6237 + }, + { + "epoch": 0.73, + "learning_rate": 8.354789181528286e-08, + "logits/chosen": -1.8903127908706665, + "logits/rejected": -2.2764787673950195, + "logps/chosen": -347.53826904296875, + "logps/rejected": -219.7482452392578, + "loss": 0.3216, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5685063004493713, + "rewards/margins": 2.0581140518188477, + "rewards/rejected": -2.6266207695007324, + "step": 6238 + }, + { + "epoch": 0.73, + "learning_rate": 8.351246013936458e-08, + "logits/chosen": -2.054142475128174, + "logits/rejected": -2.029484272003174, + "logps/chosen": -263.55535888671875, + "logps/rejected": -320.9850769042969, + "loss": 0.2719, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6472530364990234, + "rewards/margins": 3.1822195053100586, + "rewards/rejected": -4.829472541809082, + "step": 6239 + }, + { + "epoch": 0.73, + "learning_rate": 8.347702846344633e-08, + "logits/chosen": -1.9950740337371826, + "logits/rejected": -2.1135499477386475, + "logps/chosen": -308.297607421875, + "logps/rejected": -232.7818145751953, + "loss": 0.5222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5112283229827881, + "rewards/margins": 1.7710139751434326, + "rewards/rejected": -2.2822420597076416, + "step": 6240 + }, + { + "epoch": 0.73, + "learning_rate": 8.344159678752805e-08, + "logits/chosen": -2.441589117050171, + "logits/rejected": -2.3939151763916016, + "logps/chosen": -240.2314453125, + "logps/rejected": -345.9123840332031, + "loss": 0.2578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8331272602081299, + "rewards/margins": 1.7683361768722534, + "rewards/rejected": -2.6014633178710938, + "step": 6241 + }, + { + "epoch": 0.73, + "learning_rate": 8.340616511160977e-08, + "logits/chosen": -2.821415424346924, + "logits/rejected": -2.531061887741089, + "logps/chosen": -101.25201416015625, + "logps/rejected": -171.59292602539062, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3780596256256104, + "rewards/margins": 2.818971633911133, + "rewards/rejected": -4.197031021118164, + "step": 6242 + }, + { + "epoch": 0.73, + "learning_rate": 8.337073343569151e-08, + "logits/chosen": -2.843327522277832, + "logits/rejected": -2.7762293815612793, + "logps/chosen": -219.25865173339844, + "logps/rejected": -268.79473876953125, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8540209531784058, + "rewards/margins": 3.6852879524230957, + "rewards/rejected": -4.539308547973633, + "step": 6243 + }, + { + "epoch": 0.73, + "learning_rate": 8.333530175977323e-08, + "logits/chosen": -2.2756004333496094, + "logits/rejected": -2.0726325511932373, + "logps/chosen": -187.93846130371094, + "logps/rejected": -347.4775390625, + "loss": 0.4215, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37550094723701477, + "rewards/margins": 4.324132919311523, + "rewards/rejected": -4.699634552001953, + "step": 6244 + }, + { + "epoch": 0.73, + "learning_rate": 8.329987008385495e-08, + "logits/chosen": -2.638855457305908, + "logits/rejected": -2.652555465698242, + "logps/chosen": -268.8465881347656, + "logps/rejected": -285.71441650390625, + "loss": 0.3067, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5621377229690552, + "rewards/margins": 2.5597269535064697, + "rewards/rejected": -3.1218647956848145, + "step": 6245 + }, + { + "epoch": 0.73, + "learning_rate": 8.32644384079367e-08, + "logits/chosen": -2.724247932434082, + "logits/rejected": -2.566775321960449, + "logps/chosen": -124.0853500366211, + "logps/rejected": -242.38612365722656, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2946052551269531, + "rewards/margins": 1.3789558410644531, + "rewards/rejected": -1.6735610961914062, + "step": 6246 + }, + { + "epoch": 0.73, + "learning_rate": 8.322900673201843e-08, + "logits/chosen": -2.37001633644104, + "logits/rejected": -2.5617761611938477, + "logps/chosen": -380.3348693847656, + "logps/rejected": -287.5806884765625, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4337380528450012, + "rewards/margins": 3.596730947494507, + "rewards/rejected": -4.030468940734863, + "step": 6247 + }, + { + "epoch": 0.73, + "learning_rate": 8.319357505610015e-08, + "logits/chosen": -2.4132156372070312, + "logits/rejected": -2.8407673835754395, + "logps/chosen": -441.3544616699219, + "logps/rejected": -207.17843627929688, + "loss": 0.5126, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9827101230621338, + "rewards/margins": 0.8773732781410217, + "rewards/rejected": -1.8600834608078003, + "step": 6248 + }, + { + "epoch": 0.73, + "learning_rate": 8.315814338018188e-08, + "logits/chosen": -2.1347768306732178, + "logits/rejected": -2.492623805999756, + "logps/chosen": -332.7708435058594, + "logps/rejected": -406.6116638183594, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13036808371543884, + "rewards/margins": 2.984086036682129, + "rewards/rejected": -3.1144542694091797, + "step": 6249 + }, + { + "epoch": 0.73, + "learning_rate": 8.31227117042636e-08, + "logits/chosen": -1.7121227979660034, + "logits/rejected": -1.5564534664154053, + "logps/chosen": -450.9198303222656, + "logps/rejected": -467.85968017578125, + "loss": 0.5187, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4625214338302612, + "rewards/margins": 1.4492309093475342, + "rewards/rejected": -2.911752223968506, + "step": 6250 + }, + { + "epoch": 0.73, + "learning_rate": 8.308728002834533e-08, + "logits/chosen": -2.6764185428619385, + "logits/rejected": -2.6374990940093994, + "logps/chosen": -161.54757690429688, + "logps/rejected": -190.63674926757812, + "loss": 0.4942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5977938175201416, + "rewards/margins": 1.2090122699737549, + "rewards/rejected": -1.8068060874938965, + "step": 6251 + }, + { + "epoch": 0.73, + "learning_rate": 8.305184835242708e-08, + "logits/chosen": -2.0300803184509277, + "logits/rejected": -2.270563840866089, + "logps/chosen": -278.9227294921875, + "logps/rejected": -181.25901794433594, + "loss": 0.6181, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1741971969604492, + "rewards/margins": 0.5925653576850891, + "rewards/rejected": -1.7667627334594727, + "step": 6252 + }, + { + "epoch": 0.73, + "learning_rate": 8.30164166765088e-08, + "logits/chosen": -2.5598561763763428, + "logits/rejected": -2.661360025405884, + "logps/chosen": -161.10269165039062, + "logps/rejected": -217.74209594726562, + "loss": 0.5598, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7750529646873474, + "rewards/margins": 1.4126702547073364, + "rewards/rejected": -2.187723159790039, + "step": 6253 + }, + { + "epoch": 0.73, + "learning_rate": 8.298098500059052e-08, + "logits/chosen": -2.4177656173706055, + "logits/rejected": -2.515322208404541, + "logps/chosen": -291.6696472167969, + "logps/rejected": -263.458984375, + "loss": 0.481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5502617359161377, + "rewards/margins": 2.1220645904541016, + "rewards/rejected": -2.67232608795166, + "step": 6254 + }, + { + "epoch": 0.73, + "learning_rate": 8.294555332467226e-08, + "logits/chosen": -2.6189582347869873, + "logits/rejected": -2.394188642501831, + "logps/chosen": -251.34561157226562, + "logps/rejected": -287.58441162109375, + "loss": 0.3661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8734403252601624, + "rewards/margins": 2.77573299407959, + "rewards/rejected": -3.6491732597351074, + "step": 6255 + }, + { + "epoch": 0.73, + "learning_rate": 8.291012164875398e-08, + "logits/chosen": -1.9054627418518066, + "logits/rejected": -2.2309091091156006, + "logps/chosen": -531.7499389648438, + "logps/rejected": -303.321044921875, + "loss": 0.1626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4840483069419861, + "rewards/margins": 3.39410400390625, + "rewards/rejected": -3.8781521320343018, + "step": 6256 + }, + { + "epoch": 0.73, + "learning_rate": 8.28746899728357e-08, + "logits/chosen": -2.6862878799438477, + "logits/rejected": -2.6782002449035645, + "logps/chosen": -270.00982666015625, + "logps/rejected": -272.99383544921875, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0199124813079834, + "rewards/margins": 1.1365556716918945, + "rewards/rejected": -2.156468391418457, + "step": 6257 + }, + { + "epoch": 0.73, + "learning_rate": 8.283925829691745e-08, + "logits/chosen": -2.495436906814575, + "logits/rejected": -2.288783311843872, + "logps/chosen": -95.98995971679688, + "logps/rejected": -97.82820892333984, + "loss": 0.8575, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8888918161392212, + "rewards/margins": 1.1619963645935059, + "rewards/rejected": -3.0508880615234375, + "step": 6258 + }, + { + "epoch": 0.73, + "learning_rate": 8.280382662099917e-08, + "logits/chosen": -2.4147067070007324, + "logits/rejected": -2.499863624572754, + "logps/chosen": -407.50335693359375, + "logps/rejected": -226.576416015625, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8907994031906128, + "rewards/margins": 1.901977777481079, + "rewards/rejected": -2.7927770614624023, + "step": 6259 + }, + { + "epoch": 0.73, + "learning_rate": 8.27683949450809e-08, + "logits/chosen": -2.364745616912842, + "logits/rejected": -2.2371985912323, + "logps/chosen": -294.4615783691406, + "logps/rejected": -319.8312072753906, + "loss": 0.6786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8878707885742188, + "rewards/margins": 0.9721347093582153, + "rewards/rejected": -1.8600056171417236, + "step": 6260 + }, + { + "epoch": 0.73, + "learning_rate": 8.273296326916263e-08, + "logits/chosen": -2.6869781017303467, + "logits/rejected": -2.4125561714172363, + "logps/chosen": -265.02716064453125, + "logps/rejected": -315.237060546875, + "loss": 0.3384, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9781250357627869, + "rewards/margins": 2.018409013748169, + "rewards/rejected": -2.9965338706970215, + "step": 6261 + }, + { + "epoch": 0.73, + "learning_rate": 8.269753159324435e-08, + "logits/chosen": -2.641075849533081, + "logits/rejected": -2.4539215564727783, + "logps/chosen": -237.93077087402344, + "logps/rejected": -291.9406433105469, + "loss": 0.6089, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7139095664024353, + "rewards/margins": 0.743086040019989, + "rewards/rejected": -1.4569956064224243, + "step": 6262 + }, + { + "epoch": 0.73, + "learning_rate": 8.266209991732607e-08, + "logits/chosen": -2.105607271194458, + "logits/rejected": -1.717482566833496, + "logps/chosen": -279.1279296875, + "logps/rejected": -476.36773681640625, + "loss": 0.573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9925141334533691, + "rewards/margins": 1.4825098514556885, + "rewards/rejected": -2.4750242233276367, + "step": 6263 + }, + { + "epoch": 0.73, + "learning_rate": 8.262666824140782e-08, + "logits/chosen": -2.358150005340576, + "logits/rejected": -2.7737839221954346, + "logps/chosen": -498.6929931640625, + "logps/rejected": -285.8047790527344, + "loss": 0.3831, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7536145448684692, + "rewards/margins": 2.4077465534210205, + "rewards/rejected": -3.1613612174987793, + "step": 6264 + }, + { + "epoch": 0.73, + "learning_rate": 8.259123656548954e-08, + "logits/chosen": -2.2057714462280273, + "logits/rejected": -2.3441431522369385, + "logps/chosen": -378.1039733886719, + "logps/rejected": -363.8343200683594, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5303623676300049, + "rewards/margins": 2.359379291534424, + "rewards/rejected": -2.889741897583008, + "step": 6265 + }, + { + "epoch": 0.73, + "learning_rate": 8.255580488957128e-08, + "logits/chosen": -2.3165674209594727, + "logits/rejected": -2.2454793453216553, + "logps/chosen": -190.69009399414062, + "logps/rejected": -256.96429443359375, + "loss": 0.4102, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39243611693382263, + "rewards/margins": 1.5246665477752686, + "rewards/rejected": -1.9171026945114136, + "step": 6266 + }, + { + "epoch": 0.73, + "learning_rate": 8.2520373213653e-08, + "logits/chosen": -2.4928946495056152, + "logits/rejected": -2.5909063816070557, + "logps/chosen": -229.4381103515625, + "logps/rejected": -280.5776672363281, + "loss": 0.4203, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8401175737380981, + "rewards/margins": 4.340473175048828, + "rewards/rejected": -5.1805901527404785, + "step": 6267 + }, + { + "epoch": 0.73, + "learning_rate": 8.248494153773472e-08, + "logits/chosen": -2.0853829383850098, + "logits/rejected": -2.16218900680542, + "logps/chosen": -269.3026428222656, + "logps/rejected": -305.31097412109375, + "loss": 0.4087, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5687272548675537, + "rewards/margins": 1.372157096862793, + "rewards/rejected": -1.9408843517303467, + "step": 6268 + }, + { + "epoch": 0.73, + "learning_rate": 8.244950986181646e-08, + "logits/chosen": -2.5696840286254883, + "logits/rejected": -2.461580276489258, + "logps/chosen": -363.15875244140625, + "logps/rejected": -310.2806396484375, + "loss": 0.3471, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7559940814971924, + "rewards/margins": 2.71581768989563, + "rewards/rejected": -3.4718117713928223, + "step": 6269 + }, + { + "epoch": 0.73, + "learning_rate": 8.241407818589819e-08, + "logits/chosen": -2.0937113761901855, + "logits/rejected": -2.0045359134674072, + "logps/chosen": -305.2699890136719, + "logps/rejected": -422.78912353515625, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2489248514175415, + "rewards/margins": 2.0927305221557617, + "rewards/rejected": -2.3416552543640137, + "step": 6270 + }, + { + "epoch": 0.73, + "learning_rate": 8.237864650997992e-08, + "logits/chosen": -1.9130191802978516, + "logits/rejected": -2.0114526748657227, + "logps/chosen": -231.45687866210938, + "logps/rejected": -228.45582580566406, + "loss": 0.5287, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2295161485671997, + "rewards/margins": 1.1234570741653442, + "rewards/rejected": -2.352973222732544, + "step": 6271 + }, + { + "epoch": 0.73, + "learning_rate": 8.234321483406165e-08, + "logits/chosen": -2.383329153060913, + "logits/rejected": -2.758892059326172, + "logps/chosen": -184.5867156982422, + "logps/rejected": -113.81681823730469, + "loss": 0.5987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4687278270721436, + "rewards/margins": 0.6785489320755005, + "rewards/rejected": -2.1472766399383545, + "step": 6272 + }, + { + "epoch": 0.73, + "learning_rate": 8.230778315814337e-08, + "logits/chosen": -2.4360463619232178, + "logits/rejected": -2.364473342895508, + "logps/chosen": -296.12445068359375, + "logps/rejected": -214.16531372070312, + "loss": 0.5327, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.569430410861969, + "rewards/margins": 1.1649894714355469, + "rewards/rejected": -1.7344199419021606, + "step": 6273 + }, + { + "epoch": 0.73, + "learning_rate": 8.22723514822251e-08, + "logits/chosen": -2.0668206214904785, + "logits/rejected": -2.3308053016662598, + "logps/chosen": -495.6319580078125, + "logps/rejected": -549.9015502929688, + "loss": 0.3124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46989697217941284, + "rewards/margins": 2.1919171810150146, + "rewards/rejected": -2.6618142127990723, + "step": 6274 + }, + { + "epoch": 0.73, + "learning_rate": 8.223691980630684e-08, + "logits/chosen": -2.408827304840088, + "logits/rejected": -2.4145760536193848, + "logps/chosen": -310.6664123535156, + "logps/rejected": -269.8677062988281, + "loss": 0.2538, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7586761713027954, + "rewards/margins": 3.156709671020508, + "rewards/rejected": -3.9153857231140137, + "step": 6275 + }, + { + "epoch": 0.73, + "learning_rate": 8.220148813038857e-08, + "logits/chosen": -2.770163059234619, + "logits/rejected": -2.610686779022217, + "logps/chosen": -219.5615692138672, + "logps/rejected": -252.30465698242188, + "loss": 0.2636, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.708857536315918, + "rewards/margins": 2.321722984313965, + "rewards/rejected": -4.030580520629883, + "step": 6276 + }, + { + "epoch": 0.73, + "learning_rate": 8.21660564544703e-08, + "logits/chosen": -2.821021318435669, + "logits/rejected": -2.638423204421997, + "logps/chosen": -341.5736999511719, + "logps/rejected": -299.1343994140625, + "loss": 0.2259, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.069793701171875, + "rewards/margins": 2.6551129817962646, + "rewards/rejected": -3.7249066829681396, + "step": 6277 + }, + { + "epoch": 0.73, + "learning_rate": 8.213062477855202e-08, + "logits/chosen": -2.489882707595825, + "logits/rejected": -2.766602039337158, + "logps/chosen": -194.26040649414062, + "logps/rejected": -273.5350341796875, + "loss": 0.2305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7636180520057678, + "rewards/margins": 2.5584418773651123, + "rewards/rejected": -3.3220598697662354, + "step": 6278 + }, + { + "epoch": 0.73, + "learning_rate": 8.209519310263374e-08, + "logits/chosen": -2.25260591506958, + "logits/rejected": -2.643918514251709, + "logps/chosen": -368.50579833984375, + "logps/rejected": -212.8402099609375, + "loss": 0.398, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9171187877655029, + "rewards/margins": 2.661553382873535, + "rewards/rejected": -3.578672170639038, + "step": 6279 + }, + { + "epoch": 0.73, + "learning_rate": 8.205976142671547e-08, + "logits/chosen": -3.0564124584198, + "logits/rejected": -2.7946348190307617, + "logps/chosen": -326.1153564453125, + "logps/rejected": -253.46864318847656, + "loss": 0.209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6989347338676453, + "rewards/margins": 3.6117019653320312, + "rewards/rejected": -4.310636520385742, + "step": 6280 + }, + { + "epoch": 0.73, + "learning_rate": 8.202432975079722e-08, + "logits/chosen": -2.4073610305786133, + "logits/rejected": -2.527120351791382, + "logps/chosen": -432.93524169921875, + "logps/rejected": -252.2908935546875, + "loss": 0.1477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6735633611679077, + "rewards/margins": 3.4888954162597656, + "rewards/rejected": -4.162459373474121, + "step": 6281 + }, + { + "epoch": 0.73, + "learning_rate": 8.198889807487894e-08, + "logits/chosen": -2.2064120769500732, + "logits/rejected": -1.6713894605636597, + "logps/chosen": -277.0528564453125, + "logps/rejected": -403.48675537109375, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4311901926994324, + "rewards/margins": 2.4416494369506836, + "rewards/rejected": -2.87283992767334, + "step": 6282 + }, + { + "epoch": 0.73, + "learning_rate": 8.195346639896067e-08, + "logits/chosen": -2.1578598022460938, + "logits/rejected": -2.318854808807373, + "logps/chosen": -216.53350830078125, + "logps/rejected": -261.013916015625, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17750658094882965, + "rewards/margins": 2.9850986003875732, + "rewards/rejected": -3.162605047225952, + "step": 6283 + }, + { + "epoch": 0.73, + "learning_rate": 8.19180347230424e-08, + "logits/chosen": -2.759251117706299, + "logits/rejected": -2.5611636638641357, + "logps/chosen": -415.4079895019531, + "logps/rejected": -343.9595031738281, + "loss": 0.2184, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2044880390167236, + "rewards/margins": 2.903313398361206, + "rewards/rejected": -4.10780143737793, + "step": 6284 + }, + { + "epoch": 0.73, + "learning_rate": 8.188260304712412e-08, + "logits/chosen": -2.2869277000427246, + "logits/rejected": -2.510128974914551, + "logps/chosen": -328.0937194824219, + "logps/rejected": -233.30435180664062, + "loss": 0.1304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4671802818775177, + "rewards/margins": 2.3819899559020996, + "rewards/rejected": -2.849170207977295, + "step": 6285 + }, + { + "epoch": 0.73, + "learning_rate": 8.184717137120585e-08, + "logits/chosen": -1.878566026687622, + "logits/rejected": -1.915647268295288, + "logps/chosen": -200.0583953857422, + "logps/rejected": -341.998046875, + "loss": 0.4241, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2539724111557007, + "rewards/margins": 1.5252900123596191, + "rewards/rejected": -1.7792624235153198, + "step": 6286 + }, + { + "epoch": 0.73, + "learning_rate": 8.181173969528759e-08, + "logits/chosen": -2.385800361633301, + "logits/rejected": -2.122696876525879, + "logps/chosen": -332.53753662109375, + "logps/rejected": -255.65121459960938, + "loss": 0.3567, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7248864769935608, + "rewards/margins": 2.4618723392486572, + "rewards/rejected": -3.1867589950561523, + "step": 6287 + }, + { + "epoch": 0.73, + "learning_rate": 8.177630801936931e-08, + "logits/chosen": -2.590958833694458, + "logits/rejected": -2.192592144012451, + "logps/chosen": -294.2811584472656, + "logps/rejected": -340.4561767578125, + "loss": 0.1585, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1945384740829468, + "rewards/margins": 2.7351019382476807, + "rewards/rejected": -3.929640293121338, + "step": 6288 + }, + { + "epoch": 0.73, + "learning_rate": 8.174087634345105e-08, + "logits/chosen": -2.491452932357788, + "logits/rejected": -2.4303038120269775, + "logps/chosen": -202.56320190429688, + "logps/rejected": -319.87652587890625, + "loss": 0.3796, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6704690456390381, + "rewards/margins": 2.1825695037841797, + "rewards/rejected": -2.853038787841797, + "step": 6289 + }, + { + "epoch": 0.73, + "learning_rate": 8.170544466753277e-08, + "logits/chosen": -2.351283073425293, + "logits/rejected": -2.263904094696045, + "logps/chosen": -284.0327453613281, + "logps/rejected": -338.5082092285156, + "loss": 0.2193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02547439932823181, + "rewards/margins": 2.9597878456115723, + "rewards/rejected": -2.9343132972717285, + "step": 6290 + }, + { + "epoch": 0.73, + "learning_rate": 8.167001299161449e-08, + "logits/chosen": -2.4824001789093018, + "logits/rejected": -2.4733469486236572, + "logps/chosen": -352.1817932128906, + "logps/rejected": -359.93328857421875, + "loss": 0.4636, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.848662257194519, + "rewards/margins": 2.6582250595092773, + "rewards/rejected": -3.506887197494507, + "step": 6291 + }, + { + "epoch": 0.73, + "learning_rate": 8.163458131569623e-08, + "logits/chosen": -2.775198459625244, + "logits/rejected": -2.603262424468994, + "logps/chosen": -226.30523681640625, + "logps/rejected": -265.21368408203125, + "loss": 0.3126, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7944421768188477, + "rewards/margins": 2.5048975944519043, + "rewards/rejected": -4.299339294433594, + "step": 6292 + }, + { + "epoch": 0.73, + "learning_rate": 8.159914963977796e-08, + "logits/chosen": -2.079866886138916, + "logits/rejected": -2.16642165184021, + "logps/chosen": -223.07025146484375, + "logps/rejected": -155.1702117919922, + "loss": 0.7305, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.244734287261963, + "rewards/margins": 1.2067642211914062, + "rewards/rejected": -3.451498508453369, + "step": 6293 + }, + { + "epoch": 0.73, + "learning_rate": 8.15637179638597e-08, + "logits/chosen": -2.429098606109619, + "logits/rejected": -2.356956958770752, + "logps/chosen": -315.6029357910156, + "logps/rejected": -404.0758056640625, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9159033894538879, + "rewards/margins": 2.6245365142822266, + "rewards/rejected": -3.5404398441314697, + "step": 6294 + }, + { + "epoch": 0.73, + "learning_rate": 8.152828628794142e-08, + "logits/chosen": -2.1566147804260254, + "logits/rejected": -1.9597539901733398, + "logps/chosen": -196.0928955078125, + "logps/rejected": -274.13623046875, + "loss": 0.2938, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8582504391670227, + "rewards/margins": 2.6490767002105713, + "rewards/rejected": -3.5073273181915283, + "step": 6295 + }, + { + "epoch": 0.73, + "learning_rate": 8.149285461202314e-08, + "logits/chosen": -2.1523935794830322, + "logits/rejected": -2.1770784854888916, + "logps/chosen": -215.87460327148438, + "logps/rejected": -276.5356750488281, + "loss": 0.2839, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1355527639389038, + "rewards/margins": 2.1412055492401123, + "rewards/rejected": -3.2767584323883057, + "step": 6296 + }, + { + "epoch": 0.73, + "learning_rate": 8.145742293610488e-08, + "logits/chosen": -2.5140857696533203, + "logits/rejected": -2.706479072570801, + "logps/chosen": -302.07098388671875, + "logps/rejected": -285.73345947265625, + "loss": 0.5767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9069710969924927, + "rewards/margins": 1.6546944379806519, + "rewards/rejected": -2.5616655349731445, + "step": 6297 + }, + { + "epoch": 0.73, + "learning_rate": 8.14219912601866e-08, + "logits/chosen": -2.2257556915283203, + "logits/rejected": -2.2137725353240967, + "logps/chosen": -298.0929260253906, + "logps/rejected": -326.4964904785156, + "loss": 0.278, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.395040899515152, + "rewards/margins": 2.1004443168640137, + "rewards/rejected": -2.495485305786133, + "step": 6298 + }, + { + "epoch": 0.73, + "learning_rate": 8.138655958426833e-08, + "logits/chosen": -2.2454395294189453, + "logits/rejected": -2.3735191822052, + "logps/chosen": -299.9709777832031, + "logps/rejected": -176.82261657714844, + "loss": 0.8927, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1321961879730225, + "rewards/margins": 1.0521554946899414, + "rewards/rejected": -3.1843514442443848, + "step": 6299 + }, + { + "epoch": 0.73, + "learning_rate": 8.135112790835007e-08, + "logits/chosen": -2.4328107833862305, + "logits/rejected": -2.433586835861206, + "logps/chosen": -165.39280700683594, + "logps/rejected": -202.50747680664062, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3298513889312744, + "rewards/margins": 1.9624146223068237, + "rewards/rejected": -2.2922658920288086, + "step": 6300 + }, + { + "epoch": 0.73, + "learning_rate": 8.131569623243179e-08, + "logits/chosen": -2.513254404067993, + "logits/rejected": -2.0525245666503906, + "logps/chosen": -215.44793701171875, + "logps/rejected": -248.286865234375, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3027451038360596, + "rewards/margins": 2.6571762561798096, + "rewards/rejected": -3.959921360015869, + "step": 6301 + }, + { + "epoch": 0.73, + "learning_rate": 8.128026455651351e-08, + "logits/chosen": -2.374569892883301, + "logits/rejected": -2.7276487350463867, + "logps/chosen": -282.89776611328125, + "logps/rejected": -297.6399841308594, + "loss": 0.1268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.443732738494873, + "rewards/margins": 4.862957000732422, + "rewards/rejected": -6.306689739227295, + "step": 6302 + }, + { + "epoch": 0.73, + "learning_rate": 8.124483288059525e-08, + "logits/chosen": -2.08272647857666, + "logits/rejected": -2.200378894805908, + "logps/chosen": -290.1570129394531, + "logps/rejected": -341.6326904296875, + "loss": 0.5478, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5187958478927612, + "rewards/margins": 3.3704991340637207, + "rewards/rejected": -3.8892951011657715, + "step": 6303 + }, + { + "epoch": 0.73, + "learning_rate": 8.120940120467697e-08, + "logits/chosen": -2.678109884262085, + "logits/rejected": -2.8483505249023438, + "logps/chosen": -203.88536071777344, + "logps/rejected": -117.86561584472656, + "loss": 0.7845, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0104577541351318, + "rewards/margins": 0.3936546742916107, + "rewards/rejected": -1.4041123390197754, + "step": 6304 + }, + { + "epoch": 0.73, + "learning_rate": 8.117396952875872e-08, + "logits/chosen": -2.691967725753784, + "logits/rejected": -2.803523063659668, + "logps/chosen": -326.55841064453125, + "logps/rejected": -189.71987915039062, + "loss": 0.5483, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6118083000183105, + "rewards/margins": 2.255246162414551, + "rewards/rejected": -3.8670544624328613, + "step": 6305 + }, + { + "epoch": 0.73, + "learning_rate": 8.113853785284044e-08, + "logits/chosen": -2.2065279483795166, + "logits/rejected": -2.2410717010498047, + "logps/chosen": -378.35919189453125, + "logps/rejected": -357.23541259765625, + "loss": 0.3575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37178128957748413, + "rewards/margins": 1.0559897422790527, + "rewards/rejected": -1.427770972251892, + "step": 6306 + }, + { + "epoch": 0.73, + "learning_rate": 8.110310617692216e-08, + "logits/chosen": -2.418013095855713, + "logits/rejected": -2.314380168914795, + "logps/chosen": -351.4008483886719, + "logps/rejected": -337.0474548339844, + "loss": 0.1427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14531224966049194, + "rewards/margins": 3.0513761043548584, + "rewards/rejected": -3.196688652038574, + "step": 6307 + }, + { + "epoch": 0.73, + "learning_rate": 8.106767450100389e-08, + "logits/chosen": -2.3339617252349854, + "logits/rejected": -2.5668692588806152, + "logps/chosen": -268.4945068359375, + "logps/rejected": -172.73361206054688, + "loss": 0.2585, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30025404691696167, + "rewards/margins": 2.546266555786133, + "rewards/rejected": -2.8465209007263184, + "step": 6308 + }, + { + "epoch": 0.73, + "learning_rate": 8.103224282508562e-08, + "logits/chosen": -1.7337167263031006, + "logits/rejected": -2.0814599990844727, + "logps/chosen": -427.5549011230469, + "logps/rejected": -330.0394592285156, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9564264416694641, + "rewards/margins": 2.7459521293640137, + "rewards/rejected": -3.702378749847412, + "step": 6309 + }, + { + "epoch": 0.73, + "learning_rate": 8.099681114916736e-08, + "logits/chosen": -2.6628193855285645, + "logits/rejected": -2.5879805088043213, + "logps/chosen": -352.0947265625, + "logps/rejected": -261.4306640625, + "loss": 0.3394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12677332758903503, + "rewards/margins": 1.805931806564331, + "rewards/rejected": -1.9327051639556885, + "step": 6310 + }, + { + "epoch": 0.73, + "learning_rate": 8.096137947324909e-08, + "logits/chosen": -2.735067129135132, + "logits/rejected": -2.661919593811035, + "logps/chosen": -161.14744567871094, + "logps/rejected": -249.68600463867188, + "loss": 0.4429, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30944788455963135, + "rewards/margins": 1.8128485679626465, + "rewards/rejected": -2.1222963333129883, + "step": 6311 + }, + { + "epoch": 0.73, + "learning_rate": 8.092594779733081e-08, + "logits/chosen": -2.192164897918701, + "logits/rejected": -2.000788688659668, + "logps/chosen": -309.6528625488281, + "logps/rejected": -286.0964050292969, + "loss": 0.4859, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6537206172943115, + "rewards/margins": 0.7854449152946472, + "rewards/rejected": -1.4391655921936035, + "step": 6312 + }, + { + "epoch": 0.73, + "learning_rate": 8.089051612141254e-08, + "logits/chosen": -1.9527461528778076, + "logits/rejected": -2.0570244789123535, + "logps/chosen": -253.2622528076172, + "logps/rejected": -283.11993408203125, + "loss": 0.3253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5407022833824158, + "rewards/margins": 1.521715521812439, + "rewards/rejected": -2.06241774559021, + "step": 6313 + }, + { + "epoch": 0.73, + "learning_rate": 8.085508444549427e-08, + "logits/chosen": -2.4430432319641113, + "logits/rejected": -2.022214412689209, + "logps/chosen": -316.5509948730469, + "logps/rejected": -304.9562072753906, + "loss": 0.4155, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8587452173233032, + "rewards/margins": 1.6502361297607422, + "rewards/rejected": -2.508981466293335, + "step": 6314 + }, + { + "epoch": 0.73, + "learning_rate": 8.081965276957599e-08, + "logits/chosen": -2.449873924255371, + "logits/rejected": -2.2064504623413086, + "logps/chosen": -324.83978271484375, + "logps/rejected": -384.2618713378906, + "loss": 0.3873, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.130963921546936, + "rewards/margins": 2.674929618835449, + "rewards/rejected": -3.8058934211730957, + "step": 6315 + }, + { + "epoch": 0.73, + "learning_rate": 8.078422109365773e-08, + "logits/chosen": -2.715024471282959, + "logits/rejected": -2.7743289470672607, + "logps/chosen": -221.88232421875, + "logps/rejected": -198.40609741210938, + "loss": 0.3921, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1148772239685059, + "rewards/margins": 2.199631690979004, + "rewards/rejected": -3.314509153366089, + "step": 6316 + }, + { + "epoch": 0.73, + "learning_rate": 8.074878941773946e-08, + "logits/chosen": -2.6985230445861816, + "logits/rejected": -2.373257875442505, + "logps/chosen": -263.46173095703125, + "logps/rejected": -275.8941650390625, + "loss": 0.4419, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8131861686706543, + "rewards/margins": 1.470260500907898, + "rewards/rejected": -3.283446788787842, + "step": 6317 + }, + { + "epoch": 0.73, + "learning_rate": 8.071335774182119e-08, + "logits/chosen": -2.4910757541656494, + "logits/rejected": -2.2706148624420166, + "logps/chosen": -151.76080322265625, + "logps/rejected": -248.01141357421875, + "loss": 0.345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20789504051208496, + "rewards/margins": 2.3533577919006348, + "rewards/rejected": -2.5612525939941406, + "step": 6318 + }, + { + "epoch": 0.74, + "learning_rate": 8.067792606590291e-08, + "logits/chosen": -2.1222779750823975, + "logits/rejected": -2.506214141845703, + "logps/chosen": -286.5250549316406, + "logps/rejected": -222.7381591796875, + "loss": 0.7344, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3550012111663818, + "rewards/margins": 2.3467609882354736, + "rewards/rejected": -3.7017621994018555, + "step": 6319 + }, + { + "epoch": 0.74, + "learning_rate": 8.064249438998464e-08, + "logits/chosen": -2.5851402282714844, + "logits/rejected": -2.7743523120880127, + "logps/chosen": -203.3822021484375, + "logps/rejected": -194.300537109375, + "loss": 0.3158, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0593708753585815, + "rewards/margins": 2.071169137954712, + "rewards/rejected": -3.130540132522583, + "step": 6320 + }, + { + "epoch": 0.74, + "learning_rate": 8.060706271406637e-08, + "logits/chosen": -2.2360267639160156, + "logits/rejected": -2.263235092163086, + "logps/chosen": -202.82937622070312, + "logps/rejected": -197.75900268554688, + "loss": 0.5966, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8050970435142517, + "rewards/margins": 1.4324630498886108, + "rewards/rejected": -2.2375600337982178, + "step": 6321 + }, + { + "epoch": 0.74, + "learning_rate": 8.057163103814811e-08, + "logits/chosen": -2.3795969486236572, + "logits/rejected": -2.697396993637085, + "logps/chosen": -295.0947265625, + "logps/rejected": -194.46299743652344, + "loss": 0.5222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6688815355300903, + "rewards/margins": 1.3016772270202637, + "rewards/rejected": -1.9705586433410645, + "step": 6322 + }, + { + "epoch": 0.74, + "learning_rate": 8.053619936222984e-08, + "logits/chosen": -2.4393835067749023, + "logits/rejected": -2.610004186630249, + "logps/chosen": -330.8255615234375, + "logps/rejected": -266.91778564453125, + "loss": 0.6459, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15006303787231445, + "rewards/margins": 1.5149664878845215, + "rewards/rejected": -1.364903450012207, + "step": 6323 + }, + { + "epoch": 0.74, + "learning_rate": 8.050076768631156e-08, + "logits/chosen": -2.707409620285034, + "logits/rejected": -2.5879998207092285, + "logps/chosen": -197.8020782470703, + "logps/rejected": -313.51165771484375, + "loss": 0.3101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4572105407714844, + "rewards/margins": 2.926699638366699, + "rewards/rejected": -4.383910179138184, + "step": 6324 + }, + { + "epoch": 0.74, + "learning_rate": 8.046533601039328e-08, + "logits/chosen": -2.4200048446655273, + "logits/rejected": -2.338900089263916, + "logps/chosen": -236.69314575195312, + "logps/rejected": -289.446533203125, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1741054356098175, + "rewards/margins": 2.9379196166992188, + "rewards/rejected": -3.112025022506714, + "step": 6325 + }, + { + "epoch": 0.74, + "learning_rate": 8.042990433447502e-08, + "logits/chosen": -1.9167144298553467, + "logits/rejected": -1.765062928199768, + "logps/chosen": -253.27584838867188, + "logps/rejected": -298.7745361328125, + "loss": 0.3556, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23549270629882812, + "rewards/margins": 1.9545906782150269, + "rewards/rejected": -2.1900835037231445, + "step": 6326 + }, + { + "epoch": 0.74, + "learning_rate": 8.039447265855674e-08, + "logits/chosen": -2.336045265197754, + "logits/rejected": -2.367928981781006, + "logps/chosen": -258.0199279785156, + "logps/rejected": -226.34568786621094, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9433164596557617, + "rewards/margins": 1.2530632019042969, + "rewards/rejected": -2.1963796615600586, + "step": 6327 + }, + { + "epoch": 0.74, + "learning_rate": 8.035904098263849e-08, + "logits/chosen": -2.4406237602233887, + "logits/rejected": -2.4752016067504883, + "logps/chosen": -280.4081726074219, + "logps/rejected": -212.3579864501953, + "loss": 0.4545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8370428681373596, + "rewards/margins": 1.292995572090149, + "rewards/rejected": -2.130038261413574, + "step": 6328 + }, + { + "epoch": 0.74, + "learning_rate": 8.032360930672021e-08, + "logits/chosen": -1.8495343923568726, + "logits/rejected": -2.1059248447418213, + "logps/chosen": -359.9935302734375, + "logps/rejected": -332.1397705078125, + "loss": 0.5057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7448663115501404, + "rewards/margins": 1.676568865776062, + "rewards/rejected": -2.4214353561401367, + "step": 6329 + }, + { + "epoch": 0.74, + "learning_rate": 8.028817763080193e-08, + "logits/chosen": -2.373020887374878, + "logits/rejected": -2.0751891136169434, + "logps/chosen": -111.7528076171875, + "logps/rejected": -212.71249389648438, + "loss": 0.5459, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4305014610290527, + "rewards/margins": 2.1131765842437744, + "rewards/rejected": -3.543678045272827, + "step": 6330 + }, + { + "epoch": 0.74, + "learning_rate": 8.025274595488367e-08, + "logits/chosen": -1.7870392799377441, + "logits/rejected": -2.156320810317993, + "logps/chosen": -266.6884765625, + "logps/rejected": -238.04588317871094, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6065239906311035, + "rewards/margins": 1.5989813804626465, + "rewards/rejected": -2.205505132675171, + "step": 6331 + }, + { + "epoch": 0.74, + "learning_rate": 8.021731427896539e-08, + "logits/chosen": -2.100820541381836, + "logits/rejected": -2.3984410762786865, + "logps/chosen": -122.62279510498047, + "logps/rejected": -154.38226318359375, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7856298685073853, + "rewards/margins": 1.3981035947799683, + "rewards/rejected": -3.1837334632873535, + "step": 6332 + }, + { + "epoch": 0.74, + "learning_rate": 8.018188260304711e-08, + "logits/chosen": -2.411797046661377, + "logits/rejected": -2.3634426593780518, + "logps/chosen": -506.2190856933594, + "logps/rejected": -496.09600830078125, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3148081302642822, + "rewards/margins": 4.1490936279296875, + "rewards/rejected": -5.463901519775391, + "step": 6333 + }, + { + "epoch": 0.74, + "learning_rate": 8.014645092712886e-08, + "logits/chosen": -2.5489275455474854, + "logits/rejected": -2.595855712890625, + "logps/chosen": -376.095947265625, + "logps/rejected": -257.3561096191406, + "loss": 0.2723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8370881080627441, + "rewards/margins": 3.264387607574463, + "rewards/rejected": -4.101475715637207, + "step": 6334 + }, + { + "epoch": 0.74, + "learning_rate": 8.011101925121058e-08, + "logits/chosen": -2.2640929222106934, + "logits/rejected": -2.519909143447876, + "logps/chosen": -184.02383422851562, + "logps/rejected": -116.49836730957031, + "loss": 1.4869, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8177053928375244, + "rewards/margins": 0.5954796671867371, + "rewards/rejected": -2.4131851196289062, + "step": 6335 + }, + { + "epoch": 0.74, + "learning_rate": 8.00755875752923e-08, + "logits/chosen": -2.040315866470337, + "logits/rejected": -2.250474452972412, + "logps/chosen": -660.4305419921875, + "logps/rejected": -282.0590515136719, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6525173783302307, + "rewards/margins": 3.889765739440918, + "rewards/rejected": -4.542283058166504, + "step": 6336 + }, + { + "epoch": 0.74, + "learning_rate": 8.004015589937404e-08, + "logits/chosen": -2.224618911743164, + "logits/rejected": -2.5247464179992676, + "logps/chosen": -356.90277099609375, + "logps/rejected": -268.60858154296875, + "loss": 0.4535, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0749468803405762, + "rewards/margins": 2.922450542449951, + "rewards/rejected": -3.9973976612091064, + "step": 6337 + }, + { + "epoch": 0.74, + "learning_rate": 8.000472422345576e-08, + "logits/chosen": -2.746990203857422, + "logits/rejected": -2.6114230155944824, + "logps/chosen": -95.02008056640625, + "logps/rejected": -188.71951293945312, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4209843575954437, + "rewards/margins": 2.2544894218444824, + "rewards/rejected": -2.675473690032959, + "step": 6338 + }, + { + "epoch": 0.74, + "learning_rate": 7.996929254753748e-08, + "logits/chosen": -2.2374794483184814, + "logits/rejected": -2.4559807777404785, + "logps/chosen": -135.2891387939453, + "logps/rejected": -140.6848602294922, + "loss": 0.5603, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7226144075393677, + "rewards/margins": 2.0534355640411377, + "rewards/rejected": -2.776050090789795, + "step": 6339 + }, + { + "epoch": 0.74, + "learning_rate": 7.993386087161923e-08, + "logits/chosen": -2.538193702697754, + "logits/rejected": -2.479562759399414, + "logps/chosen": -256.07586669921875, + "logps/rejected": -255.6300811767578, + "loss": 0.2871, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.062509685754776, + "rewards/margins": 2.0434389114379883, + "rewards/rejected": -2.1059484481811523, + "step": 6340 + }, + { + "epoch": 0.74, + "learning_rate": 7.989842919570095e-08, + "logits/chosen": -1.6480209827423096, + "logits/rejected": -1.8877830505371094, + "logps/chosen": -478.7276611328125, + "logps/rejected": -391.46966552734375, + "loss": 0.4397, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0250537395477295, + "rewards/margins": 2.3963165283203125, + "rewards/rejected": -3.421370506286621, + "step": 6341 + }, + { + "epoch": 0.74, + "learning_rate": 7.986299751978269e-08, + "logits/chosen": -1.9675129652023315, + "logits/rejected": -2.048269748687744, + "logps/chosen": -373.30487060546875, + "logps/rejected": -269.5622253417969, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6289035081863403, + "rewards/margins": 1.624576210975647, + "rewards/rejected": -3.2534799575805664, + "step": 6342 + }, + { + "epoch": 0.74, + "learning_rate": 7.982756584386441e-08, + "logits/chosen": -2.332080125808716, + "logits/rejected": -2.4183220863342285, + "logps/chosen": -173.9196319580078, + "logps/rejected": -198.49227905273438, + "loss": 0.6057, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2623704671859741, + "rewards/margins": 1.0499732494354248, + "rewards/rejected": -2.3123435974121094, + "step": 6343 + }, + { + "epoch": 0.74, + "learning_rate": 7.979213416794613e-08, + "logits/chosen": -2.3395373821258545, + "logits/rejected": -2.4529404640197754, + "logps/chosen": -219.03012084960938, + "logps/rejected": -266.9730529785156, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.509674072265625, + "rewards/margins": 3.7301769256591797, + "rewards/rejected": -4.239850997924805, + "step": 6344 + }, + { + "epoch": 0.74, + "learning_rate": 7.975670249202788e-08, + "logits/chosen": -2.44329833984375, + "logits/rejected": -2.727839946746826, + "logps/chosen": -286.5792541503906, + "logps/rejected": -263.8098449707031, + "loss": 0.204, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1153305768966675, + "rewards/margins": 4.686122894287109, + "rewards/rejected": -5.801453590393066, + "step": 6345 + }, + { + "epoch": 0.74, + "learning_rate": 7.97212708161096e-08, + "logits/chosen": -1.8430575132369995, + "logits/rejected": -1.8269721269607544, + "logps/chosen": -346.16162109375, + "logps/rejected": -362.7665100097656, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25482305884361267, + "rewards/margins": 2.1301023960113525, + "rewards/rejected": -2.384925365447998, + "step": 6346 + }, + { + "epoch": 0.74, + "learning_rate": 7.968583914019133e-08, + "logits/chosen": -2.687967538833618, + "logits/rejected": -2.6635665893554688, + "logps/chosen": -275.445556640625, + "logps/rejected": -279.7044372558594, + "loss": 0.7284, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1380425691604614, + "rewards/margins": 1.1375553607940674, + "rewards/rejected": -2.2755980491638184, + "step": 6347 + }, + { + "epoch": 0.74, + "learning_rate": 7.965040746427306e-08, + "logits/chosen": -2.5320448875427246, + "logits/rejected": -2.525076150894165, + "logps/chosen": -271.2569274902344, + "logps/rejected": -231.52011108398438, + "loss": 0.4563, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.400573253631592, + "rewards/margins": 1.5105398893356323, + "rewards/rejected": -4.9111127853393555, + "step": 6348 + }, + { + "epoch": 0.74, + "learning_rate": 7.961497578835478e-08, + "logits/chosen": -2.2348616123199463, + "logits/rejected": -2.2682247161865234, + "logps/chosen": -208.01763916015625, + "logps/rejected": -255.90309143066406, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05114486813545227, + "rewards/margins": 3.586090564727783, + "rewards/rejected": -3.5349459648132324, + "step": 6349 + }, + { + "epoch": 0.74, + "learning_rate": 7.95795441124365e-08, + "logits/chosen": -2.188889265060425, + "logits/rejected": -2.0249288082122803, + "logps/chosen": -217.57273864746094, + "logps/rejected": -377.77996826171875, + "loss": 0.2571, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4829117953777313, + "rewards/margins": 3.6875457763671875, + "rewards/rejected": -4.17045783996582, + "step": 6350 + }, + { + "epoch": 0.74, + "learning_rate": 7.954411243651825e-08, + "logits/chosen": -1.9877650737762451, + "logits/rejected": -2.043609142303467, + "logps/chosen": -327.7762451171875, + "logps/rejected": -294.2757263183594, + "loss": 0.9263, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.207418441772461, + "rewards/margins": 1.025368332862854, + "rewards/rejected": -3.2327868938446045, + "step": 6351 + }, + { + "epoch": 0.74, + "learning_rate": 7.950868076059998e-08, + "logits/chosen": -2.4101905822753906, + "logits/rejected": -2.2523396015167236, + "logps/chosen": -243.02162170410156, + "logps/rejected": -278.38818359375, + "loss": 1.1713, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4679781198501587, + "rewards/margins": 0.5629440546035767, + "rewards/rejected": -2.0309221744537354, + "step": 6352 + }, + { + "epoch": 0.74, + "learning_rate": 7.94732490846817e-08, + "logits/chosen": -2.762709617614746, + "logits/rejected": -2.8701274394989014, + "logps/chosen": -220.67984008789062, + "logps/rejected": -229.9732666015625, + "loss": 0.3112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22425761818885803, + "rewards/margins": 1.9971094131469727, + "rewards/rejected": -2.221367120742798, + "step": 6353 + }, + { + "epoch": 0.74, + "learning_rate": 7.943781740876343e-08, + "logits/chosen": -2.9179625511169434, + "logits/rejected": -2.9291281700134277, + "logps/chosen": -152.64508056640625, + "logps/rejected": -186.2161407470703, + "loss": 0.2898, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6997923851013184, + "rewards/margins": 2.489960193634033, + "rewards/rejected": -4.189752578735352, + "step": 6354 + }, + { + "epoch": 0.74, + "learning_rate": 7.940238573284516e-08, + "logits/chosen": -2.2195541858673096, + "logits/rejected": -2.4074959754943848, + "logps/chosen": -431.2127685546875, + "logps/rejected": -303.155029296875, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.063283085823059, + "rewards/margins": 0.7332756519317627, + "rewards/rejected": -1.7965586185455322, + "step": 6355 + }, + { + "epoch": 0.74, + "learning_rate": 7.936695405692688e-08, + "logits/chosen": -1.6377177238464355, + "logits/rejected": -1.9323945045471191, + "logps/chosen": -238.52816772460938, + "logps/rejected": -188.13729858398438, + "loss": 0.3598, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1040544509887695, + "rewards/margins": 1.2057509422302246, + "rewards/rejected": -2.309805393218994, + "step": 6356 + }, + { + "epoch": 0.74, + "learning_rate": 7.933152238100863e-08, + "logits/chosen": -2.194798707962036, + "logits/rejected": -2.193270206451416, + "logps/chosen": -286.700439453125, + "logps/rejected": -308.5291748046875, + "loss": 0.4358, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0350940227508545, + "rewards/margins": 1.666778564453125, + "rewards/rejected": -2.7018728256225586, + "step": 6357 + }, + { + "epoch": 0.74, + "learning_rate": 7.929609070509035e-08, + "logits/chosen": -2.2756619453430176, + "logits/rejected": -2.51761531829834, + "logps/chosen": -300.4959411621094, + "logps/rejected": -251.83456420898438, + "loss": 0.4894, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6292780637741089, + "rewards/margins": 2.141449213027954, + "rewards/rejected": -2.7707276344299316, + "step": 6358 + }, + { + "epoch": 0.74, + "learning_rate": 7.926065902917208e-08, + "logits/chosen": -2.2793943881988525, + "logits/rejected": -1.944014310836792, + "logps/chosen": -212.53379821777344, + "logps/rejected": -355.3455810546875, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9987373352050781, + "rewards/margins": 2.181215524673462, + "rewards/rejected": -3.179952621459961, + "step": 6359 + }, + { + "epoch": 0.74, + "learning_rate": 7.92252273532538e-08, + "logits/chosen": -2.1999456882476807, + "logits/rejected": -2.5204761028289795, + "logps/chosen": -231.4736328125, + "logps/rejected": -197.885986328125, + "loss": 0.5163, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5233587622642517, + "rewards/margins": 1.302664875984192, + "rewards/rejected": -1.8260236978530884, + "step": 6360 + }, + { + "epoch": 0.74, + "learning_rate": 7.918979567733553e-08, + "logits/chosen": -2.8117880821228027, + "logits/rejected": -2.6549758911132812, + "logps/chosen": -460.23028564453125, + "logps/rejected": -301.2898864746094, + "loss": 0.4498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7425796985626221, + "rewards/margins": 1.5982327461242676, + "rewards/rejected": -2.3408122062683105, + "step": 6361 + }, + { + "epoch": 0.74, + "learning_rate": 7.915436400141725e-08, + "logits/chosen": -2.353105306625366, + "logits/rejected": -2.3922836780548096, + "logps/chosen": -447.5340881347656, + "logps/rejected": -291.4940490722656, + "loss": 0.5881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9012231826782227, + "rewards/margins": 1.522189736366272, + "rewards/rejected": -2.423412799835205, + "step": 6362 + }, + { + "epoch": 0.74, + "learning_rate": 7.9118932325499e-08, + "logits/chosen": -1.9020167589187622, + "logits/rejected": -1.9705555438995361, + "logps/chosen": -261.39434814453125, + "logps/rejected": -272.8210144042969, + "loss": 0.6319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5101104378700256, + "rewards/margins": 2.4853148460388184, + "rewards/rejected": -2.9954254627227783, + "step": 6363 + }, + { + "epoch": 0.74, + "learning_rate": 7.908350064958072e-08, + "logits/chosen": -2.0674240589141846, + "logits/rejected": -2.142570972442627, + "logps/chosen": -378.7806701660156, + "logps/rejected": -370.92462158203125, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0304274559020996, + "rewards/margins": 1.9749919176101685, + "rewards/rejected": -3.0054192543029785, + "step": 6364 + }, + { + "epoch": 0.74, + "learning_rate": 7.904806897366246e-08, + "logits/chosen": -2.604363203048706, + "logits/rejected": -2.3836793899536133, + "logps/chosen": -164.04202270507812, + "logps/rejected": -252.3910369873047, + "loss": 0.1998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7688165307044983, + "rewards/margins": 3.4030261039733887, + "rewards/rejected": -4.171842575073242, + "step": 6365 + }, + { + "epoch": 0.74, + "learning_rate": 7.901263729774418e-08, + "logits/chosen": -2.043614387512207, + "logits/rejected": -1.8067901134490967, + "logps/chosen": -402.77264404296875, + "logps/rejected": -450.8165588378906, + "loss": 0.3358, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6997337341308594, + "rewards/margins": 2.1948723793029785, + "rewards/rejected": -2.894606113433838, + "step": 6366 + }, + { + "epoch": 0.74, + "learning_rate": 7.89772056218259e-08, + "logits/chosen": -1.9815620183944702, + "logits/rejected": -2.1567678451538086, + "logps/chosen": -265.69305419921875, + "logps/rejected": -208.31370544433594, + "loss": 0.4901, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5322065353393555, + "rewards/margins": 1.6610689163208008, + "rewards/rejected": -3.1932754516601562, + "step": 6367 + }, + { + "epoch": 0.74, + "learning_rate": 7.894177394590764e-08, + "logits/chosen": -2.639808177947998, + "logits/rejected": -2.558666229248047, + "logps/chosen": -165.11106872558594, + "logps/rejected": -199.14105224609375, + "loss": 0.1576, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2951794862747192, + "rewards/margins": 2.61676025390625, + "rewards/rejected": -3.911939859390259, + "step": 6368 + }, + { + "epoch": 0.74, + "learning_rate": 7.890634226998937e-08, + "logits/chosen": -2.270467519760132, + "logits/rejected": -2.5941131114959717, + "logps/chosen": -367.6741943359375, + "logps/rejected": -426.695068359375, + "loss": 0.2144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8996294140815735, + "rewards/margins": 3.03814697265625, + "rewards/rejected": -3.9377760887145996, + "step": 6369 + }, + { + "epoch": 0.74, + "learning_rate": 7.88709105940711e-08, + "logits/chosen": -2.081770420074463, + "logits/rejected": -2.2126667499542236, + "logps/chosen": -312.7315368652344, + "logps/rejected": -335.0174255371094, + "loss": 0.4204, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0186822414398193, + "rewards/margins": 2.056854248046875, + "rewards/rejected": -3.0755362510681152, + "step": 6370 + }, + { + "epoch": 0.74, + "learning_rate": 7.883547891815283e-08, + "logits/chosen": -2.034918785095215, + "logits/rejected": -2.069779872894287, + "logps/chosen": -600.7816772460938, + "logps/rejected": -465.16864013671875, + "loss": 0.2051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8423968553543091, + "rewards/margins": 3.27105975151062, + "rewards/rejected": -4.113456726074219, + "step": 6371 + }, + { + "epoch": 0.74, + "learning_rate": 7.880004724223455e-08, + "logits/chosen": -1.668796420097351, + "logits/rejected": -1.5578563213348389, + "logps/chosen": -271.2153015136719, + "logps/rejected": -295.33282470703125, + "loss": 0.2592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01847708225250244, + "rewards/margins": 1.7359957695007324, + "rewards/rejected": -1.7544727325439453, + "step": 6372 + }, + { + "epoch": 0.74, + "learning_rate": 7.876461556631627e-08, + "logits/chosen": -1.6720739603042603, + "logits/rejected": -1.5310975313186646, + "logps/chosen": -506.5141906738281, + "logps/rejected": -433.57000732421875, + "loss": 0.3995, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40779513120651245, + "rewards/margins": 1.3797935247421265, + "rewards/rejected": -1.7875887155532837, + "step": 6373 + }, + { + "epoch": 0.74, + "learning_rate": 7.872918389039801e-08, + "logits/chosen": -2.6460795402526855, + "logits/rejected": -2.87593150138855, + "logps/chosen": -301.34161376953125, + "logps/rejected": -202.25306701660156, + "loss": 0.8871, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1275746822357178, + "rewards/margins": 1.0288233757019043, + "rewards/rejected": -3.156397819519043, + "step": 6374 + }, + { + "epoch": 0.74, + "learning_rate": 7.869375221447974e-08, + "logits/chosen": -2.8347349166870117, + "logits/rejected": -2.8491361141204834, + "logps/chosen": -135.2898712158203, + "logps/rejected": -229.7789306640625, + "loss": 0.9116, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6730844974517822, + "rewards/margins": 0.5685628056526184, + "rewards/rejected": -2.241647243499756, + "step": 6375 + }, + { + "epoch": 0.74, + "learning_rate": 7.865832053856148e-08, + "logits/chosen": -2.123122215270996, + "logits/rejected": -1.979169249534607, + "logps/chosen": -334.5379333496094, + "logps/rejected": -312.5858154296875, + "loss": 0.4361, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10387551039457321, + "rewards/margins": 2.380066156387329, + "rewards/rejected": -2.4839417934417725, + "step": 6376 + }, + { + "epoch": 0.74, + "learning_rate": 7.86228888626432e-08, + "logits/chosen": -2.435196876525879, + "logits/rejected": -2.5130269527435303, + "logps/chosen": -202.3519287109375, + "logps/rejected": -254.30059814453125, + "loss": 0.2889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41866984963417053, + "rewards/margins": 2.2714922428131104, + "rewards/rejected": -2.690162181854248, + "step": 6377 + }, + { + "epoch": 0.74, + "learning_rate": 7.858745718672492e-08, + "logits/chosen": -2.450761556625366, + "logits/rejected": -2.5789716243743896, + "logps/chosen": -259.0782775878906, + "logps/rejected": -327.1623229980469, + "loss": 0.3381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.956762433052063, + "rewards/margins": 1.993109941482544, + "rewards/rejected": -2.9498724937438965, + "step": 6378 + }, + { + "epoch": 0.74, + "learning_rate": 7.855202551080665e-08, + "logits/chosen": -1.9856858253479004, + "logits/rejected": -1.9470373392105103, + "logps/chosen": -226.54275512695312, + "logps/rejected": -293.525390625, + "loss": 0.3393, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3621931076049805, + "rewards/margins": 3.199445962905884, + "rewards/rejected": -4.561638832092285, + "step": 6379 + }, + { + "epoch": 0.74, + "learning_rate": 7.85165938348884e-08, + "logits/chosen": -1.936179280281067, + "logits/rejected": -1.8625948429107666, + "logps/chosen": -289.6942138671875, + "logps/rejected": -338.8097839355469, + "loss": 0.5706, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0653133392333984, + "rewards/margins": 2.964376926422119, + "rewards/rejected": -4.029690265655518, + "step": 6380 + }, + { + "epoch": 0.74, + "learning_rate": 7.848116215897012e-08, + "logits/chosen": -2.0657742023468018, + "logits/rejected": -2.1465325355529785, + "logps/chosen": -255.31570434570312, + "logps/rejected": -243.9123992919922, + "loss": 0.4989, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2228045463562012, + "rewards/margins": 1.5146889686584473, + "rewards/rejected": -2.7374935150146484, + "step": 6381 + }, + { + "epoch": 0.74, + "learning_rate": 7.844573048305185e-08, + "logits/chosen": -1.9133802652359009, + "logits/rejected": -2.108579158782959, + "logps/chosen": -346.1192626953125, + "logps/rejected": -322.2735900878906, + "loss": 0.252, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3973810076713562, + "rewards/margins": 2.286226272583008, + "rewards/rejected": -2.683607339859009, + "step": 6382 + }, + { + "epoch": 0.74, + "learning_rate": 7.841029880713357e-08, + "logits/chosen": -1.9985883235931396, + "logits/rejected": -2.173832416534424, + "logps/chosen": -180.30029296875, + "logps/rejected": -220.94223022460938, + "loss": 0.6336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6139665842056274, + "rewards/margins": 0.863246500492096, + "rewards/rejected": -2.477213144302368, + "step": 6383 + }, + { + "epoch": 0.74, + "learning_rate": 7.83748671312153e-08, + "logits/chosen": -1.8698720932006836, + "logits/rejected": -1.9429621696472168, + "logps/chosen": -354.42169189453125, + "logps/rejected": -258.96478271484375, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5393194556236267, + "rewards/margins": 1.4937331676483154, + "rewards/rejected": -2.033052444458008, + "step": 6384 + }, + { + "epoch": 0.74, + "learning_rate": 7.833943545529703e-08, + "logits/chosen": -2.1771397590637207, + "logits/rejected": -2.39392352104187, + "logps/chosen": -354.16107177734375, + "logps/rejected": -258.6745910644531, + "loss": 0.7984, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.092716097831726, + "rewards/margins": 0.891497015953064, + "rewards/rejected": -1.98421311378479, + "step": 6385 + }, + { + "epoch": 0.74, + "learning_rate": 7.830400377937877e-08, + "logits/chosen": -2.7067558765411377, + "logits/rejected": -2.6858816146850586, + "logps/chosen": -311.6806640625, + "logps/rejected": -280.38909912109375, + "loss": 0.3669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3288809061050415, + "rewards/margins": 2.27292537689209, + "rewards/rejected": -2.601806163787842, + "step": 6386 + }, + { + "epoch": 0.74, + "learning_rate": 7.826857210346049e-08, + "logits/chosen": -2.6012721061706543, + "logits/rejected": -2.686793327331543, + "logps/chosen": -195.41128540039062, + "logps/rejected": -327.2893371582031, + "loss": 0.2204, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3343936502933502, + "rewards/margins": 4.042865753173828, + "rewards/rejected": -4.377259731292725, + "step": 6387 + }, + { + "epoch": 0.74, + "learning_rate": 7.823314042754222e-08, + "logits/chosen": -2.145986318588257, + "logits/rejected": -2.257993459701538, + "logps/chosen": -393.15777587890625, + "logps/rejected": -252.71803283691406, + "loss": 0.4194, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4136618673801422, + "rewards/margins": 2.398505926132202, + "rewards/rejected": -2.8121676445007324, + "step": 6388 + }, + { + "epoch": 0.74, + "learning_rate": 7.819770875162395e-08, + "logits/chosen": -2.6908040046691895, + "logits/rejected": -2.8434133529663086, + "logps/chosen": -312.4591064453125, + "logps/rejected": -223.5443572998047, + "loss": 0.3814, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.596076250076294, + "rewards/margins": 1.5741448402404785, + "rewards/rejected": -2.1702210903167725, + "step": 6389 + }, + { + "epoch": 0.74, + "learning_rate": 7.816227707570567e-08, + "logits/chosen": -2.7513985633850098, + "logits/rejected": -2.401251792907715, + "logps/chosen": -268.22637939453125, + "logps/rejected": -204.35552978515625, + "loss": 0.0828, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0270477533340454, + "rewards/margins": 3.2404892444610596, + "rewards/rejected": -4.2675371170043945, + "step": 6390 + }, + { + "epoch": 0.74, + "learning_rate": 7.81268453997874e-08, + "logits/chosen": -2.588042974472046, + "logits/rejected": -2.3384296894073486, + "logps/chosen": -151.8711395263672, + "logps/rejected": -255.84030151367188, + "loss": 0.3645, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6730120182037354, + "rewards/margins": 2.4799699783325195, + "rewards/rejected": -3.152981996536255, + "step": 6391 + }, + { + "epoch": 0.74, + "learning_rate": 7.809141372386914e-08, + "logits/chosen": -1.8338249921798706, + "logits/rejected": -1.654573678970337, + "logps/chosen": -294.57196044921875, + "logps/rejected": -529.7262573242188, + "loss": 0.414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6450214385986328, + "rewards/margins": 2.026660442352295, + "rewards/rejected": -2.6716818809509277, + "step": 6392 + }, + { + "epoch": 0.74, + "learning_rate": 7.805598204795087e-08, + "logits/chosen": -2.3792786598205566, + "logits/rejected": -2.3096213340759277, + "logps/chosen": -371.8365173339844, + "logps/rejected": -470.0130920410156, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4234592914581299, + "rewards/margins": 2.2488715648651123, + "rewards/rejected": -2.672330856323242, + "step": 6393 + }, + { + "epoch": 0.74, + "learning_rate": 7.80205503720326e-08, + "logits/chosen": -2.5118401050567627, + "logits/rejected": -2.6777453422546387, + "logps/chosen": -163.86968994140625, + "logps/rejected": -195.02151489257812, + "loss": 0.2516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.495700478553772, + "rewards/margins": 2.8646626472473145, + "rewards/rejected": -3.360363245010376, + "step": 6394 + }, + { + "epoch": 0.74, + "learning_rate": 7.798511869611432e-08, + "logits/chosen": -2.5370490550994873, + "logits/rejected": -2.3993818759918213, + "logps/chosen": -258.4268798828125, + "logps/rejected": -257.0291748046875, + "loss": 0.4706, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4182193279266357, + "rewards/margins": 1.7299749851226807, + "rewards/rejected": -3.1481943130493164, + "step": 6395 + }, + { + "epoch": 0.74, + "learning_rate": 7.794968702019605e-08, + "logits/chosen": -2.118284225463867, + "logits/rejected": -1.9005517959594727, + "logps/chosen": -275.18121337890625, + "logps/rejected": -399.9444580078125, + "loss": 0.5522, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3262256383895874, + "rewards/margins": 1.8195912837982178, + "rewards/rejected": -3.1458168029785156, + "step": 6396 + }, + { + "epoch": 0.74, + "learning_rate": 7.791425534427778e-08, + "logits/chosen": -2.2186057567596436, + "logits/rejected": -2.0822126865386963, + "logps/chosen": -310.2674255371094, + "logps/rejected": -238.73013305664062, + "loss": 0.254, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37500691413879395, + "rewards/margins": 1.5949251651763916, + "rewards/rejected": -1.9699320793151855, + "step": 6397 + }, + { + "epoch": 0.74, + "learning_rate": 7.787882366835951e-08, + "logits/chosen": -2.501980781555176, + "logits/rejected": -2.134209156036377, + "logps/chosen": -452.5187683105469, + "logps/rejected": -440.0625, + "loss": 0.4548, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0395818948745728, + "rewards/margins": 1.407677173614502, + "rewards/rejected": -2.4472591876983643, + "step": 6398 + }, + { + "epoch": 0.74, + "learning_rate": 7.784339199244125e-08, + "logits/chosen": -2.3460328578948975, + "logits/rejected": -2.5737810134887695, + "logps/chosen": -152.9859619140625, + "logps/rejected": -145.36912536621094, + "loss": 0.3407, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2107190489768982, + "rewards/margins": 2.135066032409668, + "rewards/rejected": -2.345785140991211, + "step": 6399 + }, + { + "epoch": 0.74, + "learning_rate": 7.780796031652297e-08, + "logits/chosen": -2.001856803894043, + "logits/rejected": -2.4711577892303467, + "logps/chosen": -292.7585754394531, + "logps/rejected": -218.11392211914062, + "loss": 0.7948, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9619510173797607, + "rewards/margins": 2.713930368423462, + "rewards/rejected": -4.675881385803223, + "step": 6400 + }, + { + "epoch": 0.74, + "learning_rate": 7.777252864060469e-08, + "logits/chosen": -2.4192304611206055, + "logits/rejected": -2.420469045639038, + "logps/chosen": -276.3792419433594, + "logps/rejected": -426.84539794921875, + "loss": 0.319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37056198716163635, + "rewards/margins": 2.081132411956787, + "rewards/rejected": -2.4516944885253906, + "step": 6401 + }, + { + "epoch": 0.74, + "learning_rate": 7.773709696468643e-08, + "logits/chosen": -2.474398612976074, + "logits/rejected": -2.5094876289367676, + "logps/chosen": -348.28717041015625, + "logps/rejected": -382.49127197265625, + "loss": 0.0622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3090416193008423, + "rewards/margins": 3.640346050262451, + "rewards/rejected": -3.949387311935425, + "step": 6402 + }, + { + "epoch": 0.74, + "learning_rate": 7.770166528876815e-08, + "logits/chosen": -2.181915044784546, + "logits/rejected": -2.7005882263183594, + "logps/chosen": -473.056396484375, + "logps/rejected": -364.496826171875, + "loss": 0.3907, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3759779930114746, + "rewards/margins": 2.50034761428833, + "rewards/rejected": -3.8763256072998047, + "step": 6403 + }, + { + "epoch": 0.74, + "learning_rate": 7.76662336128499e-08, + "logits/chosen": -2.3214123249053955, + "logits/rejected": -2.5783770084381104, + "logps/chosen": -331.99114990234375, + "logps/rejected": -243.52842712402344, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6402058005332947, + "rewards/margins": 2.6924171447753906, + "rewards/rejected": -3.332623243331909, + "step": 6404 + }, + { + "epoch": 0.75, + "learning_rate": 7.763080193693162e-08, + "logits/chosen": -2.486558437347412, + "logits/rejected": -2.4222195148468018, + "logps/chosen": -181.11451721191406, + "logps/rejected": -336.13665771484375, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23040178418159485, + "rewards/margins": 3.480703830718994, + "rewards/rejected": -3.2503018379211426, + "step": 6405 + }, + { + "epoch": 0.75, + "learning_rate": 7.759537026101334e-08, + "logits/chosen": -2.7107272148132324, + "logits/rejected": -2.5056614875793457, + "logps/chosen": -194.38168334960938, + "logps/rejected": -338.28448486328125, + "loss": 1.2051, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0191123485565186, + "rewards/margins": 0.20944470167160034, + "rewards/rejected": -1.2285569906234741, + "step": 6406 + }, + { + "epoch": 0.75, + "learning_rate": 7.755993858509506e-08, + "logits/chosen": -2.742217540740967, + "logits/rejected": -2.6435117721557617, + "logps/chosen": -233.5757293701172, + "logps/rejected": -204.10552978515625, + "loss": 0.5591, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7199912071228027, + "rewards/margins": 1.6379963159561157, + "rewards/rejected": -3.357987403869629, + "step": 6407 + }, + { + "epoch": 0.75, + "learning_rate": 7.75245069091768e-08, + "logits/chosen": -2.321730136871338, + "logits/rejected": -2.3149333000183105, + "logps/chosen": -98.77872467041016, + "logps/rejected": -178.81195068359375, + "loss": 0.1743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27893608808517456, + "rewards/margins": 2.057197093963623, + "rewards/rejected": -1.7782609462738037, + "step": 6408 + }, + { + "epoch": 0.75, + "learning_rate": 7.748907523325852e-08, + "logits/chosen": -2.0387425422668457, + "logits/rejected": -2.279232978820801, + "logps/chosen": -325.68994140625, + "logps/rejected": -253.56680297851562, + "loss": 0.309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7646862268447876, + "rewards/margins": 2.463909864425659, + "rewards/rejected": -3.2285962104797363, + "step": 6409 + }, + { + "epoch": 0.75, + "learning_rate": 7.745364355734027e-08, + "logits/chosen": -2.392296314239502, + "logits/rejected": -2.693480968475342, + "logps/chosen": -294.962890625, + "logps/rejected": -198.32708740234375, + "loss": 0.4708, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5032957792282104, + "rewards/margins": 0.8935557007789612, + "rewards/rejected": -1.3968515396118164, + "step": 6410 + }, + { + "epoch": 0.75, + "learning_rate": 7.741821188142199e-08, + "logits/chosen": -2.5389504432678223, + "logits/rejected": -2.484173536300659, + "logps/chosen": -142.568603515625, + "logps/rejected": -189.47430419921875, + "loss": 0.6796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7840370535850525, + "rewards/margins": 1.5526070594787598, + "rewards/rejected": -2.336644172668457, + "step": 6411 + }, + { + "epoch": 0.75, + "learning_rate": 7.738278020550371e-08, + "logits/chosen": -2.5575222969055176, + "logits/rejected": -2.5970146656036377, + "logps/chosen": -420.9395446777344, + "logps/rejected": -323.59466552734375, + "loss": 1.0558, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2047288417816162, + "rewards/margins": 0.5521663427352905, + "rewards/rejected": -1.7568954229354858, + "step": 6412 + }, + { + "epoch": 0.75, + "learning_rate": 7.734734852958545e-08, + "logits/chosen": -2.1747989654541016, + "logits/rejected": -2.170938014984131, + "logps/chosen": -249.02703857421875, + "logps/rejected": -217.4600830078125, + "loss": 0.602, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2801377773284912, + "rewards/margins": 1.802008867263794, + "rewards/rejected": -3.082146644592285, + "step": 6413 + }, + { + "epoch": 0.75, + "learning_rate": 7.731191685366717e-08, + "logits/chosen": -2.6273951530456543, + "logits/rejected": -2.6874468326568604, + "logps/chosen": -264.71185302734375, + "logps/rejected": -173.08937072753906, + "loss": 0.2643, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47029584646224976, + "rewards/margins": 2.6012015342712402, + "rewards/rejected": -3.0714974403381348, + "step": 6414 + }, + { + "epoch": 0.75, + "learning_rate": 7.72764851777489e-08, + "logits/chosen": -2.0252740383148193, + "logits/rejected": -2.1635279655456543, + "logps/chosen": -211.72442626953125, + "logps/rejected": -281.4804382324219, + "loss": 0.1336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7684502005577087, + "rewards/margins": 4.9723615646362305, + "rewards/rejected": -5.740812301635742, + "step": 6415 + }, + { + "epoch": 0.75, + "learning_rate": 7.724105350183064e-08, + "logits/chosen": -1.6997333765029907, + "logits/rejected": -1.5918294191360474, + "logps/chosen": -450.3856201171875, + "logps/rejected": -402.88134765625, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5704143047332764, + "rewards/margins": 2.1217641830444336, + "rewards/rejected": -2.69217848777771, + "step": 6416 + }, + { + "epoch": 0.75, + "learning_rate": 7.720562182591236e-08, + "logits/chosen": -2.4934637546539307, + "logits/rejected": -2.5155117511749268, + "logps/chosen": -360.11785888671875, + "logps/rejected": -241.81321716308594, + "loss": 0.8171, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.428935170173645, + "rewards/margins": 0.8101019859313965, + "rewards/rejected": -1.2390371561050415, + "step": 6417 + }, + { + "epoch": 0.75, + "learning_rate": 7.717019014999409e-08, + "logits/chosen": -2.6665194034576416, + "logits/rejected": -2.5604965686798096, + "logps/chosen": -116.65167236328125, + "logps/rejected": -168.26075744628906, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8757097125053406, + "rewards/margins": 2.071082592010498, + "rewards/rejected": -2.9467923641204834, + "step": 6418 + }, + { + "epoch": 0.75, + "learning_rate": 7.713475847407582e-08, + "logits/chosen": -2.6286911964416504, + "logits/rejected": -2.559898614883423, + "logps/chosen": -224.26950073242188, + "logps/rejected": -270.5161437988281, + "loss": 0.344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19065624475479126, + "rewards/margins": 1.9123352766036987, + "rewards/rejected": -2.1029915809631348, + "step": 6419 + }, + { + "epoch": 0.75, + "learning_rate": 7.709932679815754e-08, + "logits/chosen": -2.9494924545288086, + "logits/rejected": -2.9871902465820312, + "logps/chosen": -389.91729736328125, + "logps/rejected": -317.595703125, + "loss": 0.3525, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1283293962478638, + "rewards/margins": 2.1654815673828125, + "rewards/rejected": -3.293811321258545, + "step": 6420 + }, + { + "epoch": 0.75, + "learning_rate": 7.706389512223929e-08, + "logits/chosen": -2.4265518188476562, + "logits/rejected": -2.2038326263427734, + "logps/chosen": -130.63023376464844, + "logps/rejected": -225.9704132080078, + "loss": 0.1912, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1925289630889893, + "rewards/margins": 2.4350008964538574, + "rewards/rejected": -3.627530097961426, + "step": 6421 + }, + { + "epoch": 0.75, + "learning_rate": 7.702846344632101e-08, + "logits/chosen": -2.337998390197754, + "logits/rejected": -2.1579906940460205, + "logps/chosen": -350.9261474609375, + "logps/rejected": -283.45416259765625, + "loss": 0.334, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5537731647491455, + "rewards/margins": 2.0075488090515137, + "rewards/rejected": -3.561321973800659, + "step": 6422 + }, + { + "epoch": 0.75, + "learning_rate": 7.699303177040274e-08, + "logits/chosen": -1.7990086078643799, + "logits/rejected": -2.044326066970825, + "logps/chosen": -346.77459716796875, + "logps/rejected": -235.65347290039062, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7417292594909668, + "rewards/margins": 2.1745195388793945, + "rewards/rejected": -2.9162487983703613, + "step": 6423 + }, + { + "epoch": 0.75, + "learning_rate": 7.695760009448446e-08, + "logits/chosen": -2.50469970703125, + "logits/rejected": -2.839393138885498, + "logps/chosen": -178.4884490966797, + "logps/rejected": -188.29721069335938, + "loss": 0.7207, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0601043701171875, + "rewards/margins": 0.38692495226860046, + "rewards/rejected": -2.4470293521881104, + "step": 6424 + }, + { + "epoch": 0.75, + "learning_rate": 7.69221684185662e-08, + "logits/chosen": -2.257568120956421, + "logits/rejected": -2.3210902214050293, + "logps/chosen": -338.0167541503906, + "logps/rejected": -240.28793334960938, + "loss": 0.5292, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5461677312850952, + "rewards/margins": 1.6609265804290771, + "rewards/rejected": -2.2070939540863037, + "step": 6425 + }, + { + "epoch": 0.75, + "learning_rate": 7.688673674264792e-08, + "logits/chosen": -1.8621723651885986, + "logits/rejected": -2.1463358402252197, + "logps/chosen": -447.75689697265625, + "logps/rejected": -313.0927429199219, + "loss": 0.5084, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2757142782211304, + "rewards/margins": 0.9229774475097656, + "rewards/rejected": -2.1986918449401855, + "step": 6426 + }, + { + "epoch": 0.75, + "learning_rate": 7.685130506672966e-08, + "logits/chosen": -2.346403121948242, + "logits/rejected": -2.356208324432373, + "logps/chosen": -313.5272216796875, + "logps/rejected": -254.33682250976562, + "loss": 0.4091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9742799997329712, + "rewards/margins": 1.9327505826950073, + "rewards/rejected": -2.9070305824279785, + "step": 6427 + }, + { + "epoch": 0.75, + "learning_rate": 7.681587339081139e-08, + "logits/chosen": -2.499162197113037, + "logits/rejected": -2.7812552452087402, + "logps/chosen": -125.84668731689453, + "logps/rejected": -144.59274291992188, + "loss": 0.306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3935560882091522, + "rewards/margins": 2.2597146034240723, + "rewards/rejected": -2.653270721435547, + "step": 6428 + }, + { + "epoch": 0.75, + "learning_rate": 7.678044171489311e-08, + "logits/chosen": -2.6562695503234863, + "logits/rejected": -2.642174482345581, + "logps/chosen": -191.4093017578125, + "logps/rejected": -149.96524047851562, + "loss": 0.3989, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6402041912078857, + "rewards/margins": 1.5469110012054443, + "rewards/rejected": -2.18711519241333, + "step": 6429 + }, + { + "epoch": 0.75, + "learning_rate": 7.674501003897484e-08, + "logits/chosen": -2.7760348320007324, + "logits/rejected": -2.6117167472839355, + "logps/chosen": -338.2925720214844, + "logps/rejected": -325.4942321777344, + "loss": 0.2399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0609721839427948, + "rewards/margins": 2.663550853729248, + "rewards/rejected": -2.602578639984131, + "step": 6430 + }, + { + "epoch": 0.75, + "learning_rate": 7.670957836305657e-08, + "logits/chosen": -2.445775270462036, + "logits/rejected": -2.4182183742523193, + "logps/chosen": -151.1334991455078, + "logps/rejected": -150.6818389892578, + "loss": 0.3694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7647528648376465, + "rewards/margins": 1.4589147567749023, + "rewards/rejected": -2.2236673831939697, + "step": 6431 + }, + { + "epoch": 0.75, + "learning_rate": 7.667414668713829e-08, + "logits/chosen": -1.8199172019958496, + "logits/rejected": -1.9016788005828857, + "logps/chosen": -292.3561706542969, + "logps/rejected": -237.92996215820312, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21134167909622192, + "rewards/margins": 3.099377155303955, + "rewards/rejected": -3.3107187747955322, + "step": 6432 + }, + { + "epoch": 0.75, + "learning_rate": 7.663871501122004e-08, + "logits/chosen": -1.7157964706420898, + "logits/rejected": -2.1576313972473145, + "logps/chosen": -525.5230102539062, + "logps/rejected": -398.78155517578125, + "loss": 0.0809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.654600203037262, + "rewards/margins": 3.6912384033203125, + "rewards/rejected": -4.34583854675293, + "step": 6433 + }, + { + "epoch": 0.75, + "learning_rate": 7.660328333530176e-08, + "logits/chosen": -2.387354612350464, + "logits/rejected": -2.482710599899292, + "logps/chosen": -183.86631774902344, + "logps/rejected": -237.038818359375, + "loss": 3.8958, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.453646659851074, + "rewards/margins": -2.402622699737549, + "rewards/rejected": -2.0510237216949463, + "step": 6434 + }, + { + "epoch": 0.75, + "learning_rate": 7.656785165938348e-08, + "logits/chosen": -2.186370611190796, + "logits/rejected": -2.560506820678711, + "logps/chosen": -278.1424560546875, + "logps/rejected": -204.10498046875, + "loss": 0.3552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2696784734725952, + "rewards/margins": 2.2725183963775635, + "rewards/rejected": -2.542196750640869, + "step": 6435 + }, + { + "epoch": 0.75, + "learning_rate": 7.653241998346522e-08, + "logits/chosen": -2.4624102115631104, + "logits/rejected": -2.461409568786621, + "logps/chosen": -470.4972839355469, + "logps/rejected": -527.3855590820312, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19695037603378296, + "rewards/margins": 2.706575393676758, + "rewards/rejected": -2.9035258293151855, + "step": 6436 + }, + { + "epoch": 0.75, + "learning_rate": 7.649698830754694e-08, + "logits/chosen": -2.0214905738830566, + "logits/rejected": -2.026721715927124, + "logps/chosen": -397.970947265625, + "logps/rejected": -259.406494140625, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.111090287566185, + "rewards/margins": 2.7056374549865723, + "rewards/rejected": -2.5945472717285156, + "step": 6437 + }, + { + "epoch": 0.75, + "learning_rate": 7.646155663162866e-08, + "logits/chosen": -2.2953052520751953, + "logits/rejected": -2.2925941944122314, + "logps/chosen": -282.2213134765625, + "logps/rejected": -339.807373046875, + "loss": 0.3698, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.836524486541748, + "rewards/margins": 2.700441837310791, + "rewards/rejected": -3.536966323852539, + "step": 6438 + }, + { + "epoch": 0.75, + "learning_rate": 7.642612495571041e-08, + "logits/chosen": -2.29888916015625, + "logits/rejected": -2.2941832542419434, + "logps/chosen": -148.8206787109375, + "logps/rejected": -221.51780700683594, + "loss": 0.4291, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5345407128334045, + "rewards/margins": 1.535630226135254, + "rewards/rejected": -2.0701708793640137, + "step": 6439 + }, + { + "epoch": 0.75, + "learning_rate": 7.639069327979213e-08, + "logits/chosen": -2.4630770683288574, + "logits/rejected": -2.332904815673828, + "logps/chosen": -193.40936279296875, + "logps/rejected": -259.61285400390625, + "loss": 0.2297, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4938666820526123, + "rewards/margins": 2.873671293258667, + "rewards/rejected": -3.3675379753112793, + "step": 6440 + }, + { + "epoch": 0.75, + "learning_rate": 7.635526160387385e-08, + "logits/chosen": -2.464574098587036, + "logits/rejected": -2.5408425331115723, + "logps/chosen": -431.915771484375, + "logps/rejected": -390.4723205566406, + "loss": 0.3606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05764177441596985, + "rewards/margins": 1.3085925579071045, + "rewards/rejected": -1.366234302520752, + "step": 6441 + }, + { + "epoch": 0.75, + "learning_rate": 7.631982992795559e-08, + "logits/chosen": -2.410548210144043, + "logits/rejected": -2.1704158782958984, + "logps/chosen": -212.90451049804688, + "logps/rejected": -332.1941223144531, + "loss": 0.2765, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0454578399658203, + "rewards/margins": 3.0149343013763428, + "rewards/rejected": -4.060391902923584, + "step": 6442 + }, + { + "epoch": 0.75, + "learning_rate": 7.628439825203731e-08, + "logits/chosen": -2.3366470336914062, + "logits/rejected": -2.1366610527038574, + "logps/chosen": -294.73858642578125, + "logps/rejected": -366.5589599609375, + "loss": 0.4117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.904717743396759, + "rewards/margins": 2.1983988285064697, + "rewards/rejected": -3.103116512298584, + "step": 6443 + }, + { + "epoch": 0.75, + "learning_rate": 7.624896657611903e-08, + "logits/chosen": -2.305436134338379, + "logits/rejected": -2.45635724067688, + "logps/chosen": -316.4794006347656, + "logps/rejected": -295.93994140625, + "loss": 0.533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.414783239364624, + "rewards/margins": 1.143951654434204, + "rewards/rejected": -2.558734893798828, + "step": 6444 + }, + { + "epoch": 0.75, + "learning_rate": 7.621353490020078e-08, + "logits/chosen": -2.4052436351776123, + "logits/rejected": -2.41888427734375, + "logps/chosen": -346.21722412109375, + "logps/rejected": -365.48455810546875, + "loss": 0.5674, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4423046112060547, + "rewards/margins": 1.6221472024917603, + "rewards/rejected": -3.0644521713256836, + "step": 6445 + }, + { + "epoch": 0.75, + "learning_rate": 7.61781032242825e-08, + "logits/chosen": -1.9310271739959717, + "logits/rejected": -2.4453656673431396, + "logps/chosen": -336.87689208984375, + "logps/rejected": -218.16835021972656, + "loss": 0.1854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1601131111383438, + "rewards/margins": 2.227339506149292, + "rewards/rejected": -2.3874526023864746, + "step": 6446 + }, + { + "epoch": 0.75, + "learning_rate": 7.614267154836424e-08, + "logits/chosen": -2.6300199031829834, + "logits/rejected": -2.7626144886016846, + "logps/chosen": -210.91806030273438, + "logps/rejected": -171.9352569580078, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5809270143508911, + "rewards/margins": 2.3892831802368164, + "rewards/rejected": -2.970210075378418, + "step": 6447 + }, + { + "epoch": 0.75, + "learning_rate": 7.610723987244596e-08, + "logits/chosen": -1.9328283071517944, + "logits/rejected": -2.085153818130493, + "logps/chosen": -388.857666015625, + "logps/rejected": -439.230712890625, + "loss": 0.2692, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22985783219337463, + "rewards/margins": 2.691563367843628, + "rewards/rejected": -2.921421527862549, + "step": 6448 + }, + { + "epoch": 0.75, + "learning_rate": 7.607180819652768e-08, + "logits/chosen": -1.8152461051940918, + "logits/rejected": -2.160717725753784, + "logps/chosen": -332.2568359375, + "logps/rejected": -251.8614959716797, + "loss": 0.1953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5463205575942993, + "rewards/margins": 2.6006813049316406, + "rewards/rejected": -3.1470019817352295, + "step": 6449 + }, + { + "epoch": 0.75, + "learning_rate": 7.603637652060942e-08, + "logits/chosen": -2.3659584522247314, + "logits/rejected": -2.3320837020874023, + "logps/chosen": -256.12530517578125, + "logps/rejected": -304.3139343261719, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5431153774261475, + "rewards/margins": 2.831453800201416, + "rewards/rejected": -3.3745691776275635, + "step": 6450 + }, + { + "epoch": 0.75, + "learning_rate": 7.600094484469115e-08, + "logits/chosen": -2.1952297687530518, + "logits/rejected": -2.3484854698181152, + "logps/chosen": -391.63580322265625, + "logps/rejected": -348.87298583984375, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3151053190231323, + "rewards/margins": 2.619378089904785, + "rewards/rejected": -2.934483289718628, + "step": 6451 + }, + { + "epoch": 0.75, + "learning_rate": 7.596551316877288e-08, + "logits/chosen": -1.8346936702728271, + "logits/rejected": -1.9064598083496094, + "logps/chosen": -361.21990966796875, + "logps/rejected": -258.1582336425781, + "loss": 0.7016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6663395166397095, + "rewards/margins": 0.3649175763130188, + "rewards/rejected": -1.0312570333480835, + "step": 6452 + }, + { + "epoch": 0.75, + "learning_rate": 7.593008149285461e-08, + "logits/chosen": -2.4647676944732666, + "logits/rejected": -2.600994110107422, + "logps/chosen": -109.36909484863281, + "logps/rejected": -134.81832885742188, + "loss": 0.2538, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7016472220420837, + "rewards/margins": 2.127311944961548, + "rewards/rejected": -2.8289592266082764, + "step": 6453 + }, + { + "epoch": 0.75, + "learning_rate": 7.589464981693633e-08, + "logits/chosen": -2.1688389778137207, + "logits/rejected": -2.4107134342193604, + "logps/chosen": -330.3179016113281, + "logps/rejected": -348.23565673828125, + "loss": 0.5562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.66881263256073, + "rewards/margins": 1.482250452041626, + "rewards/rejected": -2.1510632038116455, + "step": 6454 + }, + { + "epoch": 0.75, + "learning_rate": 7.585921814101806e-08, + "logits/chosen": -1.9878334999084473, + "logits/rejected": -2.5281176567077637, + "logps/chosen": -367.12762451171875, + "logps/rejected": -282.396484375, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1726101636886597, + "rewards/margins": 1.6659901142120361, + "rewards/rejected": -2.8386001586914062, + "step": 6455 + }, + { + "epoch": 0.75, + "learning_rate": 7.58237864650998e-08, + "logits/chosen": -2.494420289993286, + "logits/rejected": -2.31838321685791, + "logps/chosen": -149.25677490234375, + "logps/rejected": -243.22354125976562, + "loss": 0.37, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4915926456451416, + "rewards/margins": 3.71329927444458, + "rewards/rejected": -4.204892158508301, + "step": 6456 + }, + { + "epoch": 0.75, + "learning_rate": 7.578835478918153e-08, + "logits/chosen": -2.2464678287506104, + "logits/rejected": -2.3555166721343994, + "logps/chosen": -304.54730224609375, + "logps/rejected": -330.56024169921875, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21251508593559265, + "rewards/margins": 2.277697801589966, + "rewards/rejected": -2.490212917327881, + "step": 6457 + }, + { + "epoch": 0.75, + "learning_rate": 7.575292311326326e-08, + "logits/chosen": -2.114300012588501, + "logits/rejected": -2.2568600177764893, + "logps/chosen": -394.3472595214844, + "logps/rejected": -306.275634765625, + "loss": 0.5306, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7053381204605103, + "rewards/margins": 0.5187211632728577, + "rewards/rejected": -2.2240591049194336, + "step": 6458 + }, + { + "epoch": 0.75, + "learning_rate": 7.571749143734498e-08, + "logits/chosen": -1.6774001121520996, + "logits/rejected": -2.349780797958374, + "logps/chosen": -380.2327880859375, + "logps/rejected": -201.6908416748047, + "loss": 1.0181, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2111611366271973, + "rewards/margins": 0.721416711807251, + "rewards/rejected": -1.9325779676437378, + "step": 6459 + }, + { + "epoch": 0.75, + "learning_rate": 7.568205976142671e-08, + "logits/chosen": -2.290689468383789, + "logits/rejected": -2.560246706008911, + "logps/chosen": -222.33877563476562, + "logps/rejected": -211.14617919921875, + "loss": 0.45, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.624109148979187, + "rewards/margins": 1.4265174865722656, + "rewards/rejected": -2.050626516342163, + "step": 6460 + }, + { + "epoch": 0.75, + "learning_rate": 7.564662808550843e-08, + "logits/chosen": -2.7059998512268066, + "logits/rejected": -2.596613883972168, + "logps/chosen": -225.66851806640625, + "logps/rejected": -224.09307861328125, + "loss": 1.3102, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1883246898651123, + "rewards/margins": 0.09823280572891235, + "rewards/rejected": -2.28655743598938, + "step": 6461 + }, + { + "epoch": 0.75, + "learning_rate": 7.561119640959018e-08, + "logits/chosen": -2.6661267280578613, + "logits/rejected": -2.522162914276123, + "logps/chosen": -132.64132690429688, + "logps/rejected": -168.64820861816406, + "loss": 0.2069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14964909851551056, + "rewards/margins": 2.1558678150177, + "rewards/rejected": -2.3055169582366943, + "step": 6462 + }, + { + "epoch": 0.75, + "learning_rate": 7.55757647336719e-08, + "logits/chosen": -2.6403417587280273, + "logits/rejected": -2.68473482131958, + "logps/chosen": -116.46459197998047, + "logps/rejected": -186.0068359375, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29007911682128906, + "rewards/margins": 2.6083991527557373, + "rewards/rejected": -2.8984785079956055, + "step": 6463 + }, + { + "epoch": 0.75, + "learning_rate": 7.554033305775363e-08, + "logits/chosen": -2.2794063091278076, + "logits/rejected": -1.8668127059936523, + "logps/chosen": -220.65646362304688, + "logps/rejected": -383.7742919921875, + "loss": 0.1164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4166777729988098, + "rewards/margins": 4.237817287445068, + "rewards/rejected": -4.654494762420654, + "step": 6464 + }, + { + "epoch": 0.75, + "learning_rate": 7.550490138183536e-08, + "logits/chosen": -2.253746747970581, + "logits/rejected": -2.491044282913208, + "logps/chosen": -283.82611083984375, + "logps/rejected": -162.2842254638672, + "loss": 0.7464, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5153189897537231, + "rewards/margins": 0.38248109817504883, + "rewards/rejected": -1.8977999687194824, + "step": 6465 + }, + { + "epoch": 0.75, + "learning_rate": 7.546946970591708e-08, + "logits/chosen": -2.421590566635132, + "logits/rejected": -2.5192604064941406, + "logps/chosen": -219.20285034179688, + "logps/rejected": -299.8850402832031, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5302232503890991, + "rewards/margins": 2.0074219703674316, + "rewards/rejected": -2.5376453399658203, + "step": 6466 + }, + { + "epoch": 0.75, + "learning_rate": 7.543403802999881e-08, + "logits/chosen": -2.5977070331573486, + "logits/rejected": -2.7787818908691406, + "logps/chosen": -263.13397216796875, + "logps/rejected": -258.5908203125, + "loss": 0.4478, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6950018405914307, + "rewards/margins": 2.1370444297790527, + "rewards/rejected": -3.8320462703704834, + "step": 6467 + }, + { + "epoch": 0.75, + "learning_rate": 7.539860635408055e-08, + "logits/chosen": -2.4104084968566895, + "logits/rejected": -2.624112844467163, + "logps/chosen": -298.2568359375, + "logps/rejected": -235.68045043945312, + "loss": 0.3528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14634397625923157, + "rewards/margins": 2.625641107559204, + "rewards/rejected": -2.7719852924346924, + "step": 6468 + }, + { + "epoch": 0.75, + "learning_rate": 7.536317467816227e-08, + "logits/chosen": -2.758101463317871, + "logits/rejected": -2.6431703567504883, + "logps/chosen": -203.6315155029297, + "logps/rejected": -213.75733947753906, + "loss": 0.4345, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1302729994058609, + "rewards/margins": 2.419203281402588, + "rewards/rejected": -2.2889304161071777, + "step": 6469 + }, + { + "epoch": 0.75, + "learning_rate": 7.532774300224401e-08, + "logits/chosen": -2.469794750213623, + "logits/rejected": -2.5683364868164062, + "logps/chosen": -205.78976440429688, + "logps/rejected": -168.04039001464844, + "loss": 0.6453, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6418776512145996, + "rewards/margins": 1.5474066734313965, + "rewards/rejected": -3.189284324645996, + "step": 6470 + }, + { + "epoch": 0.75, + "learning_rate": 7.529231132632573e-08, + "logits/chosen": -2.6583425998687744, + "logits/rejected": -2.584652900695801, + "logps/chosen": -189.99542236328125, + "logps/rejected": -222.5227813720703, + "loss": 0.4268, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4412004947662354, + "rewards/margins": 1.4033842086791992, + "rewards/rejected": -2.8445847034454346, + "step": 6471 + }, + { + "epoch": 0.75, + "learning_rate": 7.525687965040745e-08, + "logits/chosen": -1.9429912567138672, + "logits/rejected": -2.0021865367889404, + "logps/chosen": -382.5565185546875, + "logps/rejected": -391.878662109375, + "loss": 0.1949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6977723240852356, + "rewards/margins": 2.8193814754486084, + "rewards/rejected": -3.517153739929199, + "step": 6472 + }, + { + "epoch": 0.75, + "learning_rate": 7.522144797448919e-08, + "logits/chosen": -1.653220295906067, + "logits/rejected": -1.8512948751449585, + "logps/chosen": -323.2754211425781, + "logps/rejected": -263.09332275390625, + "loss": 1.1886, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.051288604736328, + "rewards/margins": -0.09607559442520142, + "rewards/rejected": -1.955213189125061, + "step": 6473 + }, + { + "epoch": 0.75, + "learning_rate": 7.518601629857092e-08, + "logits/chosen": -2.3776702880859375, + "logits/rejected": -2.4249541759490967, + "logps/chosen": -222.36526489257812, + "logps/rejected": -221.4114990234375, + "loss": 0.2417, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09754163771867752, + "rewards/margins": 2.849870204925537, + "rewards/rejected": -2.752328634262085, + "step": 6474 + }, + { + "epoch": 0.75, + "learning_rate": 7.515058462265266e-08, + "logits/chosen": -2.602116823196411, + "logits/rejected": -2.6855695247650146, + "logps/chosen": -186.8363494873047, + "logps/rejected": -223.6293182373047, + "loss": 0.3513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9947190284729004, + "rewards/margins": 1.5746976137161255, + "rewards/rejected": -2.5694165229797363, + "step": 6475 + }, + { + "epoch": 0.75, + "learning_rate": 7.511515294673438e-08, + "logits/chosen": -2.1278703212738037, + "logits/rejected": -1.8477816581726074, + "logps/chosen": -187.11715698242188, + "logps/rejected": -214.281982421875, + "loss": 0.687, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0301921367645264, + "rewards/margins": 1.824438214302063, + "rewards/rejected": -2.854630470275879, + "step": 6476 + }, + { + "epoch": 0.75, + "learning_rate": 7.50797212708161e-08, + "logits/chosen": -2.460930824279785, + "logits/rejected": -2.3468692302703857, + "logps/chosen": -207.79335021972656, + "logps/rejected": -185.47250366210938, + "loss": 0.3304, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2658095359802246, + "rewards/margins": 1.3391714096069336, + "rewards/rejected": -2.604980945587158, + "step": 6477 + }, + { + "epoch": 0.75, + "learning_rate": 7.504428959489782e-08, + "logits/chosen": -2.850128650665283, + "logits/rejected": -2.8688979148864746, + "logps/chosen": -183.9222869873047, + "logps/rejected": -138.09597778320312, + "loss": 0.4537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2794824838638306, + "rewards/margins": 1.0173484086990356, + "rewards/rejected": -2.296830654144287, + "step": 6478 + }, + { + "epoch": 0.75, + "learning_rate": 7.500885791897956e-08, + "logits/chosen": -2.754365921020508, + "logits/rejected": -2.6961374282836914, + "logps/chosen": -315.6653137207031, + "logps/rejected": -355.5334167480469, + "loss": 0.405, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6954002976417542, + "rewards/margins": 3.1540236473083496, + "rewards/rejected": -3.84942364692688, + "step": 6479 + }, + { + "epoch": 0.75, + "learning_rate": 7.49734262430613e-08, + "logits/chosen": -1.521028995513916, + "logits/rejected": -2.0557851791381836, + "logps/chosen": -678.4910888671875, + "logps/rejected": -382.72503662109375, + "loss": 0.7318, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8140151500701904, + "rewards/margins": 0.8725492358207703, + "rewards/rejected": -1.686564564704895, + "step": 6480 + }, + { + "epoch": 0.75, + "learning_rate": 7.493799456714303e-08, + "logits/chosen": -2.4138646125793457, + "logits/rejected": -2.2300660610198975, + "logps/chosen": -292.7588806152344, + "logps/rejected": -373.501953125, + "loss": 0.2313, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13556605577468872, + "rewards/margins": 3.559673309326172, + "rewards/rejected": -3.424107551574707, + "step": 6481 + }, + { + "epoch": 0.75, + "learning_rate": 7.490256289122475e-08, + "logits/chosen": -2.4603888988494873, + "logits/rejected": -2.487705707550049, + "logps/chosen": -320.8235778808594, + "logps/rejected": -287.2161560058594, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24512377381324768, + "rewards/margins": 3.4962949752807617, + "rewards/rejected": -3.7414186000823975, + "step": 6482 + }, + { + "epoch": 0.75, + "learning_rate": 7.486713121530647e-08, + "logits/chosen": -2.47711181640625, + "logits/rejected": -2.6055920124053955, + "logps/chosen": -309.1855773925781, + "logps/rejected": -372.4263610839844, + "loss": 0.3376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9842288494110107, + "rewards/margins": 2.6509792804718018, + "rewards/rejected": -3.6352081298828125, + "step": 6483 + }, + { + "epoch": 0.75, + "learning_rate": 7.483169953938821e-08, + "logits/chosen": -2.5217947959899902, + "logits/rejected": -2.4774439334869385, + "logps/chosen": -332.9872131347656, + "logps/rejected": -305.52606201171875, + "loss": 0.3967, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2651970386505127, + "rewards/margins": 1.6915512084960938, + "rewards/rejected": -2.9567484855651855, + "step": 6484 + }, + { + "epoch": 0.75, + "learning_rate": 7.479626786346995e-08, + "logits/chosen": -2.7126035690307617, + "logits/rejected": -2.6986136436462402, + "logps/chosen": -211.6145477294922, + "logps/rejected": -183.21795654296875, + "loss": 0.2613, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6984207630157471, + "rewards/margins": 1.9462628364562988, + "rewards/rejected": -2.644683837890625, + "step": 6485 + }, + { + "epoch": 0.75, + "learning_rate": 7.476083618755167e-08, + "logits/chosen": -2.9035329818725586, + "logits/rejected": -2.9082236289978027, + "logps/chosen": -169.04530334472656, + "logps/rejected": -215.0203857421875, + "loss": 0.3599, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44306913018226624, + "rewards/margins": 1.5387532711029053, + "rewards/rejected": -1.9818223714828491, + "step": 6486 + }, + { + "epoch": 0.75, + "learning_rate": 7.47254045116334e-08, + "logits/chosen": -2.186835527420044, + "logits/rejected": -1.8564000129699707, + "logps/chosen": -232.23985290527344, + "logps/rejected": -279.71630859375, + "loss": 0.5581, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3683847188949585, + "rewards/margins": 1.2653347253799438, + "rewards/rejected": -2.6337194442749023, + "step": 6487 + }, + { + "epoch": 0.75, + "learning_rate": 7.468997283571512e-08, + "logits/chosen": -2.484494209289551, + "logits/rejected": -2.596965789794922, + "logps/chosen": -171.68630981445312, + "logps/rejected": -259.25433349609375, + "loss": 0.3942, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8313173055648804, + "rewards/margins": 2.4513344764709473, + "rewards/rejected": -3.282651901245117, + "step": 6488 + }, + { + "epoch": 0.75, + "learning_rate": 7.465454115979685e-08, + "logits/chosen": -1.6246484518051147, + "logits/rejected": -2.0188868045806885, + "logps/chosen": -340.76995849609375, + "logps/rejected": -243.5684356689453, + "loss": 0.2812, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6674381494522095, + "rewards/margins": 1.9222631454467773, + "rewards/rejected": -2.5897014141082764, + "step": 6489 + }, + { + "epoch": 0.75, + "learning_rate": 7.461910948387858e-08, + "logits/chosen": -2.3743112087249756, + "logits/rejected": -2.3970508575439453, + "logps/chosen": -313.0654602050781, + "logps/rejected": -199.01651000976562, + "loss": 0.4237, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40202367305755615, + "rewards/margins": 1.7450857162475586, + "rewards/rejected": -2.147109270095825, + "step": 6490 + }, + { + "epoch": 0.76, + "learning_rate": 7.458367780796032e-08, + "logits/chosen": -2.2585835456848145, + "logits/rejected": -2.320622444152832, + "logps/chosen": -270.8371276855469, + "logps/rejected": -278.6273498535156, + "loss": 0.479, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08981799334287643, + "rewards/margins": 2.071437358856201, + "rewards/rejected": -2.161255359649658, + "step": 6491 + }, + { + "epoch": 0.76, + "learning_rate": 7.454824613204204e-08, + "logits/chosen": -2.8580784797668457, + "logits/rejected": -2.9435601234436035, + "logps/chosen": -288.48907470703125, + "logps/rejected": -270.0225830078125, + "loss": 0.5848, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9860184788703918, + "rewards/margins": 1.1633367538452148, + "rewards/rejected": -2.149355173110962, + "step": 6492 + }, + { + "epoch": 0.76, + "learning_rate": 7.451281445612378e-08, + "logits/chosen": -2.5036725997924805, + "logits/rejected": -2.624281883239746, + "logps/chosen": -289.745361328125, + "logps/rejected": -256.320068359375, + "loss": 0.3267, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1172726154327393, + "rewards/margins": 2.3018720149993896, + "rewards/rejected": -3.41914439201355, + "step": 6493 + }, + { + "epoch": 0.76, + "learning_rate": 7.44773827802055e-08, + "logits/chosen": -2.2382712364196777, + "logits/rejected": -2.2557499408721924, + "logps/chosen": -273.10577392578125, + "logps/rejected": -217.0933074951172, + "loss": 0.268, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48139986395835876, + "rewards/margins": 2.766129970550537, + "rewards/rejected": -3.2475295066833496, + "step": 6494 + }, + { + "epoch": 0.76, + "learning_rate": 7.444195110428723e-08, + "logits/chosen": -2.5953822135925293, + "logits/rejected": -2.470294952392578, + "logps/chosen": -170.13339233398438, + "logps/rejected": -200.04359436035156, + "loss": 0.269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9646235108375549, + "rewards/margins": 2.2494678497314453, + "rewards/rejected": -3.2140913009643555, + "step": 6495 + }, + { + "epoch": 0.76, + "learning_rate": 7.440651942836895e-08, + "logits/chosen": -2.607260227203369, + "logits/rejected": -2.6524176597595215, + "logps/chosen": -235.563232421875, + "logps/rejected": -363.0798034667969, + "loss": 0.3737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7814541459083557, + "rewards/margins": 2.5795629024505615, + "rewards/rejected": -3.3610172271728516, + "step": 6496 + }, + { + "epoch": 0.76, + "learning_rate": 7.437108775245069e-08, + "logits/chosen": -2.4380621910095215, + "logits/rejected": -2.32533597946167, + "logps/chosen": -258.1275634765625, + "logps/rejected": -243.79844665527344, + "loss": 0.7227, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.98960280418396, + "rewards/margins": 1.4333994388580322, + "rewards/rejected": -2.423002004623413, + "step": 6497 + }, + { + "epoch": 0.76, + "learning_rate": 7.433565607653241e-08, + "logits/chosen": -2.4614648818969727, + "logits/rejected": -2.4190120697021484, + "logps/chosen": -252.6791229248047, + "logps/rejected": -284.16650390625, + "loss": 0.154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24404442310333252, + "rewards/margins": 4.167491912841797, + "rewards/rejected": -4.41153621673584, + "step": 6498 + }, + { + "epoch": 0.76, + "learning_rate": 7.430022440061415e-08, + "logits/chosen": -1.7292258739471436, + "logits/rejected": -1.9843504428863525, + "logps/chosen": -441.6611633300781, + "logps/rejected": -319.1882019042969, + "loss": 0.6822, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3567826747894287, + "rewards/margins": 1.640333890914917, + "rewards/rejected": -2.997117042541504, + "step": 6499 + }, + { + "epoch": 0.76, + "learning_rate": 7.426479272469587e-08, + "logits/chosen": -2.569911003112793, + "logits/rejected": -2.304762125015259, + "logps/chosen": -259.9080505371094, + "logps/rejected": -316.58984375, + "loss": 0.6812, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2681901454925537, + "rewards/margins": 0.9844961166381836, + "rewards/rejected": -2.2526865005493164, + "step": 6500 + }, + { + "epoch": 0.76, + "learning_rate": 7.42293610487776e-08, + "logits/chosen": -1.911576271057129, + "logits/rejected": -2.0494470596313477, + "logps/chosen": -334.9373474121094, + "logps/rejected": -356.62994384765625, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.680385172367096, + "rewards/margins": 2.8933048248291016, + "rewards/rejected": -3.573690176010132, + "step": 6501 + }, + { + "epoch": 0.76, + "learning_rate": 7.419392937285934e-08, + "logits/chosen": -2.306076765060425, + "logits/rejected": -2.262850046157837, + "logps/chosen": -252.64849853515625, + "logps/rejected": -257.77490234375, + "loss": 0.4245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.955533504486084, + "rewards/margins": 1.8460835218429565, + "rewards/rejected": -2.801616907119751, + "step": 6502 + }, + { + "epoch": 0.76, + "learning_rate": 7.415849769694106e-08, + "logits/chosen": -2.7781872749328613, + "logits/rejected": -3.105736255645752, + "logps/chosen": -262.8927001953125, + "logps/rejected": -273.08935546875, + "loss": 0.8175, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3758583068847656, + "rewards/margins": 1.2751716375350952, + "rewards/rejected": -2.651029586791992, + "step": 6503 + }, + { + "epoch": 0.76, + "learning_rate": 7.412306602102278e-08, + "logits/chosen": -1.7319554090499878, + "logits/rejected": -1.9258675575256348, + "logps/chosen": -290.14031982421875, + "logps/rejected": -251.9043426513672, + "loss": 0.663, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.456110954284668, + "rewards/margins": 1.7998138666152954, + "rewards/rejected": -3.255924940109253, + "step": 6504 + }, + { + "epoch": 0.76, + "learning_rate": 7.408763434510452e-08, + "logits/chosen": -2.0228519439697266, + "logits/rejected": -2.0797502994537354, + "logps/chosen": -356.51446533203125, + "logps/rejected": -378.6701354980469, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1638134717941284, + "rewards/margins": 2.167304277420044, + "rewards/rejected": -3.331117630004883, + "step": 6505 + }, + { + "epoch": 0.76, + "learning_rate": 7.405220266918624e-08, + "logits/chosen": -2.0093495845794678, + "logits/rejected": -2.029608726501465, + "logps/chosen": -322.6050720214844, + "logps/rejected": -372.1376647949219, + "loss": 0.45, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5367717742919922, + "rewards/margins": 2.23799991607666, + "rewards/rejected": -3.7747714519500732, + "step": 6506 + }, + { + "epoch": 0.76, + "learning_rate": 7.401677099326798e-08, + "logits/chosen": -2.650331497192383, + "logits/rejected": -2.6581790447235107, + "logps/chosen": -207.1827392578125, + "logps/rejected": -280.2098388671875, + "loss": 0.2948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0037756264209747314, + "rewards/margins": 1.6332275867462158, + "rewards/rejected": -1.6370033025741577, + "step": 6507 + }, + { + "epoch": 0.76, + "learning_rate": 7.398133931734971e-08, + "logits/chosen": -2.2206897735595703, + "logits/rejected": -2.142420768737793, + "logps/chosen": -214.672607421875, + "logps/rejected": -295.1993713378906, + "loss": 0.2733, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1452040672302246, + "rewards/margins": 2.333329200744629, + "rewards/rejected": -3.4785332679748535, + "step": 6508 + }, + { + "epoch": 0.76, + "learning_rate": 7.394590764143143e-08, + "logits/chosen": -1.8460166454315186, + "logits/rejected": -2.1236777305603027, + "logps/chosen": -263.8135986328125, + "logps/rejected": -213.716796875, + "loss": 0.3003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1670980155467987, + "rewards/margins": 1.4465421438217163, + "rewards/rejected": -1.6136400699615479, + "step": 6509 + }, + { + "epoch": 0.76, + "learning_rate": 7.391047596551317e-08, + "logits/chosen": -1.7671496868133545, + "logits/rejected": -1.7532504796981812, + "logps/chosen": -343.1138916015625, + "logps/rejected": -382.9918518066406, + "loss": 0.4533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8129963874816895, + "rewards/margins": 0.8321951627731323, + "rewards/rejected": -1.6451914310455322, + "step": 6510 + }, + { + "epoch": 0.76, + "learning_rate": 7.387504428959489e-08, + "logits/chosen": -2.6106786727905273, + "logits/rejected": -2.6972742080688477, + "logps/chosen": -196.06707763671875, + "logps/rejected": -237.4906768798828, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5211362838745117, + "rewards/margins": 3.398703098297119, + "rewards/rejected": -4.919839859008789, + "step": 6511 + }, + { + "epoch": 0.76, + "learning_rate": 7.383961261367663e-08, + "logits/chosen": -2.4326791763305664, + "logits/rejected": -2.3134677410125732, + "logps/chosen": -170.42840576171875, + "logps/rejected": -321.0561828613281, + "loss": 0.1045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6179792284965515, + "rewards/margins": 3.7009851932525635, + "rewards/rejected": -4.31896448135376, + "step": 6512 + }, + { + "epoch": 0.76, + "learning_rate": 7.380418093775835e-08, + "logits/chosen": -2.6463255882263184, + "logits/rejected": -2.819624900817871, + "logps/chosen": -233.36697387695312, + "logps/rejected": -218.2624969482422, + "loss": 1.3446, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8299775123596191, + "rewards/margins": 0.5131313800811768, + "rewards/rejected": -2.343108892440796, + "step": 6513 + }, + { + "epoch": 0.76, + "learning_rate": 7.376874926184009e-08, + "logits/chosen": -2.1878550052642822, + "logits/rejected": -2.1497457027435303, + "logps/chosen": -319.4171447753906, + "logps/rejected": -329.67529296875, + "loss": 0.9727, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5298254489898682, + "rewards/margins": 0.49959075450897217, + "rewards/rejected": -2.029416084289551, + "step": 6514 + }, + { + "epoch": 0.76, + "learning_rate": 7.373331758592181e-08, + "logits/chosen": -1.9755208492279053, + "logits/rejected": -1.9579623937606812, + "logps/chosen": -257.964599609375, + "logps/rejected": -247.97134399414062, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7390990853309631, + "rewards/margins": 2.092090129852295, + "rewards/rejected": -2.8311893939971924, + "step": 6515 + }, + { + "epoch": 0.76, + "learning_rate": 7.369788591000354e-08, + "logits/chosen": -2.7085702419281006, + "logits/rejected": -2.7637767791748047, + "logps/chosen": -134.40591430664062, + "logps/rejected": -186.67678833007812, + "loss": 0.3419, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6995807886123657, + "rewards/margins": 2.4577202796936035, + "rewards/rejected": -4.157301425933838, + "step": 6516 + }, + { + "epoch": 0.76, + "learning_rate": 7.366245423408526e-08, + "logits/chosen": -2.745403528213501, + "logits/rejected": -2.747579574584961, + "logps/chosen": -172.17137145996094, + "logps/rejected": -264.71221923828125, + "loss": 0.1932, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5493249893188477, + "rewards/margins": 3.3155744075775146, + "rewards/rejected": -4.864899158477783, + "step": 6517 + }, + { + "epoch": 0.76, + "learning_rate": 7.3627022558167e-08, + "logits/chosen": -2.5278282165527344, + "logits/rejected": -2.596538543701172, + "logps/chosen": -277.4001159667969, + "logps/rejected": -209.38958740234375, + "loss": 1.3385, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1616735458374023, + "rewards/margins": -0.027311623096466064, + "rewards/rejected": -2.134361743927002, + "step": 6518 + }, + { + "epoch": 0.76, + "learning_rate": 7.359159088224874e-08, + "logits/chosen": -2.228825092315674, + "logits/rejected": -2.0385007858276367, + "logps/chosen": -256.73638916015625, + "logps/rejected": -294.7371520996094, + "loss": 0.38, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4170072078704834, + "rewards/margins": 1.9229167699813843, + "rewards/rejected": -3.339923858642578, + "step": 6519 + }, + { + "epoch": 0.76, + "learning_rate": 7.355615920633046e-08, + "logits/chosen": -2.2051901817321777, + "logits/rejected": -2.163444995880127, + "logps/chosen": -182.35267639160156, + "logps/rejected": -273.6107482910156, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.648921489715576, + "rewards/margins": 1.814222812652588, + "rewards/rejected": -4.463144302368164, + "step": 6520 + }, + { + "epoch": 0.76, + "learning_rate": 7.352072753041218e-08, + "logits/chosen": -1.9406752586364746, + "logits/rejected": -2.0569047927856445, + "logps/chosen": -507.5339660644531, + "logps/rejected": -287.4808349609375, + "loss": 0.2596, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2752528786659241, + "rewards/margins": 2.776866912841797, + "rewards/rejected": -3.052119731903076, + "step": 6521 + }, + { + "epoch": 0.76, + "learning_rate": 7.348529585449392e-08, + "logits/chosen": -2.3453402519226074, + "logits/rejected": -2.4058632850646973, + "logps/chosen": -265.2162780761719, + "logps/rejected": -269.7513122558594, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3292612135410309, + "rewards/margins": 3.143617630004883, + "rewards/rejected": -2.814356565475464, + "step": 6522 + }, + { + "epoch": 0.76, + "learning_rate": 7.344986417857564e-08, + "logits/chosen": -1.975771188735962, + "logits/rejected": -1.5246907472610474, + "logps/chosen": -294.74359130859375, + "logps/rejected": -443.364013671875, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06991385668516159, + "rewards/margins": 4.122983932495117, + "rewards/rejected": -4.192897796630859, + "step": 6523 + }, + { + "epoch": 0.76, + "learning_rate": 7.341443250265737e-08, + "logits/chosen": -2.279090166091919, + "logits/rejected": -2.301064968109131, + "logps/chosen": -298.11285400390625, + "logps/rejected": -366.35723876953125, + "loss": 0.4457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5600953102111816, + "rewards/margins": 2.197056531906128, + "rewards/rejected": -2.7571516036987305, + "step": 6524 + }, + { + "epoch": 0.76, + "learning_rate": 7.337900082673911e-08, + "logits/chosen": -2.778961658477783, + "logits/rejected": -2.559969425201416, + "logps/chosen": -165.39889526367188, + "logps/rejected": -429.42169189453125, + "loss": 0.2889, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0781649351119995, + "rewards/margins": 2.2567379474639893, + "rewards/rejected": -3.3349030017852783, + "step": 6525 + }, + { + "epoch": 0.76, + "learning_rate": 7.334356915082083e-08, + "logits/chosen": -2.564197540283203, + "logits/rejected": -2.8193860054016113, + "logps/chosen": -188.641845703125, + "logps/rejected": -189.2569580078125, + "loss": 0.4698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7985535860061646, + "rewards/margins": 1.819006323814392, + "rewards/rejected": -2.6175599098205566, + "step": 6526 + }, + { + "epoch": 0.76, + "learning_rate": 7.330813747490255e-08, + "logits/chosen": -2.369932174682617, + "logits/rejected": -2.3503100872039795, + "logps/chosen": -188.45352172851562, + "logps/rejected": -193.16091918945312, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7510823011398315, + "rewards/margins": 3.372767448425293, + "rewards/rejected": -4.123849868774414, + "step": 6527 + }, + { + "epoch": 0.76, + "learning_rate": 7.327270579898429e-08, + "logits/chosen": -2.2970075607299805, + "logits/rejected": -2.196621894836426, + "logps/chosen": -239.46925354003906, + "logps/rejected": -288.114501953125, + "loss": 0.1946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7004674077033997, + "rewards/margins": 3.2607064247131348, + "rewards/rejected": -3.9611740112304688, + "step": 6528 + }, + { + "epoch": 0.76, + "learning_rate": 7.323727412306602e-08, + "logits/chosen": -2.274451732635498, + "logits/rejected": -1.9620574712753296, + "logps/chosen": -259.2524108886719, + "logps/rejected": -325.619384765625, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.021093726158142, + "rewards/margins": 3.362720251083374, + "rewards/rejected": -4.383813858032227, + "step": 6529 + }, + { + "epoch": 0.76, + "learning_rate": 7.320184244714775e-08, + "logits/chosen": -2.456428050994873, + "logits/rejected": -2.4305083751678467, + "logps/chosen": -235.43374633789062, + "logps/rejected": -337.66326904296875, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1129276305437088, + "rewards/margins": 3.2768592834472656, + "rewards/rejected": -3.389786720275879, + "step": 6530 + }, + { + "epoch": 0.76, + "learning_rate": 7.316641077122948e-08, + "logits/chosen": -2.3559694290161133, + "logits/rejected": -2.574814796447754, + "logps/chosen": -359.7582092285156, + "logps/rejected": -262.01507568359375, + "loss": 0.2465, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34616971015930176, + "rewards/margins": 2.019064426422119, + "rewards/rejected": -2.365234136581421, + "step": 6531 + }, + { + "epoch": 0.76, + "learning_rate": 7.31309790953112e-08, + "logits/chosen": -1.6165971755981445, + "logits/rejected": -2.1380906105041504, + "logps/chosen": -400.5142822265625, + "logps/rejected": -342.49346923828125, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6343805193901062, + "rewards/margins": 3.116079092025757, + "rewards/rejected": -3.750459671020508, + "step": 6532 + }, + { + "epoch": 0.76, + "learning_rate": 7.309554741939292e-08, + "logits/chosen": -2.138234853744507, + "logits/rejected": -2.3391356468200684, + "logps/chosen": -272.2114562988281, + "logps/rejected": -210.33538818359375, + "loss": 0.7787, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0656989812850952, + "rewards/margins": 0.9475712180137634, + "rewards/rejected": -2.013270139694214, + "step": 6533 + }, + { + "epoch": 0.76, + "learning_rate": 7.306011574347466e-08, + "logits/chosen": -2.5343878269195557, + "logits/rejected": -2.4906201362609863, + "logps/chosen": -287.34747314453125, + "logps/rejected": -251.7882537841797, + "loss": 0.2721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8745272159576416, + "rewards/margins": 2.5609042644500732, + "rewards/rejected": -3.435431480407715, + "step": 6534 + }, + { + "epoch": 0.76, + "learning_rate": 7.30246840675564e-08, + "logits/chosen": -1.9182782173156738, + "logits/rejected": -2.0620129108428955, + "logps/chosen": -251.80648803710938, + "logps/rejected": -338.21917724609375, + "loss": 0.1969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6208652853965759, + "rewards/margins": 2.8699145317077637, + "rewards/rejected": -3.4907798767089844, + "step": 6535 + }, + { + "epoch": 0.76, + "learning_rate": 7.298925239163812e-08, + "logits/chosen": -2.469118118286133, + "logits/rejected": -2.538001298904419, + "logps/chosen": -274.2197265625, + "logps/rejected": -211.62066650390625, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5934911966323853, + "rewards/margins": 3.318878650665283, + "rewards/rejected": -3.912369728088379, + "step": 6536 + }, + { + "epoch": 0.76, + "learning_rate": 7.295382071571985e-08, + "logits/chosen": -2.654019355773926, + "logits/rejected": -2.769404411315918, + "logps/chosen": -339.317626953125, + "logps/rejected": -276.64801025390625, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7511558532714844, + "rewards/margins": 2.7480406761169434, + "rewards/rejected": -3.4991965293884277, + "step": 6537 + }, + { + "epoch": 0.76, + "learning_rate": 7.291838903980158e-08, + "logits/chosen": -2.148364543914795, + "logits/rejected": -2.12152099609375, + "logps/chosen": -414.82781982421875, + "logps/rejected": -387.35693359375, + "loss": 0.1954, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.724144458770752, + "rewards/margins": 2.728975296020508, + "rewards/rejected": -4.45311975479126, + "step": 6538 + }, + { + "epoch": 0.76, + "learning_rate": 7.288295736388331e-08, + "logits/chosen": -1.9216448068618774, + "logits/rejected": -1.9350512027740479, + "logps/chosen": -268.9171142578125, + "logps/rejected": -289.7937316894531, + "loss": 0.6139, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0308666229248047, + "rewards/margins": 1.491780400276184, + "rewards/rejected": -2.522646903991699, + "step": 6539 + }, + { + "epoch": 0.76, + "learning_rate": 7.284752568796503e-08, + "logits/chosen": -2.0315539836883545, + "logits/rejected": -2.1437599658966064, + "logps/chosen": -549.8056640625, + "logps/rejected": -399.4963073730469, + "loss": 0.6171, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.253898561000824, + "rewards/margins": 2.779489755630493, + "rewards/rejected": -3.033388137817383, + "step": 6540 + }, + { + "epoch": 0.76, + "learning_rate": 7.281209401204677e-08, + "logits/chosen": -2.057417154312134, + "logits/rejected": -2.329926013946533, + "logps/chosen": -170.92593383789062, + "logps/rejected": -102.81556701660156, + "loss": 3.8151, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.620354175567627, + "rewards/margins": -2.6292402744293213, + "rewards/rejected": -0.99111407995224, + "step": 6541 + }, + { + "epoch": 0.76, + "learning_rate": 7.277666233612849e-08, + "logits/chosen": -2.69047474861145, + "logits/rejected": -2.429795742034912, + "logps/chosen": -130.8695831298828, + "logps/rejected": -271.32086181640625, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13336829841136932, + "rewards/margins": 2.939639091491699, + "rewards/rejected": -3.073007583618164, + "step": 6542 + }, + { + "epoch": 0.76, + "learning_rate": 7.274123066021023e-08, + "logits/chosen": -2.475344657897949, + "logits/rejected": -2.7696638107299805, + "logps/chosen": -365.8242492675781, + "logps/rejected": -272.079833984375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5095143914222717, + "rewards/margins": 3.026102304458618, + "rewards/rejected": -3.535616874694824, + "step": 6543 + }, + { + "epoch": 0.76, + "learning_rate": 7.270579898429195e-08, + "logits/chosen": -2.818424701690674, + "logits/rejected": -2.856477737426758, + "logps/chosen": -165.74758911132812, + "logps/rejected": -210.67074584960938, + "loss": 0.1723, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.248735785484314, + "rewards/margins": 4.049625873565674, + "rewards/rejected": -5.298361301422119, + "step": 6544 + }, + { + "epoch": 0.76, + "learning_rate": 7.267036730837368e-08, + "logits/chosen": -2.2270748615264893, + "logits/rejected": -2.580418348312378, + "logps/chosen": -287.2328186035156, + "logps/rejected": -143.59609985351562, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9970101714134216, + "rewards/margins": 1.3176482915878296, + "rewards/rejected": -2.3146584033966064, + "step": 6545 + }, + { + "epoch": 0.76, + "learning_rate": 7.263493563245542e-08, + "logits/chosen": -2.325033664703369, + "logits/rejected": -2.0952320098876953, + "logps/chosen": -222.8415069580078, + "logps/rejected": -271.2603454589844, + "loss": 0.3789, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.636660099029541, + "rewards/margins": 2.155402660369873, + "rewards/rejected": -2.792062997817993, + "step": 6546 + }, + { + "epoch": 0.76, + "learning_rate": 7.259950395653714e-08, + "logits/chosen": -2.231644630432129, + "logits/rejected": -2.341620683670044, + "logps/chosen": -233.1407012939453, + "logps/rejected": -157.40293884277344, + "loss": 0.3366, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1038765907287598, + "rewards/margins": 1.2512096166610718, + "rewards/rejected": -2.355086326599121, + "step": 6547 + }, + { + "epoch": 0.76, + "learning_rate": 7.256407228061888e-08, + "logits/chosen": -2.0113120079040527, + "logits/rejected": -1.5310449600219727, + "logps/chosen": -179.986328125, + "logps/rejected": -297.4427490234375, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7868562340736389, + "rewards/margins": 1.7069090604782104, + "rewards/rejected": -2.493765354156494, + "step": 6548 + }, + { + "epoch": 0.76, + "learning_rate": 7.25286406047006e-08, + "logits/chosen": -2.325350284576416, + "logits/rejected": -2.3683652877807617, + "logps/chosen": -137.22653198242188, + "logps/rejected": -226.56143188476562, + "loss": 0.2318, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8646396398544312, + "rewards/margins": 3.303483724594116, + "rewards/rejected": -4.168123245239258, + "step": 6549 + }, + { + "epoch": 0.76, + "learning_rate": 7.249320892878232e-08, + "logits/chosen": -2.4055593013763428, + "logits/rejected": -2.288334846496582, + "logps/chosen": -253.2880859375, + "logps/rejected": -185.30796813964844, + "loss": 0.713, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3272736072540283, + "rewards/margins": 0.6200856566429138, + "rewards/rejected": -1.9473592042922974, + "step": 6550 + }, + { + "epoch": 0.76, + "learning_rate": 7.245777725286406e-08, + "logits/chosen": -2.0152196884155273, + "logits/rejected": -2.1809940338134766, + "logps/chosen": -271.6258850097656, + "logps/rejected": -233.04685974121094, + "loss": 0.5223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.702997088432312, + "rewards/margins": 1.1633658409118652, + "rewards/rejected": -1.8663629293441772, + "step": 6551 + }, + { + "epoch": 0.76, + "learning_rate": 7.242234557694579e-08, + "logits/chosen": -2.5214412212371826, + "logits/rejected": -2.3518805503845215, + "logps/chosen": -250.47854614257812, + "logps/rejected": -261.0081787109375, + "loss": 0.3925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7221649885177612, + "rewards/margins": 1.0089610815048218, + "rewards/rejected": -1.7311261892318726, + "step": 6552 + }, + { + "epoch": 0.76, + "learning_rate": 7.238691390102751e-08, + "logits/chosen": -2.7128942012786865, + "logits/rejected": -2.670668125152588, + "logps/chosen": -316.68060302734375, + "logps/rejected": -354.77789306640625, + "loss": 0.8776, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4352983236312866, + "rewards/margins": 0.9401544332504272, + "rewards/rejected": -2.375452756881714, + "step": 6553 + }, + { + "epoch": 0.76, + "learning_rate": 7.235148222510925e-08, + "logits/chosen": -2.9136030673980713, + "logits/rejected": -2.933730125427246, + "logps/chosen": -264.0575256347656, + "logps/rejected": -364.34808349609375, + "loss": 0.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2657017707824707, + "rewards/margins": 2.445497989654541, + "rewards/rejected": -3.7111997604370117, + "step": 6554 + }, + { + "epoch": 0.76, + "learning_rate": 7.231605054919097e-08, + "logits/chosen": -2.730557918548584, + "logits/rejected": -2.598778009414673, + "logps/chosen": -116.2453842163086, + "logps/rejected": -258.6522216796875, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0272077322006226, + "rewards/margins": 3.154448986053467, + "rewards/rejected": -4.181656837463379, + "step": 6555 + }, + { + "epoch": 0.76, + "learning_rate": 7.22806188732727e-08, + "logits/chosen": -2.5023436546325684, + "logits/rejected": -2.43769907951355, + "logps/chosen": -205.6953582763672, + "logps/rejected": -327.6824035644531, + "loss": 0.3144, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9908139705657959, + "rewards/margins": 3.6888599395751953, + "rewards/rejected": -4.67967414855957, + "step": 6556 + }, + { + "epoch": 0.76, + "learning_rate": 7.224518719735444e-08, + "logits/chosen": -1.8455619812011719, + "logits/rejected": -2.047776222229004, + "logps/chosen": -548.744873046875, + "logps/rejected": -445.59625244140625, + "loss": 0.3119, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6766484975814819, + "rewards/margins": 1.7968112230300903, + "rewards/rejected": -2.4734597206115723, + "step": 6557 + }, + { + "epoch": 0.76, + "learning_rate": 7.220975552143616e-08, + "logits/chosen": -2.4191226959228516, + "logits/rejected": -2.3122975826263428, + "logps/chosen": -317.31927490234375, + "logps/rejected": -300.4651794433594, + "loss": 0.1803, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0931577682495117, + "rewards/margins": 2.896366596221924, + "rewards/rejected": -3.9895246028900146, + "step": 6558 + }, + { + "epoch": 0.76, + "learning_rate": 7.217432384551789e-08, + "logits/chosen": -2.0525755882263184, + "logits/rejected": -2.3356125354766846, + "logps/chosen": -545.1234130859375, + "logps/rejected": -360.16107177734375, + "loss": 0.1789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5032755136489868, + "rewards/margins": 2.242983341217041, + "rewards/rejected": -2.746258497238159, + "step": 6559 + }, + { + "epoch": 0.76, + "learning_rate": 7.213889216959962e-08, + "logits/chosen": -2.060914993286133, + "logits/rejected": -2.2822837829589844, + "logps/chosen": -336.3218994140625, + "logps/rejected": -211.91015625, + "loss": 0.6174, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8327117562294006, + "rewards/margins": 2.033830165863037, + "rewards/rejected": -2.866541862487793, + "step": 6560 + }, + { + "epoch": 0.76, + "learning_rate": 7.210346049368134e-08, + "logits/chosen": -2.302398204803467, + "logits/rejected": -2.2340269088745117, + "logps/chosen": -305.50543212890625, + "logps/rejected": -337.9438781738281, + "loss": 0.5812, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1847318410873413, + "rewards/margins": 1.9427093267440796, + "rewards/rejected": -3.127440929412842, + "step": 6561 + }, + { + "epoch": 0.76, + "learning_rate": 7.206802881776308e-08, + "logits/chosen": -2.2075300216674805, + "logits/rejected": -2.400376558303833, + "logps/chosen": -300.6115417480469, + "logps/rejected": -170.13043212890625, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4874149560928345, + "rewards/margins": 2.8852169513702393, + "rewards/rejected": -3.3726320266723633, + "step": 6562 + }, + { + "epoch": 0.76, + "learning_rate": 7.203259714184481e-08, + "logits/chosen": -2.06742262840271, + "logits/rejected": -2.191110610961914, + "logps/chosen": -474.23370361328125, + "logps/rejected": -427.57342529296875, + "loss": 1.037, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7416523694992065, + "rewards/margins": -0.12721076607704163, + "rewards/rejected": -1.6144416332244873, + "step": 6563 + }, + { + "epoch": 0.76, + "learning_rate": 7.199716546592654e-08, + "logits/chosen": -2.5993423461914062, + "logits/rejected": -2.7527365684509277, + "logps/chosen": -427.892333984375, + "logps/rejected": -271.1238708496094, + "loss": 0.4168, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0618658065795898, + "rewards/margins": 2.125018835067749, + "rewards/rejected": -3.186884641647339, + "step": 6564 + }, + { + "epoch": 0.76, + "learning_rate": 7.196173379000826e-08, + "logits/chosen": -2.2372584342956543, + "logits/rejected": -1.7887210845947266, + "logps/chosen": -353.0359191894531, + "logps/rejected": -455.6588134765625, + "loss": 0.7099, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2904033660888672, + "rewards/margins": 1.6828110218048096, + "rewards/rejected": -2.973214626312256, + "step": 6565 + }, + { + "epoch": 0.76, + "learning_rate": 7.192630211408999e-08, + "logits/chosen": -2.256141424179077, + "logits/rejected": -2.159013271331787, + "logps/chosen": -354.660400390625, + "logps/rejected": -271.24468994140625, + "loss": 0.5736, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6704725623130798, + "rewards/margins": 0.9750857353210449, + "rewards/rejected": -1.6455583572387695, + "step": 6566 + }, + { + "epoch": 0.76, + "learning_rate": 7.189087043817173e-08, + "logits/chosen": -2.3853988647460938, + "logits/rejected": -2.662398338317871, + "logps/chosen": -323.795166015625, + "logps/rejected": -200.51747131347656, + "loss": 0.2398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6135047078132629, + "rewards/margins": 2.204486846923828, + "rewards/rejected": -2.8179914951324463, + "step": 6567 + }, + { + "epoch": 0.76, + "learning_rate": 7.185543876225345e-08, + "logits/chosen": -2.290132999420166, + "logits/rejected": -2.2598729133605957, + "logps/chosen": -222.91253662109375, + "logps/rejected": -205.90487670898438, + "loss": 0.484, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.144146680831909, + "rewards/margins": 1.8360090255737305, + "rewards/rejected": -3.9801554679870605, + "step": 6568 + }, + { + "epoch": 0.76, + "learning_rate": 7.182000708633519e-08, + "logits/chosen": -2.860623359680176, + "logits/rejected": -2.7459938526153564, + "logps/chosen": -190.53921508789062, + "logps/rejected": -266.0608215332031, + "loss": 0.2989, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4529485702514648, + "rewards/margins": 2.4082086086273193, + "rewards/rejected": -3.861157178878784, + "step": 6569 + }, + { + "epoch": 0.76, + "learning_rate": 7.178457541041691e-08, + "logits/chosen": -2.4549155235290527, + "logits/rejected": -2.3483550548553467, + "logps/chosen": -169.5719451904297, + "logps/rejected": -232.28244018554688, + "loss": 0.4462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5750154256820679, + "rewards/margins": 1.4062734842300415, + "rewards/rejected": -1.981289029121399, + "step": 6570 + }, + { + "epoch": 0.76, + "learning_rate": 7.174914373449863e-08, + "logits/chosen": -2.220867395401001, + "logits/rejected": -2.1686244010925293, + "logps/chosen": -208.12989807128906, + "logps/rejected": -219.7139434814453, + "loss": 0.2391, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2352497577667236, + "rewards/margins": 2.3534114360809326, + "rewards/rejected": -3.5886611938476562, + "step": 6571 + }, + { + "epoch": 0.76, + "learning_rate": 7.171371205858037e-08, + "logits/chosen": -1.8048017024993896, + "logits/rejected": -2.104722499847412, + "logps/chosen": -303.0167541503906, + "logps/rejected": -291.9480895996094, + "loss": 0.7084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6916234493255615, + "rewards/margins": 0.26465147733688354, + "rewards/rejected": -0.9562749266624451, + "step": 6572 + }, + { + "epoch": 0.76, + "learning_rate": 7.16782803826621e-08, + "logits/chosen": -2.6052513122558594, + "logits/rejected": -2.61708927154541, + "logps/chosen": -339.950439453125, + "logps/rejected": -247.12716674804688, + "loss": 0.1938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10415807366371155, + "rewards/margins": 2.412104368209839, + "rewards/rejected": -2.5162622928619385, + "step": 6573 + }, + { + "epoch": 0.76, + "learning_rate": 7.164284870674382e-08, + "logits/chosen": -2.5525941848754883, + "logits/rejected": -2.7941062450408936, + "logps/chosen": -333.68682861328125, + "logps/rejected": -148.64871215820312, + "loss": 0.5338, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8634580969810486, + "rewards/margins": 0.998272180557251, + "rewards/rejected": -1.8617304563522339, + "step": 6574 + }, + { + "epoch": 0.76, + "learning_rate": 7.160741703082556e-08, + "logits/chosen": -2.4997150897979736, + "logits/rejected": -2.441781759262085, + "logps/chosen": -345.038818359375, + "logps/rejected": -232.8472900390625, + "loss": 0.2771, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7323188185691833, + "rewards/margins": 1.831019639968872, + "rewards/rejected": -2.5633385181427, + "step": 6575 + }, + { + "epoch": 0.76, + "learning_rate": 7.157198535490728e-08, + "logits/chosen": -2.5994954109191895, + "logits/rejected": -2.64284086227417, + "logps/chosen": -156.81898498535156, + "logps/rejected": -256.114990234375, + "loss": 0.2208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.678633451461792, + "rewards/margins": 1.6605322360992432, + "rewards/rejected": -2.339165687561035, + "step": 6576 + }, + { + "epoch": 0.77, + "learning_rate": 7.1536553678989e-08, + "logits/chosen": -2.334977149963379, + "logits/rejected": -2.39477801322937, + "logps/chosen": -242.10418701171875, + "logps/rejected": -324.3295593261719, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0180972814559937, + "rewards/margins": 3.5532448291778564, + "rewards/rejected": -4.571342468261719, + "step": 6577 + }, + { + "epoch": 0.77, + "learning_rate": 7.150112200307074e-08, + "logits/chosen": -1.5605220794677734, + "logits/rejected": -1.657948613166809, + "logps/chosen": -451.6317443847656, + "logps/rejected": -399.4571838378906, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8863317966461182, + "rewards/margins": 1.616328239440918, + "rewards/rejected": -2.502660036087036, + "step": 6578 + }, + { + "epoch": 0.77, + "learning_rate": 7.146569032715247e-08, + "logits/chosen": -1.1152442693710327, + "logits/rejected": -1.1567585468292236, + "logps/chosen": -710.7568359375, + "logps/rejected": -590.7899169921875, + "loss": 0.4131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.427842915058136, + "rewards/margins": 1.3298442363739014, + "rewards/rejected": -1.7576872110366821, + "step": 6579 + }, + { + "epoch": 0.77, + "learning_rate": 7.143025865123421e-08, + "logits/chosen": -2.792476177215576, + "logits/rejected": -2.8484408855438232, + "logps/chosen": -211.41357421875, + "logps/rejected": -292.6558837890625, + "loss": 0.3843, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2674305438995361, + "rewards/margins": 3.8567700386047363, + "rewards/rejected": -5.124200344085693, + "step": 6580 + }, + { + "epoch": 0.77, + "learning_rate": 7.139482697531593e-08, + "logits/chosen": -2.7501626014709473, + "logits/rejected": -2.411219835281372, + "logps/chosen": -183.0274658203125, + "logps/rejected": -390.5455627441406, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5011011362075806, + "rewards/margins": 3.47036075592041, + "rewards/rejected": -3.9714620113372803, + "step": 6581 + }, + { + "epoch": 0.77, + "learning_rate": 7.135939529939765e-08, + "logits/chosen": -2.694293737411499, + "logits/rejected": -2.8904948234558105, + "logps/chosen": -274.9427795410156, + "logps/rejected": -187.0045166015625, + "loss": 0.6538, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6009187698364258, + "rewards/margins": 2.1450629234313965, + "rewards/rejected": -3.7459819316864014, + "step": 6582 + }, + { + "epoch": 0.77, + "learning_rate": 7.132396362347939e-08, + "logits/chosen": -2.3319268226623535, + "logits/rejected": -2.4902915954589844, + "logps/chosen": -364.5555725097656, + "logps/rejected": -335.962158203125, + "loss": 0.7266, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6324841380119324, + "rewards/margins": 1.6029672622680664, + "rewards/rejected": -2.2354514598846436, + "step": 6583 + }, + { + "epoch": 0.77, + "learning_rate": 7.128853194756112e-08, + "logits/chosen": -2.3043508529663086, + "logits/rejected": -2.4165549278259277, + "logps/chosen": -183.96328735351562, + "logps/rejected": -138.2550048828125, + "loss": 0.9518, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9003181457519531, + "rewards/margins": 0.7257276773452759, + "rewards/rejected": -2.6260459423065186, + "step": 6584 + }, + { + "epoch": 0.77, + "learning_rate": 7.125310027164285e-08, + "logits/chosen": -2.490055561065674, + "logits/rejected": -2.765590190887451, + "logps/chosen": -543.8170776367188, + "logps/rejected": -296.1157531738281, + "loss": 0.286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32808917760849, + "rewards/margins": 3.615591526031494, + "rewards/rejected": -3.943680763244629, + "step": 6585 + }, + { + "epoch": 0.77, + "learning_rate": 7.121766859572458e-08, + "logits/chosen": -2.9766478538513184, + "logits/rejected": -2.91683292388916, + "logps/chosen": -268.734619140625, + "logps/rejected": -335.5603332519531, + "loss": 0.1984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5018095970153809, + "rewards/margins": 2.518181085586548, + "rewards/rejected": -3.0199904441833496, + "step": 6586 + }, + { + "epoch": 0.77, + "learning_rate": 7.11822369198063e-08, + "logits/chosen": -2.635106325149536, + "logits/rejected": -2.7410998344421387, + "logps/chosen": -214.9885711669922, + "logps/rejected": -257.1778259277344, + "loss": 0.1906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6152777671813965, + "rewards/margins": 2.673250913619995, + "rewards/rejected": -3.2885289192199707, + "step": 6587 + }, + { + "epoch": 0.77, + "learning_rate": 7.114680524388803e-08, + "logits/chosen": -2.546565294265747, + "logits/rejected": -2.1516318321228027, + "logps/chosen": -109.921142578125, + "logps/rejected": -257.1788330078125, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6235591769218445, + "rewards/margins": 1.2545857429504395, + "rewards/rejected": -1.8781448602676392, + "step": 6588 + }, + { + "epoch": 0.77, + "learning_rate": 7.111137356796976e-08, + "logits/chosen": -1.69341242313385, + "logits/rejected": -2.005821943283081, + "logps/chosen": -472.2279968261719, + "logps/rejected": -377.54583740234375, + "loss": 0.8055, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.806959629058838, + "rewards/margins": 1.255922794342041, + "rewards/rejected": -3.062882661819458, + "step": 6589 + }, + { + "epoch": 0.77, + "learning_rate": 7.10759418920515e-08, + "logits/chosen": -2.194896936416626, + "logits/rejected": -2.074000597000122, + "logps/chosen": -194.71420288085938, + "logps/rejected": -327.8777770996094, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5948743224143982, + "rewards/margins": 2.3299942016601562, + "rewards/rejected": -2.924868583679199, + "step": 6590 + }, + { + "epoch": 0.77, + "learning_rate": 7.104051021613322e-08, + "logits/chosen": -2.6353819370269775, + "logits/rejected": -2.858490228652954, + "logps/chosen": -237.4617462158203, + "logps/rejected": -233.78521728515625, + "loss": 0.1916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5409172177314758, + "rewards/margins": 2.4213995933532715, + "rewards/rejected": -2.9623167514801025, + "step": 6591 + }, + { + "epoch": 0.77, + "learning_rate": 7.100507854021495e-08, + "logits/chosen": -2.4676883220672607, + "logits/rejected": -2.6945886611938477, + "logps/chosen": -222.08116149902344, + "logps/rejected": -220.3759307861328, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03765710070729256, + "rewards/margins": 3.4931957721710205, + "rewards/rejected": -3.530852794647217, + "step": 6592 + }, + { + "epoch": 0.77, + "learning_rate": 7.096964686429668e-08, + "logits/chosen": -2.5589213371276855, + "logits/rejected": -2.537661075592041, + "logps/chosen": -180.3143768310547, + "logps/rejected": -246.37892150878906, + "loss": 0.2642, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7748106718063354, + "rewards/margins": 2.6238901615142822, + "rewards/rejected": -4.398700714111328, + "step": 6593 + }, + { + "epoch": 0.77, + "learning_rate": 7.093421518837841e-08, + "logits/chosen": -2.1803622245788574, + "logits/rejected": -2.0115742683410645, + "logps/chosen": -361.5255432128906, + "logps/rejected": -371.6014709472656, + "loss": 0.4742, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4732470512390137, + "rewards/margins": 1.8863335847854614, + "rewards/rejected": -3.3595809936523438, + "step": 6594 + }, + { + "epoch": 0.77, + "learning_rate": 7.089878351246013e-08, + "logits/chosen": -2.73527193069458, + "logits/rejected": -2.818406105041504, + "logps/chosen": -371.71600341796875, + "logps/rejected": -255.74227905273438, + "loss": 0.4158, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5244619846343994, + "rewards/margins": 1.3789600133895874, + "rewards/rejected": -1.9034219980239868, + "step": 6595 + }, + { + "epoch": 0.77, + "learning_rate": 7.086335183654187e-08, + "logits/chosen": -2.0440053939819336, + "logits/rejected": -2.263322353363037, + "logps/chosen": -373.58203125, + "logps/rejected": -199.83987426757812, + "loss": 0.2796, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4240535497665405, + "rewards/margins": 2.214442729949951, + "rewards/rejected": -1.790389060974121, + "step": 6596 + }, + { + "epoch": 0.77, + "learning_rate": 7.082792016062359e-08, + "logits/chosen": -1.8701245784759521, + "logits/rejected": -1.865479588508606, + "logps/chosen": -176.17059326171875, + "logps/rejected": -251.68515014648438, + "loss": 0.7459, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8355588912963867, + "rewards/margins": 1.9148268699645996, + "rewards/rejected": -2.7503857612609863, + "step": 6597 + }, + { + "epoch": 0.77, + "learning_rate": 7.079248848470533e-08, + "logits/chosen": -1.4631054401397705, + "logits/rejected": -1.7176802158355713, + "logps/chosen": -341.572265625, + "logps/rejected": -336.7872009277344, + "loss": 0.361, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5201863646507263, + "rewards/margins": 1.8873575925827026, + "rewards/rejected": -2.407543897628784, + "step": 6598 + }, + { + "epoch": 0.77, + "learning_rate": 7.075705680878705e-08, + "logits/chosen": -2.033832550048828, + "logits/rejected": -1.8602389097213745, + "logps/chosen": -216.70803833007812, + "logps/rejected": -278.1370544433594, + "loss": 0.6953, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0389900207519531, + "rewards/margins": 1.6648831367492676, + "rewards/rejected": -2.7038731575012207, + "step": 6599 + }, + { + "epoch": 0.77, + "learning_rate": 7.072162513286878e-08, + "logits/chosen": -2.198885679244995, + "logits/rejected": -2.1142377853393555, + "logps/chosen": -294.97149658203125, + "logps/rejected": -352.531982421875, + "loss": 0.3151, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42281007766723633, + "rewards/margins": 1.930802583694458, + "rewards/rejected": -2.3536126613616943, + "step": 6600 + }, + { + "epoch": 0.77, + "learning_rate": 7.068619345695052e-08, + "logits/chosen": -2.3773207664489746, + "logits/rejected": -2.327993392944336, + "logps/chosen": -266.50506591796875, + "logps/rejected": -209.75372314453125, + "loss": 0.3627, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.10751211643219, + "rewards/margins": 2.5054850578308105, + "rewards/rejected": -3.61299729347229, + "step": 6601 + }, + { + "epoch": 0.77, + "learning_rate": 7.065076178103224e-08, + "logits/chosen": -2.049816370010376, + "logits/rejected": -1.9778817892074585, + "logps/chosen": -292.8638610839844, + "logps/rejected": -254.61312866210938, + "loss": 0.7338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8768731355667114, + "rewards/margins": 0.7110817432403564, + "rewards/rejected": -1.5879547595977783, + "step": 6602 + }, + { + "epoch": 0.77, + "learning_rate": 7.061533010511396e-08, + "logits/chosen": -1.6834138631820679, + "logits/rejected": -1.8715546131134033, + "logps/chosen": -494.77545166015625, + "logps/rejected": -351.3647766113281, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24862122535705566, + "rewards/margins": 1.687313437461853, + "rewards/rejected": -1.9359346628189087, + "step": 6603 + }, + { + "epoch": 0.77, + "learning_rate": 7.05798984291957e-08, + "logits/chosen": -2.500033378601074, + "logits/rejected": -2.347038984298706, + "logps/chosen": -234.99172973632812, + "logps/rejected": -239.53709411621094, + "loss": 0.4528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8218315839767456, + "rewards/margins": 2.3247923851013184, + "rewards/rejected": -3.1466240882873535, + "step": 6604 + }, + { + "epoch": 0.77, + "learning_rate": 7.054446675327742e-08, + "logits/chosen": -2.469059467315674, + "logits/rejected": -2.4501309394836426, + "logps/chosen": -184.72787475585938, + "logps/rejected": -215.57427978515625, + "loss": 0.3123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1057450771331787, + "rewards/margins": 1.7908668518066406, + "rewards/rejected": -2.8966116905212402, + "step": 6605 + }, + { + "epoch": 0.77, + "learning_rate": 7.050903507735916e-08, + "logits/chosen": -2.194666862487793, + "logits/rejected": -2.451421022415161, + "logps/chosen": -410.2794189453125, + "logps/rejected": -358.3693542480469, + "loss": 0.8041, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.081912636756897, + "rewards/margins": 1.566514015197754, + "rewards/rejected": -2.6484267711639404, + "step": 6606 + }, + { + "epoch": 0.77, + "learning_rate": 7.047360340144089e-08, + "logits/chosen": -2.5707454681396484, + "logits/rejected": -2.5349347591400146, + "logps/chosen": -166.31765747070312, + "logps/rejected": -204.29615783691406, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1405258178710938, + "rewards/margins": 2.2832295894622803, + "rewards/rejected": -3.423755645751953, + "step": 6607 + }, + { + "epoch": 0.77, + "learning_rate": 7.043817172552261e-08, + "logits/chosen": -2.4307923316955566, + "logits/rejected": -2.410396099090576, + "logps/chosen": -269.4496154785156, + "logps/rejected": -274.9190673828125, + "loss": 0.6383, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.236451506614685, + "rewards/margins": 2.384723663330078, + "rewards/rejected": -3.6211748123168945, + "step": 6608 + }, + { + "epoch": 0.77, + "learning_rate": 7.040274004960434e-08, + "logits/chosen": -2.2702419757843018, + "logits/rejected": -2.4160943031311035, + "logps/chosen": -516.3233642578125, + "logps/rejected": -362.15142822265625, + "loss": 0.3267, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.327873945236206, + "rewards/margins": 1.3431696891784668, + "rewards/rejected": -2.6710433959960938, + "step": 6609 + }, + { + "epoch": 0.77, + "learning_rate": 7.036730837368607e-08, + "logits/chosen": -2.109980583190918, + "logits/rejected": -2.179483413696289, + "logps/chosen": -322.4153137207031, + "logps/rejected": -289.8718566894531, + "loss": 0.3486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03851597011089325, + "rewards/margins": 1.5667411088943481, + "rewards/rejected": -1.6052570343017578, + "step": 6610 + }, + { + "epoch": 0.77, + "learning_rate": 7.03318766977678e-08, + "logits/chosen": -1.738944172859192, + "logits/rejected": -1.9524295330047607, + "logps/chosen": -385.84002685546875, + "logps/rejected": -394.3967590332031, + "loss": 0.1048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8301822543144226, + "rewards/margins": 3.6284518241882324, + "rewards/rejected": -4.458634376525879, + "step": 6611 + }, + { + "epoch": 0.77, + "learning_rate": 7.029644502184953e-08, + "logits/chosen": -3.090388059616089, + "logits/rejected": -2.977144241333008, + "logps/chosen": -375.7198791503906, + "logps/rejected": -250.50637817382812, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6453310251235962, + "rewards/margins": 2.2055253982543945, + "rewards/rejected": -2.8508565425872803, + "step": 6612 + }, + { + "epoch": 0.77, + "learning_rate": 7.026101334593126e-08, + "logits/chosen": -2.1728622913360596, + "logits/rejected": -2.492575168609619, + "logps/chosen": -327.7327880859375, + "logps/rejected": -198.7549591064453, + "loss": 0.9959, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.0687782764434814, + "rewards/margins": 0.7634240388870239, + "rewards/rejected": -2.832202434539795, + "step": 6613 + }, + { + "epoch": 0.77, + "learning_rate": 7.022558167001299e-08, + "logits/chosen": -2.4572174549102783, + "logits/rejected": -2.4759702682495117, + "logps/chosen": -218.84423828125, + "logps/rejected": -327.53033447265625, + "loss": 0.1461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7455686926841736, + "rewards/margins": 3.667435884475708, + "rewards/rejected": -4.413004398345947, + "step": 6614 + }, + { + "epoch": 0.77, + "learning_rate": 7.019014999409471e-08, + "logits/chosen": -2.533139228820801, + "logits/rejected": -2.720991849899292, + "logps/chosen": -349.4258728027344, + "logps/rejected": -335.9798583984375, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9443877935409546, + "rewards/margins": 2.964670181274414, + "rewards/rejected": -3.9090576171875, + "step": 6615 + }, + { + "epoch": 0.77, + "learning_rate": 7.015471831817644e-08, + "logits/chosen": -2.3158645629882812, + "logits/rejected": -2.364549398422241, + "logps/chosen": -257.8260498046875, + "logps/rejected": -339.10137939453125, + "loss": 0.2317, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2057769298553467, + "rewards/margins": 2.7890753746032715, + "rewards/rejected": -3.9948525428771973, + "step": 6616 + }, + { + "epoch": 0.77, + "learning_rate": 7.011928664225818e-08, + "logits/chosen": -2.2050368785858154, + "logits/rejected": -2.327218532562256, + "logps/chosen": -268.4022216796875, + "logps/rejected": -225.6710968017578, + "loss": 0.2627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38610756397247314, + "rewards/margins": 1.8269723653793335, + "rewards/rejected": -2.2130799293518066, + "step": 6617 + }, + { + "epoch": 0.77, + "learning_rate": 7.008385496633991e-08, + "logits/chosen": -2.3171496391296387, + "logits/rejected": -2.2513980865478516, + "logps/chosen": -216.95068359375, + "logps/rejected": -284.6390686035156, + "loss": 0.4528, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9000439047813416, + "rewards/margins": 0.9235094785690308, + "rewards/rejected": -1.8235533237457275, + "step": 6618 + }, + { + "epoch": 0.77, + "learning_rate": 7.004842329042164e-08, + "logits/chosen": -2.234891414642334, + "logits/rejected": -2.241267442703247, + "logps/chosen": -207.60003662109375, + "logps/rejected": -217.52725219726562, + "loss": 0.3256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7794975638389587, + "rewards/margins": 2.5186290740966797, + "rewards/rejected": -3.298126697540283, + "step": 6619 + }, + { + "epoch": 0.77, + "learning_rate": 7.001299161450336e-08, + "logits/chosen": -2.447984218597412, + "logits/rejected": -2.419877767562866, + "logps/chosen": -301.27825927734375, + "logps/rejected": -343.28167724609375, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3788779973983765, + "rewards/margins": 2.88132381439209, + "rewards/rejected": -4.260201930999756, + "step": 6620 + }, + { + "epoch": 0.77, + "learning_rate": 6.99775599385851e-08, + "logits/chosen": -2.5448317527770996, + "logits/rejected": -2.803438186645508, + "logps/chosen": -352.8631591796875, + "logps/rejected": -167.7257843017578, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2308075726032257, + "rewards/margins": 2.5865988731384277, + "rewards/rejected": -2.817406177520752, + "step": 6621 + }, + { + "epoch": 0.77, + "learning_rate": 6.994212826266682e-08, + "logits/chosen": -2.257390022277832, + "logits/rejected": -2.3664021492004395, + "logps/chosen": -378.28887939453125, + "logps/rejected": -180.0978546142578, + "loss": 0.4339, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6106510758399963, + "rewards/margins": 1.7156684398651123, + "rewards/rejected": -2.326319456100464, + "step": 6622 + }, + { + "epoch": 0.77, + "learning_rate": 6.990669658674855e-08, + "logits/chosen": -2.1238322257995605, + "logits/rejected": -1.9127647876739502, + "logps/chosen": -363.7259521484375, + "logps/rejected": -441.3682556152344, + "loss": 0.2292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2712361216545105, + "rewards/margins": 4.4554033279418945, + "rewards/rejected": -4.726639270782471, + "step": 6623 + }, + { + "epoch": 0.77, + "learning_rate": 6.987126491083029e-08, + "logits/chosen": -2.550433397293091, + "logits/rejected": -2.5088014602661133, + "logps/chosen": -366.5540466308594, + "logps/rejected": -306.4691162109375, + "loss": 0.2515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7718722224235535, + "rewards/margins": 2.192201614379883, + "rewards/rejected": -2.964073896408081, + "step": 6624 + }, + { + "epoch": 0.77, + "learning_rate": 6.983583323491201e-08, + "logits/chosen": -2.8775973320007324, + "logits/rejected": -2.8634979724884033, + "logps/chosen": -126.82150268554688, + "logps/rejected": -235.63772583007812, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1370368003845215, + "rewards/margins": 2.8018836975097656, + "rewards/rejected": -3.938920497894287, + "step": 6625 + }, + { + "epoch": 0.77, + "learning_rate": 6.980040155899373e-08, + "logits/chosen": -2.6823110580444336, + "logits/rejected": -2.273869037628174, + "logps/chosen": -69.99015045166016, + "logps/rejected": -295.0671691894531, + "loss": 0.711, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2944557666778564, + "rewards/margins": 1.178027868270874, + "rewards/rejected": -2.4724836349487305, + "step": 6626 + }, + { + "epoch": 0.77, + "learning_rate": 6.976496988307547e-08, + "logits/chosen": -1.9078502655029297, + "logits/rejected": -2.0522212982177734, + "logps/chosen": -323.8881530761719, + "logps/rejected": -236.24331665039062, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01991422474384308, + "rewards/margins": 1.0969030857086182, + "rewards/rejected": -1.076988935470581, + "step": 6627 + }, + { + "epoch": 0.77, + "learning_rate": 6.97295382071572e-08, + "logits/chosen": -2.1701769828796387, + "logits/rejected": -2.3331563472747803, + "logps/chosen": -247.45120239257812, + "logps/rejected": -179.26004028320312, + "loss": 0.2787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5168095827102661, + "rewards/margins": 1.7797433137893677, + "rewards/rejected": -2.296552896499634, + "step": 6628 + }, + { + "epoch": 0.77, + "learning_rate": 6.969410653123892e-08, + "logits/chosen": -2.079460859298706, + "logits/rejected": -2.389756202697754, + "logps/chosen": -440.46453857421875, + "logps/rejected": -297.7349853515625, + "loss": 0.3677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.693283200263977, + "rewards/margins": 1.3064268827438354, + "rewards/rejected": -1.9997100830078125, + "step": 6629 + }, + { + "epoch": 0.77, + "learning_rate": 6.965867485532066e-08, + "logits/chosen": -2.77360200881958, + "logits/rejected": -2.557802438735962, + "logps/chosen": -159.20547485351562, + "logps/rejected": -187.08859252929688, + "loss": 0.7045, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3333966732025146, + "rewards/margins": 1.133470058441162, + "rewards/rejected": -2.4668667316436768, + "step": 6630 + }, + { + "epoch": 0.77, + "learning_rate": 6.962324317940238e-08, + "logits/chosen": -2.184325933456421, + "logits/rejected": -2.1934454441070557, + "logps/chosen": -174.13169860839844, + "logps/rejected": -255.92047119140625, + "loss": 0.6279, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4359121322631836, + "rewards/margins": 1.796686053276062, + "rewards/rejected": -3.232598304748535, + "step": 6631 + }, + { + "epoch": 0.77, + "learning_rate": 6.95878115034841e-08, + "logits/chosen": -1.9716644287109375, + "logits/rejected": -1.8746100664138794, + "logps/chosen": -437.4822998046875, + "logps/rejected": -484.39215087890625, + "loss": 0.2563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7362415194511414, + "rewards/margins": 2.0270400047302246, + "rewards/rejected": -2.76328182220459, + "step": 6632 + }, + { + "epoch": 0.77, + "learning_rate": 6.955237982756584e-08, + "logits/chosen": -1.991597294807434, + "logits/rejected": -2.3162038326263428, + "logps/chosen": -237.528564453125, + "logps/rejected": -197.452392578125, + "loss": 0.1455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3604428768157959, + "rewards/margins": 2.5374393463134766, + "rewards/rejected": -2.8978824615478516, + "step": 6633 + }, + { + "epoch": 0.77, + "learning_rate": 6.951694815164757e-08, + "logits/chosen": -2.631809711456299, + "logits/rejected": -2.59686279296875, + "logps/chosen": -209.438720703125, + "logps/rejected": -223.6019744873047, + "loss": 0.1834, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3835806846618652, + "rewards/margins": 3.4554567337036133, + "rewards/rejected": -4.8390374183654785, + "step": 6634 + }, + { + "epoch": 0.77, + "learning_rate": 6.94815164757293e-08, + "logits/chosen": -2.712904930114746, + "logits/rejected": -2.690114974975586, + "logps/chosen": -306.30010986328125, + "logps/rejected": -263.168701171875, + "loss": 0.6519, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7241472601890564, + "rewards/margins": 3.0463032722473145, + "rewards/rejected": -3.7704505920410156, + "step": 6635 + }, + { + "epoch": 0.77, + "learning_rate": 6.944608479981103e-08, + "logits/chosen": -2.3657848834991455, + "logits/rejected": -2.300935745239258, + "logps/chosen": -250.40768432617188, + "logps/rejected": -277.20428466796875, + "loss": 0.3891, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2417380809783936, + "rewards/margins": 1.886415958404541, + "rewards/rejected": -3.1281542778015137, + "step": 6636 + }, + { + "epoch": 0.77, + "learning_rate": 6.941065312389275e-08, + "logits/chosen": -2.375091314315796, + "logits/rejected": -2.6809215545654297, + "logps/chosen": -490.8929138183594, + "logps/rejected": -370.1326904296875, + "loss": 0.461, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6053032279014587, + "rewards/margins": 1.198777675628662, + "rewards/rejected": -1.804080843925476, + "step": 6637 + }, + { + "epoch": 0.77, + "learning_rate": 6.937522144797449e-08, + "logits/chosen": -2.343369960784912, + "logits/rejected": -2.0317327976226807, + "logps/chosen": -120.5345458984375, + "logps/rejected": -186.42825317382812, + "loss": 0.3822, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2421795129776, + "rewards/margins": 1.5862599611282349, + "rewards/rejected": -2.828439474105835, + "step": 6638 + }, + { + "epoch": 0.77, + "learning_rate": 6.933978977205621e-08, + "logits/chosen": -2.389347553253174, + "logits/rejected": -2.49420166015625, + "logps/chosen": -223.27484130859375, + "logps/rejected": -290.1815185546875, + "loss": 0.6958, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6868993043899536, + "rewards/margins": 0.7938140630722046, + "rewards/rejected": -2.480713367462158, + "step": 6639 + }, + { + "epoch": 0.77, + "learning_rate": 6.930435809613795e-08, + "logits/chosen": -2.6270298957824707, + "logits/rejected": -3.0052294731140137, + "logps/chosen": -398.4481506347656, + "logps/rejected": -293.1113586425781, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028123825788497925, + "rewards/margins": 1.5148711204528809, + "rewards/rejected": -1.5429949760437012, + "step": 6640 + }, + { + "epoch": 0.77, + "learning_rate": 6.926892642021967e-08, + "logits/chosen": -2.1066699028015137, + "logits/rejected": -1.9926878213882446, + "logps/chosen": -142.7261962890625, + "logps/rejected": -305.0911560058594, + "loss": 0.1722, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.90076744556427, + "rewards/margins": 3.0268542766571045, + "rewards/rejected": -3.927621364593506, + "step": 6641 + }, + { + "epoch": 0.77, + "learning_rate": 6.92334947443014e-08, + "logits/chosen": -2.714205265045166, + "logits/rejected": -2.7946505546569824, + "logps/chosen": -290.4112548828125, + "logps/rejected": -258.45343017578125, + "loss": 0.9009, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.517221450805664, + "rewards/margins": 0.4512033760547638, + "rewards/rejected": -2.9684245586395264, + "step": 6642 + }, + { + "epoch": 0.77, + "learning_rate": 6.919806306838313e-08, + "logits/chosen": -1.948014497756958, + "logits/rejected": -1.98294198513031, + "logps/chosen": -267.6580810546875, + "logps/rejected": -304.2770080566406, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8639314770698547, + "rewards/margins": 2.2468276023864746, + "rewards/rejected": -3.1107590198516846, + "step": 6643 + }, + { + "epoch": 0.77, + "learning_rate": 6.916263139246486e-08, + "logits/chosen": -2.8469736576080322, + "logits/rejected": -2.9345526695251465, + "logps/chosen": -243.15455627441406, + "logps/rejected": -352.44757080078125, + "loss": 0.5488, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4624139070510864, + "rewards/margins": 2.533154249191284, + "rewards/rejected": -2.995568037033081, + "step": 6644 + }, + { + "epoch": 0.77, + "learning_rate": 6.91271997165466e-08, + "logits/chosen": -2.5226283073425293, + "logits/rejected": -2.5264055728912354, + "logps/chosen": -137.54177856445312, + "logps/rejected": -175.3719482421875, + "loss": 0.215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0016009137034416199, + "rewards/margins": 2.48758602142334, + "rewards/rejected": -2.489187002182007, + "step": 6645 + }, + { + "epoch": 0.77, + "learning_rate": 6.909176804062832e-08, + "logits/chosen": -1.4381839036941528, + "logits/rejected": -1.8904484510421753, + "logps/chosen": -512.6334838867188, + "logps/rejected": -367.06787109375, + "loss": 0.3859, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7148163318634033, + "rewards/margins": 1.0443284511566162, + "rewards/rejected": -1.759144902229309, + "step": 6646 + }, + { + "epoch": 0.77, + "learning_rate": 6.905633636471004e-08, + "logits/chosen": -2.646623373031616, + "logits/rejected": -2.2689201831817627, + "logps/chosen": -206.1192626953125, + "logps/rejected": -330.0472717285156, + "loss": 0.3317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6966134905815125, + "rewards/margins": 3.713118076324463, + "rewards/rejected": -4.409731864929199, + "step": 6647 + }, + { + "epoch": 0.77, + "learning_rate": 6.902090468879178e-08, + "logits/chosen": -2.775387763977051, + "logits/rejected": -2.6166491508483887, + "logps/chosen": -292.8073425292969, + "logps/rejected": -184.85568237304688, + "loss": 1.0769, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5371967554092407, + "rewards/margins": 0.8607674241065979, + "rewards/rejected": -2.3979642391204834, + "step": 6648 + }, + { + "epoch": 0.77, + "learning_rate": 6.89854730128735e-08, + "logits/chosen": -2.610664129257202, + "logits/rejected": -2.658414125442505, + "logps/chosen": -178.27243041992188, + "logps/rejected": -213.05398559570312, + "loss": 0.8761, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3539060354232788, + "rewards/margins": 0.8860634565353394, + "rewards/rejected": -2.239969491958618, + "step": 6649 + }, + { + "epoch": 0.77, + "learning_rate": 6.895004133695523e-08, + "logits/chosen": -2.6043927669525146, + "logits/rejected": -2.740880012512207, + "logps/chosen": -400.2502136230469, + "logps/rejected": -282.388427734375, + "loss": 0.9478, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.113663911819458, + "rewards/margins": 1.8005231618881226, + "rewards/rejected": -2.914186954498291, + "step": 6650 + }, + { + "epoch": 0.77, + "learning_rate": 6.891460966103697e-08, + "logits/chosen": -2.4223198890686035, + "logits/rejected": -2.3453238010406494, + "logps/chosen": -349.73974609375, + "logps/rejected": -426.7125549316406, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.442206621170044, + "rewards/margins": 0.47975996136665344, + "rewards/rejected": -1.921966552734375, + "step": 6651 + }, + { + "epoch": 0.77, + "learning_rate": 6.887917798511869e-08, + "logits/chosen": -2.3321826457977295, + "logits/rejected": -2.3653907775878906, + "logps/chosen": -414.3248596191406, + "logps/rejected": -273.0155029296875, + "loss": 0.1649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9409549832344055, + "rewards/margins": 2.9121716022491455, + "rewards/rejected": -3.8531267642974854, + "step": 6652 + }, + { + "epoch": 0.77, + "learning_rate": 6.884374630920043e-08, + "logits/chosen": -2.1017701625823975, + "logits/rejected": -1.9043891429901123, + "logps/chosen": -176.669921875, + "logps/rejected": -276.97674560546875, + "loss": 0.5942, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2469433546066284, + "rewards/margins": 1.6676008701324463, + "rewards/rejected": -2.914544105529785, + "step": 6653 + }, + { + "epoch": 0.77, + "learning_rate": 6.880831463328215e-08, + "logits/chosen": -1.904212474822998, + "logits/rejected": -1.9556723833084106, + "logps/chosen": -387.9800720214844, + "logps/rejected": -287.9755554199219, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6587207317352295, + "rewards/margins": 2.8442280292510986, + "rewards/rejected": -3.502948760986328, + "step": 6654 + }, + { + "epoch": 0.77, + "learning_rate": 6.877288295736388e-08, + "logits/chosen": -2.134296178817749, + "logits/rejected": -2.0261964797973633, + "logps/chosen": -481.0274658203125, + "logps/rejected": -387.65643310546875, + "loss": 0.426, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8048465251922607, + "rewards/margins": 3.173562526702881, + "rewards/rejected": -3.9784088134765625, + "step": 6655 + }, + { + "epoch": 0.77, + "learning_rate": 6.873745128144562e-08, + "logits/chosen": -2.0005249977111816, + "logits/rejected": -1.683049201965332, + "logps/chosen": -197.4270477294922, + "logps/rejected": -326.55743408203125, + "loss": 0.2772, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3897382020950317, + "rewards/margins": 1.4603462219238281, + "rewards/rejected": -2.8500845432281494, + "step": 6656 + }, + { + "epoch": 0.77, + "learning_rate": 6.870201960552734e-08, + "logits/chosen": -2.3564882278442383, + "logits/rejected": -2.4877281188964844, + "logps/chosen": -257.4177551269531, + "logps/rejected": -268.4561767578125, + "loss": 0.3436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4151650071144104, + "rewards/margins": 2.5974178314208984, + "rewards/rejected": -3.012583017349243, + "step": 6657 + }, + { + "epoch": 0.77, + "learning_rate": 6.866658792960906e-08, + "logits/chosen": -2.0950613021850586, + "logits/rejected": -2.270035743713379, + "logps/chosen": -425.1199951171875, + "logps/rejected": -263.08819580078125, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4403303265571594, + "rewards/margins": 1.696149230003357, + "rewards/rejected": -2.136479616165161, + "step": 6658 + }, + { + "epoch": 0.77, + "learning_rate": 6.86311562536908e-08, + "logits/chosen": -2.5671989917755127, + "logits/rejected": -2.3682990074157715, + "logps/chosen": -269.3314208984375, + "logps/rejected": -314.81243896484375, + "loss": 0.3753, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1224250793457031, + "rewards/margins": 1.9260385036468506, + "rewards/rejected": -3.0484635829925537, + "step": 6659 + }, + { + "epoch": 0.77, + "learning_rate": 6.859572457777252e-08, + "logits/chosen": -2.1609649658203125, + "logits/rejected": -2.1049904823303223, + "logps/chosen": -311.01080322265625, + "logps/rejected": -272.4715576171875, + "loss": 0.4828, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5274262428283691, + "rewards/margins": 2.12687611579895, + "rewards/rejected": -3.6543025970458984, + "step": 6660 + }, + { + "epoch": 0.77, + "learning_rate": 6.856029290185426e-08, + "logits/chosen": -2.559229612350464, + "logits/rejected": -2.6232171058654785, + "logps/chosen": -296.31939697265625, + "logps/rejected": -170.80902099609375, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.071327805519104, + "rewards/margins": 0.8896474242210388, + "rewards/rejected": -1.9609752893447876, + "step": 6661 + }, + { + "epoch": 0.77, + "learning_rate": 6.852486122593599e-08, + "logits/chosen": -2.2304036617279053, + "logits/rejected": -2.469721555709839, + "logps/chosen": -293.23968505859375, + "logps/rejected": -224.3767547607422, + "loss": 0.3781, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9680774807929993, + "rewards/margins": 1.1710808277130127, + "rewards/rejected": -2.139158248901367, + "step": 6662 + }, + { + "epoch": 0.78, + "learning_rate": 6.848942955001771e-08, + "logits/chosen": -2.056396245956421, + "logits/rejected": -2.124464750289917, + "logps/chosen": -283.32275390625, + "logps/rejected": -322.185302734375, + "loss": 0.4609, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0718042850494385, + "rewards/margins": 2.5476369857788086, + "rewards/rejected": -3.619441032409668, + "step": 6663 + }, + { + "epoch": 0.78, + "learning_rate": 6.845399787409944e-08, + "logits/chosen": -2.842240810394287, + "logits/rejected": -2.8054933547973633, + "logps/chosen": -312.9918212890625, + "logps/rejected": -305.90753173828125, + "loss": 0.7551, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9745824337005615, + "rewards/margins": 0.6597601771354675, + "rewards/rejected": -1.6343426704406738, + "step": 6664 + }, + { + "epoch": 0.78, + "learning_rate": 6.841856619818117e-08, + "logits/chosen": -2.2440812587738037, + "logits/rejected": -2.3041956424713135, + "logps/chosen": -183.93267822265625, + "logps/rejected": -174.135498046875, + "loss": 0.3045, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7480471134185791, + "rewards/margins": 3.1588854789733887, + "rewards/rejected": -3.906932830810547, + "step": 6665 + }, + { + "epoch": 0.78, + "learning_rate": 6.83831345222629e-08, + "logits/chosen": -2.231405258178711, + "logits/rejected": -2.4036526679992676, + "logps/chosen": -255.26280212402344, + "logps/rejected": -190.95584106445312, + "loss": 0.6998, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8631519079208374, + "rewards/margins": 0.9314168691635132, + "rewards/rejected": -2.7945687770843506, + "step": 6666 + }, + { + "epoch": 0.78, + "learning_rate": 6.834770284634463e-08, + "logits/chosen": -1.7355539798736572, + "logits/rejected": -1.5476696491241455, + "logps/chosen": -183.87754821777344, + "logps/rejected": -269.5453796386719, + "loss": 0.277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6058818101882935, + "rewards/margins": 1.688905119895935, + "rewards/rejected": -2.2947866916656494, + "step": 6667 + }, + { + "epoch": 0.78, + "learning_rate": 6.831227117042636e-08, + "logits/chosen": -2.950035810470581, + "logits/rejected": -2.900320529937744, + "logps/chosen": -165.6063232421875, + "logps/rejected": -186.03823852539062, + "loss": 0.2848, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1759132146835327, + "rewards/margins": 2.7915596961975098, + "rewards/rejected": -3.967472791671753, + "step": 6668 + }, + { + "epoch": 0.78, + "learning_rate": 6.827683949450809e-08, + "logits/chosen": -2.3502089977264404, + "logits/rejected": -2.443232297897339, + "logps/chosen": -228.5415496826172, + "logps/rejected": -241.6712646484375, + "loss": 0.4101, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5140676498413086, + "rewards/margins": 1.5131011009216309, + "rewards/rejected": -3.0271687507629395, + "step": 6669 + }, + { + "epoch": 0.78, + "learning_rate": 6.824140781858981e-08, + "logits/chosen": -2.016116142272949, + "logits/rejected": -1.896512508392334, + "logps/chosen": -252.22604370117188, + "logps/rejected": -343.3431701660156, + "loss": 0.3498, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3217052221298218, + "rewards/margins": 2.5496010780334473, + "rewards/rejected": -3.8713064193725586, + "step": 6670 + }, + { + "epoch": 0.78, + "learning_rate": 6.820597614267154e-08, + "logits/chosen": -2.4583587646484375, + "logits/rejected": -2.8200106620788574, + "logps/chosen": -287.8721618652344, + "logps/rejected": -245.5226593017578, + "loss": 0.6136, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4895684719085693, + "rewards/margins": 0.7056936025619507, + "rewards/rejected": -2.1952619552612305, + "step": 6671 + }, + { + "epoch": 0.78, + "learning_rate": 6.817054446675328e-08, + "logits/chosen": -2.352027416229248, + "logits/rejected": -2.35127329826355, + "logps/chosen": -330.3799133300781, + "logps/rejected": -276.8382873535156, + "loss": 0.2105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5801509022712708, + "rewards/margins": 2.4024109840393066, + "rewards/rejected": -2.9825618267059326, + "step": 6672 + }, + { + "epoch": 0.78, + "learning_rate": 6.8135112790835e-08, + "logits/chosen": -2.2304534912109375, + "logits/rejected": -1.8882079124450684, + "logps/chosen": -217.9007568359375, + "logps/rejected": -310.2322082519531, + "loss": 0.2342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15801197290420532, + "rewards/margins": 3.6213507652282715, + "rewards/rejected": -3.779362678527832, + "step": 6673 + }, + { + "epoch": 0.78, + "learning_rate": 6.809968111491674e-08, + "logits/chosen": -1.7277190685272217, + "logits/rejected": -2.197317123413086, + "logps/chosen": -499.27569580078125, + "logps/rejected": -372.884033203125, + "loss": 0.5511, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.793287754058838, + "rewards/margins": 1.4433727264404297, + "rewards/rejected": -3.2366604804992676, + "step": 6674 + }, + { + "epoch": 0.78, + "learning_rate": 6.806424943899846e-08, + "logits/chosen": -2.9848339557647705, + "logits/rejected": -2.984494209289551, + "logps/chosen": -121.56941223144531, + "logps/rejected": -215.59518432617188, + "loss": 0.3409, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6516674757003784, + "rewards/margins": 3.237811326980591, + "rewards/rejected": -3.889478921890259, + "step": 6675 + }, + { + "epoch": 0.78, + "learning_rate": 6.802881776308018e-08, + "logits/chosen": -1.9355971813201904, + "logits/rejected": -1.8923416137695312, + "logps/chosen": -270.706298828125, + "logps/rejected": -251.49169921875, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4284957647323608, + "rewards/margins": 1.7913737297058105, + "rewards/rejected": -3.219869375228882, + "step": 6676 + }, + { + "epoch": 0.78, + "learning_rate": 6.799338608716192e-08, + "logits/chosen": -2.196194648742676, + "logits/rejected": -2.3535146713256836, + "logps/chosen": -229.96652221679688, + "logps/rejected": -230.62355041503906, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18463164567947388, + "rewards/margins": 2.3111119270324707, + "rewards/rejected": -2.495743751525879, + "step": 6677 + }, + { + "epoch": 0.78, + "learning_rate": 6.795795441124365e-08, + "logits/chosen": -1.701321005821228, + "logits/rejected": -1.894420862197876, + "logps/chosen": -365.467041015625, + "logps/rejected": -319.93438720703125, + "loss": 0.3965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8441025614738464, + "rewards/margins": 2.462033987045288, + "rewards/rejected": -3.3061366081237793, + "step": 6678 + }, + { + "epoch": 0.78, + "learning_rate": 6.792252273532537e-08, + "logits/chosen": -2.285198926925659, + "logits/rejected": -2.0963997840881348, + "logps/chosen": -229.80361938476562, + "logps/rejected": -368.8497314453125, + "loss": 0.4179, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5706879496574402, + "rewards/margins": 1.7221283912658691, + "rewards/rejected": -2.292816400527954, + "step": 6679 + }, + { + "epoch": 0.78, + "learning_rate": 6.788709105940711e-08, + "logits/chosen": -2.1315906047821045, + "logits/rejected": -2.2837376594543457, + "logps/chosen": -511.73431396484375, + "logps/rejected": -443.95025634765625, + "loss": 0.3399, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1352139711380005, + "rewards/margins": 2.3367979526519775, + "rewards/rejected": -2.4720120429992676, + "step": 6680 + }, + { + "epoch": 0.78, + "learning_rate": 6.785165938348883e-08, + "logits/chosen": -2.438467025756836, + "logits/rejected": -2.4087517261505127, + "logps/chosen": -266.48663330078125, + "logps/rejected": -205.9776611328125, + "loss": 0.5589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45648103952407837, + "rewards/margins": 0.9591766595840454, + "rewards/rejected": -1.415657639503479, + "step": 6681 + }, + { + "epoch": 0.78, + "learning_rate": 6.781622770757057e-08, + "logits/chosen": -2.1352956295013428, + "logits/rejected": -2.089787483215332, + "logps/chosen": -466.389404296875, + "logps/rejected": -463.48272705078125, + "loss": 0.4255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1629711240530014, + "rewards/margins": 2.187103271484375, + "rewards/rejected": -2.350074291229248, + "step": 6682 + }, + { + "epoch": 0.78, + "learning_rate": 6.77807960316523e-08, + "logits/chosen": -2.502530813217163, + "logits/rejected": -2.5864686965942383, + "logps/chosen": -163.0996551513672, + "logps/rejected": -230.57554626464844, + "loss": 0.2212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5594675540924072, + "rewards/margins": 3.2722225189208984, + "rewards/rejected": -3.8316903114318848, + "step": 6683 + }, + { + "epoch": 0.78, + "learning_rate": 6.774536435573402e-08, + "logits/chosen": -2.7706050872802734, + "logits/rejected": -2.7820358276367188, + "logps/chosen": -381.8843078613281, + "logps/rejected": -342.4682922363281, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2220343351364136, + "rewards/margins": 2.320164203643799, + "rewards/rejected": -3.542198419570923, + "step": 6684 + }, + { + "epoch": 0.78, + "learning_rate": 6.770993267981575e-08, + "logits/chosen": -2.5427920818328857, + "logits/rejected": -2.403404951095581, + "logps/chosen": -229.47494506835938, + "logps/rejected": -282.0889892578125, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9079654216766357, + "rewards/margins": 2.157979726791382, + "rewards/rejected": -3.0659451484680176, + "step": 6685 + }, + { + "epoch": 0.78, + "learning_rate": 6.767450100389748e-08, + "logits/chosen": -2.4850566387176514, + "logits/rejected": -2.378840923309326, + "logps/chosen": -236.47410583496094, + "logps/rejected": -327.4798889160156, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0551799535751343, + "rewards/margins": 4.095218181610107, + "rewards/rejected": -5.150398254394531, + "step": 6686 + }, + { + "epoch": 0.78, + "learning_rate": 6.76390693279792e-08, + "logits/chosen": -2.2919516563415527, + "logits/rejected": -2.0618233680725098, + "logps/chosen": -250.63177490234375, + "logps/rejected": -235.53341674804688, + "loss": 1.0357, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7706544399261475, + "rewards/margins": 0.5728428363800049, + "rewards/rejected": -2.3434972763061523, + "step": 6687 + }, + { + "epoch": 0.78, + "learning_rate": 6.760363765206094e-08, + "logits/chosen": -2.4397132396698, + "logits/rejected": -2.470994472503662, + "logps/chosen": -246.7538299560547, + "logps/rejected": -250.6220703125, + "loss": 0.3559, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.259779930114746, + "rewards/margins": 2.5538787841796875, + "rewards/rejected": -3.8136587142944336, + "step": 6688 + }, + { + "epoch": 0.78, + "learning_rate": 6.756820597614267e-08, + "logits/chosen": -2.0749871730804443, + "logits/rejected": -2.177999496459961, + "logps/chosen": -261.2310485839844, + "logps/rejected": -327.1963195800781, + "loss": 0.4146, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0801362991333008, + "rewards/margins": 2.3057026863098145, + "rewards/rejected": -3.385838747024536, + "step": 6689 + }, + { + "epoch": 0.78, + "learning_rate": 6.75327743002244e-08, + "logits/chosen": -2.0340328216552734, + "logits/rejected": -2.05511474609375, + "logps/chosen": -262.8029479980469, + "logps/rejected": -274.64764404296875, + "loss": 0.5585, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9783518314361572, + "rewards/margins": 1.1480052471160889, + "rewards/rejected": -2.126357078552246, + "step": 6690 + }, + { + "epoch": 0.78, + "learning_rate": 6.749734262430613e-08, + "logits/chosen": -2.8588266372680664, + "logits/rejected": -2.785094976425171, + "logps/chosen": -214.4049072265625, + "logps/rejected": -226.07699584960938, + "loss": 0.3339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5854496955871582, + "rewards/margins": 1.4948749542236328, + "rewards/rejected": -2.080324649810791, + "step": 6691 + }, + { + "epoch": 0.78, + "learning_rate": 6.746191094838785e-08, + "logits/chosen": -2.3237717151641846, + "logits/rejected": -2.3030593395233154, + "logps/chosen": -223.0485382080078, + "logps/rejected": -257.0408935546875, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3497526943683624, + "rewards/margins": 3.204129695892334, + "rewards/rejected": -3.553882360458374, + "step": 6692 + }, + { + "epoch": 0.78, + "learning_rate": 6.742647927246959e-08, + "logits/chosen": -2.815141201019287, + "logits/rejected": -2.7521286010742188, + "logps/chosen": -136.90203857421875, + "logps/rejected": -230.65548706054688, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7323436737060547, + "rewards/margins": 2.256695508956909, + "rewards/rejected": -2.989039421081543, + "step": 6693 + }, + { + "epoch": 0.78, + "learning_rate": 6.739104759655131e-08, + "logits/chosen": -2.3654043674468994, + "logits/rejected": -2.1271324157714844, + "logps/chosen": -298.1686706542969, + "logps/rejected": -392.1276550292969, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2456430196762085, + "rewards/margins": 2.0732266902923584, + "rewards/rejected": -3.3188695907592773, + "step": 6694 + }, + { + "epoch": 0.78, + "learning_rate": 6.735561592063305e-08, + "logits/chosen": -2.842491626739502, + "logits/rejected": -2.674410104751587, + "logps/chosen": -173.41233825683594, + "logps/rejected": -288.1602783203125, + "loss": 0.1001, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27922865748405457, + "rewards/margins": 4.741949081420898, + "rewards/rejected": -5.021177768707275, + "step": 6695 + }, + { + "epoch": 0.78, + "learning_rate": 6.732018424471477e-08, + "logits/chosen": -2.092898368835449, + "logits/rejected": -1.9460290670394897, + "logps/chosen": -123.1417007446289, + "logps/rejected": -409.05474853515625, + "loss": 0.9271, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5931000709533691, + "rewards/margins": 1.0334882736206055, + "rewards/rejected": -2.6265883445739746, + "step": 6696 + }, + { + "epoch": 0.78, + "learning_rate": 6.72847525687965e-08, + "logits/chosen": -2.388728380203247, + "logits/rejected": -2.531466007232666, + "logps/chosen": -185.8707733154297, + "logps/rejected": -198.40872192382812, + "loss": 0.6583, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.013964295387268, + "rewards/margins": 0.8661527633666992, + "rewards/rejected": -1.8801169395446777, + "step": 6697 + }, + { + "epoch": 0.78, + "learning_rate": 6.724932089287823e-08, + "logits/chosen": -2.474527597427368, + "logits/rejected": -2.4738364219665527, + "logps/chosen": -516.7196655273438, + "logps/rejected": -367.37628173828125, + "loss": 0.1967, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6396868228912354, + "rewards/margins": 2.2974131107330322, + "rewards/rejected": -2.9370996952056885, + "step": 6698 + }, + { + "epoch": 0.78, + "learning_rate": 6.721388921695996e-08, + "logits/chosen": -1.6957606077194214, + "logits/rejected": -1.4716038703918457, + "logps/chosen": -177.84158325195312, + "logps/rejected": -292.15850830078125, + "loss": 0.2089, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2286207675933838, + "rewards/margins": 2.5625884532928467, + "rewards/rejected": -3.7912087440490723, + "step": 6699 + }, + { + "epoch": 0.78, + "learning_rate": 6.71784575410417e-08, + "logits/chosen": -2.2870731353759766, + "logits/rejected": -2.3567264080047607, + "logps/chosen": -269.99822998046875, + "logps/rejected": -248.5194549560547, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1575675755739212, + "rewards/margins": 2.3558287620544434, + "rewards/rejected": -2.5133962631225586, + "step": 6700 + }, + { + "epoch": 0.78, + "learning_rate": 6.714302586512342e-08, + "logits/chosen": -2.222492218017578, + "logits/rejected": -2.4547367095947266, + "logps/chosen": -284.49078369140625, + "logps/rejected": -326.5968017578125, + "loss": 0.3943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9901636242866516, + "rewards/margins": 2.006624460220337, + "rewards/rejected": -2.996788263320923, + "step": 6701 + }, + { + "epoch": 0.78, + "learning_rate": 6.710759418920514e-08, + "logits/chosen": -2.125469207763672, + "logits/rejected": -1.943100929260254, + "logps/chosen": -284.98944091796875, + "logps/rejected": -355.3919677734375, + "loss": 0.2669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.607336163520813, + "rewards/margins": 1.6198086738586426, + "rewards/rejected": -2.227144718170166, + "step": 6702 + }, + { + "epoch": 0.78, + "learning_rate": 6.707216251328688e-08, + "logits/chosen": -2.67234468460083, + "logits/rejected": -2.604809522628784, + "logps/chosen": -278.9767150878906, + "logps/rejected": -284.6295471191406, + "loss": 0.2739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8595300912857056, + "rewards/margins": 2.480501174926758, + "rewards/rejected": -3.340031623840332, + "step": 6703 + }, + { + "epoch": 0.78, + "learning_rate": 6.70367308373686e-08, + "logits/chosen": -2.54579496383667, + "logits/rejected": -2.729091167449951, + "logps/chosen": -90.99949645996094, + "logps/rejected": -124.462158203125, + "loss": 0.364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.47379645705223083, + "rewards/margins": 1.7775765657424927, + "rewards/rejected": -2.251373052597046, + "step": 6704 + }, + { + "epoch": 0.78, + "learning_rate": 6.700129916145033e-08, + "logits/chosen": -1.987938404083252, + "logits/rejected": -2.2770819664001465, + "logps/chosen": -379.1482238769531, + "logps/rejected": -316.0445861816406, + "loss": 0.4018, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0910412073135376, + "rewards/margins": 2.3174967765808105, + "rewards/rejected": -3.4085376262664795, + "step": 6705 + }, + { + "epoch": 0.78, + "learning_rate": 6.696586748553207e-08, + "logits/chosen": -2.6607789993286133, + "logits/rejected": -2.796546220779419, + "logps/chosen": -283.2216491699219, + "logps/rejected": -510.07379150390625, + "loss": 0.6347, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3854193687438965, + "rewards/margins": 2.472313404083252, + "rewards/rejected": -4.857732772827148, + "step": 6706 + }, + { + "epoch": 0.78, + "learning_rate": 6.693043580961379e-08, + "logits/chosen": -2.7860565185546875, + "logits/rejected": -2.7219154834747314, + "logps/chosen": -302.9002685546875, + "logps/rejected": -207.79586791992188, + "loss": 0.2426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2785945534706116, + "rewards/margins": 2.64985728263855, + "rewards/rejected": -2.9284520149230957, + "step": 6707 + }, + { + "epoch": 0.78, + "learning_rate": 6.689500413369551e-08, + "logits/chosen": -2.933516025543213, + "logits/rejected": -2.913712501525879, + "logps/chosen": -310.345458984375, + "logps/rejected": -244.89085388183594, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3036630153656006, + "rewards/margins": 2.6693365573883057, + "rewards/rejected": -3.9729995727539062, + "step": 6708 + }, + { + "epoch": 0.78, + "learning_rate": 6.685957245777725e-08, + "logits/chosen": -2.9953761100769043, + "logits/rejected": -2.976691484451294, + "logps/chosen": -191.99363708496094, + "logps/rejected": -232.1956024169922, + "loss": 0.8601, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2406907081604004, + "rewards/margins": 1.1798081398010254, + "rewards/rejected": -2.420498847961426, + "step": 6709 + }, + { + "epoch": 0.78, + "learning_rate": 6.682414078185898e-08, + "logits/chosen": -2.3260185718536377, + "logits/rejected": -2.6103432178497314, + "logps/chosen": -294.9425048828125, + "logps/rejected": -228.21963500976562, + "loss": 0.2102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7171142101287842, + "rewards/margins": 2.6876914501190186, + "rewards/rejected": -3.4048056602478027, + "step": 6710 + }, + { + "epoch": 0.78, + "learning_rate": 6.678870910594071e-08, + "logits/chosen": -1.967081904411316, + "logits/rejected": -1.7084155082702637, + "logps/chosen": -206.1607666015625, + "logps/rejected": -359.88720703125, + "loss": 0.2116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1750939041376114, + "rewards/margins": 2.833411693572998, + "rewards/rejected": -3.0085058212280273, + "step": 6711 + }, + { + "epoch": 0.78, + "learning_rate": 6.675327743002244e-08, + "logits/chosen": -2.3091394901275635, + "logits/rejected": -2.2618730068206787, + "logps/chosen": -222.5257568359375, + "logps/rejected": -231.15777587890625, + "loss": 0.3866, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42745906114578247, + "rewards/margins": 2.213589668273926, + "rewards/rejected": -2.6410486698150635, + "step": 6712 + }, + { + "epoch": 0.78, + "learning_rate": 6.671784575410416e-08, + "logits/chosen": -2.0033977031707764, + "logits/rejected": -2.1826815605163574, + "logps/chosen": -254.77870178222656, + "logps/rejected": -271.9248046875, + "loss": 0.1936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8561174273490906, + "rewards/margins": 2.539294958114624, + "rewards/rejected": -3.3954124450683594, + "step": 6713 + }, + { + "epoch": 0.78, + "learning_rate": 6.668241407818589e-08, + "logits/chosen": -1.9096633195877075, + "logits/rejected": -2.057343006134033, + "logps/chosen": -340.69976806640625, + "logps/rejected": -294.840576171875, + "loss": 0.2965, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2418824434280396, + "rewards/margins": 1.7994009256362915, + "rewards/rejected": -3.041283369064331, + "step": 6714 + }, + { + "epoch": 0.78, + "learning_rate": 6.664698240226762e-08, + "logits/chosen": -2.017667531967163, + "logits/rejected": -2.1800107955932617, + "logps/chosen": -573.7841796875, + "logps/rejected": -382.7703857421875, + "loss": 1.0998, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8655719757080078, + "rewards/margins": 0.43704816699028015, + "rewards/rejected": -2.3026201725006104, + "step": 6715 + }, + { + "epoch": 0.78, + "learning_rate": 6.661155072634936e-08, + "logits/chosen": -2.0943005084991455, + "logits/rejected": -1.7638336420059204, + "logps/chosen": -193.97933959960938, + "logps/rejected": -364.96185302734375, + "loss": 0.1382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7888104915618896, + "rewards/margins": 2.637986660003662, + "rewards/rejected": -3.4267971515655518, + "step": 6716 + }, + { + "epoch": 0.78, + "learning_rate": 6.657611905043108e-08, + "logits/chosen": -2.4395244121551514, + "logits/rejected": -2.6528139114379883, + "logps/chosen": -236.4577178955078, + "logps/rejected": -333.0775451660156, + "loss": 0.3902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4057982861995697, + "rewards/margins": 1.909897804260254, + "rewards/rejected": -2.3156960010528564, + "step": 6717 + }, + { + "epoch": 0.78, + "learning_rate": 6.654068737451281e-08, + "logits/chosen": -1.967890977859497, + "logits/rejected": -1.7734661102294922, + "logps/chosen": -335.6396484375, + "logps/rejected": -358.78277587890625, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3551292419433594, + "rewards/margins": 2.6152760982513428, + "rewards/rejected": -2.970405340194702, + "step": 6718 + }, + { + "epoch": 0.78, + "learning_rate": 6.650525569859454e-08, + "logits/chosen": -2.302067756652832, + "logits/rejected": -2.425612211227417, + "logps/chosen": -119.6279296875, + "logps/rejected": -200.06536865234375, + "loss": 0.6904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.73972487449646, + "rewards/margins": 0.6601678133010864, + "rewards/rejected": -2.399892568588257, + "step": 6719 + }, + { + "epoch": 0.78, + "learning_rate": 6.646982402267627e-08, + "logits/chosen": -2.418684959411621, + "logits/rejected": -2.5878515243530273, + "logps/chosen": -435.0817565917969, + "logps/rejected": -296.0639343261719, + "loss": 0.2157, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7551548480987549, + "rewards/margins": 3.0221855640411377, + "rewards/rejected": -3.7773404121398926, + "step": 6720 + }, + { + "epoch": 0.78, + "learning_rate": 6.6434392346758e-08, + "logits/chosen": -2.2393128871917725, + "logits/rejected": -2.2634389400482178, + "logps/chosen": -256.0483703613281, + "logps/rejected": -178.4649658203125, + "loss": 0.4936, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.778351902961731, + "rewards/margins": 2.2833328247070312, + "rewards/rejected": -3.0616846084594727, + "step": 6721 + }, + { + "epoch": 0.78, + "learning_rate": 6.639896067083973e-08, + "logits/chosen": -1.8561320304870605, + "logits/rejected": -2.142460584640503, + "logps/chosen": -320.501708984375, + "logps/rejected": -363.2616882324219, + "loss": 0.1555, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3523080050945282, + "rewards/margins": 3.58445143699646, + "rewards/rejected": -3.9367592334747314, + "step": 6722 + }, + { + "epoch": 0.78, + "learning_rate": 6.636352899492145e-08, + "logits/chosen": -1.8482023477554321, + "logits/rejected": -2.105743646621704, + "logps/chosen": -364.515869140625, + "logps/rejected": -323.3650817871094, + "loss": 0.3709, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04167512059211731, + "rewards/margins": 2.946732997894287, + "rewards/rejected": -2.988408088684082, + "step": 6723 + }, + { + "epoch": 0.78, + "learning_rate": 6.632809731900319e-08, + "logits/chosen": -2.152111053466797, + "logits/rejected": -2.317734718322754, + "logps/chosen": -440.8103332519531, + "logps/rejected": -370.03790283203125, + "loss": 0.9016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8296585083007812, + "rewards/margins": 0.4981834888458252, + "rewards/rejected": -1.3278419971466064, + "step": 6724 + }, + { + "epoch": 0.78, + "learning_rate": 6.629266564308491e-08, + "logits/chosen": -2.3289875984191895, + "logits/rejected": -2.611556053161621, + "logps/chosen": -279.58648681640625, + "logps/rejected": -255.20599365234375, + "loss": 0.2886, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6124484539031982, + "rewards/margins": 2.6644556522369385, + "rewards/rejected": -4.276904106140137, + "step": 6725 + }, + { + "epoch": 0.78, + "learning_rate": 6.625723396716664e-08, + "logits/chosen": -1.9364712238311768, + "logits/rejected": -2.001591682434082, + "logps/chosen": -332.02203369140625, + "logps/rejected": -363.69232177734375, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47412994503974915, + "rewards/margins": 1.4747166633605957, + "rewards/rejected": -1.9488468170166016, + "step": 6726 + }, + { + "epoch": 0.78, + "learning_rate": 6.622180229124838e-08, + "logits/chosen": -2.468430995941162, + "logits/rejected": -2.4510247707366943, + "logps/chosen": -196.44134521484375, + "logps/rejected": -186.54351806640625, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9392768144607544, + "rewards/margins": 2.258169651031494, + "rewards/rejected": -3.197446346282959, + "step": 6727 + }, + { + "epoch": 0.78, + "learning_rate": 6.61863706153301e-08, + "logits/chosen": -2.9351646900177, + "logits/rejected": -2.616594076156616, + "logps/chosen": -193.66583251953125, + "logps/rejected": -276.6463317871094, + "loss": 0.2623, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6879783868789673, + "rewards/margins": 2.3686537742614746, + "rewards/rejected": -3.0566320419311523, + "step": 6728 + }, + { + "epoch": 0.78, + "learning_rate": 6.615093893941184e-08, + "logits/chosen": -1.8479408025741577, + "logits/rejected": -2.0661375522613525, + "logps/chosen": -340.1581726074219, + "logps/rejected": -311.9808654785156, + "loss": 0.6654, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.392877221107483, + "rewards/margins": 2.0435659885406494, + "rewards/rejected": -3.436443328857422, + "step": 6729 + }, + { + "epoch": 0.78, + "learning_rate": 6.611550726349356e-08, + "logits/chosen": -2.0385022163391113, + "logits/rejected": -1.8074243068695068, + "logps/chosen": -309.35235595703125, + "logps/rejected": -307.7257995605469, + "loss": 1.0232, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.6712918281555176, + "rewards/margins": -0.20161934196949005, + "rewards/rejected": -1.469672441482544, + "step": 6730 + }, + { + "epoch": 0.78, + "learning_rate": 6.608007558757528e-08, + "logits/chosen": -1.737768530845642, + "logits/rejected": -1.9827021360397339, + "logps/chosen": -512.064697265625, + "logps/rejected": -301.81939697265625, + "loss": 1.0681, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2946124076843262, + "rewards/margins": 0.734893798828125, + "rewards/rejected": -2.029506206512451, + "step": 6731 + }, + { + "epoch": 0.78, + "learning_rate": 6.604464391165702e-08, + "logits/chosen": -2.710054874420166, + "logits/rejected": -2.653728485107422, + "logps/chosen": -243.56912231445312, + "logps/rejected": -320.6463928222656, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0831300020217896, + "rewards/margins": 3.838249683380127, + "rewards/rejected": -4.921379566192627, + "step": 6732 + }, + { + "epoch": 0.78, + "learning_rate": 6.600921223573875e-08, + "logits/chosen": -2.6883115768432617, + "logits/rejected": -2.9124701023101807, + "logps/chosen": -321.918212890625, + "logps/rejected": -278.0485534667969, + "loss": 0.2667, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.841445803642273, + "rewards/margins": 3.0533792972564697, + "rewards/rejected": -3.894824981689453, + "step": 6733 + }, + { + "epoch": 0.78, + "learning_rate": 6.597378055982047e-08, + "logits/chosen": -2.188117027282715, + "logits/rejected": -2.0665595531463623, + "logps/chosen": -321.9722900390625, + "logps/rejected": -375.6776123046875, + "loss": 0.5807, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1069436073303223, + "rewards/margins": 1.1814830303192139, + "rewards/rejected": -2.288426399230957, + "step": 6734 + }, + { + "epoch": 0.78, + "learning_rate": 6.593834888390221e-08, + "logits/chosen": -2.5359866619110107, + "logits/rejected": -2.3362865447998047, + "logps/chosen": -182.16632080078125, + "logps/rejected": -289.39312744140625, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5255894660949707, + "rewards/margins": 3.344526529312134, + "rewards/rejected": -3.8701162338256836, + "step": 6735 + }, + { + "epoch": 0.78, + "learning_rate": 6.590291720798393e-08, + "logits/chosen": -2.0730690956115723, + "logits/rejected": -2.156136989593506, + "logps/chosen": -183.74127197265625, + "logps/rejected": -231.93020629882812, + "loss": 0.6229, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4540839195251465, + "rewards/margins": 1.9810066223144531, + "rewards/rejected": -2.4350903034210205, + "step": 6736 + }, + { + "epoch": 0.78, + "learning_rate": 6.586748553206567e-08, + "logits/chosen": -2.4043474197387695, + "logits/rejected": -2.217194080352783, + "logps/chosen": -141.60353088378906, + "logps/rejected": -209.29141235351562, + "loss": 0.4534, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6602283716201782, + "rewards/margins": 2.5609889030456543, + "rewards/rejected": -3.221217155456543, + "step": 6737 + }, + { + "epoch": 0.78, + "learning_rate": 6.583205385614739e-08, + "logits/chosen": -2.0044758319854736, + "logits/rejected": -1.8788104057312012, + "logps/chosen": -303.8839111328125, + "logps/rejected": -352.0296630859375, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7515566945075989, + "rewards/margins": 1.7527194023132324, + "rewards/rejected": -2.5042760372161865, + "step": 6738 + }, + { + "epoch": 0.78, + "learning_rate": 6.579662218022912e-08, + "logits/chosen": -2.1231765747070312, + "logits/rejected": -2.0586905479431152, + "logps/chosen": -345.1786804199219, + "logps/rejected": -350.0381774902344, + "loss": 1.0571, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7826244831085205, + "rewards/margins": 1.353762149810791, + "rewards/rejected": -3.1363868713378906, + "step": 6739 + }, + { + "epoch": 0.78, + "learning_rate": 6.576119050431085e-08, + "logits/chosen": -2.665330171585083, + "logits/rejected": -2.7096450328826904, + "logps/chosen": -362.310302734375, + "logps/rejected": -286.77593994140625, + "loss": 0.4386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5667153000831604, + "rewards/margins": 1.836995005607605, + "rewards/rejected": -2.4037106037139893, + "step": 6740 + }, + { + "epoch": 0.78, + "learning_rate": 6.572575882839258e-08, + "logits/chosen": -2.1973042488098145, + "logits/rejected": -2.223388195037842, + "logps/chosen": -212.58102416992188, + "logps/rejected": -277.4917297363281, + "loss": 0.3324, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32471635937690735, + "rewards/margins": 2.4254143238067627, + "rewards/rejected": -2.7501308917999268, + "step": 6741 + }, + { + "epoch": 0.78, + "learning_rate": 6.56903271524743e-08, + "logits/chosen": -2.749851703643799, + "logits/rejected": -2.685868978500366, + "logps/chosen": -195.6599884033203, + "logps/rejected": -191.56588745117188, + "loss": 0.4973, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.046813726425171, + "rewards/margins": 2.3936359882354736, + "rewards/rejected": -3.4404499530792236, + "step": 6742 + }, + { + "epoch": 0.78, + "learning_rate": 6.565489547655604e-08, + "logits/chosen": -2.497851610183716, + "logits/rejected": -2.4638240337371826, + "logps/chosen": -230.641357421875, + "logps/rejected": -258.274169921875, + "loss": 0.4202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6937828063964844, + "rewards/margins": 1.394040822982788, + "rewards/rejected": -2.0878236293792725, + "step": 6743 + }, + { + "epoch": 0.78, + "learning_rate": 6.561946380063778e-08, + "logits/chosen": -1.594895839691162, + "logits/rejected": -2.0362868309020996, + "logps/chosen": -444.515625, + "logps/rejected": -308.9933166503906, + "loss": 0.3211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41105130314826965, + "rewards/margins": 2.552729845046997, + "rewards/rejected": -2.9637811183929443, + "step": 6744 + }, + { + "epoch": 0.78, + "learning_rate": 6.55840321247195e-08, + "logits/chosen": -2.069666624069214, + "logits/rejected": -2.523656129837036, + "logps/chosen": -236.4113006591797, + "logps/rejected": -259.1937255859375, + "loss": 1.0425, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.043017029762268, + "rewards/margins": 1.61940598487854, + "rewards/rejected": -2.6624231338500977, + "step": 6745 + }, + { + "epoch": 0.78, + "learning_rate": 6.554860044880122e-08, + "logits/chosen": -2.1396474838256836, + "logits/rejected": -2.4101409912109375, + "logps/chosen": -258.7208251953125, + "logps/rejected": -254.325927734375, + "loss": 0.5451, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3880400657653809, + "rewards/margins": 1.569046974182129, + "rewards/rejected": -2.9570870399475098, + "step": 6746 + }, + { + "epoch": 0.78, + "learning_rate": 6.551316877288295e-08, + "logits/chosen": -2.376088857650757, + "logits/rejected": -2.5357284545898438, + "logps/chosen": -126.98919677734375, + "logps/rejected": -200.66676330566406, + "loss": 1.4096, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6329371929168701, + "rewards/margins": 1.0077791213989258, + "rewards/rejected": -2.640716552734375, + "step": 6747 + }, + { + "epoch": 0.78, + "learning_rate": 6.547773709696468e-08, + "logits/chosen": -1.579930305480957, + "logits/rejected": -1.797488808631897, + "logps/chosen": -420.4679260253906, + "logps/rejected": -288.0094299316406, + "loss": 1.2377, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.564877986907959, + "rewards/margins": 0.03185093402862549, + "rewards/rejected": -1.5967289209365845, + "step": 6748 + }, + { + "epoch": 0.79, + "learning_rate": 6.544230542104641e-08, + "logits/chosen": -2.333695411682129, + "logits/rejected": -2.538177490234375, + "logps/chosen": -253.01019287109375, + "logps/rejected": -283.5047302246094, + "loss": 0.1498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6573278903961182, + "rewards/margins": 3.010467529296875, + "rewards/rejected": -3.667795419692993, + "step": 6749 + }, + { + "epoch": 0.79, + "learning_rate": 6.540687374512815e-08, + "logits/chosen": -2.2079274654388428, + "logits/rejected": -2.2318241596221924, + "logps/chosen": -363.25457763671875, + "logps/rejected": -224.43641662597656, + "loss": 0.5239, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2910170555114746, + "rewards/margins": 2.611124038696289, + "rewards/rejected": -3.9021408557891846, + "step": 6750 + }, + { + "epoch": 0.79, + "learning_rate": 6.537144206920987e-08, + "logits/chosen": -2.1962552070617676, + "logits/rejected": -2.102534532546997, + "logps/chosen": -308.8802795410156, + "logps/rejected": -392.2976379394531, + "loss": 0.4778, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10515855252742767, + "rewards/margins": 2.554295778274536, + "rewards/rejected": -2.4491372108459473, + "step": 6751 + }, + { + "epoch": 0.79, + "learning_rate": 6.533601039329159e-08, + "logits/chosen": -2.3547756671905518, + "logits/rejected": -2.259152889251709, + "logps/chosen": -209.7338409423828, + "logps/rejected": -281.8211364746094, + "loss": 0.4909, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0186517238616943, + "rewards/margins": 1.631777048110962, + "rewards/rejected": -2.6504287719726562, + "step": 6752 + }, + { + "epoch": 0.79, + "learning_rate": 6.530057871737333e-08, + "logits/chosen": -2.0333974361419678, + "logits/rejected": -1.9305593967437744, + "logps/chosen": -406.3330078125, + "logps/rejected": -414.7881774902344, + "loss": 0.466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9523491859436035, + "rewards/margins": 1.9146692752838135, + "rewards/rejected": -2.867018699645996, + "step": 6753 + }, + { + "epoch": 0.79, + "learning_rate": 6.526514704145506e-08, + "logits/chosen": -1.991227626800537, + "logits/rejected": -1.8974330425262451, + "logps/chosen": -149.16363525390625, + "logps/rejected": -166.6828155517578, + "loss": 1.0408, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3483424186706543, + "rewards/margins": -0.11009010672569275, + "rewards/rejected": -2.2382521629333496, + "step": 6754 + }, + { + "epoch": 0.79, + "learning_rate": 6.522971536553678e-08, + "logits/chosen": -2.4581847190856934, + "logits/rejected": -2.5660510063171387, + "logps/chosen": -213.33958435058594, + "logps/rejected": -422.2435607910156, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3090536594390869, + "rewards/margins": 4.743819713592529, + "rewards/rejected": -5.052873134613037, + "step": 6755 + }, + { + "epoch": 0.79, + "learning_rate": 6.519428368961852e-08, + "logits/chosen": -2.933178186416626, + "logits/rejected": -2.9832205772399902, + "logps/chosen": -290.4124755859375, + "logps/rejected": -283.08917236328125, + "loss": 0.1929, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5617907643318176, + "rewards/margins": 4.2436323165893555, + "rewards/rejected": -4.805422782897949, + "step": 6756 + }, + { + "epoch": 0.79, + "learning_rate": 6.515885201370024e-08, + "logits/chosen": -2.020195245742798, + "logits/rejected": -2.2006397247314453, + "logps/chosen": -379.0953369140625, + "logps/rejected": -355.92523193359375, + "loss": 0.4079, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13003234565258026, + "rewards/margins": 2.3635714054107666, + "rewards/rejected": -2.493603467941284, + "step": 6757 + }, + { + "epoch": 0.79, + "learning_rate": 6.512342033778196e-08, + "logits/chosen": -2.332660436630249, + "logits/rejected": -2.279651403427124, + "logps/chosen": -245.4786834716797, + "logps/rejected": -274.9839782714844, + "loss": 0.4827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4531765282154083, + "rewards/margins": 2.3407700061798096, + "rewards/rejected": -2.7939465045928955, + "step": 6758 + }, + { + "epoch": 0.79, + "learning_rate": 6.50879886618637e-08, + "logits/chosen": -2.2957258224487305, + "logits/rejected": -2.3555660247802734, + "logps/chosen": -258.70166015625, + "logps/rejected": -413.1977233886719, + "loss": 0.1403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4180722236633301, + "rewards/margins": 12.504809379577637, + "rewards/rejected": -12.922881126403809, + "step": 6759 + }, + { + "epoch": 0.79, + "learning_rate": 6.505255698594544e-08, + "logits/chosen": -2.1647658348083496, + "logits/rejected": -2.176680326461792, + "logps/chosen": -403.1396789550781, + "logps/rejected": -441.18951416015625, + "loss": 0.0772, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0489552021026611, + "rewards/margins": 4.99904727935791, + "rewards/rejected": -6.048002243041992, + "step": 6760 + }, + { + "epoch": 0.79, + "learning_rate": 6.501712531002717e-08, + "logits/chosen": -2.440051555633545, + "logits/rejected": -2.5826523303985596, + "logps/chosen": -241.02601623535156, + "logps/rejected": -267.2476501464844, + "loss": 0.4929, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9369781017303467, + "rewards/margins": 1.6223808526992798, + "rewards/rejected": -2.559359073638916, + "step": 6761 + }, + { + "epoch": 0.79, + "learning_rate": 6.498169363410889e-08, + "logits/chosen": -2.602499485015869, + "logits/rejected": -2.636160135269165, + "logps/chosen": -446.03240966796875, + "logps/rejected": -333.2652587890625, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2011590003967285, + "rewards/margins": 2.303955078125, + "rewards/rejected": -3.5051138401031494, + "step": 6762 + }, + { + "epoch": 0.79, + "learning_rate": 6.494626195819061e-08, + "logits/chosen": -1.7616400718688965, + "logits/rejected": -1.9467805624008179, + "logps/chosen": -374.210205078125, + "logps/rejected": -429.8559875488281, + "loss": 0.3652, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7973393201828003, + "rewards/margins": 2.6581778526306152, + "rewards/rejected": -4.455517292022705, + "step": 6763 + }, + { + "epoch": 0.79, + "learning_rate": 6.491083028227235e-08, + "logits/chosen": -2.1555492877960205, + "logits/rejected": -2.1077823638916016, + "logps/chosen": -313.660888671875, + "logps/rejected": -344.238525390625, + "loss": 0.5436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17069600522518158, + "rewards/margins": 1.8394380807876587, + "rewards/rejected": -2.010133981704712, + "step": 6764 + }, + { + "epoch": 0.79, + "learning_rate": 6.487539860635407e-08, + "logits/chosen": -1.8631126880645752, + "logits/rejected": -1.885514259338379, + "logps/chosen": -704.1841430664062, + "logps/rejected": -585.5087890625, + "loss": 0.6706, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.569437861442566, + "rewards/margins": 1.4699018001556396, + "rewards/rejected": -3.039339780807495, + "step": 6765 + }, + { + "epoch": 0.79, + "learning_rate": 6.483996693043581e-08, + "logits/chosen": -2.69454026222229, + "logits/rejected": -2.587406635284424, + "logps/chosen": -269.1883544921875, + "logps/rejected": -232.35519409179688, + "loss": 0.2584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7365078926086426, + "rewards/margins": 2.3860931396484375, + "rewards/rejected": -3.122601270675659, + "step": 6766 + }, + { + "epoch": 0.79, + "learning_rate": 6.480453525451754e-08, + "logits/chosen": -2.0686826705932617, + "logits/rejected": -1.9851720333099365, + "logps/chosen": -257.35552978515625, + "logps/rejected": -403.6017761230469, + "loss": 0.6672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.689854621887207, + "rewards/margins": 1.8388233184814453, + "rewards/rejected": -2.5286779403686523, + "step": 6767 + }, + { + "epoch": 0.79, + "learning_rate": 6.476910357859927e-08, + "logits/chosen": -2.481163740158081, + "logits/rejected": -2.213897228240967, + "logps/chosen": -101.82785034179688, + "logps/rejected": -144.59585571289062, + "loss": 0.3622, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19704991579055786, + "rewards/margins": 2.106851100921631, + "rewards/rejected": -2.303901195526123, + "step": 6768 + }, + { + "epoch": 0.79, + "learning_rate": 6.473367190268099e-08, + "logits/chosen": -2.824014186859131, + "logits/rejected": -2.9293911457061768, + "logps/chosen": -348.09173583984375, + "logps/rejected": -270.203369140625, + "loss": 0.2389, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1321749687194824, + "rewards/margins": 2.141580104827881, + "rewards/rejected": -3.2737550735473633, + "step": 6769 + }, + { + "epoch": 0.79, + "learning_rate": 6.469824022676272e-08, + "logits/chosen": -1.855371117591858, + "logits/rejected": -1.72642183303833, + "logps/chosen": -246.8117218017578, + "logps/rejected": -232.2223358154297, + "loss": 0.5009, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1168028116226196, + "rewards/margins": 1.14151132106781, + "rewards/rejected": -2.2583141326904297, + "step": 6770 + }, + { + "epoch": 0.79, + "learning_rate": 6.466280855084446e-08, + "logits/chosen": -2.3918545246124268, + "logits/rejected": -2.523070812225342, + "logps/chosen": -404.99725341796875, + "logps/rejected": -282.2174377441406, + "loss": 0.4269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5646174550056458, + "rewards/margins": 1.5465744733810425, + "rewards/rejected": -2.111191987991333, + "step": 6771 + }, + { + "epoch": 0.79, + "learning_rate": 6.462737687492618e-08, + "logits/chosen": -2.271946430206299, + "logits/rejected": -2.6504709720611572, + "logps/chosen": -324.5555419921875, + "logps/rejected": -150.41871643066406, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07543468475341797, + "rewards/margins": 2.293001890182495, + "rewards/rejected": -2.368436813354492, + "step": 6772 + }, + { + "epoch": 0.79, + "learning_rate": 6.459194519900792e-08, + "logits/chosen": -1.6467218399047852, + "logits/rejected": -2.00667142868042, + "logps/chosen": -335.9797668457031, + "logps/rejected": -309.2889099121094, + "loss": 0.7189, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9041330814361572, + "rewards/margins": 1.2284350395202637, + "rewards/rejected": -3.132567882537842, + "step": 6773 + }, + { + "epoch": 0.79, + "learning_rate": 6.455651352308964e-08, + "logits/chosen": -2.5751824378967285, + "logits/rejected": -2.3824427127838135, + "logps/chosen": -336.43975830078125, + "logps/rejected": -329.6060791015625, + "loss": 0.6016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7487530708312988, + "rewards/margins": 1.679397463798523, + "rewards/rejected": -2.4281504154205322, + "step": 6774 + }, + { + "epoch": 0.79, + "learning_rate": 6.452108184717136e-08, + "logits/chosen": -2.3123629093170166, + "logits/rejected": -2.4909796714782715, + "logps/chosen": -276.26409912109375, + "logps/rejected": -217.53536987304688, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.295357584953308, + "rewards/margins": 1.583902359008789, + "rewards/rejected": -2.8792598247528076, + "step": 6775 + }, + { + "epoch": 0.79, + "learning_rate": 6.44856501712531e-08, + "logits/chosen": -2.4778575897216797, + "logits/rejected": -2.470651865005493, + "logps/chosen": -296.90399169921875, + "logps/rejected": -416.0457458496094, + "loss": 0.4017, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5988209247589111, + "rewards/margins": 2.1861300468444824, + "rewards/rejected": -2.7849507331848145, + "step": 6776 + }, + { + "epoch": 0.79, + "learning_rate": 6.445021849533483e-08, + "logits/chosen": -2.9068360328674316, + "logits/rejected": -2.645242214202881, + "logps/chosen": -159.5697784423828, + "logps/rejected": -189.36062622070312, + "loss": 0.4952, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9467595815658569, + "rewards/margins": 1.4570311307907104, + "rewards/rejected": -2.4037909507751465, + "step": 6777 + }, + { + "epoch": 0.79, + "learning_rate": 6.441478681941655e-08, + "logits/chosen": -1.7599766254425049, + "logits/rejected": -2.1193909645080566, + "logps/chosen": -480.92724609375, + "logps/rejected": -513.0261840820312, + "loss": 0.3123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.893574595451355, + "rewards/margins": 3.4860892295837402, + "rewards/rejected": -4.379663944244385, + "step": 6778 + }, + { + "epoch": 0.79, + "learning_rate": 6.437935514349829e-08, + "logits/chosen": -2.5956902503967285, + "logits/rejected": -2.714261054992676, + "logps/chosen": -316.0840759277344, + "logps/rejected": -274.8338623046875, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.717914879322052, + "rewards/margins": 2.4928011894226074, + "rewards/rejected": -3.2107162475585938, + "step": 6779 + }, + { + "epoch": 0.79, + "learning_rate": 6.434392346758001e-08, + "logits/chosen": -2.7722506523132324, + "logits/rejected": -2.7597031593322754, + "logps/chosen": -253.97442626953125, + "logps/rejected": -227.812744140625, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3998329639434814, + "rewards/margins": 2.868847370147705, + "rewards/rejected": -4.268680095672607, + "step": 6780 + }, + { + "epoch": 0.79, + "learning_rate": 6.430849179166175e-08, + "logits/chosen": -2.7589733600616455, + "logits/rejected": -2.6167430877685547, + "logps/chosen": -247.87600708007812, + "logps/rejected": -302.966552734375, + "loss": 0.3008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2275225818157196, + "rewards/margins": 1.8940273523330688, + "rewards/rejected": -2.1215500831604004, + "step": 6781 + }, + { + "epoch": 0.79, + "learning_rate": 6.427306011574348e-08, + "logits/chosen": -2.5680832862854004, + "logits/rejected": -2.6700496673583984, + "logps/chosen": -269.5735168457031, + "logps/rejected": -270.3863830566406, + "loss": 0.3527, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4740527868270874, + "rewards/margins": 3.1395316123962402, + "rewards/rejected": -4.613584518432617, + "step": 6782 + }, + { + "epoch": 0.79, + "learning_rate": 6.42376284398252e-08, + "logits/chosen": -2.1120963096618652, + "logits/rejected": -2.222761631011963, + "logps/chosen": -323.16033935546875, + "logps/rejected": -225.1178436279297, + "loss": 0.4928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8396785259246826, + "rewards/margins": 1.0267994403839111, + "rewards/rejected": -1.8664780855178833, + "step": 6783 + }, + { + "epoch": 0.79, + "learning_rate": 6.420219676390692e-08, + "logits/chosen": -2.378286600112915, + "logits/rejected": -2.6183524131774902, + "logps/chosen": -206.07606506347656, + "logps/rejected": -240.38479614257812, + "loss": 0.5128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5946887731552124, + "rewards/margins": 3.5106632709503174, + "rewards/rejected": -4.10535192489624, + "step": 6784 + }, + { + "epoch": 0.79, + "learning_rate": 6.416676508798866e-08, + "logits/chosen": -1.9542427062988281, + "logits/rejected": -1.672656536102295, + "logps/chosen": -368.3193664550781, + "logps/rejected": -360.6171875, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0096635818481445, + "rewards/margins": 0.9955824017524719, + "rewards/rejected": -2.0052459239959717, + "step": 6785 + }, + { + "epoch": 0.79, + "learning_rate": 6.413133341207038e-08, + "logits/chosen": -1.9322152137756348, + "logits/rejected": -1.8752424716949463, + "logps/chosen": -147.2918701171875, + "logps/rejected": -238.3200225830078, + "loss": 0.8416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6621087789535522, + "rewards/margins": 2.421107769012451, + "rewards/rejected": -3.0832161903381348, + "step": 6786 + }, + { + "epoch": 0.79, + "learning_rate": 6.409590173615212e-08, + "logits/chosen": -2.230856418609619, + "logits/rejected": -2.345964193344116, + "logps/chosen": -236.75242614746094, + "logps/rejected": -246.5378875732422, + "loss": 0.8337, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4754624366760254, + "rewards/margins": 0.5911222696304321, + "rewards/rejected": -2.066584825515747, + "step": 6787 + }, + { + "epoch": 0.79, + "learning_rate": 6.406047006023385e-08, + "logits/chosen": -2.1282143592834473, + "logits/rejected": -1.9664204120635986, + "logps/chosen": -406.5665283203125, + "logps/rejected": -376.43115234375, + "loss": 0.4449, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2480473518371582, + "rewards/margins": 0.8421512842178345, + "rewards/rejected": -2.090198516845703, + "step": 6788 + }, + { + "epoch": 0.79, + "learning_rate": 6.402503838431558e-08, + "logits/chosen": -2.765913963317871, + "logits/rejected": -2.871213912963867, + "logps/chosen": -158.09007263183594, + "logps/rejected": -188.76287841796875, + "loss": 0.3622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.694503903388977, + "rewards/margins": 2.385610818862915, + "rewards/rejected": -3.0801146030426025, + "step": 6789 + }, + { + "epoch": 0.79, + "learning_rate": 6.39896067083973e-08, + "logits/chosen": -1.8214585781097412, + "logits/rejected": -1.9915573596954346, + "logps/chosen": -603.442626953125, + "logps/rejected": -488.077880859375, + "loss": 0.4965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3335566520690918, + "rewards/margins": 0.9505004286766052, + "rewards/rejected": -1.2840571403503418, + "step": 6790 + }, + { + "epoch": 0.79, + "learning_rate": 6.395417503247903e-08, + "logits/chosen": -2.566884994506836, + "logits/rejected": -2.858926296234131, + "logps/chosen": -216.67279052734375, + "logps/rejected": -153.56576538085938, + "loss": 1.8635, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.2564234733581543, + "rewards/margins": 0.19750335812568665, + "rewards/rejected": -3.4539268016815186, + "step": 6791 + }, + { + "epoch": 0.79, + "learning_rate": 6.391874335656077e-08, + "logits/chosen": -2.8240928649902344, + "logits/rejected": -2.744661808013916, + "logps/chosen": -276.2595520019531, + "logps/rejected": -210.21902465820312, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.290804386138916, + "rewards/margins": 2.221125841140747, + "rewards/rejected": -4.511929988861084, + "step": 6792 + }, + { + "epoch": 0.79, + "learning_rate": 6.388331168064249e-08, + "logits/chosen": -2.2747035026550293, + "logits/rejected": -2.4926605224609375, + "logps/chosen": -425.50567626953125, + "logps/rejected": -408.43560791015625, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09721925854682922, + "rewards/margins": 3.4144649505615234, + "rewards/rejected": -3.3172454833984375, + "step": 6793 + }, + { + "epoch": 0.79, + "learning_rate": 6.384788000472423e-08, + "logits/chosen": -2.4956142902374268, + "logits/rejected": -2.4524619579315186, + "logps/chosen": -281.15789794921875, + "logps/rejected": -392.2569274902344, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0813252180814743, + "rewards/margins": 2.829874277114868, + "rewards/rejected": -2.748548984527588, + "step": 6794 + }, + { + "epoch": 0.79, + "learning_rate": 6.381244832880595e-08, + "logits/chosen": -1.7882614135742188, + "logits/rejected": -1.9740612506866455, + "logps/chosen": -252.18954467773438, + "logps/rejected": -240.942626953125, + "loss": 0.3267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7134333848953247, + "rewards/margins": 2.48065447807312, + "rewards/rejected": -3.1940877437591553, + "step": 6795 + }, + { + "epoch": 0.79, + "learning_rate": 6.377701665288767e-08, + "logits/chosen": -2.222358226776123, + "logits/rejected": -2.0604336261749268, + "logps/chosen": -347.1020812988281, + "logps/rejected": -296.7547302246094, + "loss": 0.3148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9110134243965149, + "rewards/margins": 2.1604127883911133, + "rewards/rejected": -3.0714261531829834, + "step": 6796 + }, + { + "epoch": 0.79, + "learning_rate": 6.37415849769694e-08, + "logits/chosen": -2.1205124855041504, + "logits/rejected": -2.520047664642334, + "logps/chosen": -460.56463623046875, + "logps/rejected": -215.44024658203125, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7175145149230957, + "rewards/margins": 1.0822789669036865, + "rewards/rejected": -1.7997934818267822, + "step": 6797 + }, + { + "epoch": 0.79, + "learning_rate": 6.370615330105114e-08, + "logits/chosen": -1.941310167312622, + "logits/rejected": -1.7959825992584229, + "logps/chosen": -172.890869140625, + "logps/rejected": -218.49435424804688, + "loss": 0.3668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0585466623306274, + "rewards/margins": 2.230484962463379, + "rewards/rejected": -3.289031982421875, + "step": 6798 + }, + { + "epoch": 0.79, + "learning_rate": 6.367072162513288e-08, + "logits/chosen": -2.1557884216308594, + "logits/rejected": -2.217745780944824, + "logps/chosen": -265.44488525390625, + "logps/rejected": -317.3211669921875, + "loss": 0.6724, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.370278239250183, + "rewards/margins": 1.152424693107605, + "rewards/rejected": -2.522702932357788, + "step": 6799 + }, + { + "epoch": 0.79, + "learning_rate": 6.36352899492146e-08, + "logits/chosen": -2.3434767723083496, + "logits/rejected": -2.5845749378204346, + "logps/chosen": -189.81007385253906, + "logps/rejected": -188.2084503173828, + "loss": 0.2135, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13010673224925995, + "rewards/margins": 2.838980197906494, + "rewards/rejected": -2.9690871238708496, + "step": 6800 + }, + { + "epoch": 0.79, + "learning_rate": 6.359985827329632e-08, + "logits/chosen": -2.397975444793701, + "logits/rejected": -2.4049019813537598, + "logps/chosen": -312.78070068359375, + "logps/rejected": -349.8260498046875, + "loss": 0.143, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21491481363773346, + "rewards/margins": 3.8566417694091797, + "rewards/rejected": -3.6417269706726074, + "step": 6801 + }, + { + "epoch": 0.79, + "learning_rate": 6.356442659737806e-08, + "logits/chosen": -1.8943442106246948, + "logits/rejected": -1.9619600772857666, + "logps/chosen": -199.3164825439453, + "logps/rejected": -214.37164306640625, + "loss": 0.6279, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.964583158493042, + "rewards/margins": 1.8225607872009277, + "rewards/rejected": -2.7871437072753906, + "step": 6802 + }, + { + "epoch": 0.79, + "learning_rate": 6.352899492145978e-08, + "logits/chosen": -2.821326732635498, + "logits/rejected": -2.658748149871826, + "logps/chosen": -192.15103149414062, + "logps/rejected": -305.13214111328125, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7400562167167664, + "rewards/margins": 4.641158580780029, + "rewards/rejected": -5.381214618682861, + "step": 6803 + }, + { + "epoch": 0.79, + "learning_rate": 6.349356324554151e-08, + "logits/chosen": -2.022153377532959, + "logits/rejected": -1.8650825023651123, + "logps/chosen": -158.96315002441406, + "logps/rejected": -202.23785400390625, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6663171052932739, + "rewards/margins": 1.7247976064682007, + "rewards/rejected": -2.3911147117614746, + "step": 6804 + }, + { + "epoch": 0.79, + "learning_rate": 6.345813156962325e-08, + "logits/chosen": -1.7799378633499146, + "logits/rejected": -1.8087536096572876, + "logps/chosen": -243.47442626953125, + "logps/rejected": -286.40008544921875, + "loss": 0.2598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8991287350654602, + "rewards/margins": 2.0799508094787598, + "rewards/rejected": -2.979079484939575, + "step": 6805 + }, + { + "epoch": 0.79, + "learning_rate": 6.342269989370497e-08, + "logits/chosen": -2.2313826084136963, + "logits/rejected": -2.3537559509277344, + "logps/chosen": -380.25433349609375, + "logps/rejected": -233.24615478515625, + "loss": 0.7489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6211779117584229, + "rewards/margins": 0.3604320287704468, + "rewards/rejected": -1.9816100597381592, + "step": 6806 + }, + { + "epoch": 0.79, + "learning_rate": 6.338726821778669e-08, + "logits/chosen": -2.1844873428344727, + "logits/rejected": -2.022939920425415, + "logps/chosen": -446.2575378417969, + "logps/rejected": -484.958251953125, + "loss": 0.9916, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4694340229034424, + "rewards/margins": 0.6024514436721802, + "rewards/rejected": -2.071885585784912, + "step": 6807 + }, + { + "epoch": 0.79, + "learning_rate": 6.335183654186843e-08, + "logits/chosen": -2.7030932903289795, + "logits/rejected": -2.5498275756835938, + "logps/chosen": -355.859130859375, + "logps/rejected": -273.297119140625, + "loss": 0.462, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5947198867797852, + "rewards/margins": 2.4618163108825684, + "rewards/rejected": -4.056536674499512, + "step": 6808 + }, + { + "epoch": 0.79, + "learning_rate": 6.331640486595016e-08, + "logits/chosen": -2.745579242706299, + "logits/rejected": -2.642775535583496, + "logps/chosen": -243.49252319335938, + "logps/rejected": -238.3797149658203, + "loss": 0.7094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6358884572982788, + "rewards/margins": 1.5563158988952637, + "rewards/rejected": -3.192204475402832, + "step": 6809 + }, + { + "epoch": 0.79, + "learning_rate": 6.328097319003189e-08, + "logits/chosen": -2.2355241775512695, + "logits/rejected": -2.4976754188537598, + "logps/chosen": -290.7921142578125, + "logps/rejected": -256.57861328125, + "loss": 0.4579, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3895573616027832, + "rewards/margins": 1.275899887084961, + "rewards/rejected": -2.665457248687744, + "step": 6810 + }, + { + "epoch": 0.79, + "learning_rate": 6.324554151411362e-08, + "logits/chosen": -2.307816505432129, + "logits/rejected": -2.5519042015075684, + "logps/chosen": -297.6355285644531, + "logps/rejected": -257.4981689453125, + "loss": 0.4953, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3308672904968262, + "rewards/margins": 1.0841656923294067, + "rewards/rejected": -2.4150328636169434, + "step": 6811 + }, + { + "epoch": 0.79, + "learning_rate": 6.321010983819534e-08, + "logits/chosen": -2.3852977752685547, + "logits/rejected": -2.1939754486083984, + "logps/chosen": -419.3787536621094, + "logps/rejected": -341.34808349609375, + "loss": 0.4648, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2085882425308228, + "rewards/margins": 1.0502952337265015, + "rewards/rejected": -2.258883476257324, + "step": 6812 + }, + { + "epoch": 0.79, + "learning_rate": 6.317467816227706e-08, + "logits/chosen": -2.2341983318328857, + "logits/rejected": -1.988534927368164, + "logps/chosen": -337.31170654296875, + "logps/rejected": -292.7730712890625, + "loss": 0.3777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7908974885940552, + "rewards/margins": 1.66379976272583, + "rewards/rejected": -2.4546971321105957, + "step": 6813 + }, + { + "epoch": 0.79, + "learning_rate": 6.31392464863588e-08, + "logits/chosen": -1.9050772190093994, + "logits/rejected": -2.3762130737304688, + "logps/chosen": -419.4976501464844, + "logps/rejected": -325.50958251953125, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5835886001586914, + "rewards/margins": 2.633004665374756, + "rewards/rejected": -3.2165932655334473, + "step": 6814 + }, + { + "epoch": 0.79, + "learning_rate": 6.310381481044054e-08, + "logits/chosen": -2.432161331176758, + "logits/rejected": -2.4981040954589844, + "logps/chosen": -308.74908447265625, + "logps/rejected": -221.52450561523438, + "loss": 0.4929, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9778757691383362, + "rewards/margins": 1.6180200576782227, + "rewards/rejected": -2.595895767211914, + "step": 6815 + }, + { + "epoch": 0.79, + "learning_rate": 6.306838313452226e-08, + "logits/chosen": -1.7220635414123535, + "logits/rejected": -2.052182674407959, + "logps/chosen": -606.8417358398438, + "logps/rejected": -369.26898193359375, + "loss": 0.6589, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.31477779150009155, + "rewards/margins": 1.3013675212860107, + "rewards/rejected": -1.6161452531814575, + "step": 6816 + }, + { + "epoch": 0.79, + "learning_rate": 6.303295145860399e-08, + "logits/chosen": -2.374607563018799, + "logits/rejected": -2.3857107162475586, + "logps/chosen": -272.97906494140625, + "logps/rejected": -264.08770751953125, + "loss": 0.3251, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.95330810546875, + "rewards/margins": 1.4115948677062988, + "rewards/rejected": -3.364902973175049, + "step": 6817 + }, + { + "epoch": 0.79, + "learning_rate": 6.299751978268572e-08, + "logits/chosen": -1.8138853311538696, + "logits/rejected": -1.7799879312515259, + "logps/chosen": -366.2053527832031, + "logps/rejected": -377.4897155761719, + "loss": 0.5335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2974322438240051, + "rewards/margins": 0.9137157201766968, + "rewards/rejected": -1.2111480236053467, + "step": 6818 + }, + { + "epoch": 0.79, + "learning_rate": 6.296208810676745e-08, + "logits/chosen": -2.273588180541992, + "logits/rejected": -2.2245569229125977, + "logps/chosen": -182.46163940429688, + "logps/rejected": -156.8542022705078, + "loss": 0.8632, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9658499360084534, + "rewards/margins": 0.8314752578735352, + "rewards/rejected": -1.7973252534866333, + "step": 6819 + }, + { + "epoch": 0.79, + "learning_rate": 6.292665643084917e-08, + "logits/chosen": -2.700573205947876, + "logits/rejected": -2.5909647941589355, + "logps/chosen": -179.2833709716797, + "logps/rejected": -229.57565307617188, + "loss": 0.3743, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6661924123764038, + "rewards/margins": 2.598938226699829, + "rewards/rejected": -3.2651309967041016, + "step": 6820 + }, + { + "epoch": 0.79, + "learning_rate": 6.289122475493091e-08, + "logits/chosen": -2.6604418754577637, + "logits/rejected": -2.654819965362549, + "logps/chosen": -231.9023895263672, + "logps/rejected": -239.58074951171875, + "loss": 0.4162, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8056078553199768, + "rewards/margins": 2.2858316898345947, + "rewards/rejected": -3.091439723968506, + "step": 6821 + }, + { + "epoch": 0.79, + "learning_rate": 6.285579307901263e-08, + "logits/chosen": -2.2104830741882324, + "logits/rejected": -2.429827928543091, + "logps/chosen": -363.30902099609375, + "logps/rejected": -244.61651611328125, + "loss": 0.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6656179428100586, + "rewards/margins": 1.4062573909759521, + "rewards/rejected": -2.0718753337860107, + "step": 6822 + }, + { + "epoch": 0.79, + "learning_rate": 6.282036140309437e-08, + "logits/chosen": -2.3316798210144043, + "logits/rejected": -2.3102171421051025, + "logps/chosen": -159.2357940673828, + "logps/rejected": -244.04794311523438, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8664047718048096, + "rewards/margins": 2.4343175888061523, + "rewards/rejected": -3.300722360610962, + "step": 6823 + }, + { + "epoch": 0.79, + "learning_rate": 6.278492972717609e-08, + "logits/chosen": -1.8175790309906006, + "logits/rejected": -1.6613408327102661, + "logps/chosen": -280.41351318359375, + "logps/rejected": -386.1141357421875, + "loss": 0.2723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5777278542518616, + "rewards/margins": 1.8316383361816406, + "rewards/rejected": -2.4093661308288574, + "step": 6824 + }, + { + "epoch": 0.79, + "learning_rate": 6.274949805125782e-08, + "logits/chosen": -2.522156000137329, + "logits/rejected": -2.5817315578460693, + "logps/chosen": -357.47296142578125, + "logps/rejected": -207.1848602294922, + "loss": 0.2351, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03694732487201691, + "rewards/margins": 2.083955764770508, + "rewards/rejected": -2.1209030151367188, + "step": 6825 + }, + { + "epoch": 0.79, + "learning_rate": 6.271406637533956e-08, + "logits/chosen": -2.4609618186950684, + "logits/rejected": -2.9065542221069336, + "logps/chosen": -282.47607421875, + "logps/rejected": -194.25625610351562, + "loss": 0.2975, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8086976408958435, + "rewards/margins": 3.447981595993042, + "rewards/rejected": -4.256679534912109, + "step": 6826 + }, + { + "epoch": 0.79, + "learning_rate": 6.267863469942128e-08, + "logits/chosen": -1.9748719930648804, + "logits/rejected": -2.0269923210144043, + "logps/chosen": -361.55914306640625, + "logps/rejected": -323.3653564453125, + "loss": 0.5786, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0930440425872803, + "rewards/margins": 1.2704932689666748, + "rewards/rejected": -2.363537311553955, + "step": 6827 + }, + { + "epoch": 0.79, + "learning_rate": 6.2643203023503e-08, + "logits/chosen": -2.0246243476867676, + "logits/rejected": -2.1423563957214355, + "logps/chosen": -461.56298828125, + "logps/rejected": -403.35369873046875, + "loss": 0.1873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027841851115226746, + "rewards/margins": 2.0495471954345703, + "rewards/rejected": -2.021705389022827, + "step": 6828 + }, + { + "epoch": 0.79, + "learning_rate": 6.260777134758474e-08, + "logits/chosen": -2.49320650100708, + "logits/rejected": -2.464782238006592, + "logps/chosen": -258.2237854003906, + "logps/rejected": -258.4863586425781, + "loss": 0.3258, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.063245177268982, + "rewards/margins": 2.5312910079956055, + "rewards/rejected": -3.594536542892456, + "step": 6829 + }, + { + "epoch": 0.79, + "learning_rate": 6.257233967166646e-08, + "logits/chosen": -2.026853084564209, + "logits/rejected": -1.725264549255371, + "logps/chosen": -244.90615844726562, + "logps/rejected": -314.2957458496094, + "loss": 0.7398, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3865164518356323, + "rewards/margins": 0.1506587266921997, + "rewards/rejected": -1.537175178527832, + "step": 6830 + }, + { + "epoch": 0.79, + "learning_rate": 6.25369079957482e-08, + "logits/chosen": -2.2648966312408447, + "logits/rejected": -2.23026704788208, + "logps/chosen": -416.9146728515625, + "logps/rejected": -357.97650146484375, + "loss": 0.5728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.69813072681427, + "rewards/margins": 2.026193618774414, + "rewards/rejected": -3.7243242263793945, + "step": 6831 + }, + { + "epoch": 0.79, + "learning_rate": 6.250147631982993e-08, + "logits/chosen": -2.4307215213775635, + "logits/rejected": -2.595226287841797, + "logps/chosen": -235.8336181640625, + "logps/rejected": -254.2156982421875, + "loss": 0.3972, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3966517448425293, + "rewards/margins": 3.0202438831329346, + "rewards/rejected": -4.416895866394043, + "step": 6832 + }, + { + "epoch": 0.79, + "learning_rate": 6.246604464391165e-08, + "logits/chosen": -2.3299412727355957, + "logits/rejected": -2.471790313720703, + "logps/chosen": -477.599365234375, + "logps/rejected": -320.89849853515625, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5683954358100891, + "rewards/margins": 3.033669948577881, + "rewards/rejected": -3.602065324783325, + "step": 6833 + }, + { + "epoch": 0.79, + "learning_rate": 6.243061296799339e-08, + "logits/chosen": -2.185275077819824, + "logits/rejected": -2.119556427001953, + "logps/chosen": -158.6759033203125, + "logps/rejected": -133.61337280273438, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7501013875007629, + "rewards/margins": 1.1915045976638794, + "rewards/rejected": -1.941606044769287, + "step": 6834 + }, + { + "epoch": 0.8, + "learning_rate": 6.239518129207511e-08, + "logits/chosen": -1.6929683685302734, + "logits/rejected": -1.9497895240783691, + "logps/chosen": -438.0854187011719, + "logps/rejected": -390.5484924316406, + "loss": 0.4832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.362844467163086, + "rewards/margins": 1.0009136199951172, + "rewards/rejected": -2.363758087158203, + "step": 6835 + }, + { + "epoch": 0.8, + "learning_rate": 6.235974961615685e-08, + "logits/chosen": -2.5380823612213135, + "logits/rejected": -2.3468170166015625, + "logps/chosen": -320.11224365234375, + "logps/rejected": -277.92669677734375, + "loss": 0.3031, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7744513154029846, + "rewards/margins": 2.240217685699463, + "rewards/rejected": -3.0146689414978027, + "step": 6836 + }, + { + "epoch": 0.8, + "learning_rate": 6.232431794023857e-08, + "logits/chosen": -2.2293930053710938, + "logits/rejected": -2.4432146549224854, + "logps/chosen": -431.3406066894531, + "logps/rejected": -213.21697998046875, + "loss": 0.1883, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.039171814918518, + "rewards/margins": 2.1666224002838135, + "rewards/rejected": -3.205794334411621, + "step": 6837 + }, + { + "epoch": 0.8, + "learning_rate": 6.22888862643203e-08, + "logits/chosen": -2.434499502182007, + "logits/rejected": -2.6210217475891113, + "logps/chosen": -297.30731201171875, + "logps/rejected": -280.2550048828125, + "loss": 0.5575, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.225843906402588, + "rewards/margins": 1.1421735286712646, + "rewards/rejected": -2.3680176734924316, + "step": 6838 + }, + { + "epoch": 0.8, + "learning_rate": 6.225345458840203e-08, + "logits/chosen": -2.186605215072632, + "logits/rejected": -2.5406644344329834, + "logps/chosen": -468.1019592285156, + "logps/rejected": -256.7169189453125, + "loss": 0.4812, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8386433720588684, + "rewards/margins": 1.7743651866912842, + "rewards/rejected": -2.613008499145508, + "step": 6839 + }, + { + "epoch": 0.8, + "learning_rate": 6.221802291248376e-08, + "logits/chosen": -2.2246546745300293, + "logits/rejected": -2.299765110015869, + "logps/chosen": -341.9164733886719, + "logps/rejected": -340.49298095703125, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5369409322738647, + "rewards/margins": 2.6214704513549805, + "rewards/rejected": -4.158411502838135, + "step": 6840 + }, + { + "epoch": 0.8, + "learning_rate": 6.218259123656548e-08, + "logits/chosen": -1.8745458126068115, + "logits/rejected": -2.033921241760254, + "logps/chosen": -421.938232421875, + "logps/rejected": -391.81048583984375, + "loss": 0.1919, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.565506100654602, + "rewards/margins": 2.0856711864471436, + "rewards/rejected": -2.651177406311035, + "step": 6841 + }, + { + "epoch": 0.8, + "learning_rate": 6.214715956064722e-08, + "logits/chosen": -2.764787435531616, + "logits/rejected": -2.7290735244750977, + "logps/chosen": -234.2088623046875, + "logps/rejected": -227.45156860351562, + "loss": 0.3506, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.610879123210907, + "rewards/margins": 1.4026906490325928, + "rewards/rejected": -2.0135698318481445, + "step": 6842 + }, + { + "epoch": 0.8, + "learning_rate": 6.211172788472895e-08, + "logits/chosen": -2.291232109069824, + "logits/rejected": -2.5643715858459473, + "logps/chosen": -166.26429748535156, + "logps/rejected": -137.162353515625, + "loss": 0.2425, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12492899596691132, + "rewards/margins": 2.385897397994995, + "rewards/rejected": -2.2609682083129883, + "step": 6843 + }, + { + "epoch": 0.8, + "learning_rate": 6.207629620881068e-08, + "logits/chosen": -2.946488618850708, + "logits/rejected": -2.908271551132202, + "logps/chosen": -129.52249145507812, + "logps/rejected": -155.7763671875, + "loss": 0.458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5405779480934143, + "rewards/margins": 2.5716850757598877, + "rewards/rejected": -3.1122629642486572, + "step": 6844 + }, + { + "epoch": 0.8, + "learning_rate": 6.20408645328924e-08, + "logits/chosen": -1.786116361618042, + "logits/rejected": -2.055964469909668, + "logps/chosen": -228.19223022460938, + "logps/rejected": -227.5196533203125, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4102916717529297, + "rewards/margins": 1.517952561378479, + "rewards/rejected": -2.928244113922119, + "step": 6845 + }, + { + "epoch": 0.8, + "learning_rate": 6.200543285697413e-08, + "logits/chosen": -2.354393243789673, + "logits/rejected": -2.3393170833587646, + "logps/chosen": -307.44219970703125, + "logps/rejected": -300.79290771484375, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.558704137802124, + "rewards/margins": 2.447596549987793, + "rewards/rejected": -4.006300926208496, + "step": 6846 + }, + { + "epoch": 0.8, + "learning_rate": 6.197000118105586e-08, + "logits/chosen": -1.7318826913833618, + "logits/rejected": -1.960352897644043, + "logps/chosen": -287.6160583496094, + "logps/rejected": -210.08221435546875, + "loss": 0.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0149390697479248, + "rewards/margins": 1.2997474670410156, + "rewards/rejected": -2.3146865367889404, + "step": 6847 + }, + { + "epoch": 0.8, + "learning_rate": 6.193456950513759e-08, + "logits/chosen": -2.3671865463256836, + "logits/rejected": -1.932340383529663, + "logps/chosen": -232.91368103027344, + "logps/rejected": -325.377197265625, + "loss": 0.3741, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7375699281692505, + "rewards/margins": 2.8285973072052, + "rewards/rejected": -3.5661673545837402, + "step": 6848 + }, + { + "epoch": 0.8, + "learning_rate": 6.189913782921933e-08, + "logits/chosen": -2.748404026031494, + "logits/rejected": -2.7693657875061035, + "logps/chosen": -98.08004760742188, + "logps/rejected": -149.220458984375, + "loss": 0.7063, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2909510135650635, + "rewards/margins": 0.8454074263572693, + "rewards/rejected": -2.1363582611083984, + "step": 6849 + }, + { + "epoch": 0.8, + "learning_rate": 6.186370615330105e-08, + "logits/chosen": -2.862389326095581, + "logits/rejected": -2.8285460472106934, + "logps/chosen": -174.32081604003906, + "logps/rejected": -120.73200225830078, + "loss": 0.3054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8009113073348999, + "rewards/margins": 1.2189006805419922, + "rewards/rejected": -2.0198121070861816, + "step": 6850 + }, + { + "epoch": 0.8, + "learning_rate": 6.182827447738277e-08, + "logits/chosen": -2.3116378784179688, + "logits/rejected": -2.594020366668701, + "logps/chosen": -387.82135009765625, + "logps/rejected": -310.3623046875, + "loss": 0.2908, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2876507639884949, + "rewards/margins": 2.0654802322387695, + "rewards/rejected": -2.35313081741333, + "step": 6851 + }, + { + "epoch": 0.8, + "learning_rate": 6.17928428014645e-08, + "logits/chosen": -1.7338718175888062, + "logits/rejected": -1.6062952280044556, + "logps/chosen": -498.030517578125, + "logps/rejected": -500.8822021484375, + "loss": 0.6348, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5327322483062744, + "rewards/margins": 1.397261381149292, + "rewards/rejected": -2.9299936294555664, + "step": 6852 + }, + { + "epoch": 0.8, + "learning_rate": 6.175741112554624e-08, + "logits/chosen": -1.3945882320404053, + "logits/rejected": -1.9716272354125977, + "logps/chosen": -225.59814453125, + "logps/rejected": -212.77056884765625, + "loss": 0.5621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8643045425415039, + "rewards/margins": 2.4510693550109863, + "rewards/rejected": -3.3153738975524902, + "step": 6853 + }, + { + "epoch": 0.8, + "learning_rate": 6.172197944962796e-08, + "logits/chosen": -2.2953336238861084, + "logits/rejected": -2.371824026107788, + "logps/chosen": -362.1795959472656, + "logps/rejected": -311.12274169921875, + "loss": 0.3431, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9477490186691284, + "rewards/margins": 1.445358157157898, + "rewards/rejected": -2.3931071758270264, + "step": 6854 + }, + { + "epoch": 0.8, + "learning_rate": 6.16865477737097e-08, + "logits/chosen": -2.47581148147583, + "logits/rejected": -2.6453781127929688, + "logps/chosen": -347.84246826171875, + "logps/rejected": -260.4224548339844, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5556889772415161, + "rewards/margins": 2.5416297912597656, + "rewards/rejected": -3.0973191261291504, + "step": 6855 + }, + { + "epoch": 0.8, + "learning_rate": 6.165111609779142e-08, + "logits/chosen": -1.9120556116104126, + "logits/rejected": -2.1209051609039307, + "logps/chosen": -323.7871398925781, + "logps/rejected": -283.9405212402344, + "loss": 0.8298, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4211859703063965, + "rewards/margins": 1.362655758857727, + "rewards/rejected": -3.783841609954834, + "step": 6856 + }, + { + "epoch": 0.8, + "learning_rate": 6.161568442187314e-08, + "logits/chosen": -2.549255609512329, + "logits/rejected": -2.5862505435943604, + "logps/chosen": -200.94708251953125, + "logps/rejected": -317.8486328125, + "loss": 0.4339, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8365788459777832, + "rewards/margins": 1.6901180744171143, + "rewards/rejected": -3.5266969203948975, + "step": 6857 + }, + { + "epoch": 0.8, + "learning_rate": 6.158025274595488e-08, + "logits/chosen": -2.5728507041931152, + "logits/rejected": -2.581162452697754, + "logps/chosen": -214.61648559570312, + "logps/rejected": -284.29290771484375, + "loss": 0.4778, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2478594779968262, + "rewards/margins": 2.533503532409668, + "rewards/rejected": -3.781362771987915, + "step": 6858 + }, + { + "epoch": 0.8, + "learning_rate": 6.154482107003661e-08, + "logits/chosen": -2.4273595809936523, + "logits/rejected": -2.2840187549591064, + "logps/chosen": -200.69815063476562, + "logps/rejected": -214.00950622558594, + "loss": 0.6355, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2133837938308716, + "rewards/margins": 1.240335464477539, + "rewards/rejected": -2.453719139099121, + "step": 6859 + }, + { + "epoch": 0.8, + "learning_rate": 6.150938939411834e-08, + "logits/chosen": -2.4676544666290283, + "logits/rejected": -2.589242458343506, + "logps/chosen": -243.68450927734375, + "logps/rejected": -238.80548095703125, + "loss": 0.4162, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1741483211517334, + "rewards/margins": 3.063234329223633, + "rewards/rejected": -4.237382888793945, + "step": 6860 + }, + { + "epoch": 0.8, + "learning_rate": 6.147395771820007e-08, + "logits/chosen": -2.4629135131835938, + "logits/rejected": -2.2211174964904785, + "logps/chosen": -292.4907531738281, + "logps/rejected": -330.314697265625, + "loss": 0.5679, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.192375898361206, + "rewards/margins": 0.8940994739532471, + "rewards/rejected": -2.086475372314453, + "step": 6861 + }, + { + "epoch": 0.8, + "learning_rate": 6.143852604228179e-08, + "logits/chosen": -1.8257081508636475, + "logits/rejected": -2.1106345653533936, + "logps/chosen": -425.1636962890625, + "logps/rejected": -377.98974609375, + "loss": 0.1489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08446913957595825, + "rewards/margins": 2.6171042919158936, + "rewards/rejected": -2.532634973526001, + "step": 6862 + }, + { + "epoch": 0.8, + "learning_rate": 6.140309436636353e-08, + "logits/chosen": -2.8049685955047607, + "logits/rejected": -2.829469680786133, + "logps/chosen": -82.49890899658203, + "logps/rejected": -154.82427978515625, + "loss": 0.4006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7296584844589233, + "rewards/margins": 1.6011145114898682, + "rewards/rejected": -2.330772876739502, + "step": 6863 + }, + { + "epoch": 0.8, + "learning_rate": 6.136766269044525e-08, + "logits/chosen": -2.3144547939300537, + "logits/rejected": -2.388270378112793, + "logps/chosen": -528.3441162109375, + "logps/rejected": -454.0592956542969, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2696225047111511, + "rewards/margins": 3.642622709274292, + "rewards/rejected": -3.912245512008667, + "step": 6864 + }, + { + "epoch": 0.8, + "learning_rate": 6.133223101452699e-08, + "logits/chosen": -1.9052088260650635, + "logits/rejected": -2.3367867469787598, + "logps/chosen": -316.1722412109375, + "logps/rejected": -244.171142578125, + "loss": 0.4858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9609666466712952, + "rewards/margins": 1.7262738943099976, + "rewards/rejected": -2.6872406005859375, + "step": 6865 + }, + { + "epoch": 0.8, + "learning_rate": 6.129679933860871e-08, + "logits/chosen": -2.9436936378479004, + "logits/rejected": -2.9177675247192383, + "logps/chosen": -253.65330505371094, + "logps/rejected": -240.0859832763672, + "loss": 0.1414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01873358152806759, + "rewards/margins": 2.813572883605957, + "rewards/rejected": -2.832306385040283, + "step": 6866 + }, + { + "epoch": 0.8, + "learning_rate": 6.126136766269044e-08, + "logits/chosen": -2.4977145195007324, + "logits/rejected": -2.3757247924804688, + "logps/chosen": -136.78271484375, + "logps/rejected": -196.136962890625, + "loss": 0.2955, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12349767982959747, + "rewards/margins": 2.740003824234009, + "rewards/rejected": -2.86350154876709, + "step": 6867 + }, + { + "epoch": 0.8, + "learning_rate": 6.122593598677217e-08, + "logits/chosen": -2.9151084423065186, + "logits/rejected": -2.8006560802459717, + "logps/chosen": -321.87518310546875, + "logps/rejected": -257.6304016113281, + "loss": 0.416, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0966020822525024, + "rewards/margins": 1.8581135272979736, + "rewards/rejected": -2.9547154903411865, + "step": 6868 + }, + { + "epoch": 0.8, + "learning_rate": 6.11905043108539e-08, + "logits/chosen": -2.9206643104553223, + "logits/rejected": -2.98946213722229, + "logps/chosen": -114.8738784790039, + "logps/rejected": -182.10032653808594, + "loss": 0.6, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4498250484466553, + "rewards/margins": 2.6918790340423584, + "rewards/rejected": -4.141704082489014, + "step": 6869 + }, + { + "epoch": 0.8, + "learning_rate": 6.115507263493564e-08, + "logits/chosen": -2.2308616638183594, + "logits/rejected": -2.428957462310791, + "logps/chosen": -320.9161376953125, + "logps/rejected": -110.08751678466797, + "loss": 0.2773, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37987589836120605, + "rewards/margins": 1.940679907798767, + "rewards/rejected": -2.3205556869506836, + "step": 6870 + }, + { + "epoch": 0.8, + "learning_rate": 6.111964095901736e-08, + "logits/chosen": -2.43497371673584, + "logits/rejected": -2.551339626312256, + "logps/chosen": -428.1810607910156, + "logps/rejected": -388.3935852050781, + "loss": 0.5103, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.712624192237854, + "rewards/margins": 1.0983455181121826, + "rewards/rejected": -2.810969829559326, + "step": 6871 + }, + { + "epoch": 0.8, + "learning_rate": 6.10842092830991e-08, + "logits/chosen": -1.9998674392700195, + "logits/rejected": -1.9360617399215698, + "logps/chosen": -192.01104736328125, + "logps/rejected": -219.0188446044922, + "loss": 0.8399, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3340623378753662, + "rewards/margins": 1.8400330543518066, + "rewards/rejected": -3.174095630645752, + "step": 6872 + }, + { + "epoch": 0.8, + "learning_rate": 6.104877760718082e-08, + "logits/chosen": -1.8257018327713013, + "logits/rejected": -2.1537225246429443, + "logps/chosen": -312.02484130859375, + "logps/rejected": -270.003662109375, + "loss": 0.4332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8723500370979309, + "rewards/margins": 1.6541929244995117, + "rewards/rejected": -2.5265426635742188, + "step": 6873 + }, + { + "epoch": 0.8, + "learning_rate": 6.101334593126254e-08, + "logits/chosen": -1.9987155199050903, + "logits/rejected": -2.323665142059326, + "logps/chosen": -421.58154296875, + "logps/rejected": -302.4698486328125, + "loss": 0.3925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.841350257396698, + "rewards/margins": 1.399461030960083, + "rewards/rejected": -2.2408111095428467, + "step": 6874 + }, + { + "epoch": 0.8, + "learning_rate": 6.097791425534427e-08, + "logits/chosen": -1.8710644245147705, + "logits/rejected": -1.7614995241165161, + "logps/chosen": -416.59625244140625, + "logps/rejected": -417.6948547363281, + "loss": 0.2111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3255206346511841, + "rewards/margins": 3.144794225692749, + "rewards/rejected": -3.4703147411346436, + "step": 6875 + }, + { + "epoch": 0.8, + "learning_rate": 6.094248257942601e-08, + "logits/chosen": -2.692440986633301, + "logits/rejected": -2.3176722526550293, + "logps/chosen": -108.86640167236328, + "logps/rejected": -180.07888793945312, + "loss": 0.3629, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5793875455856323, + "rewards/margins": 1.8129773139953613, + "rewards/rejected": -2.392364978790283, + "step": 6876 + }, + { + "epoch": 0.8, + "learning_rate": 6.090705090350773e-08, + "logits/chosen": -2.173902750015259, + "logits/rejected": -2.6319642066955566, + "logps/chosen": -404.4335021972656, + "logps/rejected": -231.0235595703125, + "loss": 0.4182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6690205335617065, + "rewards/margins": 1.4047249555587769, + "rewards/rejected": -2.0737454891204834, + "step": 6877 + }, + { + "epoch": 0.8, + "learning_rate": 6.087161922758947e-08, + "logits/chosen": -2.621583938598633, + "logits/rejected": -2.739276647567749, + "logps/chosen": -173.18295288085938, + "logps/rejected": -236.97312927246094, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.818861722946167, + "rewards/margins": 1.3644630908966064, + "rewards/rejected": -2.1833250522613525, + "step": 6878 + }, + { + "epoch": 0.8, + "learning_rate": 6.083618755167119e-08, + "logits/chosen": -2.077061653137207, + "logits/rejected": -2.056602716445923, + "logps/chosen": -336.3263854980469, + "logps/rejected": -302.77972412109375, + "loss": 0.32, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1751642227172852, + "rewards/margins": 2.348365306854248, + "rewards/rejected": -3.523529529571533, + "step": 6879 + }, + { + "epoch": 0.8, + "learning_rate": 6.080075587575292e-08, + "logits/chosen": -2.639233350753784, + "logits/rejected": -2.7530274391174316, + "logps/chosen": -277.5287170410156, + "logps/rejected": -201.5216064453125, + "loss": 0.9407, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.13041353225708, + "rewards/margins": 1.434485673904419, + "rewards/rejected": -2.564899206161499, + "step": 6880 + }, + { + "epoch": 0.8, + "learning_rate": 6.076532419983466e-08, + "logits/chosen": -2.67049503326416, + "logits/rejected": -2.67189884185791, + "logps/chosen": -410.9122314453125, + "logps/rejected": -325.05987548828125, + "loss": 0.1916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.547398567199707, + "rewards/margins": 2.428257465362549, + "rewards/rejected": -2.975656032562256, + "step": 6881 + }, + { + "epoch": 0.8, + "learning_rate": 6.072989252391638e-08, + "logits/chosen": -1.8763231039047241, + "logits/rejected": -2.1768198013305664, + "logps/chosen": -239.5070343017578, + "logps/rejected": -156.99130249023438, + "loss": 0.9637, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4590353965759277, + "rewards/margins": 0.6330040097236633, + "rewards/rejected": -2.0920395851135254, + "step": 6882 + }, + { + "epoch": 0.8, + "learning_rate": 6.06944608479981e-08, + "logits/chosen": -2.3112926483154297, + "logits/rejected": -2.269711971282959, + "logps/chosen": -456.79998779296875, + "logps/rejected": -344.9031677246094, + "loss": 0.5201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7449369430541992, + "rewards/margins": 1.3485500812530518, + "rewards/rejected": -2.09348726272583, + "step": 6883 + }, + { + "epoch": 0.8, + "learning_rate": 6.065902917207984e-08, + "logits/chosen": -2.15946626663208, + "logits/rejected": -2.2022767066955566, + "logps/chosen": -322.4681396484375, + "logps/rejected": -349.84295654296875, + "loss": 0.2276, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.19395112991333, + "rewards/margins": 4.320577144622803, + "rewards/rejected": -5.514528274536133, + "step": 6884 + }, + { + "epoch": 0.8, + "learning_rate": 6.062359749616156e-08, + "logits/chosen": -2.631561756134033, + "logits/rejected": -2.4968762397766113, + "logps/chosen": -348.88671875, + "logps/rejected": -359.77557373046875, + "loss": 0.3809, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2477723360061646, + "rewards/margins": 1.8995622396469116, + "rewards/rejected": -3.147334575653076, + "step": 6885 + }, + { + "epoch": 0.8, + "learning_rate": 6.05881658202433e-08, + "logits/chosen": -2.746532917022705, + "logits/rejected": -2.66607928276062, + "logps/chosen": -170.00213623046875, + "logps/rejected": -262.0600280761719, + "loss": 0.2595, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095134496688843, + "rewards/margins": 3.477121114730835, + "rewards/rejected": -4.28663444519043, + "step": 6886 + }, + { + "epoch": 0.8, + "learning_rate": 6.055273414432503e-08, + "logits/chosen": -2.581150770187378, + "logits/rejected": -2.6428232192993164, + "logps/chosen": -302.81158447265625, + "logps/rejected": -339.23876953125, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9259889721870422, + "rewards/margins": 4.471812725067139, + "rewards/rejected": -5.397801876068115, + "step": 6887 + }, + { + "epoch": 0.8, + "learning_rate": 6.051730246840675e-08, + "logits/chosen": -2.2816524505615234, + "logits/rejected": -2.3479833602905273, + "logps/chosen": -325.6483459472656, + "logps/rejected": -348.9211730957031, + "loss": 0.4061, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.160871982574463, + "rewards/margins": 3.039816379547119, + "rewards/rejected": -4.20068883895874, + "step": 6888 + }, + { + "epoch": 0.8, + "learning_rate": 6.048187079248848e-08, + "logits/chosen": -2.1469123363494873, + "logits/rejected": -2.1613872051239014, + "logps/chosen": -188.59136962890625, + "logps/rejected": -170.01254272460938, + "loss": 0.2822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42905956506729126, + "rewards/margins": 2.2298736572265625, + "rewards/rejected": -2.658933401107788, + "step": 6889 + }, + { + "epoch": 0.8, + "learning_rate": 6.044643911657021e-08, + "logits/chosen": -2.4381847381591797, + "logits/rejected": -2.5352230072021484, + "logps/chosen": -335.6871337890625, + "logps/rejected": -371.23065185546875, + "loss": 0.249, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6556386947631836, + "rewards/margins": 2.150097608566284, + "rewards/rejected": -2.8057360649108887, + "step": 6890 + }, + { + "epoch": 0.8, + "learning_rate": 6.041100744065193e-08, + "logits/chosen": -2.400900363922119, + "logits/rejected": -2.4083051681518555, + "logps/chosen": -184.46250915527344, + "logps/rejected": -306.98779296875, + "loss": 0.1856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7026311159133911, + "rewards/margins": 4.589041233062744, + "rewards/rejected": -5.291672229766846, + "step": 6891 + }, + { + "epoch": 0.8, + "learning_rate": 6.037557576473367e-08, + "logits/chosen": -2.5259649753570557, + "logits/rejected": -2.3968427181243896, + "logps/chosen": -235.30447387695312, + "logps/rejected": -289.27667236328125, + "loss": 0.2002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3481628894805908, + "rewards/margins": 3.3171744346618652, + "rewards/rejected": -4.665337562561035, + "step": 6892 + }, + { + "epoch": 0.8, + "learning_rate": 6.03401440888154e-08, + "logits/chosen": -2.6868913173675537, + "logits/rejected": -2.7782206535339355, + "logps/chosen": -124.67086029052734, + "logps/rejected": -188.67047119140625, + "loss": 0.521, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2644999921321869, + "rewards/margins": 2.063897132873535, + "rewards/rejected": -2.328397035598755, + "step": 6893 + }, + { + "epoch": 0.8, + "learning_rate": 6.030471241289713e-08, + "logits/chosen": -2.60872745513916, + "logits/rejected": -2.807612657546997, + "logps/chosen": -264.4891357421875, + "logps/rejected": -250.44589233398438, + "loss": 0.4321, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3954761028289795, + "rewards/margins": 1.415717363357544, + "rewards/rejected": -2.8111929893493652, + "step": 6894 + }, + { + "epoch": 0.8, + "learning_rate": 6.026928073697885e-08, + "logits/chosen": -2.2900028228759766, + "logits/rejected": -2.34365177154541, + "logps/chosen": -109.8130111694336, + "logps/rejected": -121.01927185058594, + "loss": 0.4934, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.916411817073822, + "rewards/margins": 1.3454954624176025, + "rewards/rejected": -2.2619073390960693, + "step": 6895 + }, + { + "epoch": 0.8, + "learning_rate": 6.023384906106058e-08, + "logits/chosen": -2.478156089782715, + "logits/rejected": -2.4685797691345215, + "logps/chosen": -249.77191162109375, + "logps/rejected": -352.72705078125, + "loss": 0.2777, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6337382197380066, + "rewards/margins": 2.5091664791107178, + "rewards/rejected": -3.142904758453369, + "step": 6896 + }, + { + "epoch": 0.8, + "learning_rate": 6.019841738514232e-08, + "logits/chosen": -2.439596652984619, + "logits/rejected": -2.407973527908325, + "logps/chosen": -394.96710205078125, + "logps/rejected": -421.0386657714844, + "loss": 0.7773, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2945051193237305, + "rewards/margins": 2.7945199012756348, + "rewards/rejected": -4.089025020599365, + "step": 6897 + }, + { + "epoch": 0.8, + "learning_rate": 6.016298570922404e-08, + "logits/chosen": -2.16998028755188, + "logits/rejected": -2.49896240234375, + "logps/chosen": -280.694091796875, + "logps/rejected": -320.14532470703125, + "loss": 0.5565, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0702264308929443, + "rewards/margins": 1.3476715087890625, + "rewards/rejected": -2.4178977012634277, + "step": 6898 + }, + { + "epoch": 0.8, + "learning_rate": 6.012755403330578e-08, + "logits/chosen": -1.5615559816360474, + "logits/rejected": -1.4065842628479004, + "logps/chosen": -499.06158447265625, + "logps/rejected": -544.5966796875, + "loss": 0.7561, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.456490397453308, + "rewards/margins": 1.456023931503296, + "rewards/rejected": -2.9125142097473145, + "step": 6899 + }, + { + "epoch": 0.8, + "learning_rate": 6.00921223573875e-08, + "logits/chosen": -2.5633387565612793, + "logits/rejected": -2.640937566757202, + "logps/chosen": -300.31878662109375, + "logps/rejected": -244.67337036132812, + "loss": 0.4527, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6285059452056885, + "rewards/margins": 1.580533742904663, + "rewards/rejected": -2.2090396881103516, + "step": 6900 + }, + { + "epoch": 0.8, + "learning_rate": 6.005669068146922e-08, + "logits/chosen": -2.7684342861175537, + "logits/rejected": -2.8154263496398926, + "logps/chosen": -194.79095458984375, + "logps/rejected": -184.09600830078125, + "loss": 0.6013, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0689170360565186, + "rewards/margins": 1.790940761566162, + "rewards/rejected": -2.8598575592041016, + "step": 6901 + }, + { + "epoch": 0.8, + "learning_rate": 6.002125900555096e-08, + "logits/chosen": -2.1259872913360596, + "logits/rejected": -2.275294780731201, + "logps/chosen": -375.4989013671875, + "logps/rejected": -231.77810668945312, + "loss": 0.4962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6594857573509216, + "rewards/margins": 1.0236425399780273, + "rewards/rejected": -1.6831282377243042, + "step": 6902 + }, + { + "epoch": 0.8, + "learning_rate": 5.998582732963269e-08, + "logits/chosen": -2.1287176609039307, + "logits/rejected": -2.1135940551757812, + "logps/chosen": -261.378173828125, + "logps/rejected": -345.83843994140625, + "loss": 0.2099, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8118993043899536, + "rewards/margins": 2.3929967880249023, + "rewards/rejected": -3.2048959732055664, + "step": 6903 + }, + { + "epoch": 0.8, + "learning_rate": 5.995039565371441e-08, + "logits/chosen": -2.7018275260925293, + "logits/rejected": -2.4646379947662354, + "logps/chosen": -301.34576416015625, + "logps/rejected": -209.955322265625, + "loss": 0.5246, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4101365804672241, + "rewards/margins": 1.3803424835205078, + "rewards/rejected": -2.7904791831970215, + "step": 6904 + }, + { + "epoch": 0.8, + "learning_rate": 5.991496397779615e-08, + "logits/chosen": -2.3076164722442627, + "logits/rejected": -2.240375280380249, + "logps/chosen": -168.90597534179688, + "logps/rejected": -220.26681518554688, + "loss": 0.6269, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6896852254867554, + "rewards/margins": 1.234555721282959, + "rewards/rejected": -1.9242409467697144, + "step": 6905 + }, + { + "epoch": 0.8, + "learning_rate": 5.987953230187787e-08, + "logits/chosen": -2.423234701156616, + "logits/rejected": -2.4396116733551025, + "logps/chosen": -296.5135803222656, + "logps/rejected": -269.93292236328125, + "loss": 0.1656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3353526592254639, + "rewards/margins": 3.023375988006592, + "rewards/rejected": -4.358728408813477, + "step": 6906 + }, + { + "epoch": 0.8, + "learning_rate": 5.98441006259596e-08, + "logits/chosen": -2.534740447998047, + "logits/rejected": -2.324251890182495, + "logps/chosen": -163.45994567871094, + "logps/rejected": -303.6251525878906, + "loss": 0.1717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11256885528564453, + "rewards/margins": 3.5816407203674316, + "rewards/rejected": -3.6942098140716553, + "step": 6907 + }, + { + "epoch": 0.8, + "learning_rate": 5.980866895004134e-08, + "logits/chosen": -2.337916374206543, + "logits/rejected": -2.1289572715759277, + "logps/chosen": -295.1421203613281, + "logps/rejected": -289.46453857421875, + "loss": 0.5309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4490537643432617, + "rewards/margins": 1.2827553749084473, + "rewards/rejected": -1.7318089008331299, + "step": 6908 + }, + { + "epoch": 0.8, + "learning_rate": 5.977323727412306e-08, + "logits/chosen": -2.850249767303467, + "logits/rejected": -2.6536471843719482, + "logps/chosen": -118.08811950683594, + "logps/rejected": -290.046875, + "loss": 0.3509, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9690921306610107, + "rewards/margins": 2.583361864089966, + "rewards/rejected": -3.5524539947509766, + "step": 6909 + }, + { + "epoch": 0.8, + "learning_rate": 5.97378055982048e-08, + "logits/chosen": -2.5368776321411133, + "logits/rejected": -2.599597454071045, + "logps/chosen": -149.15277099609375, + "logps/rejected": -160.67559814453125, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6243939995765686, + "rewards/margins": 2.425112247467041, + "rewards/rejected": -3.049506425857544, + "step": 6910 + }, + { + "epoch": 0.8, + "learning_rate": 5.970237392228652e-08, + "logits/chosen": -2.1605288982391357, + "logits/rejected": -2.1874160766601562, + "logps/chosen": -267.1117248535156, + "logps/rejected": -298.08941650390625, + "loss": 0.6071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3768365383148193, + "rewards/margins": 1.0210864543914795, + "rewards/rejected": -2.397922992706299, + "step": 6911 + }, + { + "epoch": 0.8, + "learning_rate": 5.966694224636824e-08, + "logits/chosen": -2.272624969482422, + "logits/rejected": -2.5979700088500977, + "logps/chosen": -515.2598876953125, + "logps/rejected": -382.1220703125, + "loss": 0.6146, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5498842000961304, + "rewards/margins": 0.9148114919662476, + "rewards/rejected": -2.464695930480957, + "step": 6912 + }, + { + "epoch": 0.8, + "learning_rate": 5.963151057044998e-08, + "logits/chosen": -2.249556064605713, + "logits/rejected": -1.9992293119430542, + "logps/chosen": -179.02218627929688, + "logps/rejected": -248.80575561523438, + "loss": 0.1893, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3414685726165771, + "rewards/margins": 2.912838935852051, + "rewards/rejected": -4.254307270050049, + "step": 6913 + }, + { + "epoch": 0.8, + "learning_rate": 5.959607889453171e-08, + "logits/chosen": -1.873137354850769, + "logits/rejected": -2.4454219341278076, + "logps/chosen": -437.222412109375, + "logps/rejected": -202.480712890625, + "loss": 1.4339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.558374285697937, + "rewards/margins": -0.436688095331192, + "rewards/rejected": -1.1216862201690674, + "step": 6914 + }, + { + "epoch": 0.8, + "learning_rate": 5.9560647218613436e-08, + "logits/chosen": -2.159658193588257, + "logits/rejected": -2.299187660217285, + "logps/chosen": -267.0484924316406, + "logps/rejected": -309.9268798828125, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2032082080841064, + "rewards/margins": 1.8264163732528687, + "rewards/rejected": -3.0296247005462646, + "step": 6915 + }, + { + "epoch": 0.8, + "learning_rate": 5.952521554269517e-08, + "logits/chosen": -2.1431057453155518, + "logits/rejected": -2.173724889755249, + "logps/chosen": -494.3200378417969, + "logps/rejected": -407.2119140625, + "loss": 0.2259, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7035886645317078, + "rewards/margins": 4.577124118804932, + "rewards/rejected": -5.280712604522705, + "step": 6916 + }, + { + "epoch": 0.8, + "learning_rate": 5.9489783866776894e-08, + "logits/chosen": -2.368222236633301, + "logits/rejected": -2.5181570053100586, + "logps/chosen": -282.7748107910156, + "logps/rejected": -372.24163818359375, + "loss": 0.739, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1213910579681396, + "rewards/margins": 2.2174861431121826, + "rewards/rejected": -3.3388772010803223, + "step": 6917 + }, + { + "epoch": 0.8, + "learning_rate": 5.945435219085862e-08, + "logits/chosen": -2.977031707763672, + "logits/rejected": -3.0282797813415527, + "logps/chosen": -223.34271240234375, + "logps/rejected": -192.7584991455078, + "loss": 0.2758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1252799034118652, + "rewards/margins": 2.472568988800049, + "rewards/rejected": -3.597848653793335, + "step": 6918 + }, + { + "epoch": 0.8, + "learning_rate": 5.941892051494036e-08, + "logits/chosen": -2.1648623943328857, + "logits/rejected": -2.0049774646759033, + "logps/chosen": -219.85833740234375, + "logps/rejected": -366.86444091796875, + "loss": 0.4562, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8938494324684143, + "rewards/margins": 1.1907633543014526, + "rewards/rejected": -2.0846128463745117, + "step": 6919 + }, + { + "epoch": 0.81, + "learning_rate": 5.9383488839022087e-08, + "logits/chosen": -2.8112521171569824, + "logits/rejected": -2.4352900981903076, + "logps/chosen": -160.96536254882812, + "logps/rejected": -240.36102294921875, + "loss": 0.4124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6394641399383545, + "rewards/margins": 1.5401849746704102, + "rewards/rejected": -2.1796491146087646, + "step": 6920 + }, + { + "epoch": 0.81, + "learning_rate": 5.934805716310381e-08, + "logits/chosen": -1.9146720170974731, + "logits/rejected": -1.785881757736206, + "logps/chosen": -223.04067993164062, + "logps/rejected": -344.9404296875, + "loss": 0.3095, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0031036138534546, + "rewards/margins": 2.157834053039551, + "rewards/rejected": -3.160937786102295, + "step": 6921 + }, + { + "epoch": 0.81, + "learning_rate": 5.9312625487185544e-08, + "logits/chosen": -2.2965614795684814, + "logits/rejected": -2.4639945030212402, + "logps/chosen": -383.33642578125, + "logps/rejected": -358.50543212890625, + "loss": 0.1284, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8799735307693481, + "rewards/margins": 3.522183418273926, + "rewards/rejected": -4.402156829833984, + "step": 6922 + }, + { + "epoch": 0.81, + "learning_rate": 5.927719381126727e-08, + "logits/chosen": -1.5388253927230835, + "logits/rejected": -1.7968069314956665, + "logps/chosen": -333.13311767578125, + "logps/rejected": -273.1431884765625, + "loss": 0.3751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9445818662643433, + "rewards/margins": 3.225402593612671, + "rewards/rejected": -4.169984340667725, + "step": 6923 + }, + { + "epoch": 0.81, + "learning_rate": 5.9241762135348995e-08, + "logits/chosen": -2.956477165222168, + "logits/rejected": -2.992743968963623, + "logps/chosen": -257.56658935546875, + "logps/rejected": -248.71029663085938, + "loss": 0.3286, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39945876598358154, + "rewards/margins": 2.502977132797241, + "rewards/rejected": -2.902435779571533, + "step": 6924 + }, + { + "epoch": 0.81, + "learning_rate": 5.920633045943073e-08, + "logits/chosen": -2.698195457458496, + "logits/rejected": -2.6481029987335205, + "logps/chosen": -122.56902313232422, + "logps/rejected": -185.09597778320312, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9146978259086609, + "rewards/margins": 2.650869131088257, + "rewards/rejected": -3.5655667781829834, + "step": 6925 + }, + { + "epoch": 0.81, + "learning_rate": 5.917089878351246e-08, + "logits/chosen": -2.4099297523498535, + "logits/rejected": -2.5513081550598145, + "logps/chosen": -296.16790771484375, + "logps/rejected": -227.76943969726562, + "loss": 0.4471, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2814745903015137, + "rewards/margins": 1.2777289152145386, + "rewards/rejected": -2.559203624725342, + "step": 6926 + }, + { + "epoch": 0.81, + "learning_rate": 5.913546710759418e-08, + "logits/chosen": -2.475584030151367, + "logits/rejected": -2.472195863723755, + "logps/chosen": -274.46026611328125, + "logps/rejected": -248.33135986328125, + "loss": 0.2885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44025570154190063, + "rewards/margins": 2.531245231628418, + "rewards/rejected": -2.971500873565674, + "step": 6927 + }, + { + "epoch": 0.81, + "learning_rate": 5.9100035431675917e-08, + "logits/chosen": -2.474320650100708, + "logits/rejected": -2.594431161880493, + "logps/chosen": -346.873046875, + "logps/rejected": -299.5714416503906, + "loss": 0.1501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03139614313840866, + "rewards/margins": 3.1379268169403076, + "rewards/rejected": -3.169322967529297, + "step": 6928 + }, + { + "epoch": 0.81, + "learning_rate": 5.9064603755757645e-08, + "logits/chosen": -2.1676878929138184, + "logits/rejected": -2.3060238361358643, + "logps/chosen": -427.7969970703125, + "logps/rejected": -309.10467529296875, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3296813666820526, + "rewards/margins": 3.145228147506714, + "rewards/rejected": -3.4749093055725098, + "step": 6929 + }, + { + "epoch": 0.81, + "learning_rate": 5.902917207983937e-08, + "logits/chosen": -2.9843757152557373, + "logits/rejected": -3.0082621574401855, + "logps/chosen": -170.35476684570312, + "logps/rejected": -226.33067321777344, + "loss": 0.4306, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5126721858978271, + "rewards/margins": 2.8143670558929443, + "rewards/rejected": -3.3270392417907715, + "step": 6930 + }, + { + "epoch": 0.81, + "learning_rate": 5.89937404039211e-08, + "logits/chosen": -2.053554058074951, + "logits/rejected": -2.3966469764709473, + "logps/chosen": -324.0078430175781, + "logps/rejected": -178.91098022460938, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.945521354675293, + "rewards/margins": 0.8974461555480957, + "rewards/rejected": -1.8429676294326782, + "step": 6931 + }, + { + "epoch": 0.81, + "learning_rate": 5.895830872800283e-08, + "logits/chosen": -2.3926193714141846, + "logits/rejected": -2.1329751014709473, + "logps/chosen": -118.62220001220703, + "logps/rejected": -220.97366333007812, + "loss": 0.4281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8356322050094604, + "rewards/margins": 1.2915868759155273, + "rewards/rejected": -2.1272189617156982, + "step": 6932 + }, + { + "epoch": 0.81, + "learning_rate": 5.892287705208456e-08, + "logits/chosen": -2.2731852531433105, + "logits/rejected": -2.4036998748779297, + "logps/chosen": -357.1069030761719, + "logps/rejected": -248.55514526367188, + "loss": 0.8515, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4061845541000366, + "rewards/margins": 0.8615427613258362, + "rewards/rejected": -2.2677271366119385, + "step": 6933 + }, + { + "epoch": 0.81, + "learning_rate": 5.888744537616629e-08, + "logits/chosen": -2.847926378250122, + "logits/rejected": -2.5771467685699463, + "logps/chosen": -261.82373046875, + "logps/rejected": -373.1896057128906, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9082393646240234, + "rewards/margins": 2.187087059020996, + "rewards/rejected": -3.0953264236450195, + "step": 6934 + }, + { + "epoch": 0.81, + "learning_rate": 5.885201370024802e-08, + "logits/chosen": -2.449023962020874, + "logits/rejected": -2.432584524154663, + "logps/chosen": -357.9031677246094, + "logps/rejected": -322.2187194824219, + "loss": 0.1107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08927503228187561, + "rewards/margins": 4.092731475830078, + "rewards/rejected": -4.182006359100342, + "step": 6935 + }, + { + "epoch": 0.81, + "learning_rate": 5.8816582024329747e-08, + "logits/chosen": -2.0975115299224854, + "logits/rejected": -2.222388982772827, + "logps/chosen": -396.83636474609375, + "logps/rejected": -346.7358093261719, + "loss": 0.4213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018300257623195648, + "rewards/margins": 1.7928481101989746, + "rewards/rejected": -1.774547815322876, + "step": 6936 + }, + { + "epoch": 0.81, + "learning_rate": 5.878115034841148e-08, + "logits/chosen": -2.14475417137146, + "logits/rejected": -2.1213645935058594, + "logps/chosen": -229.7939453125, + "logps/rejected": -331.1851501464844, + "loss": 0.7505, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4984033107757568, + "rewards/margins": 2.2515532970428467, + "rewards/rejected": -3.7499568462371826, + "step": 6937 + }, + { + "epoch": 0.81, + "learning_rate": 5.8745718672493204e-08, + "logits/chosen": -2.193063735961914, + "logits/rejected": -2.479222297668457, + "logps/chosen": -302.35992431640625, + "logps/rejected": -292.68658447265625, + "loss": 0.1759, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8758623600006104, + "rewards/margins": 2.327329397201538, + "rewards/rejected": -4.203191757202148, + "step": 6938 + }, + { + "epoch": 0.81, + "learning_rate": 5.871028699657493e-08, + "logits/chosen": -2.63381028175354, + "logits/rejected": -2.87282395362854, + "logps/chosen": -468.68096923828125, + "logps/rejected": -314.24664306640625, + "loss": 0.2902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5661655068397522, + "rewards/margins": 1.6350233554840088, + "rewards/rejected": -2.201188802719116, + "step": 6939 + }, + { + "epoch": 0.81, + "learning_rate": 5.867485532065667e-08, + "logits/chosen": -1.9521119594573975, + "logits/rejected": -2.0112180709838867, + "logps/chosen": -237.43124389648438, + "logps/rejected": -287.5439453125, + "loss": 0.284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5958864092826843, + "rewards/margins": 4.474835395812988, + "rewards/rejected": -5.0707221031188965, + "step": 6940 + }, + { + "epoch": 0.81, + "learning_rate": 5.863942364473839e-08, + "logits/chosen": -1.8490170240402222, + "logits/rejected": -2.0813403129577637, + "logps/chosen": -215.92764282226562, + "logps/rejected": -170.9210205078125, + "loss": 0.5767, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9153019785881042, + "rewards/margins": 0.8074325919151306, + "rewards/rejected": -1.7227346897125244, + "step": 6941 + }, + { + "epoch": 0.81, + "learning_rate": 5.860399196882012e-08, + "logits/chosen": -1.4860576391220093, + "logits/rejected": -1.8136402368545532, + "logps/chosen": -513.1156616210938, + "logps/rejected": -463.6302490234375, + "loss": 0.5689, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6015863418579102, + "rewards/margins": 2.1121268272399902, + "rewards/rejected": -2.7137131690979004, + "step": 6942 + }, + { + "epoch": 0.81, + "learning_rate": 5.8568560292901854e-08, + "logits/chosen": -2.519143581390381, + "logits/rejected": -2.3967480659484863, + "logps/chosen": -160.36354064941406, + "logps/rejected": -215.4005126953125, + "loss": 0.4802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6472029685974121, + "rewards/margins": 1.3045904636383057, + "rewards/rejected": -1.9517934322357178, + "step": 6943 + }, + { + "epoch": 0.81, + "learning_rate": 5.8533128616983576e-08, + "logits/chosen": -2.1394565105438232, + "logits/rejected": -2.2423088550567627, + "logps/chosen": -222.90554809570312, + "logps/rejected": -276.9129943847656, + "loss": 0.1413, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5134353637695312, + "rewards/margins": 2.8941218852996826, + "rewards/rejected": -4.407557487487793, + "step": 6944 + }, + { + "epoch": 0.81, + "learning_rate": 5.849769694106531e-08, + "logits/chosen": -2.6992509365081787, + "logits/rejected": -2.342311143875122, + "logps/chosen": -262.9388427734375, + "logps/rejected": -321.6473388671875, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.887996256351471, + "rewards/margins": 3.1814281940460205, + "rewards/rejected": -4.069424629211426, + "step": 6945 + }, + { + "epoch": 0.81, + "learning_rate": 5.846226526514704e-08, + "logits/chosen": -2.5511715412139893, + "logits/rejected": -2.504384756088257, + "logps/chosen": -440.843017578125, + "logps/rejected": -360.93505859375, + "loss": 0.444, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2203733921051025, + "rewards/margins": 2.7069571018218994, + "rewards/rejected": -3.927330493927002, + "step": 6946 + }, + { + "epoch": 0.81, + "learning_rate": 5.842683358922877e-08, + "logits/chosen": -2.394871950149536, + "logits/rejected": -2.6926422119140625, + "logps/chosen": -404.0493469238281, + "logps/rejected": -354.7840576171875, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4859156608581543, + "rewards/margins": 1.930237889289856, + "rewards/rejected": -2.4161536693573, + "step": 6947 + }, + { + "epoch": 0.81, + "learning_rate": 5.83914019133105e-08, + "logits/chosen": -2.629350423812866, + "logits/rejected": -2.7744522094726562, + "logps/chosen": -218.7945556640625, + "logps/rejected": -234.1666717529297, + "loss": 1.0267, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.618085503578186, + "rewards/margins": 2.453368663787842, + "rewards/rejected": -4.071454048156738, + "step": 6948 + }, + { + "epoch": 0.81, + "learning_rate": 5.835597023739223e-08, + "logits/chosen": -2.0143070220947266, + "logits/rejected": -2.1031734943389893, + "logps/chosen": -273.558349609375, + "logps/rejected": -304.2055969238281, + "loss": 0.3932, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32279425859451294, + "rewards/margins": 2.863375663757324, + "rewards/rejected": -3.1861696243286133, + "step": 6949 + }, + { + "epoch": 0.81, + "learning_rate": 5.8320538561473956e-08, + "logits/chosen": -1.755435585975647, + "logits/rejected": -1.8585846424102783, + "logps/chosen": -453.4921875, + "logps/rejected": -412.88421630859375, + "loss": 0.2282, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14517706632614136, + "rewards/margins": 2.7922329902648926, + "rewards/rejected": -2.9374101161956787, + "step": 6950 + }, + { + "epoch": 0.81, + "learning_rate": 5.828510688555569e-08, + "logits/chosen": -2.8303415775299072, + "logits/rejected": -2.6690359115600586, + "logps/chosen": -114.41722106933594, + "logps/rejected": -144.63766479492188, + "loss": 0.4695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6630533337593079, + "rewards/margins": 1.654935598373413, + "rewards/rejected": -2.317988872528076, + "step": 6951 + }, + { + "epoch": 0.81, + "learning_rate": 5.824967520963741e-08, + "logits/chosen": -1.5581797361373901, + "logits/rejected": -2.032939910888672, + "logps/chosen": -546.8501586914062, + "logps/rejected": -352.4197998046875, + "loss": 0.2493, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.253819465637207, + "rewards/margins": 1.9726307392120361, + "rewards/rejected": -3.226449966430664, + "step": 6952 + }, + { + "epoch": 0.81, + "learning_rate": 5.821424353371914e-08, + "logits/chosen": -1.9849361181259155, + "logits/rejected": -1.9882879257202148, + "logps/chosen": -262.4668884277344, + "logps/rejected": -340.5745849609375, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4014494717121124, + "rewards/margins": 4.52146053314209, + "rewards/rejected": -4.120010852813721, + "step": 6953 + }, + { + "epoch": 0.81, + "learning_rate": 5.817881185780088e-08, + "logits/chosen": -2.539609670639038, + "logits/rejected": -2.3371968269348145, + "logps/chosen": -211.05699157714844, + "logps/rejected": -311.50787353515625, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6640783548355103, + "rewards/margins": 2.834036111831665, + "rewards/rejected": -3.4981143474578857, + "step": 6954 + }, + { + "epoch": 0.81, + "learning_rate": 5.81433801818826e-08, + "logits/chosen": -2.537095546722412, + "logits/rejected": -2.7251555919647217, + "logps/chosen": -160.5239715576172, + "logps/rejected": -142.87669372558594, + "loss": 0.2907, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25215625762939453, + "rewards/margins": 1.9948405027389526, + "rewards/rejected": -2.2469968795776367, + "step": 6955 + }, + { + "epoch": 0.81, + "learning_rate": 5.810794850596433e-08, + "logits/chosen": -2.027521848678589, + "logits/rejected": -2.292147159576416, + "logps/chosen": -182.48348999023438, + "logps/rejected": -222.15223693847656, + "loss": 0.4966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6400614976882935, + "rewards/margins": 0.633939266204834, + "rewards/rejected": -1.274000644683838, + "step": 6956 + }, + { + "epoch": 0.81, + "learning_rate": 5.8072516830046063e-08, + "logits/chosen": -2.4824366569519043, + "logits/rejected": -2.710918664932251, + "logps/chosen": -543.2318115234375, + "logps/rejected": -336.8566589355469, + "loss": 0.1946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6931426525115967, + "rewards/margins": 2.9788331985473633, + "rewards/rejected": -3.671975612640381, + "step": 6957 + }, + { + "epoch": 0.81, + "learning_rate": 5.8037085154127785e-08, + "logits/chosen": -2.1887872219085693, + "logits/rejected": -2.159700393676758, + "logps/chosen": -253.4881134033203, + "logps/rejected": -321.21331787109375, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8223419189453125, + "rewards/margins": 1.7957539558410645, + "rewards/rejected": -2.618095636367798, + "step": 6958 + }, + { + "epoch": 0.81, + "learning_rate": 5.8001653478209514e-08, + "logits/chosen": -2.410140037536621, + "logits/rejected": -2.2583718299865723, + "logps/chosen": -144.30934143066406, + "logps/rejected": -254.79275512695312, + "loss": 0.2984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7381219267845154, + "rewards/margins": 1.9606542587280273, + "rewards/rejected": -2.6987760066986084, + "step": 6959 + }, + { + "epoch": 0.81, + "learning_rate": 5.796622180229125e-08, + "logits/chosen": -2.351207971572876, + "logits/rejected": -2.4365150928497314, + "logps/chosen": -176.10784912109375, + "logps/rejected": -222.60726928710938, + "loss": 0.2394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9226801991462708, + "rewards/margins": 3.5574963092803955, + "rewards/rejected": -4.48017692565918, + "step": 6960 + }, + { + "epoch": 0.81, + "learning_rate": 5.793079012637297e-08, + "logits/chosen": -2.2367289066314697, + "logits/rejected": -2.1401097774505615, + "logps/chosen": -192.02655029296875, + "logps/rejected": -235.5335693359375, + "loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.66761314868927, + "rewards/margins": 3.285983085632324, + "rewards/rejected": -3.953596591949463, + "step": 6961 + }, + { + "epoch": 0.81, + "learning_rate": 5.78953584504547e-08, + "logits/chosen": -2.0699310302734375, + "logits/rejected": -2.4004082679748535, + "logps/chosen": -423.71343994140625, + "logps/rejected": -211.35227966308594, + "loss": 0.6446, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3338202238082886, + "rewards/margins": 0.5856020450592041, + "rewards/rejected": -1.9194222688674927, + "step": 6962 + }, + { + "epoch": 0.81, + "learning_rate": 5.7859926774536436e-08, + "logits/chosen": -2.4559712409973145, + "logits/rejected": -2.4202823638916016, + "logps/chosen": -236.87326049804688, + "logps/rejected": -281.6687316894531, + "loss": 0.6556, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2997667193412781, + "rewards/margins": 0.7654582262039185, + "rewards/rejected": -1.0652248859405518, + "step": 6963 + }, + { + "epoch": 0.81, + "learning_rate": 5.7824495098618165e-08, + "logits/chosen": -2.825956106185913, + "logits/rejected": -2.5842437744140625, + "logps/chosen": -317.8209228515625, + "logps/rejected": -286.7721862792969, + "loss": 0.7433, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.102046012878418, + "rewards/margins": 0.9493186473846436, + "rewards/rejected": -2.0513644218444824, + "step": 6964 + }, + { + "epoch": 0.81, + "learning_rate": 5.7789063422699887e-08, + "logits/chosen": -2.3499741554260254, + "logits/rejected": -2.1329143047332764, + "logps/chosen": -338.93853759765625, + "logps/rejected": -415.06304931640625, + "loss": 0.3845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5139556527137756, + "rewards/margins": 1.5393109321594238, + "rewards/rejected": -2.0532665252685547, + "step": 6965 + }, + { + "epoch": 0.81, + "learning_rate": 5.775363174678162e-08, + "logits/chosen": -2.7221386432647705, + "logits/rejected": -2.8251757621765137, + "logps/chosen": -336.4991149902344, + "logps/rejected": -396.70465087890625, + "loss": 0.2425, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.231213927268982, + "rewards/margins": 3.9743237495422363, + "rewards/rejected": -5.205537796020508, + "step": 6966 + }, + { + "epoch": 0.81, + "learning_rate": 5.771820007086335e-08, + "logits/chosen": -1.8360257148742676, + "logits/rejected": -1.5685789585113525, + "logps/chosen": -148.1580047607422, + "logps/rejected": -249.30223083496094, + "loss": 0.5354, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6783114075660706, + "rewards/margins": 1.4771463871002197, + "rewards/rejected": -2.1554577350616455, + "step": 6967 + }, + { + "epoch": 0.81, + "learning_rate": 5.768276839494507e-08, + "logits/chosen": -2.081731081008911, + "logits/rejected": -2.281965970993042, + "logps/chosen": -515.6261596679688, + "logps/rejected": -361.91741943359375, + "loss": 0.4484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6904502511024475, + "rewards/margins": 3.29288649559021, + "rewards/rejected": -3.983336925506592, + "step": 6968 + }, + { + "epoch": 0.81, + "learning_rate": 5.764733671902681e-08, + "logits/chosen": -1.909963846206665, + "logits/rejected": -2.1901423931121826, + "logps/chosen": -425.1414794921875, + "logps/rejected": -275.903076171875, + "loss": 0.2499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19101133942604065, + "rewards/margins": 1.5922093391418457, + "rewards/rejected": -1.7832207679748535, + "step": 6969 + }, + { + "epoch": 0.81, + "learning_rate": 5.761190504310854e-08, + "logits/chosen": -1.9031894207000732, + "logits/rejected": -1.932436227798462, + "logps/chosen": -294.8926086425781, + "logps/rejected": -273.4130859375, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4738729000091553, + "rewards/margins": 0.09090644121170044, + "rewards/rejected": -1.56477952003479, + "step": 6970 + }, + { + "epoch": 0.81, + "learning_rate": 5.757647336719026e-08, + "logits/chosen": -2.0417063236236572, + "logits/rejected": -2.119220018386841, + "logps/chosen": -365.8646545410156, + "logps/rejected": -301.26751708984375, + "loss": 0.3706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5905189514160156, + "rewards/margins": 1.6636261940002441, + "rewards/rejected": -2.2541451454162598, + "step": 6971 + }, + { + "epoch": 0.81, + "learning_rate": 5.7541041691271994e-08, + "logits/chosen": -1.8935199975967407, + "logits/rejected": -2.1771726608276367, + "logps/chosen": -347.3951416015625, + "logps/rejected": -276.03814697265625, + "loss": 0.2335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032300978899002075, + "rewards/margins": 2.722151517868042, + "rewards/rejected": -2.7544524669647217, + "step": 6972 + }, + { + "epoch": 0.81, + "learning_rate": 5.750561001535372e-08, + "logits/chosen": -3.028144359588623, + "logits/rejected": -3.010406017303467, + "logps/chosen": -86.32718658447266, + "logps/rejected": -138.75921630859375, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4675876796245575, + "rewards/margins": 2.6173954010009766, + "rewards/rejected": -3.0849833488464355, + "step": 6973 + }, + { + "epoch": 0.81, + "learning_rate": 5.747017833943545e-08, + "logits/chosen": -2.205203056335449, + "logits/rejected": -2.6454505920410156, + "logps/chosen": -347.1150817871094, + "logps/rejected": -281.47564697265625, + "loss": 0.3951, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5696680545806885, + "rewards/margins": 1.8272948265075684, + "rewards/rejected": -3.3969626426696777, + "step": 6974 + }, + { + "epoch": 0.81, + "learning_rate": 5.743474666351718e-08, + "logits/chosen": -2.48429799079895, + "logits/rejected": -2.5060150623321533, + "logps/chosen": -224.866455078125, + "logps/rejected": -264.07171630859375, + "loss": 0.2524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5502662062644958, + "rewards/margins": 2.526099681854248, + "rewards/rejected": -3.0763659477233887, + "step": 6975 + }, + { + "epoch": 0.81, + "learning_rate": 5.739931498759891e-08, + "logits/chosen": -2.0368740558624268, + "logits/rejected": -1.939455270767212, + "logps/chosen": -359.8154296875, + "logps/rejected": -361.9418029785156, + "loss": 0.7506, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8221428394317627, + "rewards/margins": 3.0155766010284424, + "rewards/rejected": -3.837719202041626, + "step": 6976 + }, + { + "epoch": 0.81, + "learning_rate": 5.736388331168064e-08, + "logits/chosen": -2.4694252014160156, + "logits/rejected": -2.4764902591705322, + "logps/chosen": -334.063720703125, + "logps/rejected": -431.93634033203125, + "loss": 0.1617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7500292658805847, + "rewards/margins": 6.188802242279053, + "rewards/rejected": -6.938831329345703, + "step": 6977 + }, + { + "epoch": 0.81, + "learning_rate": 5.7328451635762374e-08, + "logits/chosen": -2.681795120239258, + "logits/rejected": -2.66719913482666, + "logps/chosen": -293.13446044921875, + "logps/rejected": -292.93341064453125, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28624939918518066, + "rewards/margins": 2.820626735687256, + "rewards/rejected": -3.1068761348724365, + "step": 6978 + }, + { + "epoch": 0.81, + "learning_rate": 5.7293019959844096e-08, + "logits/chosen": -2.2025866508483887, + "logits/rejected": -2.769169330596924, + "logps/chosen": -187.2576904296875, + "logps/rejected": -163.6160125732422, + "loss": 0.3085, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9636594653129578, + "rewards/margins": 2.108764171600342, + "rewards/rejected": -3.0724236965179443, + "step": 6979 + }, + { + "epoch": 0.81, + "learning_rate": 5.725758828392583e-08, + "logits/chosen": -2.7277700901031494, + "logits/rejected": -2.8943960666656494, + "logps/chosen": -342.13336181640625, + "logps/rejected": -241.7371368408203, + "loss": 0.1962, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4444226026535034, + "rewards/margins": 3.274862766265869, + "rewards/rejected": -3.719285011291504, + "step": 6980 + }, + { + "epoch": 0.81, + "learning_rate": 5.722215660800756e-08, + "logits/chosen": -2.5525505542755127, + "logits/rejected": -2.734325647354126, + "logps/chosen": -472.78790283203125, + "logps/rejected": -593.6925048828125, + "loss": 0.4059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4447324872016907, + "rewards/margins": 1.5820753574371338, + "rewards/rejected": -2.0268077850341797, + "step": 6981 + }, + { + "epoch": 0.81, + "learning_rate": 5.718672493208928e-08, + "logits/chosen": -2.4413208961486816, + "logits/rejected": -2.4746201038360596, + "logps/chosen": -173.83233642578125, + "logps/rejected": -183.9649200439453, + "loss": 0.5021, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5750913619995117, + "rewards/margins": 0.8024642467498779, + "rewards/rejected": -1.3775556087493896, + "step": 6982 + }, + { + "epoch": 0.81, + "learning_rate": 5.715129325617102e-08, + "logits/chosen": -2.652353048324585, + "logits/rejected": -2.5170845985412598, + "logps/chosen": -278.14178466796875, + "logps/rejected": -317.2783203125, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0704457759857178, + "rewards/margins": 3.7457871437072754, + "rewards/rejected": -4.816232681274414, + "step": 6983 + }, + { + "epoch": 0.81, + "learning_rate": 5.7115861580252746e-08, + "logits/chosen": -2.3314056396484375, + "logits/rejected": -2.5843279361724854, + "logps/chosen": -298.65740966796875, + "logps/rejected": -256.2784118652344, + "loss": 0.7606, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.331590175628662, + "rewards/margins": 1.5347625017166138, + "rewards/rejected": -2.8663525581359863, + "step": 6984 + }, + { + "epoch": 0.81, + "learning_rate": 5.708042990433447e-08, + "logits/chosen": -2.110058307647705, + "logits/rejected": -2.342405080795288, + "logps/chosen": -254.48764038085938, + "logps/rejected": -217.17752075195312, + "loss": 0.2598, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8226504921913147, + "rewards/margins": 1.8612288236618042, + "rewards/rejected": -2.6838793754577637, + "step": 6985 + }, + { + "epoch": 0.81, + "learning_rate": 5.7044998228416203e-08, + "logits/chosen": -2.2351136207580566, + "logits/rejected": -2.243980646133423, + "logps/chosen": -218.61761474609375, + "logps/rejected": -141.64718627929688, + "loss": 0.7025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.918641984462738, + "rewards/margins": 0.7701011300086975, + "rewards/rejected": -1.688743233680725, + "step": 6986 + }, + { + "epoch": 0.81, + "learning_rate": 5.700956655249793e-08, + "logits/chosen": -2.4341351985931396, + "logits/rejected": -2.427731513977051, + "logps/chosen": -289.51953125, + "logps/rejected": -417.5943298339844, + "loss": 0.7309, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1060125827789307, + "rewards/margins": 0.12258338928222656, + "rewards/rejected": -1.2285959720611572, + "step": 6987 + }, + { + "epoch": 0.81, + "learning_rate": 5.6974134876579654e-08, + "logits/chosen": -2.3366141319274902, + "logits/rejected": -2.684650421142578, + "logps/chosen": -194.67318725585938, + "logps/rejected": -231.68618774414062, + "loss": 0.1379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8100523948669434, + "rewards/margins": 4.066460132598877, + "rewards/rejected": -4.87651252746582, + "step": 6988 + }, + { + "epoch": 0.81, + "learning_rate": 5.693870320066139e-08, + "logits/chosen": -2.3096301555633545, + "logits/rejected": -2.2615575790405273, + "logps/chosen": -282.6059265136719, + "logps/rejected": -341.27813720703125, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4463968276977539, + "rewards/margins": 4.804548740386963, + "rewards/rejected": -5.250945568084717, + "step": 6989 + }, + { + "epoch": 0.81, + "learning_rate": 5.690327152474312e-08, + "logits/chosen": -2.8228652477264404, + "logits/rejected": -2.753466844558716, + "logps/chosen": -333.6346130371094, + "logps/rejected": -189.7689208984375, + "loss": 1.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1062802076339722, + "rewards/margins": 1.4289377927780151, + "rewards/rejected": -2.5352180004119873, + "step": 6990 + }, + { + "epoch": 0.81, + "learning_rate": 5.686783984882485e-08, + "logits/chosen": -2.28448486328125, + "logits/rejected": -2.429816484451294, + "logps/chosen": -344.054443359375, + "logps/rejected": -302.84002685546875, + "loss": 0.296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8743220567703247, + "rewards/margins": 1.6704249382019043, + "rewards/rejected": -2.5447468757629395, + "step": 6991 + }, + { + "epoch": 0.81, + "learning_rate": 5.6832408172906576e-08, + "logits/chosen": -1.7741941213607788, + "logits/rejected": -1.9213895797729492, + "logps/chosen": -356.81390380859375, + "logps/rejected": -326.2787170410156, + "loss": 0.4512, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2742231488227844, + "rewards/margins": 2.159553050994873, + "rewards/rejected": -2.4337761402130127, + "step": 6992 + }, + { + "epoch": 0.81, + "learning_rate": 5.6796976496988305e-08, + "logits/chosen": -2.385751724243164, + "logits/rejected": -2.6124587059020996, + "logps/chosen": -289.2515869140625, + "logps/rejected": -192.94003295898438, + "loss": 0.5771, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6496453285217285, + "rewards/margins": 1.4024345874786377, + "rewards/rejected": -2.052079916000366, + "step": 6993 + }, + { + "epoch": 0.81, + "learning_rate": 5.6761544821070033e-08, + "logits/chosen": -2.5040743350982666, + "logits/rejected": -2.459688663482666, + "logps/chosen": -94.85856628417969, + "logps/rejected": -174.40863037109375, + "loss": 0.1239, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1902483701705933, + "rewards/margins": 3.0593738555908203, + "rewards/rejected": -4.249621868133545, + "step": 6994 + }, + { + "epoch": 0.81, + "learning_rate": 5.672611314515177e-08, + "logits/chosen": -2.192959785461426, + "logits/rejected": -2.2895851135253906, + "logps/chosen": -411.8631591796875, + "logps/rejected": -376.2235412597656, + "loss": 0.7831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6120821237564087, + "rewards/margins": 1.1904537677764893, + "rewards/rejected": -2.8025360107421875, + "step": 6995 + }, + { + "epoch": 0.81, + "learning_rate": 5.669068146923349e-08, + "logits/chosen": -2.1508328914642334, + "logits/rejected": -2.013137102127075, + "logps/chosen": -330.58209228515625, + "logps/rejected": -349.7351989746094, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23129160702228546, + "rewards/margins": 2.1678214073181152, + "rewards/rejected": -2.399113178253174, + "step": 6996 + }, + { + "epoch": 0.81, + "learning_rate": 5.665524979331522e-08, + "logits/chosen": -1.7649085521697998, + "logits/rejected": -2.03485369682312, + "logps/chosen": -335.2139587402344, + "logps/rejected": -347.0346374511719, + "loss": 0.351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1062867641448975, + "rewards/margins": 2.589789390563965, + "rewards/rejected": -3.6960763931274414, + "step": 6997 + }, + { + "epoch": 0.81, + "learning_rate": 5.6619818117396955e-08, + "logits/chosen": -2.5292820930480957, + "logits/rejected": -2.210775136947632, + "logps/chosen": -297.22491455078125, + "logps/rejected": -345.84759521484375, + "loss": 0.8757, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2189902067184448, + "rewards/margins": 0.3260708451271057, + "rewards/rejected": -1.5450611114501953, + "step": 6998 + }, + { + "epoch": 0.81, + "learning_rate": 5.658438644147868e-08, + "logits/chosen": -1.854620337486267, + "logits/rejected": -2.031958818435669, + "logps/chosen": -281.6073303222656, + "logps/rejected": -204.7700653076172, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7884589433670044, + "rewards/margins": 1.7698538303375244, + "rewards/rejected": -2.5583126544952393, + "step": 6999 + }, + { + "epoch": 0.81, + "learning_rate": 5.6548954765560406e-08, + "logits/chosen": -2.5460143089294434, + "logits/rejected": -2.656071901321411, + "logps/chosen": -207.1037139892578, + "logps/rejected": -218.82284545898438, + "loss": 0.2963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7719509601593018, + "rewards/margins": 2.0140271186828613, + "rewards/rejected": -2.785978317260742, + "step": 7000 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -1.753435730934143, + "eval_logits/rejected": -1.7546595335006714, + "eval_logps/chosen": -278.61065673828125, + "eval_logps/rejected": -279.0486145019531, + "eval_loss": 0.3661438524723053, + "eval_rewards/accuracies": 0.8477011322975159, + "eval_rewards/chosen": -0.6445215344429016, + "eval_rewards/margins": 2.1596851348876953, + "eval_rewards/rejected": -2.8042068481445312, + "eval_runtime": 238.2166, + "eval_samples_per_second": 2.918, + "eval_steps_per_second": 1.461, + "step": 7000 + }, + { + "epoch": 0.81, + "learning_rate": 5.651352308964214e-08, + "logits/chosen": -2.743354320526123, + "logits/rejected": -2.6290884017944336, + "logps/chosen": -310.0019226074219, + "logps/rejected": -380.6521911621094, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19075919687747955, + "rewards/margins": 2.9790518283843994, + "rewards/rejected": -3.1698110103607178, + "step": 7001 + }, + { + "epoch": 0.81, + "learning_rate": 5.647809141372386e-08, + "logits/chosen": -2.221585750579834, + "logits/rejected": -2.1962692737579346, + "logps/chosen": -124.87165832519531, + "logps/rejected": -226.46905517578125, + "loss": 0.3811, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0817298889160156, + "rewards/margins": 2.3608558177948, + "rewards/rejected": -3.4425857067108154, + "step": 7002 + }, + { + "epoch": 0.81, + "learning_rate": 5.644265973780559e-08, + "logits/chosen": -2.042593002319336, + "logits/rejected": -1.8850102424621582, + "logps/chosen": -208.573974609375, + "logps/rejected": -140.6519012451172, + "loss": 0.9635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0641510486602783, + "rewards/margins": 0.6988165378570557, + "rewards/rejected": -1.762967586517334, + "step": 7003 + }, + { + "epoch": 0.81, + "learning_rate": 5.640722806188733e-08, + "logits/chosen": -2.2030110359191895, + "logits/rejected": -1.848442792892456, + "logps/chosen": -286.32025146484375, + "logps/rejected": -423.1905517578125, + "loss": 0.4171, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1458908319473267, + "rewards/margins": 2.1772255897521973, + "rewards/rejected": -3.3231163024902344, + "step": 7004 + }, + { + "epoch": 0.81, + "learning_rate": 5.6371796385969056e-08, + "logits/chosen": -2.3112523555755615, + "logits/rejected": -2.5608091354370117, + "logps/chosen": -550.8265991210938, + "logps/rejected": -299.2526550292969, + "loss": 0.2359, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0494734048843384, + "rewards/margins": 2.820650577545166, + "rewards/rejected": -3.870123863220215, + "step": 7005 + }, + { + "epoch": 0.82, + "learning_rate": 5.633636471005078e-08, + "logits/chosen": -2.3773574829101562, + "logits/rejected": -2.5010077953338623, + "logps/chosen": -334.86102294921875, + "logps/rejected": -260.7117919921875, + "loss": 1.0688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8414567708969116, + "rewards/margins": 0.5532335042953491, + "rewards/rejected": -1.3946901559829712, + "step": 7006 + }, + { + "epoch": 0.82, + "learning_rate": 5.6300933034132514e-08, + "logits/chosen": -2.6330654621124268, + "logits/rejected": -2.5602517127990723, + "logps/chosen": -247.51730346679688, + "logps/rejected": -169.3246612548828, + "loss": 0.1369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8064134120941162, + "rewards/margins": 2.7514054775238037, + "rewards/rejected": -3.55781888961792, + "step": 7007 + }, + { + "epoch": 0.82, + "learning_rate": 5.626550135821424e-08, + "logits/chosen": -2.245460271835327, + "logits/rejected": -2.1803512573242188, + "logps/chosen": -277.1117858886719, + "logps/rejected": -203.94395446777344, + "loss": 0.7664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5660412311553955, + "rewards/margins": 0.39756739139556885, + "rewards/rejected": -0.9636086225509644, + "step": 7008 + }, + { + "epoch": 0.82, + "learning_rate": 5.6230069682295964e-08, + "logits/chosen": -2.5432655811309814, + "logits/rejected": -2.6728250980377197, + "logps/chosen": -155.08990478515625, + "logps/rejected": -159.86756896972656, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9497049450874329, + "rewards/margins": 0.6727081537246704, + "rewards/rejected": -1.6224130392074585, + "step": 7009 + }, + { + "epoch": 0.82, + "learning_rate": 5.61946380063777e-08, + "logits/chosen": -2.2363059520721436, + "logits/rejected": -2.3456993103027344, + "logps/chosen": -383.2632751464844, + "logps/rejected": -263.16253662109375, + "loss": 0.451, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0815014839172363, + "rewards/margins": 0.9606536626815796, + "rewards/rejected": -2.0421552658081055, + "step": 7010 + }, + { + "epoch": 0.82, + "learning_rate": 5.615920633045943e-08, + "logits/chosen": -2.629729986190796, + "logits/rejected": -2.7211196422576904, + "logps/chosen": -224.36520385742188, + "logps/rejected": -229.0583953857422, + "loss": 0.2655, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8444919586181641, + "rewards/margins": 3.0238380432128906, + "rewards/rejected": -3.868330478668213, + "step": 7011 + }, + { + "epoch": 0.82, + "learning_rate": 5.612377465454115e-08, + "logits/chosen": -2.5971012115478516, + "logits/rejected": -2.702526330947876, + "logps/chosen": -181.5945587158203, + "logps/rejected": -137.58251953125, + "loss": 0.709, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8414369225502014, + "rewards/margins": 0.41044557094573975, + "rewards/rejected": -1.2518823146820068, + "step": 7012 + }, + { + "epoch": 0.82, + "learning_rate": 5.6088342978622886e-08, + "logits/chosen": -2.225965738296509, + "logits/rejected": -2.447756052017212, + "logps/chosen": -319.2898864746094, + "logps/rejected": -259.48431396484375, + "loss": 0.2653, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0668734312057495, + "rewards/margins": 2.6885147094726562, + "rewards/rejected": -3.7553884983062744, + "step": 7013 + }, + { + "epoch": 0.82, + "learning_rate": 5.6052911302704615e-08, + "logits/chosen": -1.9482793807983398, + "logits/rejected": -2.2718496322631836, + "logps/chosen": -287.05230712890625, + "logps/rejected": -219.4978790283203, + "loss": 0.4544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5420180559158325, + "rewards/margins": 1.1432679891586304, + "rewards/rejected": -1.685286045074463, + "step": 7014 + }, + { + "epoch": 0.82, + "learning_rate": 5.6017479626786344e-08, + "logits/chosen": -2.3629205226898193, + "logits/rejected": -2.2595181465148926, + "logps/chosen": -171.73695373535156, + "logps/rejected": -192.44476318359375, + "loss": 0.446, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4151574969291687, + "rewards/margins": 2.3737101554870605, + "rewards/rejected": -2.788867473602295, + "step": 7015 + }, + { + "epoch": 0.82, + "learning_rate": 5.598204795086807e-08, + "logits/chosen": -2.5353453159332275, + "logits/rejected": -2.3594565391540527, + "logps/chosen": -529.4903564453125, + "logps/rejected": -267.05755615234375, + "loss": 0.5616, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03328150510787964, + "rewards/margins": 1.236585259437561, + "rewards/rejected": -1.2033036947250366, + "step": 7016 + }, + { + "epoch": 0.82, + "learning_rate": 5.59466162749498e-08, + "logits/chosen": -2.343073606491089, + "logits/rejected": -2.3654329776763916, + "logps/chosen": -326.5641174316406, + "logps/rejected": -298.3931884765625, + "loss": 0.416, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9496845602989197, + "rewards/margins": 2.2766273021698, + "rewards/rejected": -3.226311683654785, + "step": 7017 + }, + { + "epoch": 0.82, + "learning_rate": 5.5911184599031536e-08, + "logits/chosen": -2.478144884109497, + "logits/rejected": -2.4560585021972656, + "logps/chosen": -233.76255798339844, + "logps/rejected": -224.95260620117188, + "loss": 0.4352, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4216960668563843, + "rewards/margins": 1.595523476600647, + "rewards/rejected": -3.0172197818756104, + "step": 7018 + }, + { + "epoch": 0.82, + "learning_rate": 5.587575292311326e-08, + "logits/chosen": -1.8755110502243042, + "logits/rejected": -1.983034372329712, + "logps/chosen": -324.5708923339844, + "logps/rejected": -320.1654357910156, + "loss": 0.4062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8342398405075073, + "rewards/margins": 2.1243019104003906, + "rewards/rejected": -2.9585416316986084, + "step": 7019 + }, + { + "epoch": 0.82, + "learning_rate": 5.584032124719499e-08, + "logits/chosen": -1.7058281898498535, + "logits/rejected": -1.7686514854431152, + "logps/chosen": -289.7044677734375, + "logps/rejected": -196.11717224121094, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28644055128097534, + "rewards/margins": 1.9918581247329712, + "rewards/rejected": -1.705417513847351, + "step": 7020 + }, + { + "epoch": 0.82, + "learning_rate": 5.580488957127672e-08, + "logits/chosen": -2.622615098953247, + "logits/rejected": -2.713175058364868, + "logps/chosen": -391.7327880859375, + "logps/rejected": -347.2658386230469, + "loss": 0.139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6111325621604919, + "rewards/margins": 3.5223305225372314, + "rewards/rejected": -4.133462905883789, + "step": 7021 + }, + { + "epoch": 0.82, + "learning_rate": 5.576945789535845e-08, + "logits/chosen": -2.152167320251465, + "logits/rejected": -2.2246313095092773, + "logps/chosen": -353.34466552734375, + "logps/rejected": -236.0394287109375, + "loss": 0.5148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0257201194763184, + "rewards/margins": 1.7952520847320557, + "rewards/rejected": -2.820972204208374, + "step": 7022 + }, + { + "epoch": 0.82, + "learning_rate": 5.5734026219440173e-08, + "logits/chosen": -2.176028251647949, + "logits/rejected": -2.0144805908203125, + "logps/chosen": -129.39865112304688, + "logps/rejected": -237.68597412109375, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6744954586029053, + "rewards/margins": 1.2841802835464478, + "rewards/rejected": -2.9586758613586426, + "step": 7023 + }, + { + "epoch": 0.82, + "learning_rate": 5.569859454352191e-08, + "logits/chosen": -2.55669903755188, + "logits/rejected": -2.5182044506073, + "logps/chosen": -172.07899475097656, + "logps/rejected": -322.5699462890625, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5288358926773071, + "rewards/margins": 2.447700023651123, + "rewards/rejected": -2.9765360355377197, + "step": 7024 + }, + { + "epoch": 0.82, + "learning_rate": 5.566316286760364e-08, + "logits/chosen": -1.7736670970916748, + "logits/rejected": -1.938071370124817, + "logps/chosen": -460.9855651855469, + "logps/rejected": -420.30926513671875, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5788826942443848, + "rewards/margins": 3.5513153076171875, + "rewards/rejected": -4.130197525024414, + "step": 7025 + }, + { + "epoch": 0.82, + "learning_rate": 5.562773119168536e-08, + "logits/chosen": -2.717526912689209, + "logits/rejected": -2.8961517810821533, + "logps/chosen": -462.6636962890625, + "logps/rejected": -312.0180358886719, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.041585952043533325, + "rewards/margins": 2.156134843826294, + "rewards/rejected": -2.197720766067505, + "step": 7026 + }, + { + "epoch": 0.82, + "learning_rate": 5.5592299515767095e-08, + "logits/chosen": -2.22467041015625, + "logits/rejected": -2.1989736557006836, + "logps/chosen": -119.86175537109375, + "logps/rejected": -230.80935668945312, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4714958667755127, + "rewards/margins": 3.5385360717773438, + "rewards/rejected": -4.010031700134277, + "step": 7027 + }, + { + "epoch": 0.82, + "learning_rate": 5.5556867839848824e-08, + "logits/chosen": -2.4772493839263916, + "logits/rejected": -2.5116524696350098, + "logps/chosen": -290.0716552734375, + "logps/rejected": -306.0292053222656, + "loss": 0.3861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9326963424682617, + "rewards/margins": 3.4104819297790527, + "rewards/rejected": -4.343177795410156, + "step": 7028 + }, + { + "epoch": 0.82, + "learning_rate": 5.5521436163930546e-08, + "logits/chosen": -2.9221065044403076, + "logits/rejected": -2.913454294204712, + "logps/chosen": -147.1915740966797, + "logps/rejected": -143.05372619628906, + "loss": 0.2602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.759903073310852, + "rewards/margins": 2.213841199874878, + "rewards/rejected": -2.9737439155578613, + "step": 7029 + }, + { + "epoch": 0.82, + "learning_rate": 5.548600448801228e-08, + "logits/chosen": -2.2329931259155273, + "logits/rejected": -2.1490471363067627, + "logps/chosen": -112.16510009765625, + "logps/rejected": -187.5654754638672, + "loss": 0.3718, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37015512585639954, + "rewards/margins": 3.1372101306915283, + "rewards/rejected": -3.5073654651641846, + "step": 7030 + }, + { + "epoch": 0.82, + "learning_rate": 5.545057281209401e-08, + "logits/chosen": -2.2402210235595703, + "logits/rejected": -2.347588062286377, + "logps/chosen": -270.578369140625, + "logps/rejected": -285.31292724609375, + "loss": 0.431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5898099541664124, + "rewards/margins": 2.670896530151367, + "rewards/rejected": -3.260706663131714, + "step": 7031 + }, + { + "epoch": 0.82, + "learning_rate": 5.541514113617574e-08, + "logits/chosen": -2.4829442501068115, + "logits/rejected": -2.5313754081726074, + "logps/chosen": -329.05706787109375, + "logps/rejected": -371.6969909667969, + "loss": 0.3045, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4001174569129944, + "rewards/margins": 2.428717613220215, + "rewards/rejected": -2.8288350105285645, + "step": 7032 + }, + { + "epoch": 0.82, + "learning_rate": 5.537970946025747e-08, + "logits/chosen": -2.606682300567627, + "logits/rejected": -2.4086666107177734, + "logps/chosen": -106.96056365966797, + "logps/rejected": -249.67955017089844, + "loss": 0.5171, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7274596691131592, + "rewards/margins": 3.0058183670043945, + "rewards/rejected": -3.733278512954712, + "step": 7033 + }, + { + "epoch": 0.82, + "learning_rate": 5.5344277784339196e-08, + "logits/chosen": -2.0438098907470703, + "logits/rejected": -2.070625066757202, + "logps/chosen": -241.16282653808594, + "logps/rejected": -185.55960083007812, + "loss": 0.3916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8095307350158691, + "rewards/margins": 2.1999406814575195, + "rewards/rejected": -3.0094711780548096, + "step": 7034 + }, + { + "epoch": 0.82, + "learning_rate": 5.5308846108420925e-08, + "logits/chosen": -2.70963716506958, + "logits/rejected": -2.7681126594543457, + "logps/chosen": -330.4197692871094, + "logps/rejected": -348.00201416015625, + "loss": 0.2456, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.312230110168457, + "rewards/margins": 2.532076358795166, + "rewards/rejected": -3.844306468963623, + "step": 7035 + }, + { + "epoch": 0.82, + "learning_rate": 5.527341443250266e-08, + "logits/chosen": -2.6366851329803467, + "logits/rejected": -2.638327121734619, + "logps/chosen": -248.55072021484375, + "logps/rejected": -247.57936096191406, + "loss": 0.2866, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2234104871749878, + "rewards/margins": 1.2410589456558228, + "rewards/rejected": -2.4644694328308105, + "step": 7036 + }, + { + "epoch": 0.82, + "learning_rate": 5.523798275658438e-08, + "logits/chosen": -2.486292839050293, + "logits/rejected": -2.495518445968628, + "logps/chosen": -190.3717041015625, + "logps/rejected": -179.67111206054688, + "loss": 0.4753, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1203267574310303, + "rewards/margins": 0.7707808017730713, + "rewards/rejected": -1.8911075592041016, + "step": 7037 + }, + { + "epoch": 0.82, + "learning_rate": 5.520255108066611e-08, + "logits/chosen": -2.41007661819458, + "logits/rejected": -2.3945388793945312, + "logps/chosen": -156.92514038085938, + "logps/rejected": -333.22021484375, + "loss": 0.4126, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.296011209487915, + "rewards/margins": 2.5217299461364746, + "rewards/rejected": -3.8177411556243896, + "step": 7038 + }, + { + "epoch": 0.82, + "learning_rate": 5.5167119404747847e-08, + "logits/chosen": -2.670694351196289, + "logits/rejected": -2.8264575004577637, + "logps/chosen": -317.3256530761719, + "logps/rejected": -305.6164245605469, + "loss": 0.5345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2397688627243042, + "rewards/margins": 1.3730835914611816, + "rewards/rejected": -2.6128523349761963, + "step": 7039 + }, + { + "epoch": 0.82, + "learning_rate": 5.513168772882957e-08, + "logits/chosen": -2.3833930492401123, + "logits/rejected": -2.590618848800659, + "logps/chosen": -333.7469482421875, + "logps/rejected": -300.4892578125, + "loss": 0.1947, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0649809837341309, + "rewards/margins": 2.1104536056518555, + "rewards/rejected": -3.1754345893859863, + "step": 7040 + }, + { + "epoch": 0.82, + "learning_rate": 5.50962560529113e-08, + "logits/chosen": -2.313477039337158, + "logits/rejected": -2.2493460178375244, + "logps/chosen": -240.555908203125, + "logps/rejected": -345.084716796875, + "loss": 0.1956, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7639114856719971, + "rewards/margins": 2.477419137954712, + "rewards/rejected": -3.24133038520813, + "step": 7041 + }, + { + "epoch": 0.82, + "learning_rate": 5.506082437699303e-08, + "logits/chosen": -1.6895332336425781, + "logits/rejected": -1.8515617847442627, + "logps/chosen": -353.5566101074219, + "logps/rejected": -286.52374267578125, + "loss": 0.4794, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41857266426086426, + "rewards/margins": 2.7628512382507324, + "rewards/rejected": -3.1814239025115967, + "step": 7042 + }, + { + "epoch": 0.82, + "learning_rate": 5.5025392701074755e-08, + "logits/chosen": -2.565199851989746, + "logits/rejected": -2.536207675933838, + "logps/chosen": -103.2051010131836, + "logps/rejected": -129.1919403076172, + "loss": 0.3441, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4302772283554077, + "rewards/margins": 1.1429134607315063, + "rewards/rejected": -1.5731905698776245, + "step": 7043 + }, + { + "epoch": 0.82, + "learning_rate": 5.4989961025156484e-08, + "logits/chosen": -1.9578036069869995, + "logits/rejected": -1.9700123071670532, + "logps/chosen": -213.4305877685547, + "logps/rejected": -243.66778564453125, + "loss": 0.5272, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1600946187973022, + "rewards/margins": 1.013960599899292, + "rewards/rejected": -2.1740550994873047, + "step": 7044 + }, + { + "epoch": 0.82, + "learning_rate": 5.495452934923822e-08, + "logits/chosen": -2.321535587310791, + "logits/rejected": -2.4288549423217773, + "logps/chosen": -417.1219482421875, + "logps/rejected": -320.987060546875, + "loss": 0.1949, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44053545594215393, + "rewards/margins": 2.9639081954956055, + "rewards/rejected": -3.4044437408447266, + "step": 7045 + }, + { + "epoch": 0.82, + "learning_rate": 5.491909767331995e-08, + "logits/chosen": -2.1864218711853027, + "logits/rejected": -2.347222089767456, + "logps/chosen": -143.6334686279297, + "logps/rejected": -134.71881103515625, + "loss": 0.3098, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3129240870475769, + "rewards/margins": 1.6892471313476562, + "rewards/rejected": -2.002171277999878, + "step": 7046 + }, + { + "epoch": 0.82, + "learning_rate": 5.488366599740167e-08, + "logits/chosen": -2.1925387382507324, + "logits/rejected": -2.4267027378082275, + "logps/chosen": -480.58013916015625, + "logps/rejected": -379.0689392089844, + "loss": 0.5811, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.161381483078003, + "rewards/margins": 1.4451416730880737, + "rewards/rejected": -2.606523275375366, + "step": 7047 + }, + { + "epoch": 0.82, + "learning_rate": 5.4848234321483405e-08, + "logits/chosen": -1.9338150024414062, + "logits/rejected": -2.2091760635375977, + "logps/chosen": -444.70703125, + "logps/rejected": -289.3113708496094, + "loss": 0.4883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3758866786956787, + "rewards/margins": 1.1049458980560303, + "rewards/rejected": -1.480832576751709, + "step": 7048 + }, + { + "epoch": 0.82, + "learning_rate": 5.4812802645565134e-08, + "logits/chosen": -2.5573129653930664, + "logits/rejected": -2.515550136566162, + "logps/chosen": -169.90814208984375, + "logps/rejected": -291.0963439941406, + "loss": 0.2956, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7596360445022583, + "rewards/margins": 2.345292806625366, + "rewards/rejected": -3.104928493499756, + "step": 7049 + }, + { + "epoch": 0.82, + "learning_rate": 5.4777370969646856e-08, + "logits/chosen": -1.9606139659881592, + "logits/rejected": -2.335003614425659, + "logps/chosen": -489.306396484375, + "logps/rejected": -321.69708251953125, + "loss": 0.7757, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4326328039169312, + "rewards/margins": 1.1716562509536743, + "rewards/rejected": -2.6042890548706055, + "step": 7050 + }, + { + "epoch": 0.82, + "learning_rate": 5.474193929372859e-08, + "logits/chosen": -2.680314540863037, + "logits/rejected": -2.506558418273926, + "logps/chosen": -192.56076049804688, + "logps/rejected": -215.64962768554688, + "loss": 0.2666, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7757855653762817, + "rewards/margins": 2.044389247894287, + "rewards/rejected": -2.8201746940612793, + "step": 7051 + }, + { + "epoch": 0.82, + "learning_rate": 5.470650761781032e-08, + "logits/chosen": -2.650207281112671, + "logits/rejected": -2.495008945465088, + "logps/chosen": -69.27442932128906, + "logps/rejected": -211.58041381835938, + "loss": 0.1116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09619168937206268, + "rewards/margins": 3.1443819999694824, + "rewards/rejected": -3.0481905937194824, + "step": 7052 + }, + { + "epoch": 0.82, + "learning_rate": 5.4671075941892056e-08, + "logits/chosen": -3.022217035293579, + "logits/rejected": -3.001046895980835, + "logps/chosen": -316.37213134765625, + "logps/rejected": -283.6966552734375, + "loss": 0.3275, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6815279722213745, + "rewards/margins": 2.5745835304260254, + "rewards/rejected": -3.2561116218566895, + "step": 7053 + }, + { + "epoch": 0.82, + "learning_rate": 5.463564426597378e-08, + "logits/chosen": -2.23233699798584, + "logits/rejected": -2.266061305999756, + "logps/chosen": -167.78421020507812, + "logps/rejected": -242.25592041015625, + "loss": 0.2896, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.020991563796997, + "rewards/margins": 1.9985542297363281, + "rewards/rejected": -3.0195460319519043, + "step": 7054 + }, + { + "epoch": 0.82, + "learning_rate": 5.4600212590055506e-08, + "logits/chosen": -2.1596860885620117, + "logits/rejected": -2.171708822250366, + "logps/chosen": -359.32415771484375, + "logps/rejected": -377.4686584472656, + "loss": 0.3065, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0693836212158203, + "rewards/margins": 2.970337390899658, + "rewards/rejected": -4.0397210121154785, + "step": 7055 + }, + { + "epoch": 0.82, + "learning_rate": 5.456478091413724e-08, + "logits/chosen": -2.1927618980407715, + "logits/rejected": -2.477351427078247, + "logps/chosen": -289.1364440917969, + "logps/rejected": -281.2858581542969, + "loss": 0.2641, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8507543206214905, + "rewards/margins": 2.660212755203247, + "rewards/rejected": -3.5109670162200928, + "step": 7056 + }, + { + "epoch": 0.82, + "learning_rate": 5.4529349238218964e-08, + "logits/chosen": -2.792039394378662, + "logits/rejected": -2.7392797470092773, + "logps/chosen": -200.67559814453125, + "logps/rejected": -153.92906188964844, + "loss": 6.4108, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.523861885070801, + "rewards/margins": -4.526762008666992, + "rewards/rejected": -1.9970991611480713, + "step": 7057 + }, + { + "epoch": 0.82, + "learning_rate": 5.449391756230069e-08, + "logits/chosen": -1.8294763565063477, + "logits/rejected": -1.7825604677200317, + "logps/chosen": -226.99118041992188, + "logps/rejected": -272.90509033203125, + "loss": 0.3458, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10530853271484375, + "rewards/margins": 1.6039793491363525, + "rewards/rejected": -1.7092878818511963, + "step": 7058 + }, + { + "epoch": 0.82, + "learning_rate": 5.445848588638243e-08, + "logits/chosen": -2.4309864044189453, + "logits/rejected": -2.496431827545166, + "logps/chosen": -122.91911315917969, + "logps/rejected": -162.16622924804688, + "loss": 0.4084, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23282170295715332, + "rewards/margins": 1.5953443050384521, + "rewards/rejected": -1.8281662464141846, + "step": 7059 + }, + { + "epoch": 0.82, + "learning_rate": 5.442305421046415e-08, + "logits/chosen": -2.3069398403167725, + "logits/rejected": -2.5516245365142822, + "logps/chosen": -516.5227661132812, + "logps/rejected": -342.8563232421875, + "loss": 0.4918, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7194671034812927, + "rewards/margins": 2.299372911453247, + "rewards/rejected": -3.0188400745391846, + "step": 7060 + }, + { + "epoch": 0.82, + "learning_rate": 5.438762253454588e-08, + "logits/chosen": -2.560863971710205, + "logits/rejected": -2.7848618030548096, + "logps/chosen": -417.44146728515625, + "logps/rejected": -391.58563232421875, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8829959630966187, + "rewards/margins": 2.0923233032226562, + "rewards/rejected": -2.9753193855285645, + "step": 7061 + }, + { + "epoch": 0.82, + "learning_rate": 5.4352190858627614e-08, + "logits/chosen": -2.5167698860168457, + "logits/rejected": -2.6787829399108887, + "logps/chosen": -357.91510009765625, + "logps/rejected": -343.748291015625, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5590354204177856, + "rewards/margins": 3.041914463043213, + "rewards/rejected": -3.600950002670288, + "step": 7062 + }, + { + "epoch": 0.82, + "learning_rate": 5.431675918270934e-08, + "logits/chosen": -2.1628222465515137, + "logits/rejected": -2.306668758392334, + "logps/chosen": -228.98416137695312, + "logps/rejected": -192.12281799316406, + "loss": 0.3633, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6939672231674194, + "rewards/margins": 2.430783987045288, + "rewards/rejected": -3.124751091003418, + "step": 7063 + }, + { + "epoch": 0.82, + "learning_rate": 5.4281327506791065e-08, + "logits/chosen": -2.004894256591797, + "logits/rejected": -2.173497200012207, + "logps/chosen": -329.82891845703125, + "logps/rejected": -228.1202850341797, + "loss": 0.7309, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5039427280426025, + "rewards/margins": 1.6251156330108643, + "rewards/rejected": -2.129058361053467, + "step": 7064 + }, + { + "epoch": 0.82, + "learning_rate": 5.42458958308728e-08, + "logits/chosen": -2.516885757446289, + "logits/rejected": -2.3299057483673096, + "logps/chosen": -128.3290252685547, + "logps/rejected": -296.0967712402344, + "loss": 0.6783, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9761713743209839, + "rewards/margins": 2.1150224208831787, + "rewards/rejected": -3.091193675994873, + "step": 7065 + }, + { + "epoch": 0.82, + "learning_rate": 5.421046415495453e-08, + "logits/chosen": -2.446737289428711, + "logits/rejected": -2.1466989517211914, + "logps/chosen": -153.33035278320312, + "logps/rejected": -314.0165100097656, + "loss": 0.6291, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.405031442642212, + "rewards/margins": 0.8808308243751526, + "rewards/rejected": -2.2858622074127197, + "step": 7066 + }, + { + "epoch": 0.82, + "learning_rate": 5.417503247903625e-08, + "logits/chosen": -2.529860258102417, + "logits/rejected": -2.192335367202759, + "logps/chosen": -217.42800903320312, + "logps/rejected": -242.1964569091797, + "loss": 0.2936, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9606083631515503, + "rewards/margins": 2.1548354625701904, + "rewards/rejected": -3.1154439449310303, + "step": 7067 + }, + { + "epoch": 0.82, + "learning_rate": 5.413960080311799e-08, + "logits/chosen": -2.1980345249176025, + "logits/rejected": -2.250220775604248, + "logps/chosen": -382.2289733886719, + "logps/rejected": -328.2364196777344, + "loss": 0.1077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6693390011787415, + "rewards/margins": 3.673095703125, + "rewards/rejected": -4.342434406280518, + "step": 7068 + }, + { + "epoch": 0.82, + "learning_rate": 5.4104169127199715e-08, + "logits/chosen": -2.8791427612304688, + "logits/rejected": -2.7693488597869873, + "logps/chosen": -218.0260009765625, + "logps/rejected": -270.9942932128906, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4256408214569092, + "rewards/margins": 2.8898119926452637, + "rewards/rejected": -3.3154525756835938, + "step": 7069 + }, + { + "epoch": 0.82, + "learning_rate": 5.406873745128144e-08, + "logits/chosen": -1.9729505777359009, + "logits/rejected": -2.14333438873291, + "logps/chosen": -264.78900146484375, + "logps/rejected": -279.6859130859375, + "loss": 0.3105, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.896172285079956, + "rewards/margins": 2.4416356086730957, + "rewards/rejected": -3.3378076553344727, + "step": 7070 + }, + { + "epoch": 0.82, + "learning_rate": 5.403330577536317e-08, + "logits/chosen": -2.6455044746398926, + "logits/rejected": -2.810408592224121, + "logps/chosen": -158.95437622070312, + "logps/rejected": -187.54678344726562, + "loss": 0.5346, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7297312617301941, + "rewards/margins": 3.1554696559906006, + "rewards/rejected": -3.8852009773254395, + "step": 7071 + }, + { + "epoch": 0.82, + "learning_rate": 5.39978740994449e-08, + "logits/chosen": -2.7388885021209717, + "logits/rejected": -2.6037793159484863, + "logps/chosen": -237.68621826171875, + "logps/rejected": -99.4521713256836, + "loss": 0.8618, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3324000835418701, + "rewards/margins": 0.2650715708732605, + "rewards/rejected": -1.5974715948104858, + "step": 7072 + }, + { + "epoch": 0.82, + "learning_rate": 5.396244242352663e-08, + "logits/chosen": -2.2858903408050537, + "logits/rejected": -2.4309725761413574, + "logps/chosen": -563.9614868164062, + "logps/rejected": -439.88623046875, + "loss": 0.11, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38197243213653564, + "rewards/margins": 4.131682395935059, + "rewards/rejected": -4.513655185699463, + "step": 7073 + }, + { + "epoch": 0.82, + "learning_rate": 5.392701074760836e-08, + "logits/chosen": -1.8651714324951172, + "logits/rejected": -1.9832913875579834, + "logps/chosen": -244.70516967773438, + "logps/rejected": -182.74102783203125, + "loss": 0.428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0378506183624268, + "rewards/margins": 1.6991101503372192, + "rewards/rejected": -2.7369608879089355, + "step": 7074 + }, + { + "epoch": 0.82, + "learning_rate": 5.389157907169009e-08, + "logits/chosen": -2.3190481662750244, + "logits/rejected": -2.268003225326538, + "logps/chosen": -372.76898193359375, + "logps/rejected": -419.2706604003906, + "loss": 0.4622, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4713338613510132, + "rewards/margins": 1.4608826637268066, + "rewards/rejected": -2.9322166442871094, + "step": 7075 + }, + { + "epoch": 0.82, + "learning_rate": 5.3856147395771817e-08, + "logits/chosen": -2.6701719760894775, + "logits/rejected": -2.7600643634796143, + "logps/chosen": -355.6355895996094, + "logps/rejected": -349.1263427734375, + "loss": 0.1948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8950479626655579, + "rewards/margins": 3.884957790374756, + "rewards/rejected": -4.78000545501709, + "step": 7076 + }, + { + "epoch": 0.82, + "learning_rate": 5.382071571985355e-08, + "logits/chosen": -2.001457452774048, + "logits/rejected": -2.150477647781372, + "logps/chosen": -637.1976928710938, + "logps/rejected": -514.8309326171875, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36089959740638733, + "rewards/margins": 2.400705337524414, + "rewards/rejected": -2.7616047859191895, + "step": 7077 + }, + { + "epoch": 0.82, + "learning_rate": 5.3785284043935274e-08, + "logits/chosen": -2.2319910526275635, + "logits/rejected": -2.357527256011963, + "logps/chosen": -206.341796875, + "logps/rejected": -159.36843872070312, + "loss": 0.3757, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9682274460792542, + "rewards/margins": 1.93678617477417, + "rewards/rejected": -2.9050137996673584, + "step": 7078 + }, + { + "epoch": 0.82, + "learning_rate": 5.3749852368017e-08, + "logits/chosen": -2.6746723651885986, + "logits/rejected": -2.7257237434387207, + "logps/chosen": -158.10011291503906, + "logps/rejected": -199.669189453125, + "loss": 0.5038, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.032411813735962, + "rewards/margins": 1.9509447813034058, + "rewards/rejected": -2.9833567142486572, + "step": 7079 + }, + { + "epoch": 0.82, + "learning_rate": 5.371442069209874e-08, + "logits/chosen": -2.362666368484497, + "logits/rejected": -2.563931465148926, + "logps/chosen": -309.92230224609375, + "logps/rejected": -312.2405700683594, + "loss": 0.352, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2518518269062042, + "rewards/margins": 1.8158719539642334, + "rewards/rejected": -2.0677239894866943, + "step": 7080 + }, + { + "epoch": 0.82, + "learning_rate": 5.367898901618046e-08, + "logits/chosen": -2.4716134071350098, + "logits/rejected": -2.5042073726654053, + "logps/chosen": -234.95396423339844, + "logps/rejected": -186.0421600341797, + "loss": 0.9593, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4034150838851929, + "rewards/margins": 1.4118218421936035, + "rewards/rejected": -2.815236806869507, + "step": 7081 + }, + { + "epoch": 0.82, + "learning_rate": 5.364355734026219e-08, + "logits/chosen": -1.5371363162994385, + "logits/rejected": -1.668870449066162, + "logps/chosen": -273.50115966796875, + "logps/rejected": -268.65576171875, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5098941326141357, + "rewards/margins": 2.4756059646606445, + "rewards/rejected": -2.9855000972747803, + "step": 7082 + }, + { + "epoch": 0.82, + "learning_rate": 5.3608125664343924e-08, + "logits/chosen": -2.7036499977111816, + "logits/rejected": -2.818734884262085, + "logps/chosen": -373.56597900390625, + "logps/rejected": -261.6390380859375, + "loss": 0.5994, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.492471694946289, + "rewards/margins": 1.2840293645858765, + "rewards/rejected": -2.776501178741455, + "step": 7083 + }, + { + "epoch": 0.82, + "learning_rate": 5.3572693988425647e-08, + "logits/chosen": -2.3826260566711426, + "logits/rejected": -2.7383384704589844, + "logps/chosen": -358.42498779296875, + "logps/rejected": -286.14654541015625, + "loss": 0.1654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.690256655216217, + "rewards/margins": 3.972853422164917, + "rewards/rejected": -4.66310977935791, + "step": 7084 + }, + { + "epoch": 0.82, + "learning_rate": 5.3537262312507375e-08, + "logits/chosen": -2.790360927581787, + "logits/rejected": -2.649407148361206, + "logps/chosen": -371.5672302246094, + "logps/rejected": -262.5574035644531, + "loss": 0.305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5344383716583252, + "rewards/margins": 2.247753381729126, + "rewards/rejected": -2.782191753387451, + "step": 7085 + }, + { + "epoch": 0.82, + "learning_rate": 5.350183063658911e-08, + "logits/chosen": -2.1201939582824707, + "logits/rejected": -2.172301769256592, + "logps/chosen": -389.88775634765625, + "logps/rejected": -379.9013366699219, + "loss": 0.6, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7162358164787292, + "rewards/margins": 2.2213447093963623, + "rewards/rejected": -2.9375805854797363, + "step": 7086 + }, + { + "epoch": 0.82, + "learning_rate": 5.346639896067083e-08, + "logits/chosen": -2.4434680938720703, + "logits/rejected": -2.224916458129883, + "logps/chosen": -269.817138671875, + "logps/rejected": -325.4612121582031, + "loss": 0.2035, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.025089979171753, + "rewards/margins": 5.2941694259643555, + "rewards/rejected": -6.3192596435546875, + "step": 7087 + }, + { + "epoch": 0.82, + "learning_rate": 5.343096728475257e-08, + "logits/chosen": -2.4680843353271484, + "logits/rejected": -2.592451333999634, + "logps/chosen": -189.1216278076172, + "logps/rejected": -208.7321014404297, + "loss": 0.6215, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0934727191925049, + "rewards/margins": 0.8470551371574402, + "rewards/rejected": -1.9405279159545898, + "step": 7088 + }, + { + "epoch": 0.82, + "learning_rate": 5.33955356088343e-08, + "logits/chosen": -2.4609482288360596, + "logits/rejected": -1.7658753395080566, + "logps/chosen": -103.26201629638672, + "logps/rejected": -258.08026123046875, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23919722437858582, + "rewards/margins": 2.8364298343658447, + "rewards/rejected": -2.5972328186035156, + "step": 7089 + }, + { + "epoch": 0.82, + "learning_rate": 5.3360103932916026e-08, + "logits/chosen": -2.339012861251831, + "logits/rejected": -2.3758983612060547, + "logps/chosen": -204.26614379882812, + "logps/rejected": -335.383056640625, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4241714477539062, + "rewards/margins": 3.163961887359619, + "rewards/rejected": -4.588133335113525, + "step": 7090 + }, + { + "epoch": 0.82, + "learning_rate": 5.3324672256997754e-08, + "logits/chosen": -2.141118288040161, + "logits/rejected": -2.330446481704712, + "logps/chosen": -203.93617248535156, + "logps/rejected": -136.40234375, + "loss": 0.7345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8865020871162415, + "rewards/margins": 0.2946810722351074, + "rewards/rejected": -1.181183099746704, + "step": 7091 + }, + { + "epoch": 0.83, + "learning_rate": 5.328924058107948e-08, + "logits/chosen": -2.4861788749694824, + "logits/rejected": -2.318962574005127, + "logps/chosen": -137.71517944335938, + "logps/rejected": -237.64352416992188, + "loss": 0.554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7130979895591736, + "rewards/margins": 1.3904852867126465, + "rewards/rejected": -2.103583335876465, + "step": 7092 + }, + { + "epoch": 0.83, + "learning_rate": 5.325380890516121e-08, + "logits/chosen": -2.3129074573516846, + "logits/rejected": -2.226057767868042, + "logps/chosen": -259.1691589355469, + "logps/rejected": -350.78704833984375, + "loss": 0.4509, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7950044274330139, + "rewards/margins": 2.8067405223846436, + "rewards/rejected": -3.601745128631592, + "step": 7093 + }, + { + "epoch": 0.83, + "learning_rate": 5.321837722924295e-08, + "logits/chosen": -2.583815097808838, + "logits/rejected": -2.628852367401123, + "logps/chosen": -396.88885498046875, + "logps/rejected": -356.92535400390625, + "loss": 0.2169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6225636005401611, + "rewards/margins": 3.8207473754882812, + "rewards/rejected": -4.4433112144470215, + "step": 7094 + }, + { + "epoch": 0.83, + "learning_rate": 5.318294555332467e-08, + "logits/chosen": -2.6331756114959717, + "logits/rejected": -2.3665621280670166, + "logps/chosen": -257.5335998535156, + "logps/rejected": -227.23883056640625, + "loss": 0.3647, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2546592950820923, + "rewards/margins": 2.0340769290924072, + "rewards/rejected": -3.288736343383789, + "step": 7095 + }, + { + "epoch": 0.83, + "learning_rate": 5.31475138774064e-08, + "logits/chosen": -2.642678737640381, + "logits/rejected": -2.7263565063476562, + "logps/chosen": -288.19384765625, + "logps/rejected": -363.0511779785156, + "loss": 0.2518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6055020093917847, + "rewards/margins": 1.730758786201477, + "rewards/rejected": -2.3362607955932617, + "step": 7096 + }, + { + "epoch": 0.83, + "learning_rate": 5.3112082201488133e-08, + "logits/chosen": -1.9828640222549438, + "logits/rejected": -2.0624942779541016, + "logps/chosen": -292.2423095703125, + "logps/rejected": -277.7435302734375, + "loss": 0.1915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8394526243209839, + "rewards/margins": 2.8338265419006348, + "rewards/rejected": -3.67327880859375, + "step": 7097 + }, + { + "epoch": 0.83, + "learning_rate": 5.3076650525569856e-08, + "logits/chosen": -2.0936989784240723, + "logits/rejected": -2.0842397212982178, + "logps/chosen": -242.422607421875, + "logps/rejected": -298.00054931640625, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29614442586898804, + "rewards/margins": 3.2604801654815674, + "rewards/rejected": -3.5566246509552, + "step": 7098 + }, + { + "epoch": 0.83, + "learning_rate": 5.3041218849651584e-08, + "logits/chosen": -1.8181008100509644, + "logits/rejected": -2.0407612323760986, + "logps/chosen": -385.093994140625, + "logps/rejected": -340.95123291015625, + "loss": 0.448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2682061791419983, + "rewards/margins": 1.8506900072097778, + "rewards/rejected": -2.118896245956421, + "step": 7099 + }, + { + "epoch": 0.83, + "learning_rate": 5.300578717373332e-08, + "logits/chosen": -2.5803332328796387, + "logits/rejected": -2.231133460998535, + "logps/chosen": -185.06626892089844, + "logps/rejected": -300.57666015625, + "loss": 0.4812, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.085295557975769, + "rewards/margins": 2.1651248931884766, + "rewards/rejected": -3.250420570373535, + "step": 7100 + }, + { + "epoch": 0.83, + "learning_rate": 5.297035549781504e-08, + "logits/chosen": -1.9587132930755615, + "logits/rejected": -1.5557818412780762, + "logps/chosen": -138.3531951904297, + "logps/rejected": -219.65618896484375, + "loss": 0.4115, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.062540054321289, + "rewards/margins": 1.2229641675949097, + "rewards/rejected": -2.285504102706909, + "step": 7101 + }, + { + "epoch": 0.83, + "learning_rate": 5.293492382189677e-08, + "logits/chosen": -2.3611083030700684, + "logits/rejected": -2.3948662281036377, + "logps/chosen": -291.96807861328125, + "logps/rejected": -314.0717468261719, + "loss": 0.2487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7211892604827881, + "rewards/margins": 2.2462873458862305, + "rewards/rejected": -2.9674768447875977, + "step": 7102 + }, + { + "epoch": 0.83, + "learning_rate": 5.2899492145978506e-08, + "logits/chosen": -2.1996397972106934, + "logits/rejected": -2.4936602115631104, + "logps/chosen": -540.9266357421875, + "logps/rejected": -379.2435607910156, + "loss": 0.635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.330552339553833, + "rewards/margins": 1.200603723526001, + "rewards/rejected": -2.531156063079834, + "step": 7103 + }, + { + "epoch": 0.83, + "learning_rate": 5.2864060470060235e-08, + "logits/chosen": -2.1967883110046387, + "logits/rejected": -2.2393414974212646, + "logps/chosen": -179.24169921875, + "logps/rejected": -194.68667602539062, + "loss": 0.6276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7349981665611267, + "rewards/margins": 0.4179682731628418, + "rewards/rejected": -1.1529663801193237, + "step": 7104 + }, + { + "epoch": 0.83, + "learning_rate": 5.282862879414196e-08, + "logits/chosen": -2.30183744430542, + "logits/rejected": -1.9764306545257568, + "logps/chosen": -299.2536926269531, + "logps/rejected": -293.5069580078125, + "loss": 0.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10910052806138992, + "rewards/margins": 2.657681941986084, + "rewards/rejected": -2.548581600189209, + "step": 7105 + }, + { + "epoch": 0.83, + "learning_rate": 5.279319711822369e-08, + "logits/chosen": -2.1251134872436523, + "logits/rejected": -2.4199092388153076, + "logps/chosen": -406.0937194824219, + "logps/rejected": -220.78990173339844, + "loss": 0.7876, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2227082252502441, + "rewards/margins": 0.6219390630722046, + "rewards/rejected": -1.8446474075317383, + "step": 7106 + }, + { + "epoch": 0.83, + "learning_rate": 5.275776544230542e-08, + "logits/chosen": -2.24761700630188, + "logits/rejected": -1.5899224281311035, + "logps/chosen": -235.53355407714844, + "logps/rejected": -478.6787414550781, + "loss": 0.6746, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9214428067207336, + "rewards/margins": 2.0289621353149414, + "rewards/rejected": -2.9504051208496094, + "step": 7107 + }, + { + "epoch": 0.83, + "learning_rate": 5.272233376638714e-08, + "logits/chosen": -2.1261441707611084, + "logits/rejected": -2.2054073810577393, + "logps/chosen": -381.4080810546875, + "logps/rejected": -303.0703125, + "loss": 0.4313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27069926261901855, + "rewards/margins": 3.5682663917541504, + "rewards/rejected": -3.838965892791748, + "step": 7108 + }, + { + "epoch": 0.83, + "learning_rate": 5.268690209046888e-08, + "logits/chosen": -2.049912452697754, + "logits/rejected": -2.379183292388916, + "logps/chosen": -309.290283203125, + "logps/rejected": -252.06906127929688, + "loss": 0.2223, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3155606985092163, + "rewards/margins": 2.5110762119293213, + "rewards/rejected": -2.1955153942108154, + "step": 7109 + }, + { + "epoch": 0.83, + "learning_rate": 5.265147041455061e-08, + "logits/chosen": -2.629753828048706, + "logits/rejected": -2.403799057006836, + "logps/chosen": -475.8435363769531, + "logps/rejected": -289.5596923828125, + "loss": 0.5425, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1470344066619873, + "rewards/margins": 2.065884590148926, + "rewards/rejected": -3.212918996810913, + "step": 7110 + }, + { + "epoch": 0.83, + "learning_rate": 5.261603873863233e-08, + "logits/chosen": -1.710662603378296, + "logits/rejected": -2.0135138034820557, + "logps/chosen": -326.976806640625, + "logps/rejected": -223.577392578125, + "loss": 0.4374, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5686733722686768, + "rewards/margins": 1.847362756729126, + "rewards/rejected": -3.4160361289978027, + "step": 7111 + }, + { + "epoch": 0.83, + "learning_rate": 5.2580607062714065e-08, + "logits/chosen": -2.407397747039795, + "logits/rejected": -2.5939388275146484, + "logps/chosen": -129.783935546875, + "logps/rejected": -175.2313232421875, + "loss": 1.0041, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1112239360809326, + "rewards/margins": 2.2663252353668213, + "rewards/rejected": -3.377549409866333, + "step": 7112 + }, + { + "epoch": 0.83, + "learning_rate": 5.2545175386795793e-08, + "logits/chosen": -3.0449020862579346, + "logits/rejected": -3.007037878036499, + "logps/chosen": -193.508544921875, + "logps/rejected": -228.6929931640625, + "loss": 0.4054, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3500027656555176, + "rewards/margins": 1.4030952453613281, + "rewards/rejected": -2.7530980110168457, + "step": 7113 + }, + { + "epoch": 0.83, + "learning_rate": 5.250974371087752e-08, + "logits/chosen": -2.0530645847320557, + "logits/rejected": -2.142463445663452, + "logps/chosen": -192.70541381835938, + "logps/rejected": -231.62042236328125, + "loss": 0.3753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7710584998130798, + "rewards/margins": 2.796257257461548, + "rewards/rejected": -3.5673155784606934, + "step": 7114 + }, + { + "epoch": 0.83, + "learning_rate": 5.247431203495925e-08, + "logits/chosen": -1.9830033779144287, + "logits/rejected": -1.831479787826538, + "logps/chosen": -145.20343017578125, + "logps/rejected": -314.56793212890625, + "loss": 0.3766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7598315477371216, + "rewards/margins": 1.8874708414077759, + "rewards/rejected": -2.6473026275634766, + "step": 7115 + }, + { + "epoch": 0.83, + "learning_rate": 5.243888035904098e-08, + "logits/chosen": -2.559929370880127, + "logits/rejected": -2.4740145206451416, + "logps/chosen": -353.0787353515625, + "logps/rejected": -348.5948181152344, + "loss": 0.3863, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6061197519302368, + "rewards/margins": 2.7493093013763428, + "rewards/rejected": -4.355429172515869, + "step": 7116 + }, + { + "epoch": 0.83, + "learning_rate": 5.240344868312271e-08, + "logits/chosen": -2.2936782836914062, + "logits/rejected": -2.2760701179504395, + "logps/chosen": -164.6866455078125, + "logps/rejected": -354.23040771484375, + "loss": 0.5602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1644459962844849, + "rewards/margins": 1.8261011838912964, + "rewards/rejected": -2.9905471801757812, + "step": 7117 + }, + { + "epoch": 0.83, + "learning_rate": 5.236801700720444e-08, + "logits/chosen": -1.8714359998703003, + "logits/rejected": -2.095311403274536, + "logps/chosen": -218.4193115234375, + "logps/rejected": -240.65237426757812, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9698383808135986, + "rewards/margins": 2.593071460723877, + "rewards/rejected": -3.5629096031188965, + "step": 7118 + }, + { + "epoch": 0.83, + "learning_rate": 5.2332585331286166e-08, + "logits/chosen": -2.37638521194458, + "logits/rejected": -2.144479751586914, + "logps/chosen": -138.06277465820312, + "logps/rejected": -355.3026123046875, + "loss": 0.2489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.782085657119751, + "rewards/margins": 2.7575955390930176, + "rewards/rejected": -3.5396811962127686, + "step": 7119 + }, + { + "epoch": 0.83, + "learning_rate": 5.2297153655367895e-08, + "logits/chosen": -2.777553081512451, + "logits/rejected": -2.975330352783203, + "logps/chosen": -212.51296997070312, + "logps/rejected": -181.91847229003906, + "loss": 0.5504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6085493564605713, + "rewards/margins": 0.9484121203422546, + "rewards/rejected": -1.5569615364074707, + "step": 7120 + }, + { + "epoch": 0.83, + "learning_rate": 5.226172197944963e-08, + "logits/chosen": -2.4897820949554443, + "logits/rejected": -2.5976147651672363, + "logps/chosen": -258.4271545410156, + "logps/rejected": -303.53082275390625, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09243802726268768, + "rewards/margins": 3.0199313163757324, + "rewards/rejected": -2.9274935722351074, + "step": 7121 + }, + { + "epoch": 0.83, + "learning_rate": 5.222629030353135e-08, + "logits/chosen": -2.3382625579833984, + "logits/rejected": -2.6639628410339355, + "logps/chosen": -453.81890869140625, + "logps/rejected": -344.62725830078125, + "loss": 0.3791, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9182524681091309, + "rewards/margins": 3.4141435623168945, + "rewards/rejected": -5.332396030426025, + "step": 7122 + }, + { + "epoch": 0.83, + "learning_rate": 5.219085862761308e-08, + "logits/chosen": -2.4912688732147217, + "logits/rejected": -2.4850656986236572, + "logps/chosen": -305.2012634277344, + "logps/rejected": -347.4366760253906, + "loss": 0.3105, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3404896259307861, + "rewards/margins": 3.113051652908325, + "rewards/rejected": -4.453541278839111, + "step": 7123 + }, + { + "epoch": 0.83, + "learning_rate": 5.2155426951694816e-08, + "logits/chosen": -2.6170952320098877, + "logits/rejected": -2.4079811573028564, + "logps/chosen": -126.78227233886719, + "logps/rejected": -120.48257446289062, + "loss": 0.8418, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4164729118347168, + "rewards/margins": 1.6044985055923462, + "rewards/rejected": -3.0209715366363525, + "step": 7124 + }, + { + "epoch": 0.83, + "learning_rate": 5.211999527577654e-08, + "logits/chosen": -2.3039863109588623, + "logits/rejected": -2.3902111053466797, + "logps/chosen": -243.43431091308594, + "logps/rejected": -227.1172332763672, + "loss": 0.182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6218520402908325, + "rewards/margins": 2.103602647781372, + "rewards/rejected": -2.725454330444336, + "step": 7125 + }, + { + "epoch": 0.83, + "learning_rate": 5.2084563599858274e-08, + "logits/chosen": -1.9037690162658691, + "logits/rejected": -2.4086854457855225, + "logps/chosen": -473.8895263671875, + "logps/rejected": -291.5407409667969, + "loss": 0.1924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40019655227661133, + "rewards/margins": 3.169877052307129, + "rewards/rejected": -3.5700736045837402, + "step": 7126 + }, + { + "epoch": 0.83, + "learning_rate": 5.204913192394e-08, + "logits/chosen": -2.224423408508301, + "logits/rejected": -2.2290871143341064, + "logps/chosen": -175.00750732421875, + "logps/rejected": -387.51458740234375, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.359252393245697, + "rewards/margins": 4.04508113861084, + "rewards/rejected": -4.404333114624023, + "step": 7127 + }, + { + "epoch": 0.83, + "learning_rate": 5.2013700248021724e-08, + "logits/chosen": -2.1778805255889893, + "logits/rejected": -2.509572982788086, + "logps/chosen": -400.7255859375, + "logps/rejected": -368.87921142578125, + "loss": 0.2304, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4659647941589355, + "rewards/margins": 3.3027243614196777, + "rewards/rejected": -4.768689155578613, + "step": 7128 + }, + { + "epoch": 0.83, + "learning_rate": 5.197826857210346e-08, + "logits/chosen": -2.360635995864868, + "logits/rejected": -2.468200206756592, + "logps/chosen": -503.52496337890625, + "logps/rejected": -258.1363220214844, + "loss": 0.1711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20617343485355377, + "rewards/margins": 3.0621237754821777, + "rewards/rejected": -3.2682974338531494, + "step": 7129 + }, + { + "epoch": 0.83, + "learning_rate": 5.194283689618519e-08, + "logits/chosen": -2.414518117904663, + "logits/rejected": -2.243349313735962, + "logps/chosen": -77.71697235107422, + "logps/rejected": -214.50732421875, + "loss": 0.4391, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4718064069747925, + "rewards/margins": 1.063778281211853, + "rewards/rejected": -2.5355846881866455, + "step": 7130 + }, + { + "epoch": 0.83, + "learning_rate": 5.190740522026692e-08, + "logits/chosen": -1.850909948348999, + "logits/rejected": -1.955790400505066, + "logps/chosen": -218.07704162597656, + "logps/rejected": -235.55929565429688, + "loss": 0.7141, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.369465947151184, + "rewards/margins": 1.011781096458435, + "rewards/rejected": -2.381247043609619, + "step": 7131 + }, + { + "epoch": 0.83, + "learning_rate": 5.1871973544348646e-08, + "logits/chosen": -1.9200178384780884, + "logits/rejected": -2.216301918029785, + "logps/chosen": -552.276611328125, + "logps/rejected": -313.0879211425781, + "loss": 0.3146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5562664866447449, + "rewards/margins": 2.2277512550354004, + "rewards/rejected": -2.784017562866211, + "step": 7132 + }, + { + "epoch": 0.83, + "learning_rate": 5.1836541868430375e-08, + "logits/chosen": -2.3273820877075195, + "logits/rejected": -2.1277942657470703, + "logps/chosen": -351.7817077636719, + "logps/rejected": -436.43023681640625, + "loss": 0.2971, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2669453620910645, + "rewards/margins": 3.033970832824707, + "rewards/rejected": -4.3009161949157715, + "step": 7133 + }, + { + "epoch": 0.83, + "learning_rate": 5.1801110192512104e-08, + "logits/chosen": -2.4098262786865234, + "logits/rejected": -2.4733762741088867, + "logps/chosen": -407.6121826171875, + "logps/rejected": -322.1366271972656, + "loss": 0.2559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42694389820098877, + "rewards/margins": 2.8572802543640137, + "rewards/rejected": -3.284224033355713, + "step": 7134 + }, + { + "epoch": 0.83, + "learning_rate": 5.176567851659384e-08, + "logits/chosen": -2.110743522644043, + "logits/rejected": -2.147102117538452, + "logps/chosen": -215.52012634277344, + "logps/rejected": -250.29898071289062, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06611805409193039, + "rewards/margins": 3.8928465843200684, + "rewards/rejected": -3.9589648246765137, + "step": 7135 + }, + { + "epoch": 0.83, + "learning_rate": 5.173024684067556e-08, + "logits/chosen": -2.006789207458496, + "logits/rejected": -2.4439101219177246, + "logps/chosen": -232.86648559570312, + "logps/rejected": -220.13894653320312, + "loss": 0.3379, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1531846523284912, + "rewards/margins": 2.6109726428985596, + "rewards/rejected": -3.764157295227051, + "step": 7136 + }, + { + "epoch": 0.83, + "learning_rate": 5.169481516475729e-08, + "logits/chosen": -2.392191171646118, + "logits/rejected": -2.6274940967559814, + "logps/chosen": -201.02752685546875, + "logps/rejected": -309.7099304199219, + "loss": 0.5647, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1566517353057861, + "rewards/margins": 1.7430888414382935, + "rewards/rejected": -2.899740695953369, + "step": 7137 + }, + { + "epoch": 0.83, + "learning_rate": 5.1659383488839025e-08, + "logits/chosen": -2.993943691253662, + "logits/rejected": -2.783445358276367, + "logps/chosen": -241.85784912109375, + "logps/rejected": -256.97027587890625, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4644942879676819, + "rewards/margins": 2.2968029975891113, + "rewards/rejected": -2.7612972259521484, + "step": 7138 + }, + { + "epoch": 0.83, + "learning_rate": 5.162395181292075e-08, + "logits/chosen": -2.372363567352295, + "logits/rejected": -2.3671679496765137, + "logps/chosen": -206.7469482421875, + "logps/rejected": -243.5825958251953, + "loss": 0.2725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8097165822982788, + "rewards/margins": 2.364013433456421, + "rewards/rejected": -3.1737303733825684, + "step": 7139 + }, + { + "epoch": 0.83, + "learning_rate": 5.1588520137002476e-08, + "logits/chosen": -2.9474503993988037, + "logits/rejected": -2.6426734924316406, + "logps/chosen": -221.73826599121094, + "logps/rejected": -168.39122009277344, + "loss": 0.9695, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4745543003082275, + "rewards/margins": 0.2975813150405884, + "rewards/rejected": -1.772135615348816, + "step": 7140 + }, + { + "epoch": 0.83, + "learning_rate": 5.155308846108421e-08, + "logits/chosen": -2.1463799476623535, + "logits/rejected": -2.132744789123535, + "logps/chosen": -340.314453125, + "logps/rejected": -400.5849609375, + "loss": 0.2743, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9550794363021851, + "rewards/margins": 2.17525053024292, + "rewards/rejected": -3.1303300857543945, + "step": 7141 + }, + { + "epoch": 0.83, + "learning_rate": 5.1517656785165933e-08, + "logits/chosen": -2.672795534133911, + "logits/rejected": -2.8036561012268066, + "logps/chosen": -374.78564453125, + "logps/rejected": -391.685546875, + "loss": 0.2941, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9270328879356384, + "rewards/margins": 2.5095014572143555, + "rewards/rejected": -3.4365344047546387, + "step": 7142 + }, + { + "epoch": 0.83, + "learning_rate": 5.148222510924766e-08, + "logits/chosen": -2.15981125831604, + "logits/rejected": -2.021361827850342, + "logps/chosen": -315.4383239746094, + "logps/rejected": -268.4542236328125, + "loss": 0.2956, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8392565846443176, + "rewards/margins": 1.8211842775344849, + "rewards/rejected": -2.6604409217834473, + "step": 7143 + }, + { + "epoch": 0.83, + "learning_rate": 5.14467934333294e-08, + "logits/chosen": -2.3296287059783936, + "logits/rejected": -2.503373146057129, + "logps/chosen": -287.1605224609375, + "logps/rejected": -270.7717590332031, + "loss": 0.4074, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.949783205986023, + "rewards/margins": 1.634054183959961, + "rewards/rejected": -2.5838375091552734, + "step": 7144 + }, + { + "epoch": 0.83, + "learning_rate": 5.1411361757411126e-08, + "logits/chosen": -2.215186595916748, + "logits/rejected": -2.501162052154541, + "logps/chosen": -219.5772705078125, + "logps/rejected": -223.25341796875, + "loss": 0.3561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29239922761917114, + "rewards/margins": 2.107778549194336, + "rewards/rejected": -2.400177478790283, + "step": 7145 + }, + { + "epoch": 0.83, + "learning_rate": 5.137593008149285e-08, + "logits/chosen": -2.605314254760742, + "logits/rejected": -2.510509729385376, + "logps/chosen": -76.70619201660156, + "logps/rejected": -322.2023010253906, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7887746095657349, + "rewards/margins": 3.3386518955230713, + "rewards/rejected": -4.127426624298096, + "step": 7146 + }, + { + "epoch": 0.83, + "learning_rate": 5.1340498405574584e-08, + "logits/chosen": -2.6901488304138184, + "logits/rejected": -2.615410327911377, + "logps/chosen": -173.4413299560547, + "logps/rejected": -675.906982421875, + "loss": 0.1696, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13563568890094757, + "rewards/margins": 6.192337989807129, + "rewards/rejected": -6.327974319458008, + "step": 7147 + }, + { + "epoch": 0.83, + "learning_rate": 5.130506672965631e-08, + "logits/chosen": -2.385394334793091, + "logits/rejected": -2.5995168685913086, + "logps/chosen": -426.51837158203125, + "logps/rejected": -264.0852355957031, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13870924711227417, + "rewards/margins": 3.5070152282714844, + "rewards/rejected": -3.6457247734069824, + "step": 7148 + }, + { + "epoch": 0.83, + "learning_rate": 5.1269635053738035e-08, + "logits/chosen": -2.1776702404022217, + "logits/rejected": -2.2907612323760986, + "logps/chosen": -319.604736328125, + "logps/rejected": -248.90548706054688, + "loss": 0.2767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16602492332458496, + "rewards/margins": 2.040645122528076, + "rewards/rejected": -2.2066702842712402, + "step": 7149 + }, + { + "epoch": 0.83, + "learning_rate": 5.123420337781977e-08, + "logits/chosen": -2.3177404403686523, + "logits/rejected": -2.776045560836792, + "logps/chosen": -252.09771728515625, + "logps/rejected": -245.4468536376953, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6787501573562622, + "rewards/margins": 2.2814877033233643, + "rewards/rejected": -2.960237979888916, + "step": 7150 + }, + { + "epoch": 0.83, + "learning_rate": 5.11987717019015e-08, + "logits/chosen": -2.3517916202545166, + "logits/rejected": -2.2051470279693604, + "logps/chosen": -141.220947265625, + "logps/rejected": -222.5313262939453, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7227376699447632, + "rewards/margins": 4.761120319366455, + "rewards/rejected": -5.483858108520508, + "step": 7151 + }, + { + "epoch": 0.83, + "learning_rate": 5.116334002598322e-08, + "logits/chosen": -2.587747573852539, + "logits/rejected": -2.5372321605682373, + "logps/chosen": -444.88525390625, + "logps/rejected": -241.22023010253906, + "loss": 0.4512, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5273010730743408, + "rewards/margins": 2.3331494331359863, + "rewards/rejected": -3.8604507446289062, + "step": 7152 + }, + { + "epoch": 0.83, + "learning_rate": 5.1127908350064956e-08, + "logits/chosen": -2.506044387817383, + "logits/rejected": -2.6744484901428223, + "logps/chosen": -229.06698608398438, + "logps/rejected": -197.02169799804688, + "loss": 0.9063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4961929321289062, + "rewards/margins": 0.6000464558601379, + "rewards/rejected": -2.0962395668029785, + "step": 7153 + }, + { + "epoch": 0.83, + "learning_rate": 5.1092476674146685e-08, + "logits/chosen": -1.8982584476470947, + "logits/rejected": -1.9668335914611816, + "logps/chosen": -434.50445556640625, + "logps/rejected": -317.6068115234375, + "loss": 0.3489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8629757165908813, + "rewards/margins": 2.1247472763061523, + "rewards/rejected": -2.9877231121063232, + "step": 7154 + }, + { + "epoch": 0.83, + "learning_rate": 5.105704499822841e-08, + "logits/chosen": -2.5622966289520264, + "logits/rejected": -2.8973584175109863, + "logps/chosen": -189.73658752441406, + "logps/rejected": -252.08628845214844, + "loss": 0.3926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6282339096069336, + "rewards/margins": 3.343384265899658, + "rewards/rejected": -3.9716179370880127, + "step": 7155 + }, + { + "epoch": 0.83, + "learning_rate": 5.102161332231014e-08, + "logits/chosen": -2.7972865104675293, + "logits/rejected": -2.7413339614868164, + "logps/chosen": -171.08070373535156, + "logps/rejected": -195.12828063964844, + "loss": 0.3023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7089846134185791, + "rewards/margins": 2.895352840423584, + "rewards/rejected": -3.604337692260742, + "step": 7156 + }, + { + "epoch": 0.83, + "learning_rate": 5.098618164639187e-08, + "logits/chosen": -2.4722471237182617, + "logits/rejected": -2.3154401779174805, + "logps/chosen": -241.64678955078125, + "logps/rejected": -382.11260986328125, + "loss": 0.5492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2628529071807861, + "rewards/margins": 1.4469188451766968, + "rewards/rejected": -2.7097716331481934, + "step": 7157 + }, + { + "epoch": 0.83, + "learning_rate": 5.09507499704736e-08, + "logits/chosen": -2.951364278793335, + "logits/rejected": -2.9887197017669678, + "logps/chosen": -243.45172119140625, + "logps/rejected": -261.17108154296875, + "loss": 0.3008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7828085422515869, + "rewards/margins": 2.0841400623321533, + "rewards/rejected": -2.8669486045837402, + "step": 7158 + }, + { + "epoch": 0.83, + "learning_rate": 5.091531829455533e-08, + "logits/chosen": -1.874549150466919, + "logits/rejected": -1.7882063388824463, + "logps/chosen": -382.13726806640625, + "logps/rejected": -405.1980285644531, + "loss": 0.2678, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08454647660255432, + "rewards/margins": 2.0672011375427246, + "rewards/rejected": -2.151747703552246, + "step": 7159 + }, + { + "epoch": 0.83, + "learning_rate": 5.087988661863706e-08, + "logits/chosen": -2.4699203968048096, + "logits/rejected": -2.263108730316162, + "logps/chosen": -312.77398681640625, + "logps/rejected": -350.5931396484375, + "loss": 0.174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6216564178466797, + "rewards/margins": 2.995387554168701, + "rewards/rejected": -3.617043972015381, + "step": 7160 + }, + { + "epoch": 0.83, + "learning_rate": 5.084445494271879e-08, + "logits/chosen": -2.2951393127441406, + "logits/rejected": -2.4687626361846924, + "logps/chosen": -268.2995910644531, + "logps/rejected": -252.38473510742188, + "loss": 0.3676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.840825080871582, + "rewards/margins": 2.29970121383667, + "rewards/rejected": -3.140526533126831, + "step": 7161 + }, + { + "epoch": 0.83, + "learning_rate": 5.080902326680052e-08, + "logits/chosen": -2.5599539279937744, + "logits/rejected": -2.4940667152404785, + "logps/chosen": -261.1133117675781, + "logps/rejected": -296.9208068847656, + "loss": 0.311, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9520969986915588, + "rewards/margins": 2.2137787342071533, + "rewards/rejected": -3.1658756732940674, + "step": 7162 + }, + { + "epoch": 0.83, + "learning_rate": 5.0773591590882244e-08, + "logits/chosen": -2.516817331314087, + "logits/rejected": -2.5644359588623047, + "logps/chosen": -227.2720947265625, + "logps/rejected": -203.6341094970703, + "loss": 0.6328, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3923768997192383, + "rewards/margins": 0.536953866481781, + "rewards/rejected": -1.9293309450149536, + "step": 7163 + }, + { + "epoch": 0.83, + "learning_rate": 5.073815991496398e-08, + "logits/chosen": -2.5586860179901123, + "logits/rejected": -2.5136899948120117, + "logps/chosen": -284.9708251953125, + "logps/rejected": -298.86895751953125, + "loss": 0.1438, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.693950891494751, + "rewards/margins": 2.832962989807129, + "rewards/rejected": -4.526914119720459, + "step": 7164 + }, + { + "epoch": 0.83, + "learning_rate": 5.070272823904571e-08, + "logits/chosen": -2.2489173412323, + "logits/rejected": -2.240694999694824, + "logps/chosen": -226.05239868164062, + "logps/rejected": -212.68222045898438, + "loss": 0.4273, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.041130542755127, + "rewards/margins": 1.3085906505584717, + "rewards/rejected": -2.3497209548950195, + "step": 7165 + }, + { + "epoch": 0.83, + "learning_rate": 5.066729656312743e-08, + "logits/chosen": -2.773956298828125, + "logits/rejected": -2.6638193130493164, + "logps/chosen": -163.5788116455078, + "logps/rejected": -178.07530212402344, + "loss": 0.2226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5373913645744324, + "rewards/margins": 2.481682777404785, + "rewards/rejected": -3.0190742015838623, + "step": 7166 + }, + { + "epoch": 0.83, + "learning_rate": 5.0631864887209165e-08, + "logits/chosen": -2.089329957962036, + "logits/rejected": -2.106614828109741, + "logps/chosen": -314.02593994140625, + "logps/rejected": -308.55975341796875, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0466296523809433, + "rewards/margins": 2.3091797828674316, + "rewards/rejected": -2.355809450149536, + "step": 7167 + }, + { + "epoch": 0.83, + "learning_rate": 5.0596433211290894e-08, + "logits/chosen": -2.0777924060821533, + "logits/rejected": -2.2606258392333984, + "logps/chosen": -218.17251586914062, + "logps/rejected": -155.02638244628906, + "loss": 0.8018, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7421526908874512, + "rewards/margins": 0.9888541102409363, + "rewards/rejected": -2.731006622314453, + "step": 7168 + }, + { + "epoch": 0.83, + "learning_rate": 5.0561001535372616e-08, + "logits/chosen": -2.3042526245117188, + "logits/rejected": -2.597909927368164, + "logps/chosen": -423.0379638671875, + "logps/rejected": -337.80157470703125, + "loss": 0.4513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5529161691665649, + "rewards/margins": 1.489574909210205, + "rewards/rejected": -2.0424911975860596, + "step": 7169 + }, + { + "epoch": 0.83, + "learning_rate": 5.052556985945435e-08, + "logits/chosen": -2.178675889968872, + "logits/rejected": -2.3537189960479736, + "logps/chosen": -336.49029541015625, + "logps/rejected": -406.6915588378906, + "loss": 0.7927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7185924053192139, + "rewards/margins": 0.5311200618743896, + "rewards/rejected": -1.249712586402893, + "step": 7170 + }, + { + "epoch": 0.83, + "learning_rate": 5.049013818353608e-08, + "logits/chosen": -2.5046677589416504, + "logits/rejected": -2.4086878299713135, + "logps/chosen": -336.1894226074219, + "logps/rejected": -276.52630615234375, + "loss": 0.2861, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04625406488776207, + "rewards/margins": 4.149484634399414, + "rewards/rejected": -4.103230953216553, + "step": 7171 + }, + { + "epoch": 0.83, + "learning_rate": 5.045470650761781e-08, + "logits/chosen": -1.7698475122451782, + "logits/rejected": -2.3421552181243896, + "logps/chosen": -446.08209228515625, + "logps/rejected": -261.667724609375, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8496297597885132, + "rewards/margins": 1.5830228328704834, + "rewards/rejected": -2.432652473449707, + "step": 7172 + }, + { + "epoch": 0.83, + "learning_rate": 5.041927483169954e-08, + "logits/chosen": -2.478815793991089, + "logits/rejected": -2.5498456954956055, + "logps/chosen": -245.66696166992188, + "logps/rejected": -190.73684692382812, + "loss": 0.1608, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6056062579154968, + "rewards/margins": 2.8390073776245117, + "rewards/rejected": -3.4446136951446533, + "step": 7173 + }, + { + "epoch": 0.83, + "learning_rate": 5.0383843155781266e-08, + "logits/chosen": -2.5619640350341797, + "logits/rejected": -2.5106375217437744, + "logps/chosen": -219.90704345703125, + "logps/rejected": -196.73098754882812, + "loss": 0.3631, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8623713254928589, + "rewards/margins": 1.038101315498352, + "rewards/rejected": -1.900472640991211, + "step": 7174 + }, + { + "epoch": 0.83, + "learning_rate": 5.0348411479862995e-08, + "logits/chosen": -2.482436418533325, + "logits/rejected": -2.0035512447357178, + "logps/chosen": -292.5877685546875, + "logps/rejected": -322.6624755859375, + "loss": 0.3634, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2757464051246643, + "rewards/margins": 1.472292184829712, + "rewards/rejected": -1.7480387687683105, + "step": 7175 + }, + { + "epoch": 0.83, + "learning_rate": 5.031297980394473e-08, + "logits/chosen": -1.9767649173736572, + "logits/rejected": -2.434264659881592, + "logps/chosen": -517.81689453125, + "logps/rejected": -351.4700927734375, + "loss": 0.2437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31724196672439575, + "rewards/margins": 2.4893441200256348, + "rewards/rejected": -2.8065860271453857, + "step": 7176 + }, + { + "epoch": 0.83, + "learning_rate": 5.027754812802645e-08, + "logits/chosen": -2.3275210857391357, + "logits/rejected": -2.5004940032958984, + "logps/chosen": -152.16619873046875, + "logps/rejected": -206.15072631835938, + "loss": 0.3789, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9410203099250793, + "rewards/margins": 3.1446657180786133, + "rewards/rejected": -4.085686206817627, + "step": 7177 + }, + { + "epoch": 0.84, + "learning_rate": 5.024211645210818e-08, + "logits/chosen": -2.4360506534576416, + "logits/rejected": -2.1562728881835938, + "logps/chosen": -303.3389587402344, + "logps/rejected": -338.8268737792969, + "loss": 0.9539, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5906689167022705, + "rewards/margins": 0.5633884072303772, + "rewards/rejected": -2.154057502746582, + "step": 7178 + }, + { + "epoch": 0.84, + "learning_rate": 5.020668477618992e-08, + "logits/chosen": -2.391761302947998, + "logits/rejected": -2.5671191215515137, + "logps/chosen": -218.13296508789062, + "logps/rejected": -162.55575561523438, + "loss": 0.3875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8603320717811584, + "rewards/margins": 1.600976586341858, + "rewards/rejected": -2.461308717727661, + "step": 7179 + }, + { + "epoch": 0.84, + "learning_rate": 5.017125310027164e-08, + "logits/chosen": -2.1369900703430176, + "logits/rejected": -2.0169990062713623, + "logps/chosen": -139.8245849609375, + "logps/rejected": -195.13633728027344, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9784184694290161, + "rewards/margins": 0.9666426777839661, + "rewards/rejected": -1.945061206817627, + "step": 7180 + }, + { + "epoch": 0.84, + "learning_rate": 5.013582142435337e-08, + "logits/chosen": -2.1509475708007812, + "logits/rejected": -2.068617582321167, + "logps/chosen": -305.7303161621094, + "logps/rejected": -352.71502685546875, + "loss": 0.205, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32909831404685974, + "rewards/margins": 3.2757890224456787, + "rewards/rejected": -3.6048872470855713, + "step": 7181 + }, + { + "epoch": 0.84, + "learning_rate": 5.01003897484351e-08, + "logits/chosen": -2.522418260574341, + "logits/rejected": -2.5333523750305176, + "logps/chosen": -234.79428100585938, + "logps/rejected": -253.84617614746094, + "loss": 0.5002, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44380030035972595, + "rewards/margins": 2.1627559661865234, + "rewards/rejected": -2.6065564155578613, + "step": 7182 + }, + { + "epoch": 0.84, + "learning_rate": 5.0064958072516825e-08, + "logits/chosen": -2.705954074859619, + "logits/rejected": -2.9651198387145996, + "logps/chosen": -177.33502197265625, + "logps/rejected": -161.70809936523438, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1856221854686737, + "rewards/margins": 2.831040382385254, + "rewards/rejected": -2.645418405532837, + "step": 7183 + }, + { + "epoch": 0.84, + "learning_rate": 5.0029526396598554e-08, + "logits/chosen": -2.6874961853027344, + "logits/rejected": -2.7397801876068115, + "logps/chosen": -346.8856201171875, + "logps/rejected": -291.56292724609375, + "loss": 0.2783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9492874145507812, + "rewards/margins": 2.2561867237091064, + "rewards/rejected": -3.2054741382598877, + "step": 7184 + }, + { + "epoch": 0.84, + "learning_rate": 4.999409472068029e-08, + "logits/chosen": -2.4974446296691895, + "logits/rejected": -2.765751600265503, + "logps/chosen": -331.3799133300781, + "logps/rejected": -276.6502990722656, + "loss": 0.2776, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2131173610687256, + "rewards/margins": 3.040496349334717, + "rewards/rejected": -4.253613471984863, + "step": 7185 + }, + { + "epoch": 0.84, + "learning_rate": 4.995866304476201e-08, + "logits/chosen": -2.320000171661377, + "logits/rejected": -2.18325138092041, + "logps/chosen": -108.53807830810547, + "logps/rejected": -187.33087158203125, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8670979142189026, + "rewards/margins": 1.4657410383224487, + "rewards/rejected": -2.332839012145996, + "step": 7186 + }, + { + "epoch": 0.84, + "learning_rate": 4.992323136884374e-08, + "logits/chosen": -2.62608003616333, + "logits/rejected": -2.6441810131073, + "logps/chosen": -307.62060546875, + "logps/rejected": -298.3954772949219, + "loss": 0.6692, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3183602094650269, + "rewards/margins": 1.5325745344161987, + "rewards/rejected": -2.8509347438812256, + "step": 7187 + }, + { + "epoch": 0.84, + "learning_rate": 4.9887799692925475e-08, + "logits/chosen": -2.8479621410369873, + "logits/rejected": -2.7824292182922363, + "logps/chosen": -280.92852783203125, + "logps/rejected": -285.9387512207031, + "loss": 0.1593, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1127455234527588, + "rewards/margins": 4.150631904602051, + "rewards/rejected": -5.263377666473389, + "step": 7188 + }, + { + "epoch": 0.84, + "learning_rate": 4.9852368017007204e-08, + "logits/chosen": -2.352708101272583, + "logits/rejected": -2.2586896419525146, + "logps/chosen": -107.604248046875, + "logps/rejected": -279.6939697265625, + "loss": 0.3735, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7553883790969849, + "rewards/margins": 2.1009490489959717, + "rewards/rejected": -2.856337308883667, + "step": 7189 + }, + { + "epoch": 0.84, + "learning_rate": 4.9816936341088926e-08, + "logits/chosen": -2.1253318786621094, + "logits/rejected": -2.1744556427001953, + "logps/chosen": -430.69720458984375, + "logps/rejected": -511.3611145019531, + "loss": 0.6715, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2242827415466309, + "rewards/margins": 3.07436466217041, + "rewards/rejected": -4.298647403717041, + "step": 7190 + }, + { + "epoch": 0.84, + "learning_rate": 4.978150466517066e-08, + "logits/chosen": -2.7410764694213867, + "logits/rejected": -3.0196354389190674, + "logps/chosen": -326.5492858886719, + "logps/rejected": -219.04574584960938, + "loss": 0.6187, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7710084915161133, + "rewards/margins": 0.9648841619491577, + "rewards/rejected": -2.7358927726745605, + "step": 7191 + }, + { + "epoch": 0.84, + "learning_rate": 4.974607298925239e-08, + "logits/chosen": -2.66976261138916, + "logits/rejected": -2.663522720336914, + "logps/chosen": -251.55154418945312, + "logps/rejected": -265.1261291503906, + "loss": 0.8267, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3706448078155518, + "rewards/margins": 1.996922254562378, + "rewards/rejected": -3.3675670623779297, + "step": 7192 + }, + { + "epoch": 0.84, + "learning_rate": 4.971064131333411e-08, + "logits/chosen": -2.4355952739715576, + "logits/rejected": -2.437936544418335, + "logps/chosen": -301.8266296386719, + "logps/rejected": -327.0881652832031, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7147289514541626, + "rewards/margins": 3.5094902515411377, + "rewards/rejected": -4.224218845367432, + "step": 7193 + }, + { + "epoch": 0.84, + "learning_rate": 4.967520963741585e-08, + "logits/chosen": -2.662003993988037, + "logits/rejected": -2.5239040851593018, + "logps/chosen": -364.93341064453125, + "logps/rejected": -322.0084228515625, + "loss": 0.3727, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7634075880050659, + "rewards/margins": 1.1943705081939697, + "rewards/rejected": -1.957777976989746, + "step": 7194 + }, + { + "epoch": 0.84, + "learning_rate": 4.9639777961497577e-08, + "logits/chosen": -2.3027572631835938, + "logits/rejected": -2.4071929454803467, + "logps/chosen": -292.1983642578125, + "logps/rejected": -237.1532745361328, + "loss": 0.4694, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.157797336578369, + "rewards/margins": 3.4086198806762695, + "rewards/rejected": -5.566417694091797, + "step": 7195 + }, + { + "epoch": 0.84, + "learning_rate": 4.96043462855793e-08, + "logits/chosen": -1.7962408065795898, + "logits/rejected": -1.6954057216644287, + "logps/chosen": -462.31573486328125, + "logps/rejected": -485.83935546875, + "loss": 0.6266, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.277230978012085, + "rewards/margins": 0.83509361743927, + "rewards/rejected": -2.1123244762420654, + "step": 7196 + }, + { + "epoch": 0.84, + "learning_rate": 4.9568914609661034e-08, + "logits/chosen": -2.9365549087524414, + "logits/rejected": -2.8682456016540527, + "logps/chosen": -259.52410888671875, + "logps/rejected": -356.9435119628906, + "loss": 0.2886, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5328123569488525, + "rewards/margins": 4.5790486335754395, + "rewards/rejected": -5.111860752105713, + "step": 7197 + }, + { + "epoch": 0.84, + "learning_rate": 4.953348293374276e-08, + "logits/chosen": -2.2652716636657715, + "logits/rejected": -1.9189568758010864, + "logps/chosen": -226.55422973632812, + "logps/rejected": -301.32574462890625, + "loss": 0.3436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9521443843841553, + "rewards/margins": 2.1342403888702393, + "rewards/rejected": -3.0863845348358154, + "step": 7198 + }, + { + "epoch": 0.84, + "learning_rate": 4.94980512578245e-08, + "logits/chosen": -2.505087375640869, + "logits/rejected": -2.429898262023926, + "logps/chosen": -279.6246643066406, + "logps/rejected": -380.42462158203125, + "loss": 0.3023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5368125438690186, + "rewards/margins": 2.378314256668091, + "rewards/rejected": -2.9151268005371094, + "step": 7199 + }, + { + "epoch": 0.84, + "learning_rate": 4.946261958190622e-08, + "logits/chosen": -2.4863579273223877, + "logits/rejected": -2.670398235321045, + "logps/chosen": -239.47076416015625, + "logps/rejected": -258.3829345703125, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4537231922149658, + "rewards/margins": 2.9302752017974854, + "rewards/rejected": -4.383998394012451, + "step": 7200 + }, + { + "epoch": 0.84, + "learning_rate": 4.942718790598795e-08, + "logits/chosen": -2.500920295715332, + "logits/rejected": -2.4729580879211426, + "logps/chosen": -285.5711364746094, + "logps/rejected": -343.26190185546875, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9123934507369995, + "rewards/margins": 2.7548279762268066, + "rewards/rejected": -3.6672215461730957, + "step": 7201 + }, + { + "epoch": 0.84, + "learning_rate": 4.9391756230069684e-08, + "logits/chosen": -2.4613351821899414, + "logits/rejected": -2.063239336013794, + "logps/chosen": -284.67999267578125, + "logps/rejected": -297.5213317871094, + "loss": 0.4565, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5271620750427246, + "rewards/margins": 1.4998664855957031, + "rewards/rejected": -2.0270285606384277, + "step": 7202 + }, + { + "epoch": 0.84, + "learning_rate": 4.935632455415141e-08, + "logits/chosen": -2.5499303340911865, + "logits/rejected": -2.449796676635742, + "logps/chosen": -461.87567138671875, + "logps/rejected": -303.14434814453125, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8071197271347046, + "rewards/margins": 2.634397506713867, + "rewards/rejected": -3.4415173530578613, + "step": 7203 + }, + { + "epoch": 0.84, + "learning_rate": 4.9320892878233135e-08, + "logits/chosen": -2.1091604232788086, + "logits/rejected": -2.302722454071045, + "logps/chosen": -340.1296081542969, + "logps/rejected": -294.37481689453125, + "loss": 0.3637, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0292718410491943, + "rewards/margins": 1.7118206024169922, + "rewards/rejected": -2.7410924434661865, + "step": 7204 + }, + { + "epoch": 0.84, + "learning_rate": 4.928546120231487e-08, + "logits/chosen": -2.2427682876586914, + "logits/rejected": -2.110755443572998, + "logps/chosen": -351.5255126953125, + "logps/rejected": -243.82398986816406, + "loss": 3.0156, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.638721466064453, + "rewards/margins": -1.6563842296600342, + "rewards/rejected": -2.982337474822998, + "step": 7205 + }, + { + "epoch": 0.84, + "learning_rate": 4.92500295263966e-08, + "logits/chosen": -2.331860303878784, + "logits/rejected": -2.322254180908203, + "logps/chosen": -221.8563232421875, + "logps/rejected": -266.18328857421875, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8649665117263794, + "rewards/margins": 2.330371379852295, + "rewards/rejected": -3.1953377723693848, + "step": 7206 + }, + { + "epoch": 0.84, + "learning_rate": 4.921459785047832e-08, + "logits/chosen": -2.059800386428833, + "logits/rejected": -1.9741162061691284, + "logps/chosen": -313.8770751953125, + "logps/rejected": -258.14764404296875, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04994484782218933, + "rewards/margins": 3.5578956604003906, + "rewards/rejected": -3.6078405380249023, + "step": 7207 + }, + { + "epoch": 0.84, + "learning_rate": 4.917916617456006e-08, + "logits/chosen": -3.001967430114746, + "logits/rejected": -3.098764657974243, + "logps/chosen": -283.86676025390625, + "logps/rejected": -337.62371826171875, + "loss": 0.6908, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7633185386657715, + "rewards/margins": 1.0960038900375366, + "rewards/rejected": -1.8593225479125977, + "step": 7208 + }, + { + "epoch": 0.84, + "learning_rate": 4.9143734498641786e-08, + "logits/chosen": -2.4962399005889893, + "logits/rejected": -2.485201120376587, + "logps/chosen": -194.19058227539062, + "logps/rejected": -222.79832458496094, + "loss": 0.6691, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4197556972503662, + "rewards/margins": 1.5179026126861572, + "rewards/rejected": -2.9376585483551025, + "step": 7209 + }, + { + "epoch": 0.84, + "learning_rate": 4.910830282272351e-08, + "logits/chosen": -2.0379574298858643, + "logits/rejected": -2.0484559535980225, + "logps/chosen": -446.5401916503906, + "logps/rejected": -300.671875, + "loss": 0.7822, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7434964179992676, + "rewards/margins": 0.24152596294879913, + "rewards/rejected": -1.9850224256515503, + "step": 7210 + }, + { + "epoch": 0.84, + "learning_rate": 4.907287114680524e-08, + "logits/chosen": -2.2410483360290527, + "logits/rejected": -2.274388313293457, + "logps/chosen": -277.4230651855469, + "logps/rejected": -253.0372314453125, + "loss": 0.4746, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1444891691207886, + "rewards/margins": 1.9088926315307617, + "rewards/rejected": -3.05338191986084, + "step": 7211 + }, + { + "epoch": 0.84, + "learning_rate": 4.903743947088697e-08, + "logits/chosen": -2.04531192779541, + "logits/rejected": -1.8743832111358643, + "logps/chosen": -382.3240661621094, + "logps/rejected": -522.414794921875, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5880683064460754, + "rewards/margins": 4.421146392822266, + "rewards/rejected": -5.009214401245117, + "step": 7212 + }, + { + "epoch": 0.84, + "learning_rate": 4.9002007794968694e-08, + "logits/chosen": -1.9024722576141357, + "logits/rejected": -2.066669225692749, + "logps/chosen": -169.7716827392578, + "logps/rejected": -236.12615966796875, + "loss": 0.3364, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9084323644638062, + "rewards/margins": 2.4806923866271973, + "rewards/rejected": -3.3891243934631348, + "step": 7213 + }, + { + "epoch": 0.84, + "learning_rate": 4.896657611905043e-08, + "logits/chosen": -3.0541741847991943, + "logits/rejected": -2.986274480819702, + "logps/chosen": -191.76718139648438, + "logps/rejected": -215.1719207763672, + "loss": 0.1408, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0319187641143799, + "rewards/margins": 4.092617034912109, + "rewards/rejected": -5.12453556060791, + "step": 7214 + }, + { + "epoch": 0.84, + "learning_rate": 4.893114444313216e-08, + "logits/chosen": -2.8095755577087402, + "logits/rejected": -2.8311877250671387, + "logps/chosen": -230.1826171875, + "logps/rejected": -237.23745727539062, + "loss": 0.353, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7809031009674072, + "rewards/margins": 3.18636155128479, + "rewards/rejected": -4.967264652252197, + "step": 7215 + }, + { + "epoch": 0.84, + "learning_rate": 4.889571276721389e-08, + "logits/chosen": -1.9311702251434326, + "logits/rejected": -2.0014376640319824, + "logps/chosen": -373.3750305175781, + "logps/rejected": -339.63836669921875, + "loss": 0.2122, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2309131622314453, + "rewards/margins": 2.792041301727295, + "rewards/rejected": -2.5611281394958496, + "step": 7216 + }, + { + "epoch": 0.84, + "learning_rate": 4.8860281091295616e-08, + "logits/chosen": -2.1293282508850098, + "logits/rejected": -2.2090914249420166, + "logps/chosen": -217.07582092285156, + "logps/rejected": -181.185302734375, + "loss": 0.9253, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1613630056381226, + "rewards/margins": 0.9650020599365234, + "rewards/rejected": -2.1263649463653564, + "step": 7217 + }, + { + "epoch": 0.84, + "learning_rate": 4.8824849415377344e-08, + "logits/chosen": -2.0128746032714844, + "logits/rejected": -1.991823434829712, + "logps/chosen": -162.3137969970703, + "logps/rejected": -299.34466552734375, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1872367858886719, + "rewards/margins": 4.107301235198975, + "rewards/rejected": -5.294537544250488, + "step": 7218 + }, + { + "epoch": 0.84, + "learning_rate": 4.878941773945907e-08, + "logits/chosen": -2.3487906455993652, + "logits/rejected": -2.1722235679626465, + "logps/chosen": -356.243896484375, + "logps/rejected": -291.303955078125, + "loss": 0.2431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2665559649467468, + "rewards/margins": 2.201751947402954, + "rewards/rejected": -2.4683079719543457, + "step": 7219 + }, + { + "epoch": 0.84, + "learning_rate": 4.875398606354081e-08, + "logits/chosen": -2.354848861694336, + "logits/rejected": -2.545307159423828, + "logps/chosen": -429.99188232421875, + "logps/rejected": -350.27886962890625, + "loss": 0.6112, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2555952072143555, + "rewards/margins": 3.990487575531006, + "rewards/rejected": -5.246082782745361, + "step": 7220 + }, + { + "epoch": 0.84, + "learning_rate": 4.871855438762253e-08, + "logits/chosen": -2.140761375427246, + "logits/rejected": -2.1159398555755615, + "logps/chosen": -422.00128173828125, + "logps/rejected": -318.9566650390625, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41823112964630127, + "rewards/margins": 2.5083274841308594, + "rewards/rejected": -2.926558494567871, + "step": 7221 + }, + { + "epoch": 0.84, + "learning_rate": 4.868312271170426e-08, + "logits/chosen": -2.4433751106262207, + "logits/rejected": -2.4419424533843994, + "logps/chosen": -245.48712158203125, + "logps/rejected": -257.4993896484375, + "loss": 0.3492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.036436676979065, + "rewards/margins": 2.474032163619995, + "rewards/rejected": -3.5104691982269287, + "step": 7222 + }, + { + "epoch": 0.84, + "learning_rate": 4.8647691035785995e-08, + "logits/chosen": -2.068518877029419, + "logits/rejected": -2.2550837993621826, + "logps/chosen": -293.7730712890625, + "logps/rejected": -244.70651245117188, + "loss": 0.2267, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7532916069030762, + "rewards/margins": 2.2419207096099854, + "rewards/rejected": -2.9952125549316406, + "step": 7223 + }, + { + "epoch": 0.84, + "learning_rate": 4.861225935986772e-08, + "logits/chosen": -2.7407784461975098, + "logits/rejected": -2.7072811126708984, + "logps/chosen": -195.5794677734375, + "logps/rejected": -210.6586151123047, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5897418856620789, + "rewards/margins": 2.6362717151641846, + "rewards/rejected": -3.226013660430908, + "step": 7224 + }, + { + "epoch": 0.84, + "learning_rate": 4.8576827683949445e-08, + "logits/chosen": -2.3176980018615723, + "logits/rejected": -2.624950408935547, + "logps/chosen": -218.81878662109375, + "logps/rejected": -206.48016357421875, + "loss": 0.7506, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8543405532836914, + "rewards/margins": 0.5818531513214111, + "rewards/rejected": -1.4361937046051025, + "step": 7225 + }, + { + "epoch": 0.84, + "learning_rate": 4.854139600803118e-08, + "logits/chosen": -2.2553768157958984, + "logits/rejected": -2.471886157989502, + "logps/chosen": -497.543212890625, + "logps/rejected": -376.416748046875, + "loss": 0.3897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8622477054595947, + "rewards/margins": 2.372678756713867, + "rewards/rejected": -3.234926462173462, + "step": 7226 + }, + { + "epoch": 0.84, + "learning_rate": 4.85059643321129e-08, + "logits/chosen": -1.5439844131469727, + "logits/rejected": -1.8892663717269897, + "logps/chosen": -631.123291015625, + "logps/rejected": -464.6608581542969, + "loss": 0.2333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33305123448371887, + "rewards/margins": 2.1304831504821777, + "rewards/rejected": -2.463534355163574, + "step": 7227 + }, + { + "epoch": 0.84, + "learning_rate": 4.847053265619463e-08, + "logits/chosen": -2.2936432361602783, + "logits/rejected": -2.6070973873138428, + "logps/chosen": -383.8769226074219, + "logps/rejected": -327.04669189453125, + "loss": 0.2255, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5188007354736328, + "rewards/margins": 3.098691463470459, + "rewards/rejected": -4.617491722106934, + "step": 7228 + }, + { + "epoch": 0.84, + "learning_rate": 4.843510098027637e-08, + "logits/chosen": -2.1351895332336426, + "logits/rejected": -1.9213289022445679, + "logps/chosen": -562.2785034179688, + "logps/rejected": -410.59295654296875, + "loss": 0.3004, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16033703088760376, + "rewards/margins": 2.116529703140259, + "rewards/rejected": -2.2768666744232178, + "step": 7229 + }, + { + "epoch": 0.84, + "learning_rate": 4.8399669304358096e-08, + "logits/chosen": -1.9529569149017334, + "logits/rejected": -2.557175397872925, + "logps/chosen": -573.6538696289062, + "logps/rejected": -244.96539306640625, + "loss": 0.5597, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6464874744415283, + "rewards/margins": 1.2078299522399902, + "rewards/rejected": -1.854317545890808, + "step": 7230 + }, + { + "epoch": 0.84, + "learning_rate": 4.836423762843982e-08, + "logits/chosen": -1.957033634185791, + "logits/rejected": -2.4552161693573, + "logps/chosen": -308.44842529296875, + "logps/rejected": -223.10250854492188, + "loss": 0.947, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5065529346466064, + "rewards/margins": 1.11775803565979, + "rewards/rejected": -2.6243109703063965, + "step": 7231 + }, + { + "epoch": 0.84, + "learning_rate": 4.832880595252155e-08, + "logits/chosen": -2.563549280166626, + "logits/rejected": -2.4018239974975586, + "logps/chosen": -169.57012939453125, + "logps/rejected": -288.75555419921875, + "loss": 0.2809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5492761135101318, + "rewards/margins": 1.7908350229263306, + "rewards/rejected": -2.340111017227173, + "step": 7232 + }, + { + "epoch": 0.84, + "learning_rate": 4.829337427660328e-08, + "logits/chosen": -2.506622076034546, + "logits/rejected": -2.203491687774658, + "logps/chosen": -150.4077606201172, + "logps/rejected": -209.76654052734375, + "loss": 0.2685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7883079648017883, + "rewards/margins": 1.9063177108764648, + "rewards/rejected": -2.6946258544921875, + "step": 7233 + }, + { + "epoch": 0.84, + "learning_rate": 4.825794260068502e-08, + "logits/chosen": -2.28299617767334, + "logits/rejected": -2.447516441345215, + "logps/chosen": -230.67129516601562, + "logps/rejected": -130.64410400390625, + "loss": 0.724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.035509467124939, + "rewards/margins": 0.4918633699417114, + "rewards/rejected": -1.5273728370666504, + "step": 7234 + }, + { + "epoch": 0.84, + "learning_rate": 4.822251092476674e-08, + "logits/chosen": -2.0147087574005127, + "logits/rejected": -2.3300468921661377, + "logps/chosen": -266.4061584472656, + "logps/rejected": -153.98507690429688, + "loss": 1.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.934485673904419, + "rewards/margins": 0.2829618453979492, + "rewards/rejected": -2.2174477577209473, + "step": 7235 + }, + { + "epoch": 0.84, + "learning_rate": 4.818707924884847e-08, + "logits/chosen": -2.201728343963623, + "logits/rejected": -1.6382761001586914, + "logps/chosen": -131.85482788085938, + "logps/rejected": -402.7178039550781, + "loss": 0.8423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9245245456695557, + "rewards/margins": 0.3916599750518799, + "rewards/rejected": -1.316184639930725, + "step": 7236 + }, + { + "epoch": 0.84, + "learning_rate": 4.8151647572930204e-08, + "logits/chosen": -2.775913953781128, + "logits/rejected": -2.571322441101074, + "logps/chosen": -281.6272277832031, + "logps/rejected": -358.2267150878906, + "loss": 0.2819, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.207271099090576, + "rewards/margins": 2.307739019393921, + "rewards/rejected": -4.515010356903076, + "step": 7237 + }, + { + "epoch": 0.84, + "learning_rate": 4.8116215897011926e-08, + "logits/chosen": -2.3984761238098145, + "logits/rejected": -1.753377079963684, + "logps/chosen": -185.28623962402344, + "logps/rejected": -283.1778564453125, + "loss": 0.278, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16058349609375, + "rewards/margins": 2.5232138633728027, + "rewards/rejected": -2.6837973594665527, + "step": 7238 + }, + { + "epoch": 0.84, + "learning_rate": 4.8080784221093654e-08, + "logits/chosen": -2.8229618072509766, + "logits/rejected": -2.6311488151550293, + "logps/chosen": -422.86395263671875, + "logps/rejected": -289.2400207519531, + "loss": 0.1496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35182124376296997, + "rewards/margins": 4.024928569793701, + "rewards/rejected": -3.673107147216797, + "step": 7239 + }, + { + "epoch": 0.84, + "learning_rate": 4.804535254517539e-08, + "logits/chosen": -2.2566823959350586, + "logits/rejected": -2.3186519145965576, + "logps/chosen": -301.4239196777344, + "logps/rejected": -294.5097351074219, + "loss": 0.2774, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38046687841415405, + "rewards/margins": 2.7626724243164062, + "rewards/rejected": -3.143139362335205, + "step": 7240 + }, + { + "epoch": 0.84, + "learning_rate": 4.800992086925711e-08, + "logits/chosen": -2.4321773052215576, + "logits/rejected": -2.408738136291504, + "logps/chosen": -347.9837646484375, + "logps/rejected": -256.0306701660156, + "loss": 0.2982, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4054829478263855, + "rewards/margins": 2.386599540710449, + "rewards/rejected": -2.7920823097229004, + "step": 7241 + }, + { + "epoch": 0.84, + "learning_rate": 4.797448919333884e-08, + "logits/chosen": -2.211482524871826, + "logits/rejected": -2.126861095428467, + "logps/chosen": -453.6476745605469, + "logps/rejected": -419.8216857910156, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5039078593254089, + "rewards/margins": 2.8923816680908203, + "rewards/rejected": -3.396289825439453, + "step": 7242 + }, + { + "epoch": 0.84, + "learning_rate": 4.7939057517420576e-08, + "logits/chosen": -2.2669050693511963, + "logits/rejected": -1.838313341140747, + "logps/chosen": -158.189453125, + "logps/rejected": -298.2691650390625, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.070899248123169, + "rewards/margins": 1.594675064086914, + "rewards/rejected": -2.665574312210083, + "step": 7243 + }, + { + "epoch": 0.84, + "learning_rate": 4.7903625841502305e-08, + "logits/chosen": -2.380558967590332, + "logits/rejected": -2.4341065883636475, + "logps/chosen": -338.31915283203125, + "logps/rejected": -261.54193115234375, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18781372904777527, + "rewards/margins": 3.0935065746307373, + "rewards/rejected": -3.281320333480835, + "step": 7244 + }, + { + "epoch": 0.84, + "learning_rate": 4.786819416558403e-08, + "logits/chosen": -2.6138808727264404, + "logits/rejected": -2.2409861087799072, + "logps/chosen": -231.86610412597656, + "logps/rejected": -330.18035888671875, + "loss": 0.46, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20809203386306763, + "rewards/margins": 1.9651036262512207, + "rewards/rejected": -2.1731956005096436, + "step": 7245 + }, + { + "epoch": 0.84, + "learning_rate": 4.783276248966576e-08, + "logits/chosen": -2.254080295562744, + "logits/rejected": -2.3006865978240967, + "logps/chosen": -559.119873046875, + "logps/rejected": -336.6722717285156, + "loss": 0.3987, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5094993114471436, + "rewards/margins": 1.923598051071167, + "rewards/rejected": -3.4330973625183105, + "step": 7246 + }, + { + "epoch": 0.84, + "learning_rate": 4.779733081374749e-08, + "logits/chosen": -2.194765567779541, + "logits/rejected": -2.332139730453491, + "logps/chosen": -466.2408447265625, + "logps/rejected": -268.4845886230469, + "loss": 0.3918, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0451356172561646, + "rewards/margins": 3.104945182800293, + "rewards/rejected": -4.150080680847168, + "step": 7247 + }, + { + "epoch": 0.84, + "learning_rate": 4.776189913782921e-08, + "logits/chosen": -2.734750270843506, + "logits/rejected": -2.54398775100708, + "logps/chosen": -101.3779067993164, + "logps/rejected": -181.5011749267578, + "loss": 0.6118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.892003059387207, + "rewards/margins": 1.018351435661316, + "rewards/rejected": -1.910354495048523, + "step": 7248 + }, + { + "epoch": 0.84, + "learning_rate": 4.772646746191095e-08, + "logits/chosen": -1.613930344581604, + "logits/rejected": -1.9587643146514893, + "logps/chosen": -500.2410583496094, + "logps/rejected": -404.1590576171875, + "loss": 0.4243, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6046358942985535, + "rewards/margins": 3.574763298034668, + "rewards/rejected": -4.179399013519287, + "step": 7249 + }, + { + "epoch": 0.84, + "learning_rate": 4.769103578599268e-08, + "logits/chosen": -2.1649787425994873, + "logits/rejected": -1.997015118598938, + "logps/chosen": -146.96047973632812, + "logps/rejected": -269.4560852050781, + "loss": 0.2038, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33695948123931885, + "rewards/margins": 3.4192728996276855, + "rewards/rejected": -3.756232738494873, + "step": 7250 + }, + { + "epoch": 0.84, + "learning_rate": 4.76556041100744e-08, + "logits/chosen": -2.541842460632324, + "logits/rejected": -2.564763069152832, + "logps/chosen": -260.1632385253906, + "logps/rejected": -415.3908996582031, + "loss": 0.4388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0138322114944458, + "rewards/margins": 2.340074300765991, + "rewards/rejected": -3.3539066314697266, + "step": 7251 + }, + { + "epoch": 0.84, + "learning_rate": 4.7620172434156135e-08, + "logits/chosen": -1.4108140468597412, + "logits/rejected": -1.99052894115448, + "logps/chosen": -466.72503662109375, + "logps/rejected": -331.4609375, + "loss": 1.3149, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2050328254699707, + "rewards/margins": 0.39518100023269653, + "rewards/rejected": -2.6002140045166016, + "step": 7252 + }, + { + "epoch": 0.84, + "learning_rate": 4.7584740758237863e-08, + "logits/chosen": -2.3872666358947754, + "logits/rejected": -2.4127869606018066, + "logps/chosen": -369.9120788574219, + "logps/rejected": -313.3192138671875, + "loss": 0.3907, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7399356961250305, + "rewards/margins": 1.9257956743240356, + "rewards/rejected": -2.665731191635132, + "step": 7253 + }, + { + "epoch": 0.84, + "learning_rate": 4.7549309082319586e-08, + "logits/chosen": -2.452110528945923, + "logits/rejected": -2.2820703983306885, + "logps/chosen": -234.12608337402344, + "logps/rejected": -300.25909423828125, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2767859697341919, + "rewards/margins": 1.9872243404388428, + "rewards/rejected": -2.264010190963745, + "step": 7254 + }, + { + "epoch": 0.84, + "learning_rate": 4.751387740640132e-08, + "logits/chosen": -2.1036062240600586, + "logits/rejected": -2.2075722217559814, + "logps/chosen": -331.1767272949219, + "logps/rejected": -324.660888671875, + "loss": 0.1845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6990199089050293, + "rewards/margins": 3.088290214538574, + "rewards/rejected": -3.7873098850250244, + "step": 7255 + }, + { + "epoch": 0.84, + "learning_rate": 4.747844573048305e-08, + "logits/chosen": -2.424408435821533, + "logits/rejected": -2.554405450820923, + "logps/chosen": -266.6960144042969, + "logps/rejected": -233.13768005371094, + "loss": 1.3851, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5193181037902832, + "rewards/margins": 1.4815313816070557, + "rewards/rejected": -3.000849485397339, + "step": 7256 + }, + { + "epoch": 0.84, + "learning_rate": 4.744301405456478e-08, + "logits/chosen": -2.927525758743286, + "logits/rejected": -2.8531808853149414, + "logps/chosen": -119.38567352294922, + "logps/rejected": -169.3896484375, + "loss": 0.4373, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2408033609390259, + "rewards/margins": 1.1853594779968262, + "rewards/rejected": -2.4261627197265625, + "step": 7257 + }, + { + "epoch": 0.84, + "learning_rate": 4.740758237864651e-08, + "logits/chosen": -2.3776731491088867, + "logits/rejected": -2.134315013885498, + "logps/chosen": -190.57273864746094, + "logps/rejected": -122.44780731201172, + "loss": 0.7977, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.204563856124878, + "rewards/margins": 1.5379582643508911, + "rewards/rejected": -2.7425220012664795, + "step": 7258 + }, + { + "epoch": 0.84, + "learning_rate": 4.7372150702728236e-08, + "logits/chosen": -1.765716314315796, + "logits/rejected": -1.9259085655212402, + "logps/chosen": -420.5760498046875, + "logps/rejected": -346.7299499511719, + "loss": 0.5348, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8933874368667603, + "rewards/margins": 1.587803602218628, + "rewards/rejected": -2.4811909198760986, + "step": 7259 + }, + { + "epoch": 0.84, + "learning_rate": 4.7336719026809965e-08, + "logits/chosen": -2.3003668785095215, + "logits/rejected": -2.36515474319458, + "logps/chosen": -236.02853393554688, + "logps/rejected": -263.72576904296875, + "loss": 0.2449, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1744011640548706, + "rewards/margins": 3.060971975326538, + "rewards/rejected": -4.235373020172119, + "step": 7260 + }, + { + "epoch": 0.84, + "learning_rate": 4.73012873508917e-08, + "logits/chosen": -2.149463653564453, + "logits/rejected": -2.3367481231689453, + "logps/chosen": -327.5191345214844, + "logps/rejected": -504.3203125, + "loss": 0.2905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5577700734138489, + "rewards/margins": 3.975494384765625, + "rewards/rejected": -4.533264636993408, + "step": 7261 + }, + { + "epoch": 0.84, + "learning_rate": 4.726585567497342e-08, + "logits/chosen": -2.584263324737549, + "logits/rejected": -2.2885665893554688, + "logps/chosen": -225.74264526367188, + "logps/rejected": -286.761474609375, + "loss": 0.5991, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4948036670684814, + "rewards/margins": 1.0354961156845093, + "rewards/rejected": -2.530299663543701, + "step": 7262 + }, + { + "epoch": 0.84, + "learning_rate": 4.723042399905515e-08, + "logits/chosen": -2.5677809715270996, + "logits/rejected": -2.601196527481079, + "logps/chosen": -235.41229248046875, + "logps/rejected": -330.84295654296875, + "loss": 0.5151, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8960225582122803, + "rewards/margins": 3.263166904449463, + "rewards/rejected": -4.159189701080322, + "step": 7263 + }, + { + "epoch": 0.85, + "learning_rate": 4.7194992323136886e-08, + "logits/chosen": -2.401327610015869, + "logits/rejected": -2.3511359691619873, + "logps/chosen": -154.32994079589844, + "logps/rejected": -87.17759704589844, + "loss": 0.8824, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7874640822410583, + "rewards/margins": 0.6826618909835815, + "rewards/rejected": -1.4701259136199951, + "step": 7264 + }, + { + "epoch": 0.85, + "learning_rate": 4.715956064721861e-08, + "logits/chosen": -2.4127860069274902, + "logits/rejected": -2.141484022140503, + "logps/chosen": -206.03335571289062, + "logps/rejected": -262.7857360839844, + "loss": 0.5793, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0386006832122803, + "rewards/margins": 1.45302152633667, + "rewards/rejected": -2.49162220954895, + "step": 7265 + }, + { + "epoch": 0.85, + "learning_rate": 4.712412897130034e-08, + "logits/chosen": -2.49318265914917, + "logits/rejected": -2.472932815551758, + "logps/chosen": -221.3561248779297, + "logps/rejected": -228.3270263671875, + "loss": 0.1906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.883466899394989, + "rewards/margins": 3.251511573791504, + "rewards/rejected": -4.134978294372559, + "step": 7266 + }, + { + "epoch": 0.85, + "learning_rate": 4.708869729538207e-08, + "logits/chosen": -2.6464076042175293, + "logits/rejected": -2.3515431880950928, + "logps/chosen": -178.24932861328125, + "logps/rejected": -305.29425048828125, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0822030305862427, + "rewards/margins": 1.7383692264556885, + "rewards/rejected": -2.8205723762512207, + "step": 7267 + }, + { + "epoch": 0.85, + "learning_rate": 4.7053265619463795e-08, + "logits/chosen": -2.421297550201416, + "logits/rejected": -2.26431941986084, + "logps/chosen": -286.5546875, + "logps/rejected": -313.0391540527344, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1246646642684937, + "rewards/margins": 0.7499726414680481, + "rewards/rejected": -1.8746371269226074, + "step": 7268 + }, + { + "epoch": 0.85, + "learning_rate": 4.701783394354552e-08, + "logits/chosen": -1.9799296855926514, + "logits/rejected": -2.006009578704834, + "logps/chosen": -393.84661865234375, + "logps/rejected": -331.9061279296875, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4568634033203125, + "rewards/margins": 3.3995790481567383, + "rewards/rejected": -4.856442451477051, + "step": 7269 + }, + { + "epoch": 0.85, + "learning_rate": 4.698240226762726e-08, + "logits/chosen": -2.0191259384155273, + "logits/rejected": -1.8194143772125244, + "logps/chosen": -286.4261169433594, + "logps/rejected": -273.0802917480469, + "loss": 0.3602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5703830122947693, + "rewards/margins": 2.8857104778289795, + "rewards/rejected": -3.4560933113098145, + "step": 7270 + }, + { + "epoch": 0.85, + "learning_rate": 4.694697059170899e-08, + "logits/chosen": -2.659043312072754, + "logits/rejected": -2.6569063663482666, + "logps/chosen": -220.573486328125, + "logps/rejected": -253.46202087402344, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.005614995956421, + "rewards/margins": 2.4477343559265137, + "rewards/rejected": -3.4533495903015137, + "step": 7271 + }, + { + "epoch": 0.85, + "learning_rate": 4.6911538915790716e-08, + "logits/chosen": -2.7096633911132812, + "logits/rejected": -2.758909225463867, + "logps/chosen": -283.8525390625, + "logps/rejected": -214.4556884765625, + "loss": 0.2851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9787612557411194, + "rewards/margins": 1.5498037338256836, + "rewards/rejected": -2.528564929962158, + "step": 7272 + }, + { + "epoch": 0.85, + "learning_rate": 4.6876107239872445e-08, + "logits/chosen": -2.6067609786987305, + "logits/rejected": -2.5524420738220215, + "logps/chosen": -257.0869140625, + "logps/rejected": -265.0462646484375, + "loss": 0.1589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8060624599456787, + "rewards/margins": 2.8491783142089844, + "rewards/rejected": -3.655240774154663, + "step": 7273 + }, + { + "epoch": 0.85, + "learning_rate": 4.6840675563954174e-08, + "logits/chosen": -2.4515979290008545, + "logits/rejected": -2.5218050479888916, + "logps/chosen": -178.76333618164062, + "logps/rejected": -246.77127075195312, + "loss": 0.2888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46322980523109436, + "rewards/margins": 4.3714823722839355, + "rewards/rejected": -4.834712505340576, + "step": 7274 + }, + { + "epoch": 0.85, + "learning_rate": 4.680524388803591e-08, + "logits/chosen": -2.321800947189331, + "logits/rejected": -2.4098000526428223, + "logps/chosen": -79.12568664550781, + "logps/rejected": -205.3490447998047, + "loss": 0.2232, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10156463086605072, + "rewards/margins": 2.2132318019866943, + "rewards/rejected": -2.1116671562194824, + "step": 7275 + }, + { + "epoch": 0.85, + "learning_rate": 4.676981221211763e-08, + "logits/chosen": -2.6799166202545166, + "logits/rejected": -2.7415380477905273, + "logps/chosen": -219.96633911132812, + "logps/rejected": -218.77890014648438, + "loss": 0.2914, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.564933180809021, + "rewards/margins": 2.2625296115875244, + "rewards/rejected": -2.827463150024414, + "step": 7276 + }, + { + "epoch": 0.85, + "learning_rate": 4.673438053619936e-08, + "logits/chosen": -2.455326795578003, + "logits/rejected": -2.2442874908447266, + "logps/chosen": -257.2427978515625, + "logps/rejected": -343.1708984375, + "loss": 0.1499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.259770929813385, + "rewards/margins": 3.680382490158081, + "rewards/rejected": -3.9401533603668213, + "step": 7277 + }, + { + "epoch": 0.85, + "learning_rate": 4.6698948860281095e-08, + "logits/chosen": -2.714839458465576, + "logits/rejected": -2.6816189289093018, + "logps/chosen": -271.8675537109375, + "logps/rejected": -182.2960662841797, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.888410210609436, + "rewards/margins": 2.527264356613159, + "rewards/rejected": -3.4156746864318848, + "step": 7278 + }, + { + "epoch": 0.85, + "learning_rate": 4.666351718436282e-08, + "logits/chosen": -2.672159433364868, + "logits/rejected": -2.6080496311187744, + "logps/chosen": -189.35609436035156, + "logps/rejected": -346.45562744140625, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8895136117935181, + "rewards/margins": 1.8329894542694092, + "rewards/rejected": -2.7225027084350586, + "step": 7279 + }, + { + "epoch": 0.85, + "learning_rate": 4.6628085508444546e-08, + "logits/chosen": -2.141854763031006, + "logits/rejected": -2.3860459327697754, + "logps/chosen": -342.33001708984375, + "logps/rejected": -298.0542297363281, + "loss": 0.1175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10440558195114136, + "rewards/margins": 4.352990627288818, + "rewards/rejected": -4.248584747314453, + "step": 7280 + }, + { + "epoch": 0.85, + "learning_rate": 4.659265383252628e-08, + "logits/chosen": -2.0554325580596924, + "logits/rejected": -2.048275947570801, + "logps/chosen": -171.28135681152344, + "logps/rejected": -172.1506805419922, + "loss": 0.3773, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5406771898269653, + "rewards/margins": 1.4699583053588867, + "rewards/rejected": -3.0106353759765625, + "step": 7281 + }, + { + "epoch": 0.85, + "learning_rate": 4.6557222156608004e-08, + "logits/chosen": -2.335662603378296, + "logits/rejected": -2.3045449256896973, + "logps/chosen": -461.6492614746094, + "logps/rejected": -406.17431640625, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.204179048538208, + "rewards/margins": 2.9035849571228027, + "rewards/rejected": -2.6994056701660156, + "step": 7282 + }, + { + "epoch": 0.85, + "learning_rate": 4.652179048068973e-08, + "logits/chosen": -2.1742541790008545, + "logits/rejected": -1.9896347522735596, + "logps/chosen": -275.878173828125, + "logps/rejected": -334.9149169921875, + "loss": 0.3586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.032356590032577515, + "rewards/margins": 2.391522169113159, + "rewards/rejected": -2.4238789081573486, + "step": 7283 + }, + { + "epoch": 0.85, + "learning_rate": 4.648635880477147e-08, + "logits/chosen": -2.765040397644043, + "logits/rejected": -2.660111427307129, + "logps/chosen": -404.5589904785156, + "logps/rejected": -341.6977233886719, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8929305076599121, + "rewards/margins": 2.4329757690429688, + "rewards/rejected": -3.325906276702881, + "step": 7284 + }, + { + "epoch": 0.85, + "learning_rate": 4.645092712885319e-08, + "logits/chosen": -2.5072879791259766, + "logits/rejected": -2.1639630794525146, + "logps/chosen": -191.58786010742188, + "logps/rejected": -234.97933959960938, + "loss": 0.3774, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.467726707458496, + "rewards/margins": 2.0871779918670654, + "rewards/rejected": -3.5549044609069824, + "step": 7285 + }, + { + "epoch": 0.85, + "learning_rate": 4.641549545293492e-08, + "logits/chosen": -2.447723388671875, + "logits/rejected": -2.430340051651001, + "logps/chosen": -146.14358520507812, + "logps/rejected": -198.13125610351562, + "loss": 0.4157, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6975700855255127, + "rewards/margins": 1.5042892694473267, + "rewards/rejected": -2.20185923576355, + "step": 7286 + }, + { + "epoch": 0.85, + "learning_rate": 4.6380063777016654e-08, + "logits/chosen": -1.7529547214508057, + "logits/rejected": -2.16302227973938, + "logps/chosen": -329.9844665527344, + "logps/rejected": -208.68714904785156, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9004249572753906, + "rewards/margins": 1.0736174583435059, + "rewards/rejected": -1.974042534828186, + "step": 7287 + }, + { + "epoch": 0.85, + "learning_rate": 4.634463210109838e-08, + "logits/chosen": -2.842928647994995, + "logits/rejected": -2.8077797889709473, + "logps/chosen": -331.58056640625, + "logps/rejected": -271.06689453125, + "loss": 0.3279, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0677239894866943, + "rewards/margins": 2.935922145843506, + "rewards/rejected": -4.003645896911621, + "step": 7288 + }, + { + "epoch": 0.85, + "learning_rate": 4.6309200425180105e-08, + "logits/chosen": -1.9115217924118042, + "logits/rejected": -2.0723447799682617, + "logps/chosen": -224.51515197753906, + "logps/rejected": -217.2948455810547, + "loss": 0.5154, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22290608286857605, + "rewards/margins": 1.5727462768554688, + "rewards/rejected": -1.7956522703170776, + "step": 7289 + }, + { + "epoch": 0.85, + "learning_rate": 4.627376874926184e-08, + "logits/chosen": -1.9597933292388916, + "logits/rejected": -1.6078323125839233, + "logps/chosen": -330.3284606933594, + "logps/rejected": -403.7808837890625, + "loss": 0.5954, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9570811986923218, + "rewards/margins": 1.2108606100082397, + "rewards/rejected": -2.1679418087005615, + "step": 7290 + }, + { + "epoch": 0.85, + "learning_rate": 4.623833707334357e-08, + "logits/chosen": -2.5812323093414307, + "logits/rejected": -2.6780014038085938, + "logps/chosen": -216.42526245117188, + "logps/rejected": -220.69215393066406, + "loss": 0.236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5631855726242065, + "rewards/margins": 1.8326056003570557, + "rewards/rejected": -2.3957910537719727, + "step": 7291 + }, + { + "epoch": 0.85, + "learning_rate": 4.620290539742529e-08, + "logits/chosen": -2.6994714736938477, + "logits/rejected": -2.549102783203125, + "logps/chosen": -289.257568359375, + "logps/rejected": -294.6665954589844, + "loss": 0.2565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9338828325271606, + "rewards/margins": 2.419266700744629, + "rewards/rejected": -3.3531494140625, + "step": 7292 + }, + { + "epoch": 0.85, + "learning_rate": 4.6167473721507026e-08, + "logits/chosen": -2.8870503902435303, + "logits/rejected": -2.7311618328094482, + "logps/chosen": -244.91143798828125, + "logps/rejected": -270.19293212890625, + "loss": 0.2166, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0011372566223145, + "rewards/margins": 2.6142542362213135, + "rewards/rejected": -3.615391492843628, + "step": 7293 + }, + { + "epoch": 0.85, + "learning_rate": 4.6132042045588755e-08, + "logits/chosen": -2.5575194358825684, + "logits/rejected": -2.5408082008361816, + "logps/chosen": -252.397705078125, + "logps/rejected": -307.4574890136719, + "loss": 0.2709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9205053448677063, + "rewards/margins": 2.34898042678833, + "rewards/rejected": -3.2694859504699707, + "step": 7294 + }, + { + "epoch": 0.85, + "learning_rate": 4.609661036967048e-08, + "logits/chosen": -2.4763169288635254, + "logits/rejected": -2.484689235687256, + "logps/chosen": -154.07408142089844, + "logps/rejected": -240.88601684570312, + "loss": 0.2365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05343359708786011, + "rewards/margins": 3.399695634841919, + "rewards/rejected": -3.453129291534424, + "step": 7295 + }, + { + "epoch": 0.85, + "learning_rate": 4.606117869375221e-08, + "logits/chosen": -2.9426636695861816, + "logits/rejected": -2.9807536602020264, + "logps/chosen": -281.87213134765625, + "logps/rejected": -216.0591583251953, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2178903967142105, + "rewards/margins": 3.311535120010376, + "rewards/rejected": -3.093644857406616, + "step": 7296 + }, + { + "epoch": 0.85, + "learning_rate": 4.602574701783394e-08, + "logits/chosen": -2.1503169536590576, + "logits/rejected": -2.439971923828125, + "logps/chosen": -394.3768310546875, + "logps/rejected": -253.49044799804688, + "loss": 0.5404, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4857573509216309, + "rewards/margins": 1.5544641017913818, + "rewards/rejected": -3.0402212142944336, + "step": 7297 + }, + { + "epoch": 0.85, + "learning_rate": 4.599031534191567e-08, + "logits/chosen": -2.464210033416748, + "logits/rejected": -2.250335216522217, + "logps/chosen": -213.20858764648438, + "logps/rejected": -276.6716613769531, + "loss": 0.3806, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0574941635131836, + "rewards/margins": 1.8820066452026367, + "rewards/rejected": -2.9395008087158203, + "step": 7298 + }, + { + "epoch": 0.85, + "learning_rate": 4.59548836659974e-08, + "logits/chosen": -2.2917168140411377, + "logits/rejected": -2.3977363109588623, + "logps/chosen": -324.33740234375, + "logps/rejected": -220.11997985839844, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7214868664741516, + "rewards/margins": 1.813875675201416, + "rewards/rejected": -2.535362720489502, + "step": 7299 + }, + { + "epoch": 0.85, + "learning_rate": 4.591945199007913e-08, + "logits/chosen": -2.425290822982788, + "logits/rejected": -2.6377906799316406, + "logps/chosen": -297.11822509765625, + "logps/rejected": -274.6890563964844, + "loss": 0.4428, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0619198083877563, + "rewards/margins": 1.762895107269287, + "rewards/rejected": -2.824815034866333, + "step": 7300 + }, + { + "epoch": 0.85, + "learning_rate": 4.5884020314160856e-08, + "logits/chosen": -2.3076775074005127, + "logits/rejected": -2.3018112182617188, + "logps/chosen": -273.1908874511719, + "logps/rejected": -291.8925476074219, + "loss": 0.2205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05372549593448639, + "rewards/margins": 2.1939239501953125, + "rewards/rejected": -2.140198230743408, + "step": 7301 + }, + { + "epoch": 0.85, + "learning_rate": 4.584858863824259e-08, + "logits/chosen": -1.8830373287200928, + "logits/rejected": -2.193453311920166, + "logps/chosen": -314.8589782714844, + "logps/rejected": -273.13226318359375, + "loss": 0.6747, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5208985805511475, + "rewards/margins": 1.2162978649139404, + "rewards/rejected": -2.737196445465088, + "step": 7302 + }, + { + "epoch": 0.85, + "learning_rate": 4.5813156962324314e-08, + "logits/chosen": -2.23165225982666, + "logits/rejected": -2.290553331375122, + "logps/chosen": -147.7615966796875, + "logps/rejected": -226.345458984375, + "loss": 0.5152, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.449201226234436, + "rewards/margins": 1.4160616397857666, + "rewards/rejected": -2.865262985229492, + "step": 7303 + }, + { + "epoch": 0.85, + "learning_rate": 4.577772528640604e-08, + "logits/chosen": -2.585507392883301, + "logits/rejected": -2.7994539737701416, + "logps/chosen": -281.14892578125, + "logps/rejected": -213.46600341796875, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6074389219284058, + "rewards/margins": 2.2927401065826416, + "rewards/rejected": -2.900179147720337, + "step": 7304 + }, + { + "epoch": 0.85, + "learning_rate": 4.574229361048778e-08, + "logits/chosen": -2.5350453853607178, + "logits/rejected": -2.583035945892334, + "logps/chosen": -325.54400634765625, + "logps/rejected": -350.11724853515625, + "loss": 0.6793, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2575048208236694, + "rewards/margins": 2.7942183017730713, + "rewards/rejected": -4.051723003387451, + "step": 7305 + }, + { + "epoch": 0.85, + "learning_rate": 4.57068619345695e-08, + "logits/chosen": -2.1299540996551514, + "logits/rejected": -2.1954190731048584, + "logps/chosen": -240.97962951660156, + "logps/rejected": -310.7934875488281, + "loss": 0.302, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5525453686714172, + "rewards/margins": 2.854924201965332, + "rewards/rejected": -3.4074697494506836, + "step": 7306 + }, + { + "epoch": 0.85, + "learning_rate": 4.5671430258651235e-08, + "logits/chosen": -2.6079864501953125, + "logits/rejected": -2.624528169631958, + "logps/chosen": -180.7661895751953, + "logps/rejected": -216.1150360107422, + "loss": 0.5087, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0996644496917725, + "rewards/margins": 1.8805136680603027, + "rewards/rejected": -2.9801783561706543, + "step": 7307 + }, + { + "epoch": 0.85, + "learning_rate": 4.5635998582732964e-08, + "logits/chosen": -2.121796131134033, + "logits/rejected": -2.2555434703826904, + "logps/chosen": -244.13775634765625, + "logps/rejected": -216.88279724121094, + "loss": 0.5468, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8599817752838135, + "rewards/margins": 1.8738982677459717, + "rewards/rejected": -2.733880043029785, + "step": 7308 + }, + { + "epoch": 0.85, + "learning_rate": 4.5600566906814686e-08, + "logits/chosen": -2.39072585105896, + "logits/rejected": -2.057681083679199, + "logps/chosen": -199.20980834960938, + "logps/rejected": -280.323486328125, + "loss": 0.6579, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8685728311538696, + "rewards/margins": 1.7132220268249512, + "rewards/rejected": -3.5817947387695312, + "step": 7309 + }, + { + "epoch": 0.85, + "learning_rate": 4.556513523089642e-08, + "logits/chosen": -1.8494014739990234, + "logits/rejected": -1.8063623905181885, + "logps/chosen": -99.88765716552734, + "logps/rejected": -185.90750122070312, + "loss": 0.3857, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5681368112564087, + "rewards/margins": 1.4627563953399658, + "rewards/rejected": -3.030893087387085, + "step": 7310 + }, + { + "epoch": 0.85, + "learning_rate": 4.552970355497815e-08, + "logits/chosen": -2.0807783603668213, + "logits/rejected": -1.9514029026031494, + "logps/chosen": -192.13504028320312, + "logps/rejected": -187.4327850341797, + "loss": 0.1708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7305519580841064, + "rewards/margins": 2.657336473464966, + "rewards/rejected": -3.3878884315490723, + "step": 7311 + }, + { + "epoch": 0.85, + "learning_rate": 4.549427187905987e-08, + "logits/chosen": -2.2088825702667236, + "logits/rejected": -2.146001100540161, + "logps/chosen": -362.84393310546875, + "logps/rejected": -361.1010437011719, + "loss": 0.4923, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9589045643806458, + "rewards/margins": 2.5212488174438477, + "rewards/rejected": -3.4801535606384277, + "step": 7312 + }, + { + "epoch": 0.85, + "learning_rate": 4.545884020314161e-08, + "logits/chosen": -2.6832966804504395, + "logits/rejected": -2.7223803997039795, + "logps/chosen": -230.97933959960938, + "logps/rejected": -193.51158142089844, + "loss": 0.4517, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0606417655944824, + "rewards/margins": 2.6232733726501465, + "rewards/rejected": -4.683915615081787, + "step": 7313 + }, + { + "epoch": 0.85, + "learning_rate": 4.5423408527223337e-08, + "logits/chosen": -3.075793981552124, + "logits/rejected": -3.0130958557128906, + "logps/chosen": -248.43783569335938, + "logps/rejected": -259.7244873046875, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4550700783729553, + "rewards/margins": 4.59153938293457, + "rewards/rejected": -5.046609878540039, + "step": 7314 + }, + { + "epoch": 0.85, + "learning_rate": 4.5387976851305065e-08, + "logits/chosen": -2.730975866317749, + "logits/rejected": -2.5815534591674805, + "logps/chosen": -144.22885131835938, + "logps/rejected": -174.00917053222656, + "loss": 0.5294, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5326787233352661, + "rewards/margins": 2.6661484241485596, + "rewards/rejected": -4.198827266693115, + "step": 7315 + }, + { + "epoch": 0.85, + "learning_rate": 4.5352545175386794e-08, + "logits/chosen": -2.2427690029144287, + "logits/rejected": -2.2293930053710938, + "logps/chosen": -261.3278503417969, + "logps/rejected": -307.0343322753906, + "loss": 0.3902, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0258545875549316, + "rewards/margins": 1.7701010704040527, + "rewards/rejected": -2.7959556579589844, + "step": 7316 + }, + { + "epoch": 0.85, + "learning_rate": 4.531711349946852e-08, + "logits/chosen": -2.2118663787841797, + "logits/rejected": -2.3856520652770996, + "logps/chosen": -280.12738037109375, + "logps/rejected": -235.937255859375, + "loss": 0.2233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5190629959106445, + "rewards/margins": 2.63476300239563, + "rewards/rejected": -3.1538262367248535, + "step": 7317 + }, + { + "epoch": 0.85, + "learning_rate": 4.528168182355025e-08, + "logits/chosen": -1.7535436153411865, + "logits/rejected": -1.4196557998657227, + "logps/chosen": -322.5681457519531, + "logps/rejected": -377.7336730957031, + "loss": 0.3204, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3288496732711792, + "rewards/margins": 2.1085805892944336, + "rewards/rejected": -2.437429904937744, + "step": 7318 + }, + { + "epoch": 0.85, + "learning_rate": 4.524625014763199e-08, + "logits/chosen": -2.6656863689422607, + "logits/rejected": -2.6345558166503906, + "logps/chosen": -250.86964416503906, + "logps/rejected": -147.78956604003906, + "loss": 0.424, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1686431169509888, + "rewards/margins": 1.6843843460083008, + "rewards/rejected": -2.85302734375, + "step": 7319 + }, + { + "epoch": 0.85, + "learning_rate": 4.521081847171371e-08, + "logits/chosen": -2.2911293506622314, + "logits/rejected": -2.187514305114746, + "logps/chosen": -408.85400390625, + "logps/rejected": -565.2117309570312, + "loss": 0.2077, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0001400709152222, + "rewards/margins": 2.9090380668640137, + "rewards/rejected": -3.9091782569885254, + "step": 7320 + }, + { + "epoch": 0.85, + "learning_rate": 4.517538679579544e-08, + "logits/chosen": -2.46730375289917, + "logits/rejected": -2.466951608657837, + "logps/chosen": -344.14239501953125, + "logps/rejected": -327.0296936035156, + "loss": 0.2758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5047395825386047, + "rewards/margins": 2.55302095413208, + "rewards/rejected": -3.05776047706604, + "step": 7321 + }, + { + "epoch": 0.85, + "learning_rate": 4.513995511987717e-08, + "logits/chosen": -2.267716407775879, + "logits/rejected": -2.31528377532959, + "logps/chosen": -311.2690734863281, + "logps/rejected": -450.06195068359375, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9901121854782104, + "rewards/margins": 3.8002209663391113, + "rewards/rejected": -4.790333271026611, + "step": 7322 + }, + { + "epoch": 0.85, + "learning_rate": 4.5104523443958895e-08, + "logits/chosen": -2.4814443588256836, + "logits/rejected": -2.648103713989258, + "logps/chosen": -124.02961730957031, + "logps/rejected": -206.2607879638672, + "loss": 0.303, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5065128803253174, + "rewards/margins": 2.5960605144500732, + "rewards/rejected": -4.102573394775391, + "step": 7323 + }, + { + "epoch": 0.85, + "learning_rate": 4.5069091768040624e-08, + "logits/chosen": -2.1511263847351074, + "logits/rejected": -2.410322904586792, + "logps/chosen": -431.5431823730469, + "logps/rejected": -219.86117553710938, + "loss": 1.4617, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.4425389766693115, + "rewards/margins": -0.10781902074813843, + "rewards/rejected": -1.3347200155258179, + "step": 7324 + }, + { + "epoch": 0.85, + "learning_rate": 4.503366009212236e-08, + "logits/chosen": -1.5254638195037842, + "logits/rejected": -1.9191210269927979, + "logps/chosen": -321.00286865234375, + "logps/rejected": -243.042724609375, + "loss": 0.5345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7982950806617737, + "rewards/margins": 1.0435179471969604, + "rewards/rejected": -1.841813087463379, + "step": 7325 + }, + { + "epoch": 0.85, + "learning_rate": 4.499822841620408e-08, + "logits/chosen": -2.350990056991577, + "logits/rejected": -2.269261598587036, + "logps/chosen": -389.68414306640625, + "logps/rejected": -455.2854919433594, + "loss": 0.0919, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2596868574619293, + "rewards/margins": 3.7737104892730713, + "rewards/rejected": -4.033397197723389, + "step": 7326 + }, + { + "epoch": 0.85, + "learning_rate": 4.496279674028581e-08, + "logits/chosen": -2.333441734313965, + "logits/rejected": -2.5623414516448975, + "logps/chosen": -411.7857666015625, + "logps/rejected": -279.7125244140625, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4713912010192871, + "rewards/margins": 2.7497403621673584, + "rewards/rejected": -3.2211318016052246, + "step": 7327 + }, + { + "epoch": 0.85, + "learning_rate": 4.4927365064367546e-08, + "logits/chosen": -2.7420661449432373, + "logits/rejected": -2.5432658195495605, + "logps/chosen": -220.08737182617188, + "logps/rejected": -378.4952392578125, + "loss": 0.7052, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2540576457977295, + "rewards/margins": 0.8891851305961609, + "rewards/rejected": -2.143242835998535, + "step": 7328 + }, + { + "epoch": 0.85, + "learning_rate": 4.4891933388449274e-08, + "logits/chosen": -2.457857608795166, + "logits/rejected": -2.5072457790374756, + "logps/chosen": -228.43380737304688, + "logps/rejected": -201.6914825439453, + "loss": 0.2936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0318962335586548, + "rewards/margins": 1.7700797319412231, + "rewards/rejected": -2.801976203918457, + "step": 7329 + }, + { + "epoch": 0.85, + "learning_rate": 4.4856501712530996e-08, + "logits/chosen": -2.224651336669922, + "logits/rejected": -2.359795331954956, + "logps/chosen": -306.9964599609375, + "logps/rejected": -282.6907958984375, + "loss": 1.2043, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0140880346298218, + "rewards/margins": 0.547762930393219, + "rewards/rejected": -1.5618510246276855, + "step": 7330 + }, + { + "epoch": 0.85, + "learning_rate": 4.482107003661273e-08, + "logits/chosen": -2.0483202934265137, + "logits/rejected": -2.1713271141052246, + "logps/chosen": -315.455322265625, + "logps/rejected": -288.97686767578125, + "loss": 0.1838, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1263471841812134, + "rewards/margins": 2.8268942832946777, + "rewards/rejected": -3.9532415866851807, + "step": 7331 + }, + { + "epoch": 0.85, + "learning_rate": 4.478563836069446e-08, + "logits/chosen": -2.7036354541778564, + "logits/rejected": -2.9165408611297607, + "logps/chosen": -398.4561462402344, + "logps/rejected": -275.6676025390625, + "loss": 0.487, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.958478569984436, + "rewards/margins": 1.7154631614685059, + "rewards/rejected": -2.6739416122436523, + "step": 7332 + }, + { + "epoch": 0.85, + "learning_rate": 4.475020668477618e-08, + "logits/chosen": -2.4982547760009766, + "logits/rejected": -2.56457781791687, + "logps/chosen": -294.2204895019531, + "logps/rejected": -303.8901062011719, + "loss": 0.345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7577487230300903, + "rewards/margins": 2.3451154232025146, + "rewards/rejected": -3.1028642654418945, + "step": 7333 + }, + { + "epoch": 0.85, + "learning_rate": 4.471477500885792e-08, + "logits/chosen": -2.7136337757110596, + "logits/rejected": -2.5595827102661133, + "logps/chosen": -301.575439453125, + "logps/rejected": -250.81031799316406, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2759387493133545, + "rewards/margins": 1.0416202545166016, + "rewards/rejected": -2.317559003829956, + "step": 7334 + }, + { + "epoch": 0.85, + "learning_rate": 4.467934333293965e-08, + "logits/chosen": -2.797588586807251, + "logits/rejected": -2.792172431945801, + "logps/chosen": -247.49037170410156, + "logps/rejected": -268.3847351074219, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.210490107536316, + "rewards/margins": 3.2887914180755615, + "rewards/rejected": -4.499281406402588, + "step": 7335 + }, + { + "epoch": 0.85, + "learning_rate": 4.464391165702137e-08, + "logits/chosen": -2.456329822540283, + "logits/rejected": -2.4930102825164795, + "logps/chosen": -219.3330535888672, + "logps/rejected": -214.12879943847656, + "loss": 0.4986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49085357785224915, + "rewards/margins": 1.2563579082489014, + "rewards/rejected": -1.7472115755081177, + "step": 7336 + }, + { + "epoch": 0.85, + "learning_rate": 4.4608479981103104e-08, + "logits/chosen": -2.347580671310425, + "logits/rejected": -2.3174405097961426, + "logps/chosen": -207.07760620117188, + "logps/rejected": -273.7772521972656, + "loss": 0.3723, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3522975444793701, + "rewards/margins": 1.8547861576080322, + "rewards/rejected": -3.2070837020874023, + "step": 7337 + }, + { + "epoch": 0.85, + "learning_rate": 4.457304830518483e-08, + "logits/chosen": -2.2284743785858154, + "logits/rejected": -2.3651833534240723, + "logps/chosen": -382.514892578125, + "logps/rejected": -313.3924865722656, + "loss": 0.4812, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2831036150455475, + "rewards/margins": 3.44006609916687, + "rewards/rejected": -3.7231693267822266, + "step": 7338 + }, + { + "epoch": 0.85, + "learning_rate": 4.453761662926656e-08, + "logits/chosen": -2.4326493740081787, + "logits/rejected": -2.604255199432373, + "logps/chosen": -325.2414245605469, + "logps/rejected": -308.76422119140625, + "loss": 0.198, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08794661611318588, + "rewards/margins": 2.2480509281158447, + "rewards/rejected": -2.160104274749756, + "step": 7339 + }, + { + "epoch": 0.85, + "learning_rate": 4.450218495334829e-08, + "logits/chosen": -3.033639669418335, + "logits/rejected": -3.0171189308166504, + "logps/chosen": -381.73809814453125, + "logps/rejected": -268.6075744628906, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39719754457473755, + "rewards/margins": 3.028110980987549, + "rewards/rejected": -3.4253087043762207, + "step": 7340 + }, + { + "epoch": 0.85, + "learning_rate": 4.446675327743002e-08, + "logits/chosen": -1.9905246496200562, + "logits/rejected": -2.3176980018615723, + "logps/chosen": -300.2572021484375, + "logps/rejected": -182.72653198242188, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40734192728996277, + "rewards/margins": 2.7294883728027344, + "rewards/rejected": -3.1368300914764404, + "step": 7341 + }, + { + "epoch": 0.85, + "learning_rate": 4.443132160151175e-08, + "logits/chosen": -2.6938867568969727, + "logits/rejected": -2.5876338481903076, + "logps/chosen": -126.16220092773438, + "logps/rejected": -421.3466491699219, + "loss": 0.245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16083520650863647, + "rewards/margins": 3.228377342224121, + "rewards/rejected": -3.3892123699188232, + "step": 7342 + }, + { + "epoch": 0.85, + "learning_rate": 4.4395889925593477e-08, + "logits/chosen": -2.5300486087799072, + "logits/rejected": -2.609264850616455, + "logps/chosen": -176.7534637451172, + "logps/rejected": -211.013916015625, + "loss": 0.4029, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7653611898422241, + "rewards/margins": 2.3819620609283447, + "rewards/rejected": -3.1473228931427, + "step": 7343 + }, + { + "epoch": 0.85, + "learning_rate": 4.4360458249675205e-08, + "logits/chosen": -2.1275603771209717, + "logits/rejected": -2.098900556564331, + "logps/chosen": -501.35125732421875, + "logps/rejected": -475.7704162597656, + "loss": 0.2273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17431814968585968, + "rewards/margins": 2.5970346927642822, + "rewards/rejected": -2.771352767944336, + "step": 7344 + }, + { + "epoch": 0.85, + "learning_rate": 4.432502657375694e-08, + "logits/chosen": -1.5648515224456787, + "logits/rejected": -1.6907507181167603, + "logps/chosen": -258.5078125, + "logps/rejected": -240.20907592773438, + "loss": 0.3443, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04035671427845955, + "rewards/margins": 1.763710379600525, + "rewards/rejected": -1.8040671348571777, + "step": 7345 + }, + { + "epoch": 0.85, + "learning_rate": 4.428959489783867e-08, + "logits/chosen": -2.6907835006713867, + "logits/rejected": -2.3726606369018555, + "logps/chosen": -246.93577575683594, + "logps/rejected": -347.5126647949219, + "loss": 0.4622, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7086324691772461, + "rewards/margins": 1.4070476293563843, + "rewards/rejected": -2.115679979324341, + "step": 7346 + }, + { + "epoch": 0.85, + "learning_rate": 4.425416322192039e-08, + "logits/chosen": -2.6631593704223633, + "logits/rejected": -2.3611233234405518, + "logps/chosen": -269.1463623046875, + "logps/rejected": -288.017333984375, + "loss": 0.5056, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4231998920440674, + "rewards/margins": 1.5668487548828125, + "rewards/rejected": -2.990048408508301, + "step": 7347 + }, + { + "epoch": 0.85, + "learning_rate": 4.421873154600213e-08, + "logits/chosen": -2.6333627700805664, + "logits/rejected": -2.8049678802490234, + "logps/chosen": -323.4835205078125, + "logps/rejected": -298.0621337890625, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5537779331207275, + "rewards/margins": 3.883803367614746, + "rewards/rejected": -5.4375810623168945, + "step": 7348 + }, + { + "epoch": 0.85, + "learning_rate": 4.4183299870083856e-08, + "logits/chosen": -2.355320930480957, + "logits/rejected": -1.959317684173584, + "logps/chosen": -192.6806640625, + "logps/rejected": -252.55422973632812, + "loss": 0.6499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5370771884918213, + "rewards/margins": 1.104249119758606, + "rewards/rejected": -2.641326427459717, + "step": 7349 + }, + { + "epoch": 0.86, + "learning_rate": 4.414786819416558e-08, + "logits/chosen": -2.4984817504882812, + "logits/rejected": -2.256387948989868, + "logps/chosen": -152.04742431640625, + "logps/rejected": -222.860595703125, + "loss": 0.3841, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.243741750717163, + "rewards/margins": 1.2439498901367188, + "rewards/rejected": -2.487691640853882, + "step": 7350 + }, + { + "epoch": 0.86, + "learning_rate": 4.411243651824731e-08, + "logits/chosen": -2.295140504837036, + "logits/rejected": -2.4356613159179688, + "logps/chosen": -264.34686279296875, + "logps/rejected": -224.52957153320312, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0905935764312744, + "rewards/margins": 2.6236824989318848, + "rewards/rejected": -3.7142763137817383, + "step": 7351 + }, + { + "epoch": 0.86, + "learning_rate": 4.407700484232904e-08, + "logits/chosen": -2.3862545490264893, + "logits/rejected": -2.2282204627990723, + "logps/chosen": -249.524169921875, + "logps/rejected": -374.1593322753906, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9359601736068726, + "rewards/margins": 2.4585745334625244, + "rewards/rejected": -3.3945350646972656, + "step": 7352 + }, + { + "epoch": 0.86, + "learning_rate": 4.4041573166410764e-08, + "logits/chosen": -2.181784152984619, + "logits/rejected": -2.1110732555389404, + "logps/chosen": -349.128173828125, + "logps/rejected": -245.64462280273438, + "loss": 0.263, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7415443658828735, + "rewards/margins": 1.6274608373641968, + "rewards/rejected": -3.3690052032470703, + "step": 7353 + }, + { + "epoch": 0.86, + "learning_rate": 4.40061414904925e-08, + "logits/chosen": -2.189936399459839, + "logits/rejected": -2.192434310913086, + "logps/chosen": -251.35302734375, + "logps/rejected": -261.8049011230469, + "loss": 0.3262, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1037545204162598, + "rewards/margins": 1.503386378288269, + "rewards/rejected": -2.6071410179138184, + "step": 7354 + }, + { + "epoch": 0.86, + "learning_rate": 4.397070981457423e-08, + "logits/chosen": -2.8448739051818848, + "logits/rejected": -2.682577610015869, + "logps/chosen": -254.49685668945312, + "logps/rejected": -261.059326171875, + "loss": 0.0923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7456955313682556, + "rewards/margins": 3.2433173656463623, + "rewards/rejected": -3.9890127182006836, + "step": 7355 + }, + { + "epoch": 0.86, + "learning_rate": 4.393527813865596e-08, + "logits/chosen": -2.3786914348602295, + "logits/rejected": -2.1240782737731934, + "logps/chosen": -261.5246887207031, + "logps/rejected": -341.02191162109375, + "loss": 0.3705, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5833624005317688, + "rewards/margins": 1.6898002624511719, + "rewards/rejected": -2.273162603378296, + "step": 7356 + }, + { + "epoch": 0.86, + "learning_rate": 4.3899846462737686e-08, + "logits/chosen": -2.348440408706665, + "logits/rejected": -2.585839033126831, + "logps/chosen": -321.4947204589844, + "logps/rejected": -248.88595581054688, + "loss": 0.2493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5428101420402527, + "rewards/margins": 3.4239184856414795, + "rewards/rejected": -3.966728687286377, + "step": 7357 + }, + { + "epoch": 0.86, + "learning_rate": 4.3864414786819414e-08, + "logits/chosen": -2.483617067337036, + "logits/rejected": -2.618009090423584, + "logps/chosen": -346.47344970703125, + "logps/rejected": -286.251220703125, + "loss": 0.3223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8473241329193115, + "rewards/margins": 2.324265718460083, + "rewards/rejected": -3.1715898513793945, + "step": 7358 + }, + { + "epoch": 0.86, + "learning_rate": 4.382898311090114e-08, + "logits/chosen": -1.977848768234253, + "logits/rejected": -2.2220499515533447, + "logps/chosen": -397.8751220703125, + "logps/rejected": -344.96240234375, + "loss": 0.2382, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46844589710235596, + "rewards/margins": 2.8950376510620117, + "rewards/rejected": -2.4265918731689453, + "step": 7359 + }, + { + "epoch": 0.86, + "learning_rate": 4.379355143498288e-08, + "logits/chosen": -2.09094500541687, + "logits/rejected": -2.161003351211548, + "logps/chosen": -109.50414276123047, + "logps/rejected": -196.18084716796875, + "loss": 0.325, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6929395198822021, + "rewards/margins": 3.1939101219177246, + "rewards/rejected": -3.886849880218506, + "step": 7360 + }, + { + "epoch": 0.86, + "learning_rate": 4.37581197590646e-08, + "logits/chosen": -2.589751720428467, + "logits/rejected": -2.550672769546509, + "logps/chosen": -425.80865478515625, + "logps/rejected": -446.1686706542969, + "loss": 0.2548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9233182072639465, + "rewards/margins": 2.5910022258758545, + "rewards/rejected": -3.5143206119537354, + "step": 7361 + }, + { + "epoch": 0.86, + "learning_rate": 4.372268808314633e-08, + "logits/chosen": -2.833526134490967, + "logits/rejected": -2.9369349479675293, + "logps/chosen": -283.0848083496094, + "logps/rejected": -203.12779235839844, + "loss": 0.2802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6595021486282349, + "rewards/margins": 1.9044163227081299, + "rewards/rejected": -2.5639185905456543, + "step": 7362 + }, + { + "epoch": 0.86, + "learning_rate": 4.3687256407228065e-08, + "logits/chosen": -2.37874436378479, + "logits/rejected": -2.519209384918213, + "logps/chosen": -292.4700927734375, + "logps/rejected": -348.56402587890625, + "loss": 0.3207, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30318683385849, + "rewards/margins": 2.921664237976074, + "rewards/rejected": -3.22485089302063, + "step": 7363 + }, + { + "epoch": 0.86, + "learning_rate": 4.365182473130979e-08, + "logits/chosen": -2.8587770462036133, + "logits/rejected": -2.9571266174316406, + "logps/chosen": -315.1448974609375, + "logps/rejected": -237.55828857421875, + "loss": 0.1719, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13579526543617249, + "rewards/margins": 3.241835832595825, + "rewards/rejected": -3.1060407161712646, + "step": 7364 + }, + { + "epoch": 0.86, + "learning_rate": 4.3616393055391516e-08, + "logits/chosen": -1.664933204650879, + "logits/rejected": -1.9434778690338135, + "logps/chosen": -414.9933166503906, + "logps/rejected": -304.25433349609375, + "loss": 0.4602, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0309959650039673, + "rewards/margins": 1.5149590969085693, + "rewards/rejected": -2.545955181121826, + "step": 7365 + }, + { + "epoch": 0.86, + "learning_rate": 4.358096137947325e-08, + "logits/chosen": -1.8133764266967773, + "logits/rejected": -1.7225836515426636, + "logps/chosen": -512.2757568359375, + "logps/rejected": -453.98162841796875, + "loss": 0.2106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12128639221191406, + "rewards/margins": 2.3654098510742188, + "rewards/rejected": -2.486696243286133, + "step": 7366 + }, + { + "epoch": 0.86, + "learning_rate": 4.354552970355497e-08, + "logits/chosen": -2.7024364471435547, + "logits/rejected": -2.636679172515869, + "logps/chosen": -381.48486328125, + "logps/rejected": -271.9089050292969, + "loss": 0.3597, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3332599401474, + "rewards/margins": 2.525358200073242, + "rewards/rejected": -3.8586180210113525, + "step": 7367 + }, + { + "epoch": 0.86, + "learning_rate": 4.35100980276367e-08, + "logits/chosen": -1.972661018371582, + "logits/rejected": -2.104764699935913, + "logps/chosen": -202.8380126953125, + "logps/rejected": -198.59889221191406, + "loss": 0.5995, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1534758806228638, + "rewards/margins": 0.9941418170928955, + "rewards/rejected": -2.147617816925049, + "step": 7368 + }, + { + "epoch": 0.86, + "learning_rate": 4.347466635171844e-08, + "logits/chosen": -2.427387237548828, + "logits/rejected": -2.156757116317749, + "logps/chosen": -130.0189208984375, + "logps/rejected": -277.1494140625, + "loss": 0.2506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8823142647743225, + "rewards/margins": 1.7950465679168701, + "rewards/rejected": -2.6773605346679688, + "step": 7369 + }, + { + "epoch": 0.86, + "learning_rate": 4.3439234675800166e-08, + "logits/chosen": -2.401526927947998, + "logits/rejected": -2.3885245323181152, + "logps/chosen": -157.12550354003906, + "logps/rejected": -234.8175811767578, + "loss": 0.2258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8497658371925354, + "rewards/margins": 2.542341709136963, + "rewards/rejected": -3.3921074867248535, + "step": 7370 + }, + { + "epoch": 0.86, + "learning_rate": 4.340380299988189e-08, + "logits/chosen": -1.8605473041534424, + "logits/rejected": -1.9628580808639526, + "logps/chosen": -528.0529174804688, + "logps/rejected": -404.09149169921875, + "loss": 0.6271, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7081144452095032, + "rewards/margins": 0.9320942163467407, + "rewards/rejected": -1.6402087211608887, + "step": 7371 + }, + { + "epoch": 0.86, + "learning_rate": 4.3368371323963623e-08, + "logits/chosen": -2.4336495399475098, + "logits/rejected": -2.492370843887329, + "logps/chosen": -329.91912841796875, + "logps/rejected": -265.0862731933594, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4976869225502014, + "rewards/margins": 2.27584171295166, + "rewards/rejected": -2.7735283374786377, + "step": 7372 + }, + { + "epoch": 0.86, + "learning_rate": 4.333293964804535e-08, + "logits/chosen": -2.6293628215789795, + "logits/rejected": -2.544898509979248, + "logps/chosen": -162.8737030029297, + "logps/rejected": -245.86972045898438, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9415404796600342, + "rewards/margins": 3.002837657928467, + "rewards/rejected": -3.944378137588501, + "step": 7373 + }, + { + "epoch": 0.86, + "learning_rate": 4.3297507972127074e-08, + "logits/chosen": -2.3336076736450195, + "logits/rejected": -2.4770853519439697, + "logps/chosen": -174.3780975341797, + "logps/rejected": -186.2784423828125, + "loss": 0.3702, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3997822105884552, + "rewards/margins": 1.1106431484222412, + "rewards/rejected": -1.5104254484176636, + "step": 7374 + }, + { + "epoch": 0.86, + "learning_rate": 4.326207629620881e-08, + "logits/chosen": -2.3757927417755127, + "logits/rejected": -2.2179641723632812, + "logps/chosen": -265.916748046875, + "logps/rejected": -217.0623016357422, + "loss": 0.3156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.582455039024353, + "rewards/margins": 1.695911169052124, + "rewards/rejected": -2.2783660888671875, + "step": 7375 + }, + { + "epoch": 0.86, + "learning_rate": 4.322664462029054e-08, + "logits/chosen": -2.112644672393799, + "logits/rejected": -2.2404441833496094, + "logps/chosen": -345.8306884765625, + "logps/rejected": -262.53302001953125, + "loss": 0.6556, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7384858131408691, + "rewards/margins": 2.3227713108062744, + "rewards/rejected": -3.0612571239471436, + "step": 7376 + }, + { + "epoch": 0.86, + "learning_rate": 4.319121294437226e-08, + "logits/chosen": -2.4567487239837646, + "logits/rejected": -2.778498411178589, + "logps/chosen": -377.16558837890625, + "logps/rejected": -191.76596069335938, + "loss": 0.8374, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.351951003074646, + "rewards/margins": 0.6583741903305054, + "rewards/rejected": -2.0103251934051514, + "step": 7377 + }, + { + "epoch": 0.86, + "learning_rate": 4.3155781268453996e-08, + "logits/chosen": -2.286167621612549, + "logits/rejected": -2.3983213901519775, + "logps/chosen": -326.8046875, + "logps/rejected": -351.3918151855469, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3703670501708984, + "rewards/margins": 1.6795284748077393, + "rewards/rejected": -3.0498955249786377, + "step": 7378 + }, + { + "epoch": 0.86, + "learning_rate": 4.3120349592535725e-08, + "logits/chosen": -2.304551124572754, + "logits/rejected": -2.578907012939453, + "logps/chosen": -310.7557067871094, + "logps/rejected": -248.43692016601562, + "loss": 0.3673, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4836699962615967, + "rewards/margins": 1.8275984525680542, + "rewards/rejected": -3.3112685680389404, + "step": 7379 + }, + { + "epoch": 0.86, + "learning_rate": 4.308491791661746e-08, + "logits/chosen": -2.6922130584716797, + "logits/rejected": -2.6473400592803955, + "logps/chosen": -257.68267822265625, + "logps/rejected": -368.7127990722656, + "loss": 0.4121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6306822896003723, + "rewards/margins": 4.126639366149902, + "rewards/rejected": -4.757321834564209, + "step": 7380 + }, + { + "epoch": 0.86, + "learning_rate": 4.304948624069918e-08, + "logits/chosen": -2.354098320007324, + "logits/rejected": -2.554706573486328, + "logps/chosen": -192.09414672851562, + "logps/rejected": -282.3305358886719, + "loss": 0.4369, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2429633140563965, + "rewards/margins": 3.739074945449829, + "rewards/rejected": -4.982038497924805, + "step": 7381 + }, + { + "epoch": 0.86, + "learning_rate": 4.301405456478091e-08, + "logits/chosen": -1.6650643348693848, + "logits/rejected": -1.769336462020874, + "logps/chosen": -279.493408203125, + "logps/rejected": -268.69842529296875, + "loss": 0.3475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5399657487869263, + "rewards/margins": 2.4125709533691406, + "rewards/rejected": -2.9525370597839355, + "step": 7382 + }, + { + "epoch": 0.86, + "learning_rate": 4.2978622888862646e-08, + "logits/chosen": -2.931657075881958, + "logits/rejected": -2.8996098041534424, + "logps/chosen": -239.61407470703125, + "logps/rejected": -352.2484130859375, + "loss": 0.7259, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4589930772781372, + "rewards/margins": 1.556034803390503, + "rewards/rejected": -3.0150279998779297, + "step": 7383 + }, + { + "epoch": 0.86, + "learning_rate": 4.294319121294437e-08, + "logits/chosen": -2.546023368835449, + "logits/rejected": -2.750946521759033, + "logps/chosen": -433.675537109375, + "logps/rejected": -293.0964050292969, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7835550308227539, + "rewards/margins": 1.8370479345321655, + "rewards/rejected": -2.620603084564209, + "step": 7384 + }, + { + "epoch": 0.86, + "learning_rate": 4.29077595370261e-08, + "logits/chosen": -2.4985239505767822, + "logits/rejected": -2.5038416385650635, + "logps/chosen": -300.75, + "logps/rejected": -313.0320129394531, + "loss": 0.3923, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.234737753868103, + "rewards/margins": 1.8056812286376953, + "rewards/rejected": -3.0404186248779297, + "step": 7385 + }, + { + "epoch": 0.86, + "learning_rate": 4.287232786110783e-08, + "logits/chosen": -2.685487747192383, + "logits/rejected": -2.625850200653076, + "logps/chosen": -191.33912658691406, + "logps/rejected": -215.94918823242188, + "loss": 0.1168, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.698525071144104, + "rewards/margins": 2.3235015869140625, + "rewards/rejected": -3.022026538848877, + "step": 7386 + }, + { + "epoch": 0.86, + "learning_rate": 4.283689618518956e-08, + "logits/chosen": -2.461545467376709, + "logits/rejected": -2.7273592948913574, + "logps/chosen": -293.3512268066406, + "logps/rejected": -207.5708465576172, + "loss": 0.5294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9508684873580933, + "rewards/margins": 1.2214903831481934, + "rewards/rejected": -2.172358989715576, + "step": 7387 + }, + { + "epoch": 0.86, + "learning_rate": 4.280146450927128e-08, + "logits/chosen": -2.522770404815674, + "logits/rejected": -2.8090028762817383, + "logps/chosen": -428.70001220703125, + "logps/rejected": -192.25653076171875, + "loss": 0.477, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8552604913711548, + "rewards/margins": 1.2574939727783203, + "rewards/rejected": -2.1127543449401855, + "step": 7388 + }, + { + "epoch": 0.86, + "learning_rate": 4.276603283335302e-08, + "logits/chosen": -2.806605815887451, + "logits/rejected": -2.7857701778411865, + "logps/chosen": -157.57241821289062, + "logps/rejected": -265.8868408203125, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9828293323516846, + "rewards/margins": 1.7199809551239014, + "rewards/rejected": -2.702810287475586, + "step": 7389 + }, + { + "epoch": 0.86, + "learning_rate": 4.273060115743475e-08, + "logits/chosen": -2.4018545150756836, + "logits/rejected": -2.34497332572937, + "logps/chosen": -256.3289489746094, + "logps/rejected": -242.50669860839844, + "loss": 0.4223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6920467019081116, + "rewards/margins": 1.7317196130752563, + "rewards/rejected": -2.4237663745880127, + "step": 7390 + }, + { + "epoch": 0.86, + "learning_rate": 4.269516948151647e-08, + "logits/chosen": -2.208813190460205, + "logits/rejected": -2.35867977142334, + "logps/chosen": -454.2183532714844, + "logps/rejected": -380.28094482421875, + "loss": 0.4718, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8198089599609375, + "rewards/margins": 1.8115113973617554, + "rewards/rejected": -3.6313204765319824, + "step": 7391 + }, + { + "epoch": 0.86, + "learning_rate": 4.2659737805598205e-08, + "logits/chosen": -2.3672101497650146, + "logits/rejected": -2.246225595474243, + "logps/chosen": -310.89276123046875, + "logps/rejected": -323.53106689453125, + "loss": 0.2991, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6137959361076355, + "rewards/margins": 1.4820036888122559, + "rewards/rejected": -2.095799684524536, + "step": 7392 + }, + { + "epoch": 0.86, + "learning_rate": 4.2624306129679934e-08, + "logits/chosen": -2.124868869781494, + "logits/rejected": -2.3782637119293213, + "logps/chosen": -381.2189025878906, + "logps/rejected": -233.09320068359375, + "loss": 0.3013, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4157032072544098, + "rewards/margins": 1.3839783668518066, + "rewards/rejected": -1.799681544303894, + "step": 7393 + }, + { + "epoch": 0.86, + "learning_rate": 4.2588874453761656e-08, + "logits/chosen": -1.9648553133010864, + "logits/rejected": -1.8516488075256348, + "logps/chosen": -217.54640197753906, + "logps/rejected": -269.01708984375, + "loss": 0.4474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.46976763010025024, + "rewards/margins": 1.8575575351715088, + "rewards/rejected": -2.3273251056671143, + "step": 7394 + }, + { + "epoch": 0.86, + "learning_rate": 4.255344277784339e-08, + "logits/chosen": -2.5727288722991943, + "logits/rejected": -2.2998297214508057, + "logps/chosen": -634.0182495117188, + "logps/rejected": -318.913330078125, + "loss": 0.7269, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8111302852630615, + "rewards/margins": 1.3398747444152832, + "rewards/rejected": -3.1510050296783447, + "step": 7395 + }, + { + "epoch": 0.86, + "learning_rate": 4.251801110192512e-08, + "logits/chosen": -2.254666328430176, + "logits/rejected": -2.427865743637085, + "logps/chosen": -382.54217529296875, + "logps/rejected": -413.5176696777344, + "loss": 0.4112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1472088247537613, + "rewards/margins": 2.9594085216522217, + "rewards/rejected": -3.1066174507141113, + "step": 7396 + }, + { + "epoch": 0.86, + "learning_rate": 4.248257942600685e-08, + "logits/chosen": -2.7846522331237793, + "logits/rejected": -2.6029982566833496, + "logps/chosen": -128.48672485351562, + "logps/rejected": -262.2070007324219, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.903617799282074, + "rewards/margins": 4.666691303253174, + "rewards/rejected": -5.570309162139893, + "step": 7397 + }, + { + "epoch": 0.86, + "learning_rate": 4.244714775008858e-08, + "logits/chosen": -2.501453161239624, + "logits/rejected": -2.4899227619171143, + "logps/chosen": -329.1939697265625, + "logps/rejected": -219.50958251953125, + "loss": 0.2049, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.044801950454712, + "rewards/margins": 3.2693843841552734, + "rewards/rejected": -4.314186096191406, + "step": 7398 + }, + { + "epoch": 0.86, + "learning_rate": 4.2411716074170306e-08, + "logits/chosen": -2.5210464000701904, + "logits/rejected": -2.7877557277679443, + "logps/chosen": -246.3433837890625, + "logps/rejected": -250.97911071777344, + "loss": 0.1806, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49204790592193604, + "rewards/margins": 3.2195796966552734, + "rewards/rejected": -3.711627721786499, + "step": 7399 + }, + { + "epoch": 0.86, + "learning_rate": 4.2376284398252035e-08, + "logits/chosen": -2.650407552719116, + "logits/rejected": -2.4828267097473145, + "logps/chosen": -294.6566162109375, + "logps/rejected": -377.3023376464844, + "loss": 0.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5528160929679871, + "rewards/margins": 3.740328550338745, + "rewards/rejected": -4.293144702911377, + "step": 7400 + }, + { + "epoch": 0.86, + "learning_rate": 4.234085272233377e-08, + "logits/chosen": -2.2268126010894775, + "logits/rejected": -2.099517345428467, + "logps/chosen": -432.6499328613281, + "logps/rejected": -392.90692138671875, + "loss": 0.2718, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2720386981964111, + "rewards/margins": 2.1240715980529785, + "rewards/rejected": -3.3961100578308105, + "step": 7401 + }, + { + "epoch": 0.86, + "learning_rate": 4.230542104641549e-08, + "logits/chosen": -2.3600640296936035, + "logits/rejected": -2.3782622814178467, + "logps/chosen": -236.00759887695312, + "logps/rejected": -184.910400390625, + "loss": 0.3231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5416573882102966, + "rewards/margins": 1.570248007774353, + "rewards/rejected": -2.111905336380005, + "step": 7402 + }, + { + "epoch": 0.86, + "learning_rate": 4.226998937049722e-08, + "logits/chosen": -2.7298436164855957, + "logits/rejected": -2.5093538761138916, + "logps/chosen": -283.2736511230469, + "logps/rejected": -431.697509765625, + "loss": 0.1879, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4580395817756653, + "rewards/margins": 3.1673338413238525, + "rewards/rejected": -3.625373601913452, + "step": 7403 + }, + { + "epoch": 0.86, + "learning_rate": 4.2234557694578956e-08, + "logits/chosen": -2.519705295562744, + "logits/rejected": -2.308548927307129, + "logps/chosen": -239.23609924316406, + "logps/rejected": -305.8492736816406, + "loss": 0.3443, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6551697850227356, + "rewards/margins": 1.9842864274978638, + "rewards/rejected": -2.639456272125244, + "step": 7404 + }, + { + "epoch": 0.86, + "learning_rate": 4.219912601866068e-08, + "logits/chosen": -2.256939649581909, + "logits/rejected": -2.2368788719177246, + "logps/chosen": -480.8223876953125, + "logps/rejected": -411.6327819824219, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7915116548538208, + "rewards/margins": 2.8935739994049072, + "rewards/rejected": -3.6850857734680176, + "step": 7405 + }, + { + "epoch": 0.86, + "learning_rate": 4.216369434274241e-08, + "logits/chosen": -2.414038896560669, + "logits/rejected": -2.380126714706421, + "logps/chosen": -192.4914093017578, + "logps/rejected": -223.1443328857422, + "loss": 0.313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6677817106246948, + "rewards/margins": 2.699429988861084, + "rewards/rejected": -3.3672118186950684, + "step": 7406 + }, + { + "epoch": 0.86, + "learning_rate": 4.212826266682414e-08, + "logits/chosen": -2.2251546382904053, + "logits/rejected": -1.926259994506836, + "logps/chosen": -162.0848388671875, + "logps/rejected": -258.6600036621094, + "loss": 0.4616, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5949878096580505, + "rewards/margins": 1.5750833749771118, + "rewards/rejected": -2.1700711250305176, + "step": 7407 + }, + { + "epoch": 0.86, + "learning_rate": 4.2092830990905865e-08, + "logits/chosen": -2.2294363975524902, + "logits/rejected": -2.3565242290496826, + "logps/chosen": -283.98040771484375, + "logps/rejected": -242.69631958007812, + "loss": 0.3315, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5434924960136414, + "rewards/margins": 2.5101566314697266, + "rewards/rejected": -3.053649425506592, + "step": 7408 + }, + { + "epoch": 0.86, + "learning_rate": 4.2057399314987593e-08, + "logits/chosen": -2.0891835689544678, + "logits/rejected": -2.0022828578948975, + "logps/chosen": -181.45394897460938, + "logps/rejected": -255.66970825195312, + "loss": 0.8165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9042434692382812, + "rewards/margins": 3.483339309692383, + "rewards/rejected": -4.387583255767822, + "step": 7409 + }, + { + "epoch": 0.86, + "learning_rate": 4.202196763906933e-08, + "logits/chosen": -2.2725718021392822, + "logits/rejected": -2.4005653858184814, + "logps/chosen": -424.0126953125, + "logps/rejected": -235.87477111816406, + "loss": 0.3122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45461320877075195, + "rewards/margins": 1.5001500844955444, + "rewards/rejected": -1.954763412475586, + "step": 7410 + }, + { + "epoch": 0.86, + "learning_rate": 4.198653596315105e-08, + "logits/chosen": -2.4087111949920654, + "logits/rejected": -2.3705334663391113, + "logps/chosen": -220.966064453125, + "logps/rejected": -242.06295776367188, + "loss": 0.4211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3674432635307312, + "rewards/margins": 2.2516703605651855, + "rewards/rejected": -2.6191139221191406, + "step": 7411 + }, + { + "epoch": 0.86, + "learning_rate": 4.195110428723278e-08, + "logits/chosen": -2.6623263359069824, + "logits/rejected": -2.8734216690063477, + "logps/chosen": -348.59185791015625, + "logps/rejected": -299.2456970214844, + "loss": 0.3761, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8946435451507568, + "rewards/margins": 2.402216911315918, + "rewards/rejected": -3.296860694885254, + "step": 7412 + }, + { + "epoch": 0.86, + "learning_rate": 4.1915672611314515e-08, + "logits/chosen": -2.646946668624878, + "logits/rejected": -2.519709348678589, + "logps/chosen": -197.58558654785156, + "logps/rejected": -237.58331298828125, + "loss": 0.4294, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8241658210754395, + "rewards/margins": 1.91841721534729, + "rewards/rejected": -3.7425830364227295, + "step": 7413 + }, + { + "epoch": 0.86, + "learning_rate": 4.1880240935396244e-08, + "logits/chosen": -2.493227005004883, + "logits/rejected": -2.519759178161621, + "logps/chosen": -253.68472290039062, + "logps/rejected": -211.34213256835938, + "loss": 0.3613, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.036268711090088, + "rewards/margins": 2.2553019523620605, + "rewards/rejected": -3.2915706634521484, + "step": 7414 + }, + { + "epoch": 0.86, + "learning_rate": 4.184480925947797e-08, + "logits/chosen": -2.224849224090576, + "logits/rejected": -2.3085567951202393, + "logps/chosen": -401.8448181152344, + "logps/rejected": -241.9927978515625, + "loss": 0.6435, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6740058660507202, + "rewards/margins": 0.5749037861824036, + "rewards/rejected": -2.2489094734191895, + "step": 7415 + }, + { + "epoch": 0.86, + "learning_rate": 4.18093775835597e-08, + "logits/chosen": -2.768008232116699, + "logits/rejected": -2.6141791343688965, + "logps/chosen": -285.63409423828125, + "logps/rejected": -297.1236267089844, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3284810185432434, + "rewards/margins": 2.3039073944091797, + "rewards/rejected": -2.632388114929199, + "step": 7416 + }, + { + "epoch": 0.86, + "learning_rate": 4.177394590764143e-08, + "logits/chosen": -2.1684062480926514, + "logits/rejected": -2.398832082748413, + "logps/chosen": -279.0743408203125, + "logps/rejected": -250.26022338867188, + "loss": 0.3322, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5370237827301025, + "rewards/margins": 1.8731234073638916, + "rewards/rejected": -3.410147190093994, + "step": 7417 + }, + { + "epoch": 0.86, + "learning_rate": 4.1738514231723165e-08, + "logits/chosen": -2.397645950317383, + "logits/rejected": -2.6282565593719482, + "logps/chosen": -388.2731018066406, + "logps/rejected": -303.35137939453125, + "loss": 0.3917, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6384521722793579, + "rewards/margins": 1.7636640071868896, + "rewards/rejected": -2.402116298675537, + "step": 7418 + }, + { + "epoch": 0.86, + "learning_rate": 4.170308255580489e-08, + "logits/chosen": -2.3994252681732178, + "logits/rejected": -2.2704200744628906, + "logps/chosen": -389.8521728515625, + "logps/rejected": -470.2279052734375, + "loss": 0.2325, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5709764957427979, + "rewards/margins": 2.6057043075561523, + "rewards/rejected": -4.176681041717529, + "step": 7419 + }, + { + "epoch": 0.86, + "learning_rate": 4.1667650879886616e-08, + "logits/chosen": -2.4409396648406982, + "logits/rejected": -2.5146567821502686, + "logps/chosen": -457.4732971191406, + "logps/rejected": -453.3367614746094, + "loss": 0.1461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4311331808567047, + "rewards/margins": 2.905954360961914, + "rewards/rejected": -3.337087631225586, + "step": 7420 + }, + { + "epoch": 0.86, + "learning_rate": 4.163221920396835e-08, + "logits/chosen": -2.0087342262268066, + "logits/rejected": -2.2692270278930664, + "logps/chosen": -269.1477355957031, + "logps/rejected": -286.487060546875, + "loss": 0.2986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37997350096702576, + "rewards/margins": 2.428813934326172, + "rewards/rejected": -2.8087873458862305, + "step": 7421 + }, + { + "epoch": 0.86, + "learning_rate": 4.1596787528050074e-08, + "logits/chosen": -2.09857177734375, + "logits/rejected": -2.303708553314209, + "logps/chosen": -234.509765625, + "logps/rejected": -210.10980224609375, + "loss": 0.27, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7102940082550049, + "rewards/margins": 1.6775791645050049, + "rewards/rejected": -2.3878731727600098, + "step": 7422 + }, + { + "epoch": 0.86, + "learning_rate": 4.15613558521318e-08, + "logits/chosen": -2.372453451156616, + "logits/rejected": -2.395747661590576, + "logps/chosen": -391.77069091796875, + "logps/rejected": -271.1944580078125, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19812005758285522, + "rewards/margins": 3.433659315109253, + "rewards/rejected": -3.631779432296753, + "step": 7423 + }, + { + "epoch": 0.86, + "learning_rate": 4.152592417621354e-08, + "logits/chosen": -2.1211724281311035, + "logits/rejected": -2.1210222244262695, + "logps/chosen": -418.2323303222656, + "logps/rejected": -191.0109100341797, + "loss": 1.3796, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.94867742061615, + "rewards/margins": 0.22346678376197815, + "rewards/rejected": -2.1721441745758057, + "step": 7424 + }, + { + "epoch": 0.86, + "learning_rate": 4.149049250029526e-08, + "logits/chosen": -2.2247109413146973, + "logits/rejected": -2.513376235961914, + "logps/chosen": -297.91082763671875, + "logps/rejected": -340.728271484375, + "loss": 0.6787, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2377592325210571, + "rewards/margins": 1.4793713092803955, + "rewards/rejected": -2.717130661010742, + "step": 7425 + }, + { + "epoch": 0.86, + "learning_rate": 4.145506082437699e-08, + "logits/chosen": -2.4015464782714844, + "logits/rejected": -2.0406785011291504, + "logps/chosen": -239.76141357421875, + "logps/rejected": -416.66021728515625, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9676856994628906, + "rewards/margins": 3.6570210456848145, + "rewards/rejected": -4.624706745147705, + "step": 7426 + }, + { + "epoch": 0.86, + "learning_rate": 4.1419629148458724e-08, + "logits/chosen": -2.867131233215332, + "logits/rejected": -2.766174077987671, + "logps/chosen": -192.43174743652344, + "logps/rejected": -236.23245239257812, + "loss": 0.3242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6829854846000671, + "rewards/margins": 2.034066915512085, + "rewards/rejected": -2.7170522212982178, + "step": 7427 + }, + { + "epoch": 0.86, + "learning_rate": 4.138419747254045e-08, + "logits/chosen": -2.6047725677490234, + "logits/rejected": -2.6888692378997803, + "logps/chosen": -317.57965087890625, + "logps/rejected": -244.2706756591797, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48612481355667114, + "rewards/margins": 3.283125638961792, + "rewards/rejected": -3.7692503929138184, + "step": 7428 + }, + { + "epoch": 0.86, + "learning_rate": 4.1348765796622175e-08, + "logits/chosen": -2.479473114013672, + "logits/rejected": -2.6523637771606445, + "logps/chosen": -356.88067626953125, + "logps/rejected": -258.7164306640625, + "loss": 0.35, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.726493239402771, + "rewards/margins": 1.5704002380371094, + "rewards/rejected": -2.29689359664917, + "step": 7429 + }, + { + "epoch": 0.86, + "learning_rate": 4.131333412070391e-08, + "logits/chosen": -2.513615131378174, + "logits/rejected": -2.773898124694824, + "logps/chosen": -343.3598937988281, + "logps/rejected": -222.90814208984375, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5149953365325928, + "rewards/margins": 1.195157527923584, + "rewards/rejected": -1.7101528644561768, + "step": 7430 + }, + { + "epoch": 0.86, + "learning_rate": 4.127790244478564e-08, + "logits/chosen": -2.2825732231140137, + "logits/rejected": -2.440701961517334, + "logps/chosen": -279.2482604980469, + "logps/rejected": -314.2821044921875, + "loss": 0.1656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6107611060142517, + "rewards/margins": 2.887808084487915, + "rewards/rejected": -3.4985692501068115, + "step": 7431 + }, + { + "epoch": 0.86, + "learning_rate": 4.124247076886736e-08, + "logits/chosen": -2.2256932258605957, + "logits/rejected": -2.217245101928711, + "logps/chosen": -259.23126220703125, + "logps/rejected": -258.11553955078125, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1330019235610962, + "rewards/margins": 1.0320656299591064, + "rewards/rejected": -2.165067672729492, + "step": 7432 + }, + { + "epoch": 0.86, + "learning_rate": 4.1207039092949096e-08, + "logits/chosen": -2.2419238090515137, + "logits/rejected": -2.0835747718811035, + "logps/chosen": -242.33419799804688, + "logps/rejected": -227.84742736816406, + "loss": 0.317, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32714152336120605, + "rewards/margins": 1.961592674255371, + "rewards/rejected": -2.288734197616577, + "step": 7433 + }, + { + "epoch": 0.86, + "learning_rate": 4.1171607417030825e-08, + "logits/chosen": -2.481661081314087, + "logits/rejected": -2.5921473503112793, + "logps/chosen": -270.3702392578125, + "logps/rejected": -215.79481506347656, + "loss": 0.4485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9665314555168152, + "rewards/margins": 1.0891231298446655, + "rewards/rejected": -2.055654525756836, + "step": 7434 + }, + { + "epoch": 0.86, + "learning_rate": 4.113617574111255e-08, + "logits/chosen": -2.785043954849243, + "logits/rejected": -2.5490341186523438, + "logps/chosen": -206.52810668945312, + "logps/rejected": -326.9478759765625, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5069221258163452, + "rewards/margins": 3.893282890319824, + "rewards/rejected": -4.400204658508301, + "step": 7435 + }, + { + "epoch": 0.87, + "learning_rate": 4.110074406519428e-08, + "logits/chosen": -2.2484395503997803, + "logits/rejected": -2.2394204139709473, + "logps/chosen": -255.63900756835938, + "logps/rejected": -217.6568603515625, + "loss": 0.3821, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5845859050750732, + "rewards/margins": 1.6633158922195435, + "rewards/rejected": -2.2479019165039062, + "step": 7436 + }, + { + "epoch": 0.87, + "learning_rate": 4.106531238927601e-08, + "logits/chosen": -2.6651151180267334, + "logits/rejected": -2.5136616230010986, + "logps/chosen": -331.2873229980469, + "logps/rejected": -325.8858947753906, + "loss": 0.3087, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8874939680099487, + "rewards/margins": 1.9885061979293823, + "rewards/rejected": -3.87600040435791, + "step": 7437 + }, + { + "epoch": 0.87, + "learning_rate": 4.1029880713357734e-08, + "logits/chosen": -2.2892918586730957, + "logits/rejected": -2.489786148071289, + "logps/chosen": -252.24765014648438, + "logps/rejected": -188.4664764404297, + "loss": 0.341, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5855600833892822, + "rewards/margins": 2.245267391204834, + "rewards/rejected": -3.830827236175537, + "step": 7438 + }, + { + "epoch": 0.87, + "learning_rate": 4.099444903743947e-08, + "logits/chosen": -2.988538980484009, + "logits/rejected": -3.0016558170318604, + "logps/chosen": -102.7610855102539, + "logps/rejected": -118.61161804199219, + "loss": 0.3392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30525365471839905, + "rewards/margins": 1.9465928077697754, + "rewards/rejected": -2.2518465518951416, + "step": 7439 + }, + { + "epoch": 0.87, + "learning_rate": 4.09590173615212e-08, + "logits/chosen": -2.575385808944702, + "logits/rejected": -2.343461751937866, + "logps/chosen": -205.0209503173828, + "logps/rejected": -247.54193115234375, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5323732495307922, + "rewards/margins": 2.7233405113220215, + "rewards/rejected": -3.255713701248169, + "step": 7440 + }, + { + "epoch": 0.87, + "learning_rate": 4.0923585685602926e-08, + "logits/chosen": -2.4557299613952637, + "logits/rejected": -2.499826192855835, + "logps/chosen": -297.71160888671875, + "logps/rejected": -259.4007873535156, + "loss": 0.3988, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.394883632659912, + "rewards/margins": 1.990040898323059, + "rewards/rejected": -3.3849244117736816, + "step": 7441 + }, + { + "epoch": 0.87, + "learning_rate": 4.0888154009684655e-08, + "logits/chosen": -1.9132922887802124, + "logits/rejected": -2.0855712890625, + "logps/chosen": -337.0099182128906, + "logps/rejected": -223.9114532470703, + "loss": 0.6675, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4333034753799438, + "rewards/margins": 1.4482204914093018, + "rewards/rejected": -2.881524085998535, + "step": 7442 + }, + { + "epoch": 0.87, + "learning_rate": 4.0852722333766384e-08, + "logits/chosen": -2.5627241134643555, + "logits/rejected": -2.5413014888763428, + "logps/chosen": -179.08676147460938, + "logps/rejected": -232.53305053710938, + "loss": 0.2979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9355536103248596, + "rewards/margins": 3.214989423751831, + "rewards/rejected": -4.150543212890625, + "step": 7443 + }, + { + "epoch": 0.87, + "learning_rate": 4.081729065784811e-08, + "logits/chosen": -2.332305431365967, + "logits/rejected": -2.422426700592041, + "logps/chosen": -259.3702392578125, + "logps/rejected": -222.8070068359375, + "loss": 0.5868, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.332239031791687, + "rewards/margins": 0.5492417812347412, + "rewards/rejected": -1.8814808130264282, + "step": 7444 + }, + { + "epoch": 0.87, + "learning_rate": 4.078185898192985e-08, + "logits/chosen": -2.1690094470977783, + "logits/rejected": -2.0823795795440674, + "logps/chosen": -240.04525756835938, + "logps/rejected": -392.4588317871094, + "loss": 0.1616, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7538583278656006, + "rewards/margins": 3.3708839416503906, + "rewards/rejected": -4.12474250793457, + "step": 7445 + }, + { + "epoch": 0.87, + "learning_rate": 4.074642730601157e-08, + "logits/chosen": -2.7771639823913574, + "logits/rejected": -2.7408947944641113, + "logps/chosen": -180.1182861328125, + "logps/rejected": -241.030517578125, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6159970164299011, + "rewards/margins": 2.0382094383239746, + "rewards/rejected": -2.6542065143585205, + "step": 7446 + }, + { + "epoch": 0.87, + "learning_rate": 4.07109956300933e-08, + "logits/chosen": -2.605668067932129, + "logits/rejected": -2.653718948364258, + "logps/chosen": -108.45834350585938, + "logps/rejected": -187.49371337890625, + "loss": 0.2094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08610232174396515, + "rewards/margins": 2.9188454151153564, + "rewards/rejected": -2.8327431678771973, + "step": 7447 + }, + { + "epoch": 0.87, + "learning_rate": 4.0675563954175034e-08, + "logits/chosen": -2.494749069213867, + "logits/rejected": -2.637892723083496, + "logps/chosen": -302.77001953125, + "logps/rejected": -230.0926513671875, + "loss": 0.4721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8562884330749512, + "rewards/margins": 1.5954973697662354, + "rewards/rejected": -2.4517858028411865, + "step": 7448 + }, + { + "epoch": 0.87, + "learning_rate": 4.0640132278256756e-08, + "logits/chosen": -2.6143791675567627, + "logits/rejected": -2.767848014831543, + "logps/chosen": -180.4899444580078, + "logps/rejected": -351.9959411621094, + "loss": 0.7745, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8119560480117798, + "rewards/margins": 2.1861751079559326, + "rewards/rejected": -3.998131036758423, + "step": 7449 + }, + { + "epoch": 0.87, + "learning_rate": 4.0604700602338485e-08, + "logits/chosen": -1.9816347360610962, + "logits/rejected": -2.039487600326538, + "logps/chosen": -176.08734130859375, + "logps/rejected": -205.96249389648438, + "loss": 0.5977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5721927881240845, + "rewards/margins": 0.626162052154541, + "rewards/rejected": -1.1983548402786255, + "step": 7450 + }, + { + "epoch": 0.87, + "learning_rate": 4.056926892642022e-08, + "logits/chosen": -2.4256186485290527, + "logits/rejected": -2.2852325439453125, + "logps/chosen": -384.98583984375, + "logps/rejected": -359.12774658203125, + "loss": 0.7605, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3783447742462158, + "rewards/margins": 1.1083929538726807, + "rewards/rejected": -2.4867374897003174, + "step": 7451 + }, + { + "epoch": 0.87, + "learning_rate": 4.053383725050194e-08, + "logits/chosen": -2.4501595497131348, + "logits/rejected": -2.458890914916992, + "logps/chosen": -318.7148132324219, + "logps/rejected": -303.5361022949219, + "loss": 0.2608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1532467007637024, + "rewards/margins": 2.1351633071899414, + "rewards/rejected": -1.981916904449463, + "step": 7452 + }, + { + "epoch": 0.87, + "learning_rate": 4.049840557458368e-08, + "logits/chosen": -2.3681423664093018, + "logits/rejected": -2.525272846221924, + "logps/chosen": -280.0669860839844, + "logps/rejected": -328.72100830078125, + "loss": 0.8727, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0631685256958008, + "rewards/margins": 2.181605339050293, + "rewards/rejected": -3.2447738647460938, + "step": 7453 + }, + { + "epoch": 0.87, + "learning_rate": 4.0462973898665407e-08, + "logits/chosen": -2.556973457336426, + "logits/rejected": -2.326115131378174, + "logps/chosen": -194.3543243408203, + "logps/rejected": -228.249267578125, + "loss": 0.3991, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.557931125164032, + "rewards/margins": 1.0232553482055664, + "rewards/rejected": -1.5811865329742432, + "step": 7454 + }, + { + "epoch": 0.87, + "learning_rate": 4.0427542222747135e-08, + "logits/chosen": -2.96331524848938, + "logits/rejected": -2.9399170875549316, + "logps/chosen": -224.3788604736328, + "logps/rejected": -241.95213317871094, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9193310737609863, + "rewards/margins": 1.6471266746520996, + "rewards/rejected": -2.566457748413086, + "step": 7455 + }, + { + "epoch": 0.87, + "learning_rate": 4.0392110546828864e-08, + "logits/chosen": -2.2597365379333496, + "logits/rejected": -2.4630632400512695, + "logps/chosen": -350.9752197265625, + "logps/rejected": -269.10638427734375, + "loss": 0.8947, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2796251773834229, + "rewards/margins": 0.23490393161773682, + "rewards/rejected": -1.5145289897918701, + "step": 7456 + }, + { + "epoch": 0.87, + "learning_rate": 4.035667887091059e-08, + "logits/chosen": -2.0981454849243164, + "logits/rejected": -2.082630157470703, + "logps/chosen": -276.1609802246094, + "logps/rejected": -361.2107238769531, + "loss": 0.6434, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9780843257904053, + "rewards/margins": 1.9016947746276855, + "rewards/rejected": -2.87977933883667, + "step": 7457 + }, + { + "epoch": 0.87, + "learning_rate": 4.032124719499232e-08, + "logits/chosen": -2.6170413494110107, + "logits/rejected": -2.6239166259765625, + "logps/chosen": -246.8736114501953, + "logps/rejected": -205.16737365722656, + "loss": 0.3147, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3066801428794861, + "rewards/margins": 2.096379518508911, + "rewards/rejected": -2.403059720993042, + "step": 7458 + }, + { + "epoch": 0.87, + "learning_rate": 4.028581551907406e-08, + "logits/chosen": -1.5424563884735107, + "logits/rejected": -1.8053616285324097, + "logps/chosen": -390.86456298828125, + "logps/rejected": -276.9825134277344, + "loss": 0.5768, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.375307559967041, + "rewards/margins": 0.49594545364379883, + "rewards/rejected": -0.8712530136108398, + "step": 7459 + }, + { + "epoch": 0.87, + "learning_rate": 4.025038384315578e-08, + "logits/chosen": -2.8637747764587402, + "logits/rejected": -2.323227643966675, + "logps/chosen": -320.23779296875, + "logps/rejected": -496.8822937011719, + "loss": 0.335, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3381191492080688, + "rewards/margins": 2.290496349334717, + "rewards/rejected": -3.628615617752075, + "step": 7460 + }, + { + "epoch": 0.87, + "learning_rate": 4.021495216723751e-08, + "logits/chosen": -1.740652084350586, + "logits/rejected": -2.071352958679199, + "logps/chosen": -308.84796142578125, + "logps/rejected": -310.53656005859375, + "loss": 0.4248, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.891151487827301, + "rewards/margins": 1.589906930923462, + "rewards/rejected": -2.481058359146118, + "step": 7461 + }, + { + "epoch": 0.87, + "learning_rate": 4.017952049131924e-08, + "logits/chosen": -2.5051496028900146, + "logits/rejected": -2.1887893676757812, + "logps/chosen": -95.53691101074219, + "logps/rejected": -263.22601318359375, + "loss": 0.1041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028713539242744446, + "rewards/margins": 3.911935806274414, + "rewards/rejected": -3.9406492710113525, + "step": 7462 + }, + { + "epoch": 0.87, + "learning_rate": 4.0144088815400965e-08, + "logits/chosen": -2.415891170501709, + "logits/rejected": -2.3629393577575684, + "logps/chosen": -139.18809509277344, + "logps/rejected": -211.275146484375, + "loss": 0.4573, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7354158163070679, + "rewards/margins": 2.1032328605651855, + "rewards/rejected": -2.838648796081543, + "step": 7463 + }, + { + "epoch": 0.87, + "learning_rate": 4.0108657139482694e-08, + "logits/chosen": -2.936413288116455, + "logits/rejected": -2.853189706802368, + "logps/chosen": -225.50418090820312, + "logps/rejected": -270.8065185546875, + "loss": 0.5244, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6243809461593628, + "rewards/margins": 2.3480803966522217, + "rewards/rejected": -2.972461223602295, + "step": 7464 + }, + { + "epoch": 0.87, + "learning_rate": 4.007322546356443e-08, + "logits/chosen": -2.2530019283294678, + "logits/rejected": -2.1273109912872314, + "logps/chosen": -317.8463134765625, + "logps/rejected": -385.5718994140625, + "loss": 0.2454, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08119721710681915, + "rewards/margins": 2.7913320064544678, + "rewards/rejected": -2.8725290298461914, + "step": 7465 + }, + { + "epoch": 0.87, + "learning_rate": 4.003779378764615e-08, + "logits/chosen": -2.287867546081543, + "logits/rejected": -2.304603338241577, + "logps/chosen": -407.41033935546875, + "logps/rejected": -308.77142333984375, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9147146940231323, + "rewards/margins": 2.568711757659912, + "rewards/rejected": -3.483426570892334, + "step": 7466 + }, + { + "epoch": 0.87, + "learning_rate": 4.000236211172788e-08, + "logits/chosen": -2.7455592155456543, + "logits/rejected": -2.8652217388153076, + "logps/chosen": -320.1406555175781, + "logps/rejected": -308.87091064453125, + "loss": 0.2871, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0852218866348267, + "rewards/margins": 1.8617819547653198, + "rewards/rejected": -2.9470038414001465, + "step": 7467 + }, + { + "epoch": 0.87, + "learning_rate": 3.9966930435809616e-08, + "logits/chosen": -2.32810640335083, + "logits/rejected": -2.231466293334961, + "logps/chosen": -223.48406982421875, + "logps/rejected": -231.22523498535156, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2737163305282593, + "rewards/margins": 2.5546059608459473, + "rewards/rejected": -2.2808897495269775, + "step": 7468 + }, + { + "epoch": 0.87, + "learning_rate": 3.9931498759891344e-08, + "logits/chosen": -2.125253915786743, + "logits/rejected": -2.104539394378662, + "logps/chosen": -244.7877655029297, + "logps/rejected": -296.8888854980469, + "loss": 0.3128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07826540619134903, + "rewards/margins": 2.2179198265075684, + "rewards/rejected": -2.29618501663208, + "step": 7469 + }, + { + "epoch": 0.87, + "learning_rate": 3.9896067083973067e-08, + "logits/chosen": -2.8640546798706055, + "logits/rejected": -2.7171785831451416, + "logps/chosen": -224.15684509277344, + "logps/rejected": -163.0520782470703, + "loss": 0.8724, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7173874378204346, + "rewards/margins": 0.7800913453102112, + "rewards/rejected": -2.497478723526001, + "step": 7470 + }, + { + "epoch": 0.87, + "learning_rate": 3.98606354080548e-08, + "logits/chosen": -1.9381287097930908, + "logits/rejected": -1.7445000410079956, + "logps/chosen": -239.2338409423828, + "logps/rejected": -344.2511291503906, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6621671915054321, + "rewards/margins": 3.6297380924224854, + "rewards/rejected": -4.291904926300049, + "step": 7471 + }, + { + "epoch": 0.87, + "learning_rate": 3.982520373213653e-08, + "logits/chosen": -1.789376139640808, + "logits/rejected": -2.160247802734375, + "logps/chosen": -219.6719512939453, + "logps/rejected": -203.71624755859375, + "loss": 0.4051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8259466290473938, + "rewards/margins": 1.8520512580871582, + "rewards/rejected": -2.6779978275299072, + "step": 7472 + }, + { + "epoch": 0.87, + "learning_rate": 3.978977205621825e-08, + "logits/chosen": -2.5186033248901367, + "logits/rejected": -2.097498893737793, + "logps/chosen": -227.91026306152344, + "logps/rejected": -407.81878662109375, + "loss": 0.4157, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2582558393478394, + "rewards/margins": 2.272338390350342, + "rewards/rejected": -3.5305943489074707, + "step": 7473 + }, + { + "epoch": 0.87, + "learning_rate": 3.975434038029999e-08, + "logits/chosen": -2.7007932662963867, + "logits/rejected": -2.828233242034912, + "logps/chosen": -253.18551635742188, + "logps/rejected": -275.609375, + "loss": 0.6157, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1898021697998047, + "rewards/margins": 3.3642122745513916, + "rewards/rejected": -5.554014682769775, + "step": 7474 + }, + { + "epoch": 0.87, + "learning_rate": 3.971890870438172e-08, + "logits/chosen": -2.5894033908843994, + "logits/rejected": -2.453242063522339, + "logps/chosen": -195.01309204101562, + "logps/rejected": -212.51077270507812, + "loss": 0.152, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25076568126678467, + "rewards/margins": 3.4053378105163574, + "rewards/rejected": -3.6561033725738525, + "step": 7475 + }, + { + "epoch": 0.87, + "learning_rate": 3.968347702846344e-08, + "logits/chosen": -2.5519039630889893, + "logits/rejected": -2.709306478500366, + "logps/chosen": -198.63784790039062, + "logps/rejected": -184.9821319580078, + "loss": 0.2355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2773759961128235, + "rewards/margins": 2.7246203422546387, + "rewards/rejected": -3.0019962787628174, + "step": 7476 + }, + { + "epoch": 0.87, + "learning_rate": 3.9648045352545174e-08, + "logits/chosen": -2.318545341491699, + "logits/rejected": -2.2535712718963623, + "logps/chosen": -350.3158874511719, + "logps/rejected": -280.2073974609375, + "loss": 0.3428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7480247616767883, + "rewards/margins": 1.790666103363037, + "rewards/rejected": -2.5386910438537598, + "step": 7477 + }, + { + "epoch": 0.87, + "learning_rate": 3.96126136766269e-08, + "logits/chosen": -2.9069015979766846, + "logits/rejected": -2.9593658447265625, + "logps/chosen": -151.27932739257812, + "logps/rejected": -201.32086181640625, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6439796090126038, + "rewards/margins": 3.0447351932525635, + "rewards/rejected": -3.6887147426605225, + "step": 7478 + }, + { + "epoch": 0.87, + "learning_rate": 3.9577182000708625e-08, + "logits/chosen": -2.4626615047454834, + "logits/rejected": -2.563568115234375, + "logps/chosen": -311.5268859863281, + "logps/rejected": -332.6734619140625, + "loss": 0.3943, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3352310359477997, + "rewards/margins": 1.796903133392334, + "rewards/rejected": -2.132134199142456, + "step": 7479 + }, + { + "epoch": 0.87, + "learning_rate": 3.954175032479036e-08, + "logits/chosen": -2.5440256595611572, + "logits/rejected": -2.673510789871216, + "logps/chosen": -326.3738098144531, + "logps/rejected": -276.53955078125, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7997738122940063, + "rewards/margins": 2.1180038452148438, + "rewards/rejected": -2.9177775382995605, + "step": 7480 + }, + { + "epoch": 0.87, + "learning_rate": 3.950631864887209e-08, + "logits/chosen": -2.7066192626953125, + "logits/rejected": -2.224942207336426, + "logps/chosen": -185.92559814453125, + "logps/rejected": -361.118408203125, + "loss": 0.6105, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.250961422920227, + "rewards/margins": 0.9742235541343689, + "rewards/rejected": -2.225184917449951, + "step": 7481 + }, + { + "epoch": 0.87, + "learning_rate": 3.947088697295382e-08, + "logits/chosen": -1.6574145555496216, + "logits/rejected": -1.8611578941345215, + "logps/chosen": -296.454345703125, + "logps/rejected": -180.52996826171875, + "loss": 0.9579, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4499104619026184, + "rewards/margins": 0.7439854741096497, + "rewards/rejected": -1.1938960552215576, + "step": 7482 + }, + { + "epoch": 0.87, + "learning_rate": 3.943545529703555e-08, + "logits/chosen": -2.6334919929504395, + "logits/rejected": -2.868267059326172, + "logps/chosen": -187.14480590820312, + "logps/rejected": -236.28677368164062, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6801818013191223, + "rewards/margins": 4.1657257080078125, + "rewards/rejected": -4.845907688140869, + "step": 7483 + }, + { + "epoch": 0.87, + "learning_rate": 3.9400023621117276e-08, + "logits/chosen": -2.9997878074645996, + "logits/rejected": -2.8900153636932373, + "logps/chosen": -358.3124084472656, + "logps/rejected": -283.5014343261719, + "loss": 0.4944, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3210680484771729, + "rewards/margins": 2.0989174842834473, + "rewards/rejected": -3.419985294342041, + "step": 7484 + }, + { + "epoch": 0.87, + "learning_rate": 3.9364591945199004e-08, + "logits/chosen": -2.3700881004333496, + "logits/rejected": -2.283473491668701, + "logps/chosen": -315.7730712890625, + "logps/rejected": -262.1810607910156, + "loss": 0.631, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6994385123252869, + "rewards/margins": 2.479795217514038, + "rewards/rejected": -3.179234027862549, + "step": 7485 + }, + { + "epoch": 0.87, + "learning_rate": 3.932916026928074e-08, + "logits/chosen": -2.5424065589904785, + "logits/rejected": -2.4882302284240723, + "logps/chosen": -377.7289733886719, + "logps/rejected": -258.8668212890625, + "loss": 0.8413, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.19884991645813, + "rewards/margins": 1.2343378067016602, + "rewards/rejected": -3.43318772315979, + "step": 7486 + }, + { + "epoch": 0.87, + "learning_rate": 3.929372859336246e-08, + "logits/chosen": -1.4403400421142578, + "logits/rejected": -1.6577595472335815, + "logps/chosen": -326.2098693847656, + "logps/rejected": -275.27532958984375, + "loss": 0.1855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05754198133945465, + "rewards/margins": 2.917477607727051, + "rewards/rejected": -2.975019693374634, + "step": 7487 + }, + { + "epoch": 0.87, + "learning_rate": 3.92582969174442e-08, + "logits/chosen": -2.45052433013916, + "logits/rejected": -2.244040012359619, + "logps/chosen": -121.68570709228516, + "logps/rejected": -561.2068481445312, + "loss": 0.2863, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07030247151851654, + "rewards/margins": 5.309096336364746, + "rewards/rejected": -5.379398822784424, + "step": 7488 + }, + { + "epoch": 0.87, + "learning_rate": 3.9222865241525926e-08, + "logits/chosen": -2.4226434230804443, + "logits/rejected": -2.1578667163848877, + "logps/chosen": -253.4890594482422, + "logps/rejected": -228.8557891845703, + "loss": 0.4049, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8983390927314758, + "rewards/margins": 1.1459429264068604, + "rewards/rejected": -2.0442819595336914, + "step": 7489 + }, + { + "epoch": 0.87, + "learning_rate": 3.918743356560765e-08, + "logits/chosen": -2.5436697006225586, + "logits/rejected": -2.6294407844543457, + "logps/chosen": -241.81517028808594, + "logps/rejected": -249.85643005371094, + "loss": 0.2968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9220157861709595, + "rewards/margins": 3.2853140830993652, + "rewards/rejected": -4.207329750061035, + "step": 7490 + }, + { + "epoch": 0.87, + "learning_rate": 3.9152001889689383e-08, + "logits/chosen": -2.932400941848755, + "logits/rejected": -2.9976460933685303, + "logps/chosen": -214.0198516845703, + "logps/rejected": -259.37060546875, + "loss": 0.2845, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46973851323127747, + "rewards/margins": 1.8081988096237183, + "rewards/rejected": -2.277937412261963, + "step": 7491 + }, + { + "epoch": 0.87, + "learning_rate": 3.911657021377111e-08, + "logits/chosen": -2.162764310836792, + "logits/rejected": -2.498610496520996, + "logps/chosen": -371.01885986328125, + "logps/rejected": -223.04290771484375, + "loss": 0.6572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4022696018218994, + "rewards/margins": 0.9825252294540405, + "rewards/rejected": -1.38479483127594, + "step": 7492 + }, + { + "epoch": 0.87, + "learning_rate": 3.9081138537852834e-08, + "logits/chosen": -1.9126758575439453, + "logits/rejected": -2.314441680908203, + "logps/chosen": -354.7716064453125, + "logps/rejected": -196.4732666015625, + "loss": 0.6203, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6445648670196533, + "rewards/margins": 0.7784310579299927, + "rewards/rejected": -2.4229960441589355, + "step": 7493 + }, + { + "epoch": 0.87, + "learning_rate": 3.904570686193457e-08, + "logits/chosen": -2.5145556926727295, + "logits/rejected": -2.6349141597747803, + "logps/chosen": -351.16943359375, + "logps/rejected": -349.3561096191406, + "loss": 0.3624, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.577573299407959, + "rewards/margins": 2.049403667449951, + "rewards/rejected": -2.626976728439331, + "step": 7494 + }, + { + "epoch": 0.87, + "learning_rate": 3.90102751860163e-08, + "logits/chosen": -2.711668014526367, + "logits/rejected": -2.632505416870117, + "logps/chosen": -207.67723083496094, + "logps/rejected": -287.71002197265625, + "loss": 0.1646, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.117865800857544, + "rewards/margins": 3.1179416179656982, + "rewards/rejected": -4.235807418823242, + "step": 7495 + }, + { + "epoch": 0.87, + "learning_rate": 3.897484351009803e-08, + "logits/chosen": -2.065885543823242, + "logits/rejected": -2.1394834518432617, + "logps/chosen": -481.0562744140625, + "logps/rejected": -407.96160888671875, + "loss": 0.2684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45765888690948486, + "rewards/margins": 2.4948365688323975, + "rewards/rejected": -2.9524953365325928, + "step": 7496 + }, + { + "epoch": 0.87, + "learning_rate": 3.8939411834179756e-08, + "logits/chosen": -2.140235662460327, + "logits/rejected": -2.3484044075012207, + "logps/chosen": -263.39129638671875, + "logps/rejected": -260.3359375, + "loss": 0.527, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9175694584846497, + "rewards/margins": 1.723464012145996, + "rewards/rejected": -2.64103364944458, + "step": 7497 + }, + { + "epoch": 0.87, + "learning_rate": 3.8903980158261485e-08, + "logits/chosen": -2.143843412399292, + "logits/rejected": -2.0756313800811768, + "logps/chosen": -325.08392333984375, + "logps/rejected": -263.04931640625, + "loss": 0.5948, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.572506308555603, + "rewards/margins": 1.7050936222076416, + "rewards/rejected": -3.277599811553955, + "step": 7498 + }, + { + "epoch": 0.87, + "learning_rate": 3.886854848234321e-08, + "logits/chosen": -2.536574363708496, + "logits/rejected": -2.7295899391174316, + "logps/chosen": -248.046875, + "logps/rejected": -233.45281982421875, + "loss": 0.9397, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1325939893722534, + "rewards/margins": 1.6091258525848389, + "rewards/rejected": -2.741719961166382, + "step": 7499 + }, + { + "epoch": 0.87, + "learning_rate": 3.883311680642495e-08, + "logits/chosen": -2.360440731048584, + "logits/rejected": -2.264335870742798, + "logps/chosen": -87.21952819824219, + "logps/rejected": -192.10665893554688, + "loss": 0.3563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8892935514450073, + "rewards/margins": 2.650287628173828, + "rewards/rejected": -3.539581060409546, + "step": 7500 + }, + { + "epoch": 0.87, + "learning_rate": 3.879768513050667e-08, + "logits/chosen": -1.781163215637207, + "logits/rejected": -1.6736397743225098, + "logps/chosen": -593.2574462890625, + "logps/rejected": -504.3365783691406, + "loss": 0.4747, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8683615922927856, + "rewards/margins": 1.302520513534546, + "rewards/rejected": -2.170881986618042, + "step": 7501 + }, + { + "epoch": 0.87, + "learning_rate": 3.87622534545884e-08, + "logits/chosen": -1.7868337631225586, + "logits/rejected": -1.839977741241455, + "logps/chosen": -341.46307373046875, + "logps/rejected": -356.9876708984375, + "loss": 0.4488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6643365025520325, + "rewards/margins": 1.3283683061599731, + "rewards/rejected": -1.9927046298980713, + "step": 7502 + }, + { + "epoch": 0.87, + "learning_rate": 3.8726821778670135e-08, + "logits/chosen": -2.029974937438965, + "logits/rejected": -2.1934258937835693, + "logps/chosen": -240.7349853515625, + "logps/rejected": -261.1914367675781, + "loss": 0.3721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7872371077537537, + "rewards/margins": 1.9438073635101318, + "rewards/rejected": -2.7310445308685303, + "step": 7503 + }, + { + "epoch": 0.87, + "learning_rate": 3.869139010275186e-08, + "logits/chosen": -2.39571213722229, + "logits/rejected": -2.447470188140869, + "logps/chosen": -289.93951416015625, + "logps/rejected": -287.8323059082031, + "loss": 0.4244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3291493058204651, + "rewards/margins": 2.530674695968628, + "rewards/rejected": -2.8598239421844482, + "step": 7504 + }, + { + "epoch": 0.87, + "learning_rate": 3.8655958426833586e-08, + "logits/chosen": -2.5821428298950195, + "logits/rejected": -2.5792949199676514, + "logps/chosen": -200.5162353515625, + "logps/rejected": -254.36978149414062, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1960863471031189, + "rewards/margins": 3.751725196838379, + "rewards/rejected": -3.9478116035461426, + "step": 7505 + }, + { + "epoch": 0.87, + "learning_rate": 3.862052675091532e-08, + "logits/chosen": -1.7868914604187012, + "logits/rejected": -1.7113152742385864, + "logps/chosen": -293.33636474609375, + "logps/rejected": -294.551025390625, + "loss": 0.3694, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6705390214920044, + "rewards/margins": 1.1477543115615845, + "rewards/rejected": -1.8182933330535889, + "step": 7506 + }, + { + "epoch": 0.87, + "learning_rate": 3.858509507499704e-08, + "logits/chosen": -2.4955410957336426, + "logits/rejected": -2.376322031021118, + "logps/chosen": -379.24676513671875, + "logps/rejected": -340.36785888671875, + "loss": 0.2339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5571887493133545, + "rewards/margins": 2.565523147583008, + "rewards/rejected": -3.1227118968963623, + "step": 7507 + }, + { + "epoch": 0.87, + "learning_rate": 3.854966339907877e-08, + "logits/chosen": -2.2967946529388428, + "logits/rejected": -2.4550697803497314, + "logps/chosen": -197.96615600585938, + "logps/rejected": -145.23403930664062, + "loss": 0.331, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41671648621559143, + "rewards/margins": 1.574009656906128, + "rewards/rejected": -1.9907262325286865, + "step": 7508 + }, + { + "epoch": 0.87, + "learning_rate": 3.851423172316051e-08, + "logits/chosen": -2.4118752479553223, + "logits/rejected": -2.331817626953125, + "logps/chosen": -206.0049285888672, + "logps/rejected": -254.46414184570312, + "loss": 0.4497, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1649662256240845, + "rewards/margins": 0.9729798436164856, + "rewards/rejected": -2.1379458904266357, + "step": 7509 + }, + { + "epoch": 0.87, + "learning_rate": 3.847880004724223e-08, + "logits/chosen": -2.642547607421875, + "logits/rejected": -2.707130193710327, + "logps/chosen": -161.63909912109375, + "logps/rejected": -128.05995178222656, + "loss": 0.5507, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0916132926940918, + "rewards/margins": 1.0359361171722412, + "rewards/rejected": -2.127549648284912, + "step": 7510 + }, + { + "epoch": 0.87, + "learning_rate": 3.844336837132396e-08, + "logits/chosen": -1.9174813032150269, + "logits/rejected": -2.637908935546875, + "logps/chosen": -495.57928466796875, + "logps/rejected": -238.81614685058594, + "loss": 0.4253, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1786824464797974, + "rewards/margins": 2.4145259857177734, + "rewards/rejected": -3.5932087898254395, + "step": 7511 + }, + { + "epoch": 0.87, + "learning_rate": 3.8407936695405694e-08, + "logits/chosen": -1.9930419921875, + "logits/rejected": -1.9971733093261719, + "logps/chosen": -396.9998474121094, + "logps/rejected": -342.5757751464844, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7120683193206787, + "rewards/margins": 2.3684170246124268, + "rewards/rejected": -3.0804853439331055, + "step": 7512 + }, + { + "epoch": 0.87, + "learning_rate": 3.837250501948742e-08, + "logits/chosen": -1.799892544746399, + "logits/rejected": -1.6939420700073242, + "logps/chosen": -184.44906616210938, + "logps/rejected": -239.61451721191406, + "loss": 0.2921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4995110034942627, + "rewards/margins": 2.0172793865203857, + "rewards/rejected": -3.5167906284332275, + "step": 7513 + }, + { + "epoch": 0.87, + "learning_rate": 3.8337073343569144e-08, + "logits/chosen": -2.480130910873413, + "logits/rejected": -2.416158676147461, + "logps/chosen": -148.5886993408203, + "logps/rejected": -214.14231872558594, + "loss": 0.1266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5984637141227722, + "rewards/margins": 3.172476291656494, + "rewards/rejected": -3.770939826965332, + "step": 7514 + }, + { + "epoch": 0.87, + "learning_rate": 3.830164166765088e-08, + "logits/chosen": -1.9780246019363403, + "logits/rejected": -2.409618377685547, + "logps/chosen": -324.08984375, + "logps/rejected": -293.193359375, + "loss": 1.4577, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1684296131134033, + "rewards/margins": -0.2145644724369049, + "rewards/rejected": -1.9538650512695312, + "step": 7515 + }, + { + "epoch": 0.87, + "learning_rate": 3.826620999173261e-08, + "logits/chosen": -2.796067237854004, + "logits/rejected": -2.900615692138672, + "logps/chosen": -229.98208618164062, + "logps/rejected": -262.71490478515625, + "loss": 0.2802, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9960126876831055, + "rewards/margins": 2.8545782566070557, + "rewards/rejected": -3.8505911827087402, + "step": 7516 + }, + { + "epoch": 0.87, + "learning_rate": 3.823077831581433e-08, + "logits/chosen": -2.012794256210327, + "logits/rejected": -2.086329460144043, + "logps/chosen": -441.78961181640625, + "logps/rejected": -431.6699523925781, + "loss": 0.5211, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.509674310684204, + "rewards/margins": 1.555641531944275, + "rewards/rejected": -3.0653159618377686, + "step": 7517 + }, + { + "epoch": 0.87, + "learning_rate": 3.8195346639896066e-08, + "logits/chosen": -1.8747005462646484, + "logits/rejected": -1.8375592231750488, + "logps/chosen": -233.31080627441406, + "logps/rejected": -180.36962890625, + "loss": 0.427, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48130056262016296, + "rewards/margins": 1.6039750576019287, + "rewards/rejected": -2.085275650024414, + "step": 7518 + }, + { + "epoch": 0.87, + "learning_rate": 3.8159914963977795e-08, + "logits/chosen": -2.4557831287384033, + "logits/rejected": -2.3003106117248535, + "logps/chosen": -227.46347045898438, + "logps/rejected": -281.5614318847656, + "loss": 0.5987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2590041160583496, + "rewards/margins": 1.5345250368118286, + "rewards/rejected": -2.7935290336608887, + "step": 7519 + }, + { + "epoch": 0.87, + "learning_rate": 3.812448328805952e-08, + "logits/chosen": -2.1570701599121094, + "logits/rejected": -2.3173041343688965, + "logps/chosen": -256.78521728515625, + "logps/rejected": -213.810302734375, + "loss": 0.3263, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4149093627929688, + "rewards/margins": 2.4394869804382324, + "rewards/rejected": -3.8543965816497803, + "step": 7520 + }, + { + "epoch": 0.87, + "learning_rate": 3.808905161214125e-08, + "logits/chosen": -2.488544464111328, + "logits/rejected": -2.931534767150879, + "logps/chosen": -257.25811767578125, + "logps/rejected": -226.4765167236328, + "loss": 0.3114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0324139595031738, + "rewards/margins": 2.1716015338897705, + "rewards/rejected": -3.2040152549743652, + "step": 7521 + }, + { + "epoch": 0.88, + "learning_rate": 3.805361993622298e-08, + "logits/chosen": -1.7614893913269043, + "logits/rejected": -1.8610666990280151, + "logps/chosen": -346.7874755859375, + "logps/rejected": -350.2413330078125, + "loss": 1.2667, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.604437232017517, + "rewards/margins": 0.3535185158252716, + "rewards/rejected": -1.9579558372497559, + "step": 7522 + }, + { + "epoch": 0.88, + "learning_rate": 3.801818826030471e-08, + "logits/chosen": -2.0857529640197754, + "logits/rejected": -2.0895373821258545, + "logps/chosen": -260.63653564453125, + "logps/rejected": -203.45745849609375, + "loss": 0.3452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35135677456855774, + "rewards/margins": 1.9207004308700562, + "rewards/rejected": -2.272057294845581, + "step": 7523 + }, + { + "epoch": 0.88, + "learning_rate": 3.798275658438644e-08, + "logits/chosen": -1.895129680633545, + "logits/rejected": -2.3861989974975586, + "logps/chosen": -354.649169921875, + "logps/rejected": -246.88027954101562, + "loss": 0.2748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7551931142807007, + "rewards/margins": 2.075368881225586, + "rewards/rejected": -2.830562114715576, + "step": 7524 + }, + { + "epoch": 0.88, + "learning_rate": 3.794732490846817e-08, + "logits/chosen": -2.000823736190796, + "logits/rejected": -2.061523199081421, + "logps/chosen": -447.3293151855469, + "logps/rejected": -404.1339416503906, + "loss": 0.2944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7396426796913147, + "rewards/margins": 1.9754657745361328, + "rewards/rejected": -2.7151083946228027, + "step": 7525 + }, + { + "epoch": 0.88, + "learning_rate": 3.79118932325499e-08, + "logits/chosen": -2.6365180015563965, + "logits/rejected": -2.478926658630371, + "logps/chosen": -365.75994873046875, + "logps/rejected": -444.59271240234375, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7812925577163696, + "rewards/margins": 1.3631021976470947, + "rewards/rejected": -2.144394874572754, + "step": 7526 + }, + { + "epoch": 0.88, + "learning_rate": 3.787646155663163e-08, + "logits/chosen": -2.2099249362945557, + "logits/rejected": -2.382570505142212, + "logps/chosen": -307.29351806640625, + "logps/rejected": -295.52734375, + "loss": 0.4052, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5256860852241516, + "rewards/margins": 2.1640634536743164, + "rewards/rejected": -2.6897494792938232, + "step": 7527 + }, + { + "epoch": 0.88, + "learning_rate": 3.7841029880713353e-08, + "logits/chosen": -3.0187735557556152, + "logits/rejected": -2.9514951705932617, + "logps/chosen": -185.81106567382812, + "logps/rejected": -186.22023010253906, + "loss": 0.3511, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6503515243530273, + "rewards/margins": 1.89277982711792, + "rewards/rejected": -3.5431313514709473, + "step": 7528 + }, + { + "epoch": 0.88, + "learning_rate": 3.780559820479509e-08, + "logits/chosen": -1.9912121295928955, + "logits/rejected": -1.6452127695083618, + "logps/chosen": -461.4638977050781, + "logps/rejected": -453.073486328125, + "loss": 0.3192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8563157916069031, + "rewards/margins": 2.419893741607666, + "rewards/rejected": -3.276209592819214, + "step": 7529 + }, + { + "epoch": 0.88, + "learning_rate": 3.777016652887682e-08, + "logits/chosen": -2.511629343032837, + "logits/rejected": -2.468735456466675, + "logps/chosen": -133.1564178466797, + "logps/rejected": -117.62103271484375, + "loss": 0.5329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7593053579330444, + "rewards/margins": 1.00485098361969, + "rewards/rejected": -1.7641563415527344, + "step": 7530 + }, + { + "epoch": 0.88, + "learning_rate": 3.773473485295854e-08, + "logits/chosen": -2.2442924976348877, + "logits/rejected": -2.1935691833496094, + "logps/chosen": -118.59750366210938, + "logps/rejected": -151.1815185546875, + "loss": 0.3724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6515747308731079, + "rewards/margins": 1.6301442384719849, + "rewards/rejected": -2.2817189693450928, + "step": 7531 + }, + { + "epoch": 0.88, + "learning_rate": 3.7699303177040275e-08, + "logits/chosen": -1.5957058668136597, + "logits/rejected": -1.949406623840332, + "logps/chosen": -374.2304382324219, + "logps/rejected": -253.68788146972656, + "loss": 0.5035, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.19357430934906, + "rewards/margins": 0.7374681234359741, + "rewards/rejected": -1.9310424327850342, + "step": 7532 + }, + { + "epoch": 0.88, + "learning_rate": 3.7663871501122004e-08, + "logits/chosen": -2.339268207550049, + "logits/rejected": -2.2177529335021973, + "logps/chosen": -409.3575439453125, + "logps/rejected": -386.9306335449219, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20894679427146912, + "rewards/margins": 2.8393497467041016, + "rewards/rejected": -3.0482964515686035, + "step": 7533 + }, + { + "epoch": 0.88, + "learning_rate": 3.7628439825203726e-08, + "logits/chosen": -2.3660786151885986, + "logits/rejected": -2.3146395683288574, + "logps/chosen": -217.5308380126953, + "logps/rejected": -325.6949768066406, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1429389715194702, + "rewards/margins": 2.1752686500549316, + "rewards/rejected": -3.3182077407836914, + "step": 7534 + }, + { + "epoch": 0.88, + "learning_rate": 3.759300814928546e-08, + "logits/chosen": -2.212500810623169, + "logits/rejected": -1.9916319847106934, + "logps/chosen": -440.3708190917969, + "logps/rejected": -464.888427734375, + "loss": 0.4659, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9485597610473633, + "rewards/margins": 1.9101868867874146, + "rewards/rejected": -2.8587465286254883, + "step": 7535 + }, + { + "epoch": 0.88, + "learning_rate": 3.755757647336719e-08, + "logits/chosen": -2.6308751106262207, + "logits/rejected": -2.7481284141540527, + "logps/chosen": -121.22234344482422, + "logps/rejected": -160.33273315429688, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08295218646526337, + "rewards/margins": 3.8101868629455566, + "rewards/rejected": -3.893138885498047, + "step": 7536 + }, + { + "epoch": 0.88, + "learning_rate": 3.752214479744891e-08, + "logits/chosen": -1.9461264610290527, + "logits/rejected": -1.9267064332962036, + "logps/chosen": -231.03726196289062, + "logps/rejected": -217.8470001220703, + "loss": 2.2683, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.976490020751953, + "rewards/margins": -0.2544025182723999, + "rewards/rejected": -2.7220873832702637, + "step": 7537 + }, + { + "epoch": 0.88, + "learning_rate": 3.748671312153065e-08, + "logits/chosen": -2.706608772277832, + "logits/rejected": -2.6846141815185547, + "logps/chosen": -253.7475128173828, + "logps/rejected": -314.66497802734375, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42624354362487793, + "rewards/margins": 3.4176528453826904, + "rewards/rejected": -3.8438968658447266, + "step": 7538 + }, + { + "epoch": 0.88, + "learning_rate": 3.7451281445612376e-08, + "logits/chosen": -2.2262656688690186, + "logits/rejected": -2.009671688079834, + "logps/chosen": -240.5338897705078, + "logps/rejected": -257.73211669921875, + "loss": 0.3927, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5873706340789795, + "rewards/margins": 2.2126827239990234, + "rewards/rejected": -2.800053358078003, + "step": 7539 + }, + { + "epoch": 0.88, + "learning_rate": 3.7415849769694105e-08, + "logits/chosen": -2.8615362644195557, + "logits/rejected": -2.7722954750061035, + "logps/chosen": -126.22683715820312, + "logps/rejected": -138.80999755859375, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6637499332427979, + "rewards/margins": 1.7134788036346436, + "rewards/rejected": -2.3772284984588623, + "step": 7540 + }, + { + "epoch": 0.88, + "learning_rate": 3.7380418093775834e-08, + "logits/chosen": -2.6344406604766846, + "logits/rejected": -2.629892349243164, + "logps/chosen": -173.30101013183594, + "logps/rejected": -232.96792602539062, + "loss": 0.3082, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41418784856796265, + "rewards/margins": 2.68011736869812, + "rewards/rejected": -3.0943052768707275, + "step": 7541 + }, + { + "epoch": 0.88, + "learning_rate": 3.734498641785756e-08, + "logits/chosen": -1.9984859228134155, + "logits/rejected": -2.176063299179077, + "logps/chosen": -494.263671875, + "logps/rejected": -398.6833801269531, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4157119989395142, + "rewards/margins": 3.6512207984924316, + "rewards/rejected": -5.066932678222656, + "step": 7542 + }, + { + "epoch": 0.88, + "learning_rate": 3.730955474193929e-08, + "logits/chosen": -2.8124639987945557, + "logits/rejected": -2.702547550201416, + "logps/chosen": -209.90408325195312, + "logps/rejected": -297.9097900390625, + "loss": 0.4203, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8535338640213013, + "rewards/margins": 1.7778887748718262, + "rewards/rejected": -3.631422519683838, + "step": 7543 + }, + { + "epoch": 0.88, + "learning_rate": 3.727412306602102e-08, + "logits/chosen": -2.7457571029663086, + "logits/rejected": -2.5293898582458496, + "logps/chosen": -192.8824920654297, + "logps/rejected": -274.78045654296875, + "loss": 0.4939, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5886745452880859, + "rewards/margins": 0.9240310788154602, + "rewards/rejected": -1.5127055644989014, + "step": 7544 + }, + { + "epoch": 0.88, + "learning_rate": 3.723869139010275e-08, + "logits/chosen": -2.653395652770996, + "logits/rejected": -2.1893367767333984, + "logps/chosen": -156.72598266601562, + "logps/rejected": -333.017822265625, + "loss": 0.6192, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6938670873641968, + "rewards/margins": 1.1947100162506104, + "rewards/rejected": -1.8885771036148071, + "step": 7545 + }, + { + "epoch": 0.88, + "learning_rate": 3.720325971418448e-08, + "logits/chosen": -2.5845787525177, + "logits/rejected": -2.4068562984466553, + "logps/chosen": -236.43666076660156, + "logps/rejected": -309.6436767578125, + "loss": 0.1506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.584252655506134, + "rewards/margins": 3.7644872665405273, + "rewards/rejected": -4.348740100860596, + "step": 7546 + }, + { + "epoch": 0.88, + "learning_rate": 3.7167828038266206e-08, + "logits/chosen": -2.0665879249572754, + "logits/rejected": -2.3806889057159424, + "logps/chosen": -283.1510009765625, + "logps/rejected": -279.64727783203125, + "loss": 0.0756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9882591962814331, + "rewards/margins": 4.01683235168457, + "rewards/rejected": -5.005091667175293, + "step": 7547 + }, + { + "epoch": 0.88, + "learning_rate": 3.7132396362347935e-08, + "logits/chosen": -1.398958444595337, + "logits/rejected": -1.6636059284210205, + "logps/chosen": -300.9684143066406, + "logps/rejected": -244.02828979492188, + "loss": 0.4365, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4553240537643433, + "rewards/margins": 2.054354190826416, + "rewards/rejected": -3.509678363800049, + "step": 7548 + }, + { + "epoch": 0.88, + "learning_rate": 3.709696468642967e-08, + "logits/chosen": -2.731372117996216, + "logits/rejected": -2.7232906818389893, + "logps/chosen": -169.9280548095703, + "logps/rejected": -244.35594177246094, + "loss": 0.1856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.682743489742279, + "rewards/margins": 4.3151021003723145, + "rewards/rejected": -4.997845649719238, + "step": 7549 + }, + { + "epoch": 0.88, + "learning_rate": 3.706153301051139e-08, + "logits/chosen": -2.6550824642181396, + "logits/rejected": -2.7113287448883057, + "logps/chosen": -288.89190673828125, + "logps/rejected": -326.2815856933594, + "loss": 0.2165, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6875851154327393, + "rewards/margins": 2.7046422958374023, + "rewards/rejected": -3.3922271728515625, + "step": 7550 + }, + { + "epoch": 0.88, + "learning_rate": 3.702610133459312e-08, + "logits/chosen": -2.6675806045532227, + "logits/rejected": -2.5462334156036377, + "logps/chosen": -294.23748779296875, + "logps/rejected": -455.6847229003906, + "loss": 0.5107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7384076714515686, + "rewards/margins": 1.27349853515625, + "rewards/rejected": -2.011906147003174, + "step": 7551 + }, + { + "epoch": 0.88, + "learning_rate": 3.6990669658674856e-08, + "logits/chosen": -1.9464614391326904, + "logits/rejected": -2.2415525913238525, + "logps/chosen": -295.48223876953125, + "logps/rejected": -210.3455810546875, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28938746452331543, + "rewards/margins": 2.6272265911102295, + "rewards/rejected": -2.916614055633545, + "step": 7552 + }, + { + "epoch": 0.88, + "learning_rate": 3.6955237982756585e-08, + "logits/chosen": -2.2368502616882324, + "logits/rejected": -2.7476260662078857, + "logps/chosen": -360.89678955078125, + "logps/rejected": -284.3489990234375, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7855456471443176, + "rewards/margins": 2.8281712532043457, + "rewards/rejected": -3.6137170791625977, + "step": 7553 + }, + { + "epoch": 0.88, + "learning_rate": 3.6919806306838314e-08, + "logits/chosen": -1.5108752250671387, + "logits/rejected": -2.260213613510132, + "logps/chosen": -427.42242431640625, + "logps/rejected": -192.83346557617188, + "loss": 0.6278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8214280009269714, + "rewards/margins": 1.2722249031066895, + "rewards/rejected": -2.0936529636383057, + "step": 7554 + }, + { + "epoch": 0.88, + "learning_rate": 3.688437463092004e-08, + "logits/chosen": -1.9015896320343018, + "logits/rejected": -2.081286907196045, + "logps/chosen": -245.6549530029297, + "logps/rejected": -276.97149658203125, + "loss": 0.2642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9643962979316711, + "rewards/margins": 2.71993350982666, + "rewards/rejected": -3.6843295097351074, + "step": 7555 + }, + { + "epoch": 0.88, + "learning_rate": 3.684894295500177e-08, + "logits/chosen": -2.4587509632110596, + "logits/rejected": -2.4521541595458984, + "logps/chosen": -370.4828186035156, + "logps/rejected": -288.0376281738281, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7593858242034912, + "rewards/margins": 2.8072123527526855, + "rewards/rejected": -3.5665981769561768, + "step": 7556 + }, + { + "epoch": 0.88, + "learning_rate": 3.68135112790835e-08, + "logits/chosen": -2.9223873615264893, + "logits/rejected": -2.907885789871216, + "logps/chosen": -193.06895446777344, + "logps/rejected": -188.84902954101562, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.933140754699707, + "rewards/margins": 3.5564839839935303, + "rewards/rejected": -5.489624500274658, + "step": 7557 + }, + { + "epoch": 0.88, + "learning_rate": 3.677807960316523e-08, + "logits/chosen": -1.7162044048309326, + "logits/rejected": -1.5704145431518555, + "logps/chosen": -356.20086669921875, + "logps/rejected": -504.51806640625, + "loss": 0.5148, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.217374324798584, + "rewards/margins": 1.3701777458190918, + "rewards/rejected": -2.5875518321990967, + "step": 7558 + }, + { + "epoch": 0.88, + "learning_rate": 3.674264792724696e-08, + "logits/chosen": -2.0464694499969482, + "logits/rejected": -2.1825578212738037, + "logps/chosen": -155.54287719726562, + "logps/rejected": -217.27505493164062, + "loss": 0.4368, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10394778102636337, + "rewards/margins": 2.086359977722168, + "rewards/rejected": -2.190307855606079, + "step": 7559 + }, + { + "epoch": 0.88, + "learning_rate": 3.6707216251328686e-08, + "logits/chosen": -2.4749503135681152, + "logits/rejected": -2.412766695022583, + "logps/chosen": -260.7174377441406, + "logps/rejected": -277.419677734375, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2621191442012787, + "rewards/margins": 2.817535877227783, + "rewards/rejected": -3.0796549320220947, + "step": 7560 + }, + { + "epoch": 0.88, + "learning_rate": 3.6671784575410415e-08, + "logits/chosen": -2.410918951034546, + "logits/rejected": -2.418250560760498, + "logps/chosen": -284.4427185058594, + "logps/rejected": -727.6679077148438, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9451342225074768, + "rewards/margins": 4.7319440841674805, + "rewards/rejected": -5.6770782470703125, + "step": 7561 + }, + { + "epoch": 0.88, + "learning_rate": 3.6636352899492144e-08, + "logits/chosen": -2.5907928943634033, + "logits/rejected": -2.835151433944702, + "logps/chosen": -214.52401733398438, + "logps/rejected": -297.724609375, + "loss": 0.0882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.486217200756073, + "rewards/margins": 4.765803813934326, + "rewards/rejected": -5.252020835876465, + "step": 7562 + }, + { + "epoch": 0.88, + "learning_rate": 3.660092122357387e-08, + "logits/chosen": -2.9356203079223633, + "logits/rejected": -2.782942533493042, + "logps/chosen": -301.09466552734375, + "logps/rejected": -263.81231689453125, + "loss": 0.5595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.475816249847412, + "rewards/margins": 1.0224311351776123, + "rewards/rejected": -2.4982473850250244, + "step": 7563 + }, + { + "epoch": 0.88, + "learning_rate": 3.65654895476556e-08, + "logits/chosen": -2.9658052921295166, + "logits/rejected": -2.9371490478515625, + "logps/chosen": -185.08900451660156, + "logps/rejected": -171.83450317382812, + "loss": 0.4744, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.490623950958252, + "rewards/margins": 2.3582165241241455, + "rewards/rejected": -3.8488409519195557, + "step": 7564 + }, + { + "epoch": 0.88, + "learning_rate": 3.653005787173733e-08, + "logits/chosen": -1.7554082870483398, + "logits/rejected": -1.6040486097335815, + "logps/chosen": -227.764404296875, + "logps/rejected": -377.36126708984375, + "loss": 0.4086, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46963217854499817, + "rewards/margins": 1.7083430290222168, + "rewards/rejected": -2.1779751777648926, + "step": 7565 + }, + { + "epoch": 0.88, + "learning_rate": 3.649462619581906e-08, + "logits/chosen": -2.342804431915283, + "logits/rejected": -2.1793622970581055, + "logps/chosen": -141.48915100097656, + "logps/rejected": -390.7406311035156, + "loss": 0.2946, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.483340322971344, + "rewards/margins": 3.904831886291504, + "rewards/rejected": -4.388172149658203, + "step": 7566 + }, + { + "epoch": 0.88, + "learning_rate": 3.645919451990079e-08, + "logits/chosen": -1.843104362487793, + "logits/rejected": -1.9669241905212402, + "logps/chosen": -230.51632690429688, + "logps/rejected": -247.6981201171875, + "loss": 0.2617, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07780300080776215, + "rewards/margins": 2.553384304046631, + "rewards/rejected": -2.6311872005462646, + "step": 7567 + }, + { + "epoch": 0.88, + "learning_rate": 3.6423762843982516e-08, + "logits/chosen": -2.62774658203125, + "logits/rejected": -2.801915168762207, + "logps/chosen": -376.99847412109375, + "logps/rejected": -397.4684753417969, + "loss": 0.1932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.730708122253418, + "rewards/margins": 3.3445348739624023, + "rewards/rejected": -4.07524299621582, + "step": 7568 + }, + { + "epoch": 0.88, + "learning_rate": 3.6388331168064245e-08, + "logits/chosen": -2.498012065887451, + "logits/rejected": -2.518059253692627, + "logps/chosen": -337.848876953125, + "logps/rejected": -283.1419677734375, + "loss": 1.2054, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.217557191848755, + "rewards/margins": 0.8900099992752075, + "rewards/rejected": -4.107567310333252, + "step": 7569 + }, + { + "epoch": 0.88, + "learning_rate": 3.6352899492145974e-08, + "logits/chosen": -2.181763172149658, + "logits/rejected": -2.2834460735321045, + "logps/chosen": -276.3883056640625, + "logps/rejected": -353.9615783691406, + "loss": 0.4061, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3265082836151123, + "rewards/margins": 2.4751083850860596, + "rewards/rejected": -3.801616668701172, + "step": 7570 + }, + { + "epoch": 0.88, + "learning_rate": 3.631746781622771e-08, + "logits/chosen": -2.9201650619506836, + "logits/rejected": -2.906942367553711, + "logps/chosen": -349.6849670410156, + "logps/rejected": -167.28265380859375, + "loss": 0.3313, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.166731834411621, + "rewards/margins": 1.7206690311431885, + "rewards/rejected": -2.8874006271362305, + "step": 7571 + }, + { + "epoch": 0.88, + "learning_rate": 3.628203614030944e-08, + "logits/chosen": -2.510052442550659, + "logits/rejected": -2.589045524597168, + "logps/chosen": -364.6685485839844, + "logps/rejected": -309.8553161621094, + "loss": 0.4724, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7644312381744385, + "rewards/margins": 2.0742383003234863, + "rewards/rejected": -2.838669776916504, + "step": 7572 + }, + { + "epoch": 0.88, + "learning_rate": 3.624660446439116e-08, + "logits/chosen": -2.1135199069976807, + "logits/rejected": -1.8663151264190674, + "logps/chosen": -298.73944091796875, + "logps/rejected": -364.9178161621094, + "loss": 0.5358, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2922887802124023, + "rewards/margins": 0.8579879403114319, + "rewards/rejected": -2.1502766609191895, + "step": 7573 + }, + { + "epoch": 0.88, + "learning_rate": 3.6211172788472895e-08, + "logits/chosen": -1.9965763092041016, + "logits/rejected": -2.320180892944336, + "logps/chosen": -162.87847900390625, + "logps/rejected": -137.52011108398438, + "loss": 0.9638, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.119978666305542, + "rewards/margins": 0.9730182886123657, + "rewards/rejected": -2.0929970741271973, + "step": 7574 + }, + { + "epoch": 0.88, + "learning_rate": 3.6175741112554624e-08, + "logits/chosen": -2.217125415802002, + "logits/rejected": -2.205868721008301, + "logps/chosen": -350.5334777832031, + "logps/rejected": -456.9824523925781, + "loss": 0.1571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33849671483039856, + "rewards/margins": 3.5267257690429688, + "rewards/rejected": -3.865222454071045, + "step": 7575 + }, + { + "epoch": 0.88, + "learning_rate": 3.614030943663635e-08, + "logits/chosen": -2.2908670902252197, + "logits/rejected": -2.229459285736084, + "logps/chosen": -311.8523254394531, + "logps/rejected": -311.24896240234375, + "loss": 0.172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3701716661453247, + "rewards/margins": 3.251307487487793, + "rewards/rejected": -3.621479034423828, + "step": 7576 + }, + { + "epoch": 0.88, + "learning_rate": 3.610487776071808e-08, + "logits/chosen": -2.483886957168579, + "logits/rejected": -2.2705259323120117, + "logps/chosen": -119.80787658691406, + "logps/rejected": -273.5419921875, + "loss": 0.3254, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0740145444869995, + "rewards/margins": 3.8268091678619385, + "rewards/rejected": -4.90082311630249, + "step": 7577 + }, + { + "epoch": 0.88, + "learning_rate": 3.606944608479981e-08, + "logits/chosen": -1.8000823259353638, + "logits/rejected": -1.940086841583252, + "logps/chosen": -386.8724365234375, + "logps/rejected": -383.9178771972656, + "loss": 0.8209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7157626152038574, + "rewards/margins": 1.1616417169570923, + "rewards/rejected": -1.8774042129516602, + "step": 7578 + }, + { + "epoch": 0.88, + "learning_rate": 3.603401440888154e-08, + "logits/chosen": -2.113367795944214, + "logits/rejected": -2.2957262992858887, + "logps/chosen": -366.28277587890625, + "logps/rejected": -245.8885040283203, + "loss": 0.4257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23845605552196503, + "rewards/margins": 2.4931743144989014, + "rewards/rejected": -2.7316300868988037, + "step": 7579 + }, + { + "epoch": 0.88, + "learning_rate": 3.599858273296327e-08, + "logits/chosen": -2.050365924835205, + "logits/rejected": -1.860506534576416, + "logps/chosen": -277.6342468261719, + "logps/rejected": -310.39129638671875, + "loss": 0.3807, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3855675458908081, + "rewards/margins": 2.080390691757202, + "rewards/rejected": -2.465958595275879, + "step": 7580 + }, + { + "epoch": 0.88, + "learning_rate": 3.5963151057044997e-08, + "logits/chosen": -2.786585807800293, + "logits/rejected": -2.834717273712158, + "logps/chosen": -234.1034393310547, + "logps/rejected": -235.03048706054688, + "loss": 0.4382, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2417691946029663, + "rewards/margins": 2.191765308380127, + "rewards/rejected": -3.433534622192383, + "step": 7581 + }, + { + "epoch": 0.88, + "learning_rate": 3.5927719381126725e-08, + "logits/chosen": -2.3223533630371094, + "logits/rejected": -2.3397417068481445, + "logps/chosen": -149.02243041992188, + "logps/rejected": -223.93235778808594, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3775911033153534, + "rewards/margins": 2.3933608531951904, + "rewards/rejected": -2.770951986312866, + "step": 7582 + }, + { + "epoch": 0.88, + "learning_rate": 3.5892287705208454e-08, + "logits/chosen": -2.2260210514068604, + "logits/rejected": -2.2951440811157227, + "logps/chosen": -231.37539672851562, + "logps/rejected": -210.52330017089844, + "loss": 0.8598, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2619588375091553, + "rewards/margins": 0.3023863434791565, + "rewards/rejected": -1.5643452405929565, + "step": 7583 + }, + { + "epoch": 0.88, + "learning_rate": 3.585685602929018e-08, + "logits/chosen": -1.7904101610183716, + "logits/rejected": -1.9910513162612915, + "logps/chosen": -395.53851318359375, + "logps/rejected": -151.00314331054688, + "loss": 0.6489, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.422161728143692, + "rewards/margins": 1.0912331342697144, + "rewards/rejected": -1.513394832611084, + "step": 7584 + }, + { + "epoch": 0.88, + "learning_rate": 3.582142435337191e-08, + "logits/chosen": -2.518705368041992, + "logits/rejected": -2.763664722442627, + "logps/chosen": -233.6903839111328, + "logps/rejected": -138.95498657226562, + "loss": 0.5345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3573871850967407, + "rewards/margins": 2.077782154083252, + "rewards/rejected": -2.4351694583892822, + "step": 7585 + }, + { + "epoch": 0.88, + "learning_rate": 3.578599267745364e-08, + "logits/chosen": -2.1547813415527344, + "logits/rejected": -2.277367115020752, + "logps/chosen": -356.24462890625, + "logps/rejected": -457.767822265625, + "loss": 0.093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0266247987747192, + "rewards/margins": 4.047144889831543, + "rewards/rejected": -5.073769569396973, + "step": 7586 + }, + { + "epoch": 0.88, + "learning_rate": 3.575056100153537e-08, + "logits/chosen": -2.745624542236328, + "logits/rejected": -2.767343759536743, + "logps/chosen": -283.20001220703125, + "logps/rejected": -211.80340576171875, + "loss": 0.3888, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0527455806732178, + "rewards/margins": 1.9179232120513916, + "rewards/rejected": -3.9706687927246094, + "step": 7587 + }, + { + "epoch": 0.88, + "learning_rate": 3.5715129325617104e-08, + "logits/chosen": -2.451864719390869, + "logits/rejected": -2.5254292488098145, + "logps/chosen": -161.39308166503906, + "logps/rejected": -211.47601318359375, + "loss": 0.9244, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5005884170532227, + "rewards/margins": 0.14090299606323242, + "rewards/rejected": -2.641491413116455, + "step": 7588 + }, + { + "epoch": 0.88, + "learning_rate": 3.5679697649698826e-08, + "logits/chosen": -2.5783376693725586, + "logits/rejected": -2.6541740894317627, + "logps/chosen": -230.893798828125, + "logps/rejected": -344.1298522949219, + "loss": 0.3349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4742083251476288, + "rewards/margins": 2.6254560947418213, + "rewards/rejected": -3.0996642112731934, + "step": 7589 + }, + { + "epoch": 0.88, + "learning_rate": 3.564426597378056e-08, + "logits/chosen": -1.756777048110962, + "logits/rejected": -2.2079498767852783, + "logps/chosen": -363.1264343261719, + "logps/rejected": -319.36163330078125, + "loss": 0.8029, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3066222667694092, + "rewards/margins": 2.1720027923583984, + "rewards/rejected": -3.4786250591278076, + "step": 7590 + }, + { + "epoch": 0.88, + "learning_rate": 3.560883429786229e-08, + "logits/chosen": -2.6978700160980225, + "logits/rejected": -2.6855592727661133, + "logps/chosen": -214.37139892578125, + "logps/rejected": -264.273193359375, + "loss": 0.2169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8438678979873657, + "rewards/margins": 2.559060573577881, + "rewards/rejected": -3.402928352355957, + "step": 7591 + }, + { + "epoch": 0.88, + "learning_rate": 3.557340262194401e-08, + "logits/chosen": -2.2046477794647217, + "logits/rejected": -2.0360231399536133, + "logps/chosen": -173.59393310546875, + "logps/rejected": -277.59503173828125, + "loss": 0.4252, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9191750288009644, + "rewards/margins": 2.049398899078369, + "rewards/rejected": -2.968574047088623, + "step": 7592 + }, + { + "epoch": 0.88, + "learning_rate": 3.553797094602575e-08, + "logits/chosen": -2.395211935043335, + "logits/rejected": -2.4712424278259277, + "logps/chosen": -265.9989013671875, + "logps/rejected": -219.70201110839844, + "loss": 0.5676, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3602744340896606, + "rewards/margins": 1.043287754058838, + "rewards/rejected": -2.403562068939209, + "step": 7593 + }, + { + "epoch": 0.88, + "learning_rate": 3.550253927010748e-08, + "logits/chosen": -3.0294747352600098, + "logits/rejected": -3.063089609146118, + "logps/chosen": -407.8807067871094, + "logps/rejected": -256.14556884765625, + "loss": 0.2501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.805570662021637, + "rewards/margins": 2.104994535446167, + "rewards/rejected": -2.91056489944458, + "step": 7594 + }, + { + "epoch": 0.88, + "learning_rate": 3.5467107594189206e-08, + "logits/chosen": -3.047361373901367, + "logits/rejected": -3.1675684452056885, + "logps/chosen": -264.67791748046875, + "logps/rejected": -309.4627990722656, + "loss": 0.4385, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0145460367202759, + "rewards/margins": 1.821189045906067, + "rewards/rejected": -2.835735321044922, + "step": 7595 + }, + { + "epoch": 0.88, + "learning_rate": 3.5431675918270934e-08, + "logits/chosen": -1.6422865390777588, + "logits/rejected": -1.4409786462783813, + "logps/chosen": -362.65472412109375, + "logps/rejected": -316.5869140625, + "loss": 0.9686, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.574093222618103, + "rewards/margins": 0.1106400191783905, + "rewards/rejected": -1.6847333908081055, + "step": 7596 + }, + { + "epoch": 0.88, + "learning_rate": 3.539624424235266e-08, + "logits/chosen": -2.286604404449463, + "logits/rejected": -2.3062031269073486, + "logps/chosen": -312.30718994140625, + "logps/rejected": -303.1934509277344, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6884717345237732, + "rewards/margins": 4.185576438903809, + "rewards/rejected": -4.874048233032227, + "step": 7597 + }, + { + "epoch": 0.88, + "learning_rate": 3.536081256643439e-08, + "logits/chosen": -1.8365209102630615, + "logits/rejected": -2.321051836013794, + "logps/chosen": -558.3326416015625, + "logps/rejected": -354.05810546875, + "loss": 0.1751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8631792664527893, + "rewards/margins": 2.9283766746520996, + "rewards/rejected": -3.791555881500244, + "step": 7598 + }, + { + "epoch": 0.88, + "learning_rate": 3.532538089051612e-08, + "logits/chosen": -2.241608142852783, + "logits/rejected": -2.3403408527374268, + "logps/chosen": -201.49447631835938, + "logps/rejected": -358.9652404785156, + "loss": 0.4347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6704071760177612, + "rewards/margins": 2.6585044860839844, + "rewards/rejected": -3.328911781311035, + "step": 7599 + }, + { + "epoch": 0.88, + "learning_rate": 3.528994921459785e-08, + "logits/chosen": -2.118255138397217, + "logits/rejected": -2.2543983459472656, + "logps/chosen": -136.1275634765625, + "logps/rejected": -206.1763916015625, + "loss": 0.4207, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0280225276947021, + "rewards/margins": 1.806870460510254, + "rewards/rejected": -2.834892749786377, + "step": 7600 + }, + { + "epoch": 0.88, + "learning_rate": 3.525451753867958e-08, + "logits/chosen": -1.9841358661651611, + "logits/rejected": -1.904372215270996, + "logps/chosen": -213.83447265625, + "logps/rejected": -309.14056396484375, + "loss": 0.1811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4439397156238556, + "rewards/margins": 2.533085823059082, + "rewards/rejected": -2.9770255088806152, + "step": 7601 + }, + { + "epoch": 0.88, + "learning_rate": 3.521908586276131e-08, + "logits/chosen": -2.0665950775146484, + "logits/rejected": -2.1095385551452637, + "logps/chosen": -377.0731201171875, + "logps/rejected": -314.3312072753906, + "loss": 0.3043, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5919076800346375, + "rewards/margins": 1.8448699712753296, + "rewards/rejected": -2.4367775917053223, + "step": 7602 + }, + { + "epoch": 0.88, + "learning_rate": 3.5183654186843035e-08, + "logits/chosen": -2.322821617126465, + "logits/rejected": -2.4681508541107178, + "logps/chosen": -178.25363159179688, + "logps/rejected": -198.576416015625, + "loss": 0.3016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8365961313247681, + "rewards/margins": 2.7911252975463867, + "rewards/rejected": -3.6277213096618652, + "step": 7603 + }, + { + "epoch": 0.88, + "learning_rate": 3.5148222510924764e-08, + "logits/chosen": -2.3464293479919434, + "logits/rejected": -2.009361743927002, + "logps/chosen": -146.68829345703125, + "logps/rejected": -317.11572265625, + "loss": 0.379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7165323495864868, + "rewards/margins": 3.0223255157470703, + "rewards/rejected": -3.7388575077056885, + "step": 7604 + }, + { + "epoch": 0.88, + "learning_rate": 3.511279083500649e-08, + "logits/chosen": -1.4003658294677734, + "logits/rejected": -1.9235602617263794, + "logps/chosen": -456.52252197265625, + "logps/rejected": -366.36334228515625, + "loss": 0.4976, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.918946385383606, + "rewards/margins": 0.8321403861045837, + "rewards/rejected": -1.751086950302124, + "step": 7605 + }, + { + "epoch": 0.88, + "learning_rate": 3.507735915908822e-08, + "logits/chosen": -1.736219882965088, + "logits/rejected": -2.1226677894592285, + "logps/chosen": -527.1156005859375, + "logps/rejected": -361.7975158691406, + "loss": 0.3609, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.68024080991745, + "rewards/margins": 1.9000225067138672, + "rewards/rejected": -2.580263376235962, + "step": 7606 + }, + { + "epoch": 0.88, + "learning_rate": 3.504192748316996e-08, + "logits/chosen": -2.381582498550415, + "logits/rejected": -2.3617539405822754, + "logps/chosen": -237.31295776367188, + "logps/rejected": -264.87200927734375, + "loss": 0.1824, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5171858072280884, + "rewards/margins": 2.9330527782440186, + "rewards/rejected": -3.4502387046813965, + "step": 7607 + }, + { + "epoch": 0.89, + "learning_rate": 3.500649580725168e-08, + "logits/chosen": -3.048222064971924, + "logits/rejected": -2.9880573749542236, + "logps/chosen": -165.4708251953125, + "logps/rejected": -139.1703643798828, + "loss": 0.7049, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7128046751022339, + "rewards/margins": 1.683670997619629, + "rewards/rejected": -2.3964757919311523, + "step": 7608 + }, + { + "epoch": 0.89, + "learning_rate": 3.497106413133341e-08, + "logits/chosen": -2.034186840057373, + "logits/rejected": -2.1612954139709473, + "logps/chosen": -348.9342041015625, + "logps/rejected": -322.7240295410156, + "loss": 0.1834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.562360405921936, + "rewards/margins": 3.153618097305298, + "rewards/rejected": -3.7159783840179443, + "step": 7609 + }, + { + "epoch": 0.89, + "learning_rate": 3.493563245541514e-08, + "logits/chosen": -2.5055510997772217, + "logits/rejected": -2.513185501098633, + "logps/chosen": -235.10865783691406, + "logps/rejected": -258.5614318847656, + "loss": 1.1369, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2964661121368408, + "rewards/margins": 0.9200330972671509, + "rewards/rejected": -2.216499090194702, + "step": 7610 + }, + { + "epoch": 0.89, + "learning_rate": 3.4900200779496865e-08, + "logits/chosen": -2.122264862060547, + "logits/rejected": -2.0888943672180176, + "logps/chosen": -189.71197509765625, + "logps/rejected": -268.19244384765625, + "loss": 0.8387, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6480445861816406, + "rewards/margins": 6.19952392578125, + "rewards/rejected": -7.847568511962891, + "step": 7611 + }, + { + "epoch": 0.89, + "learning_rate": 3.48647691035786e-08, + "logits/chosen": -2.5462162494659424, + "logits/rejected": -2.386984348297119, + "logps/chosen": -199.71084594726562, + "logps/rejected": -323.50555419921875, + "loss": 0.3425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18095055222511292, + "rewards/margins": 2.1839346885681152, + "rewards/rejected": -2.364885091781616, + "step": 7612 + }, + { + "epoch": 0.89, + "learning_rate": 3.482933742766033e-08, + "logits/chosen": -2.3475048542022705, + "logits/rejected": -2.3897554874420166, + "logps/chosen": -293.8746032714844, + "logps/rejected": -291.1658020019531, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9285383820533752, + "rewards/margins": 2.894887685775757, + "rewards/rejected": -3.8234262466430664, + "step": 7613 + }, + { + "epoch": 0.89, + "learning_rate": 3.479390575174205e-08, + "logits/chosen": -2.568373203277588, + "logits/rejected": -2.2302021980285645, + "logps/chosen": -199.36839294433594, + "logps/rejected": -287.2409973144531, + "loss": 0.1293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19286921620368958, + "rewards/margins": 4.228273391723633, + "rewards/rejected": -4.421142101287842, + "step": 7614 + }, + { + "epoch": 0.89, + "learning_rate": 3.475847407582379e-08, + "logits/chosen": -2.0124330520629883, + "logits/rejected": -1.9705026149749756, + "logps/chosen": -115.16596984863281, + "logps/rejected": -141.6118927001953, + "loss": 0.2313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4662202000617981, + "rewards/margins": 2.2779035568237305, + "rewards/rejected": -2.744123697280884, + "step": 7615 + }, + { + "epoch": 0.89, + "learning_rate": 3.4723042399905516e-08, + "logits/chosen": -2.308062791824341, + "logits/rejected": -2.5017995834350586, + "logps/chosen": -192.04298400878906, + "logps/rejected": -219.70993041992188, + "loss": 0.2161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8961784243583679, + "rewards/margins": 2.8987491130828857, + "rewards/rejected": -3.7949275970458984, + "step": 7616 + }, + { + "epoch": 0.89, + "learning_rate": 3.4687610723987244e-08, + "logits/chosen": -1.5001823902130127, + "logits/rejected": -2.0855705738067627, + "logps/chosen": -357.8998107910156, + "logps/rejected": -208.5225830078125, + "loss": 0.7362, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0668060779571533, + "rewards/margins": 0.7136304974555969, + "rewards/rejected": -1.7804367542266846, + "step": 7617 + }, + { + "epoch": 0.89, + "learning_rate": 3.465217904806897e-08, + "logits/chosen": -2.5674753189086914, + "logits/rejected": -2.3603885173797607, + "logps/chosen": -234.48492431640625, + "logps/rejected": -260.2656555175781, + "loss": 0.8783, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0242176055908203, + "rewards/margins": -0.004185348749160767, + "rewards/rejected": -2.0200324058532715, + "step": 7618 + }, + { + "epoch": 0.89, + "learning_rate": 3.46167473721507e-08, + "logits/chosen": -2.3366308212280273, + "logits/rejected": -2.2640366554260254, + "logps/chosen": -320.86041259765625, + "logps/rejected": -262.58154296875, + "loss": 0.2894, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1759322881698608, + "rewards/margins": 2.8029353618621826, + "rewards/rejected": -3.978867530822754, + "step": 7619 + }, + { + "epoch": 0.89, + "learning_rate": 3.458131569623243e-08, + "logits/chosen": -2.2131106853485107, + "logits/rejected": -2.2648611068725586, + "logps/chosen": -176.7727508544922, + "logps/rejected": -194.8362579345703, + "loss": 0.4867, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37289994955062866, + "rewards/margins": 1.27445387840271, + "rewards/rejected": -1.6473538875579834, + "step": 7620 + }, + { + "epoch": 0.89, + "learning_rate": 3.454588402031416e-08, + "logits/chosen": -1.6854534149169922, + "logits/rejected": -1.9961775541305542, + "logps/chosen": -392.02093505859375, + "logps/rejected": -389.37213134765625, + "loss": 0.3928, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5890664458274841, + "rewards/margins": 2.6415939331054688, + "rewards/rejected": -3.2306604385375977, + "step": 7621 + }, + { + "epoch": 0.89, + "learning_rate": 3.451045234439589e-08, + "logits/chosen": -2.200377941131592, + "logits/rejected": -1.970081090927124, + "logps/chosen": -230.9412841796875, + "logps/rejected": -295.73077392578125, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0034360885620117188, + "rewards/margins": 4.186427116394043, + "rewards/rejected": -4.182991027832031, + "step": 7622 + }, + { + "epoch": 0.89, + "learning_rate": 3.447502066847762e-08, + "logits/chosen": -2.366344690322876, + "logits/rejected": -2.604785442352295, + "logps/chosen": -218.9154510498047, + "logps/rejected": -192.0758056640625, + "loss": 0.583, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2585200071334839, + "rewards/margins": 1.7580198049545288, + "rewards/rejected": -3.016540050506592, + "step": 7623 + }, + { + "epoch": 0.89, + "learning_rate": 3.4439588992559346e-08, + "logits/chosen": -2.9526591300964355, + "logits/rejected": -2.9665260314941406, + "logps/chosen": -200.92849731445312, + "logps/rejected": -195.41363525390625, + "loss": 0.4016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9716732501983643, + "rewards/margins": 2.096975326538086, + "rewards/rejected": -3.06864857673645, + "step": 7624 + }, + { + "epoch": 0.89, + "learning_rate": 3.4404157316641074e-08, + "logits/chosen": -2.771857261657715, + "logits/rejected": -2.6223251819610596, + "logps/chosen": -221.04901123046875, + "logps/rejected": -151.61904907226562, + "loss": 0.2839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9981942176818848, + "rewards/margins": 1.567647099494934, + "rewards/rejected": -2.5658414363861084, + "step": 7625 + }, + { + "epoch": 0.89, + "learning_rate": 3.436872564072281e-08, + "logits/chosen": -2.5276870727539062, + "logits/rejected": -2.2914459705352783, + "logps/chosen": -387.9427185058594, + "logps/rejected": -424.26922607421875, + "loss": 0.7229, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3404730558395386, + "rewards/margins": 2.36922550201416, + "rewards/rejected": -3.709698438644409, + "step": 7626 + }, + { + "epoch": 0.89, + "learning_rate": 3.433329396480453e-08, + "logits/chosen": -2.2207117080688477, + "logits/rejected": -1.9295684099197388, + "logps/chosen": -196.75418090820312, + "logps/rejected": -330.66156005859375, + "loss": 0.3161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8626158237457275, + "rewards/margins": 1.8759870529174805, + "rewards/rejected": -2.738602638244629, + "step": 7627 + }, + { + "epoch": 0.89, + "learning_rate": 3.429786228888626e-08, + "logits/chosen": -2.4348373413085938, + "logits/rejected": -2.4366884231567383, + "logps/chosen": -288.69110107421875, + "logps/rejected": -261.41705322265625, + "loss": 0.3186, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20772197842597961, + "rewards/margins": 2.694310426712036, + "rewards/rejected": -2.9020323753356934, + "step": 7628 + }, + { + "epoch": 0.89, + "learning_rate": 3.4262430612967996e-08, + "logits/chosen": -2.7229814529418945, + "logits/rejected": -2.923460006713867, + "logps/chosen": -272.1068115234375, + "logps/rejected": -200.5044403076172, + "loss": 1.771, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.1128547191619873, + "rewards/margins": 0.5876425504684448, + "rewards/rejected": -3.700497627258301, + "step": 7629 + }, + { + "epoch": 0.89, + "learning_rate": 3.422699893704972e-08, + "logits/chosen": -2.331104278564453, + "logits/rejected": -2.621427536010742, + "logps/chosen": -313.9878234863281, + "logps/rejected": -305.54461669921875, + "loss": 0.0964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8274447917938232, + "rewards/margins": 3.3709306716918945, + "rewards/rejected": -4.198375701904297, + "step": 7630 + }, + { + "epoch": 0.89, + "learning_rate": 3.419156726113145e-08, + "logits/chosen": -2.6498172283172607, + "logits/rejected": -2.653878688812256, + "logps/chosen": -291.9067077636719, + "logps/rejected": -310.1039733886719, + "loss": 0.3604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3400808572769165, + "rewards/margins": 1.6173481941223145, + "rewards/rejected": -1.957429051399231, + "step": 7631 + }, + { + "epoch": 0.89, + "learning_rate": 3.415613558521318e-08, + "logits/chosen": -2.3912501335144043, + "logits/rejected": -2.332576274871826, + "logps/chosen": -220.55227661132812, + "logps/rejected": -265.6580810546875, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9988751411437988, + "rewards/margins": 2.561654567718506, + "rewards/rejected": -3.560529947280884, + "step": 7632 + }, + { + "epoch": 0.89, + "learning_rate": 3.4120703909294904e-08, + "logits/chosen": -2.8664469718933105, + "logits/rejected": -2.9578754901885986, + "logps/chosen": -147.25344848632812, + "logps/rejected": -273.88983154296875, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0096616744995117, + "rewards/margins": 3.7188076972961426, + "rewards/rejected": -4.728469371795654, + "step": 7633 + }, + { + "epoch": 0.89, + "learning_rate": 3.408527223337664e-08, + "logits/chosen": -2.6539101600646973, + "logits/rejected": -2.710662841796875, + "logps/chosen": -206.93312072753906, + "logps/rejected": -214.2005615234375, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6466584801673889, + "rewards/margins": 1.8962739706039429, + "rewards/rejected": -2.5429325103759766, + "step": 7634 + }, + { + "epoch": 0.89, + "learning_rate": 3.404984055745837e-08, + "logits/chosen": -2.765563726425171, + "logits/rejected": -2.613551378250122, + "logps/chosen": -160.9385528564453, + "logps/rejected": -278.3681335449219, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3642592132091522, + "rewards/margins": 2.570542097091675, + "rewards/rejected": -2.9348013401031494, + "step": 7635 + }, + { + "epoch": 0.89, + "learning_rate": 3.401440888154009e-08, + "logits/chosen": -2.3822414875030518, + "logits/rejected": -2.4237587451934814, + "logps/chosen": -554.42138671875, + "logps/rejected": -357.5065002441406, + "loss": 0.6391, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4683160781860352, + "rewards/margins": 1.0344301462173462, + "rewards/rejected": -2.502746343612671, + "step": 7636 + }, + { + "epoch": 0.89, + "learning_rate": 3.3978977205621826e-08, + "logits/chosen": -2.5185210704803467, + "logits/rejected": -2.4584767818450928, + "logps/chosen": -310.05035400390625, + "logps/rejected": -249.13772583007812, + "loss": 0.7898, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7323188185691833, + "rewards/margins": 0.8298559188842773, + "rewards/rejected": -1.562174916267395, + "step": 7637 + }, + { + "epoch": 0.89, + "learning_rate": 3.3943545529703555e-08, + "logits/chosen": -2.3858444690704346, + "logits/rejected": -2.4281742572784424, + "logps/chosen": -254.00807189941406, + "logps/rejected": -235.343017578125, + "loss": 0.3872, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6720791459083557, + "rewards/margins": 1.157099723815918, + "rewards/rejected": -1.829178810119629, + "step": 7638 + }, + { + "epoch": 0.89, + "learning_rate": 3.3908113853785283e-08, + "logits/chosen": -2.29545259475708, + "logits/rejected": -2.562448740005493, + "logps/chosen": -412.33258056640625, + "logps/rejected": -338.9202880859375, + "loss": 0.2462, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6225334405899048, + "rewards/margins": 1.9694113731384277, + "rewards/rejected": -3.591944932937622, + "step": 7639 + }, + { + "epoch": 0.89, + "learning_rate": 3.387268217786701e-08, + "logits/chosen": -2.3682260513305664, + "logits/rejected": -2.658961296081543, + "logps/chosen": -357.8442687988281, + "logps/rejected": -259.3177795410156, + "loss": 0.4341, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2375680208206177, + "rewards/margins": 1.7236268520355225, + "rewards/rejected": -2.9611947536468506, + "step": 7640 + }, + { + "epoch": 0.89, + "learning_rate": 3.383725050194874e-08, + "logits/chosen": -1.8723139762878418, + "logits/rejected": -1.9047759771347046, + "logps/chosen": -339.5200500488281, + "logps/rejected": -341.146240234375, + "loss": 0.3818, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.987215518951416, + "rewards/margins": 2.448420524597168, + "rewards/rejected": -3.435636043548584, + "step": 7641 + }, + { + "epoch": 0.89, + "learning_rate": 3.380181882603047e-08, + "logits/chosen": -1.908645749092102, + "logits/rejected": -2.1410751342773438, + "logps/chosen": -211.60110473632812, + "logps/rejected": -223.4034881591797, + "loss": 0.56, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1155683994293213, + "rewards/margins": 2.709200143814087, + "rewards/rejected": -3.824768543243408, + "step": 7642 + }, + { + "epoch": 0.89, + "learning_rate": 3.37663871501122e-08, + "logits/chosen": -2.039562702178955, + "logits/rejected": -1.8530409336090088, + "logps/chosen": -235.6786651611328, + "logps/rejected": -280.2018127441406, + "loss": 0.6977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5308732986450195, + "rewards/margins": 1.4828643798828125, + "rewards/rejected": -2.013737678527832, + "step": 7643 + }, + { + "epoch": 0.89, + "learning_rate": 3.373095547419393e-08, + "logits/chosen": -2.4679765701293945, + "logits/rejected": -2.1235852241516113, + "logps/chosen": -240.7939453125, + "logps/rejected": -321.9285888671875, + "loss": 0.2128, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3819037675857544, + "rewards/margins": 4.185276031494141, + "rewards/rejected": -4.5671796798706055, + "step": 7644 + }, + { + "epoch": 0.89, + "learning_rate": 3.3695523798275656e-08, + "logits/chosen": -2.2757949829101562, + "logits/rejected": -2.1363863945007324, + "logps/chosen": -324.9024353027344, + "logps/rejected": -299.1056823730469, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5528488159179688, + "rewards/margins": 3.4979281425476074, + "rewards/rejected": -4.050776481628418, + "step": 7645 + }, + { + "epoch": 0.89, + "learning_rate": 3.3660092122357385e-08, + "logits/chosen": -2.264565944671631, + "logits/rejected": -2.454514265060425, + "logps/chosen": -208.4196014404297, + "logps/rejected": -221.25445556640625, + "loss": 0.6829, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3173577785491943, + "rewards/margins": 0.96364426612854, + "rewards/rejected": -2.2810020446777344, + "step": 7646 + }, + { + "epoch": 0.89, + "learning_rate": 3.3624660446439113e-08, + "logits/chosen": -2.3445000648498535, + "logits/rejected": -2.757844924926758, + "logps/chosen": -414.34124755859375, + "logps/rejected": -213.8111572265625, + "loss": 0.4728, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5289493799209595, + "rewards/margins": 1.5690455436706543, + "rewards/rejected": -2.097994804382324, + "step": 7647 + }, + { + "epoch": 0.89, + "learning_rate": 3.358922877052085e-08, + "logits/chosen": -2.443530321121216, + "logits/rejected": -2.2405943870544434, + "logps/chosen": -247.65237426757812, + "logps/rejected": -348.1358947753906, + "loss": 0.3274, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7776471376419067, + "rewards/margins": 1.996127963066101, + "rewards/rejected": -3.773775100708008, + "step": 7648 + }, + { + "epoch": 0.89, + "learning_rate": 3.355379709460257e-08, + "logits/chosen": -2.3859331607818604, + "logits/rejected": -2.3480231761932373, + "logps/chosen": -199.93727111816406, + "logps/rejected": -227.84422302246094, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.715477466583252, + "rewards/margins": 2.982395887374878, + "rewards/rejected": -3.697873115539551, + "step": 7649 + }, + { + "epoch": 0.89, + "learning_rate": 3.35183654186843e-08, + "logits/chosen": -2.385646104812622, + "logits/rejected": -2.2885100841522217, + "logps/chosen": -353.9916687011719, + "logps/rejected": -238.59259033203125, + "loss": 0.7451, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0070385932922363, + "rewards/margins": 1.0721392631530762, + "rewards/rejected": -2.0791778564453125, + "step": 7650 + }, + { + "epoch": 0.89, + "learning_rate": 3.3482933742766035e-08, + "logits/chosen": -2.3410398960113525, + "logits/rejected": -2.3802735805511475, + "logps/chosen": -162.75714111328125, + "logps/rejected": -195.3233642578125, + "loss": 0.4383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14393427968025208, + "rewards/margins": 1.8067277669906616, + "rewards/rejected": -1.9506620168685913, + "step": 7651 + }, + { + "epoch": 0.89, + "learning_rate": 3.344750206684776e-08, + "logits/chosen": -2.1280322074890137, + "logits/rejected": -2.2760677337646484, + "logps/chosen": -327.3746032714844, + "logps/rejected": -245.4957733154297, + "loss": 0.2927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5527987480163574, + "rewards/margins": 2.6305887699127197, + "rewards/rejected": -3.183387517929077, + "step": 7652 + }, + { + "epoch": 0.89, + "learning_rate": 3.341207039092949e-08, + "logits/chosen": -2.41930890083313, + "logits/rejected": -2.651841402053833, + "logps/chosen": -324.8143310546875, + "logps/rejected": -168.6737823486328, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7462116479873657, + "rewards/margins": 1.764349341392517, + "rewards/rejected": -2.510560989379883, + "step": 7653 + }, + { + "epoch": 0.89, + "learning_rate": 3.337663871501122e-08, + "logits/chosen": -2.5644845962524414, + "logits/rejected": -2.6529788970947266, + "logps/chosen": -306.45941162109375, + "logps/rejected": -308.3450012207031, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6012537479400635, + "rewards/margins": 3.1158299446105957, + "rewards/rejected": -3.71708345413208, + "step": 7654 + }, + { + "epoch": 0.89, + "learning_rate": 3.334120703909294e-08, + "logits/chosen": -1.8516416549682617, + "logits/rejected": -2.103665828704834, + "logps/chosen": -351.6147766113281, + "logps/rejected": -343.86761474609375, + "loss": 1.0257, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.269489049911499, + "rewards/margins": 1.1537859439849854, + "rewards/rejected": -2.4232749938964844, + "step": 7655 + }, + { + "epoch": 0.89, + "learning_rate": 3.330577536317468e-08, + "logits/chosen": -2.7640790939331055, + "logits/rejected": -2.690114736557007, + "logps/chosen": -187.6517333984375, + "logps/rejected": -223.4256134033203, + "loss": 0.4132, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8334189653396606, + "rewards/margins": 1.1216970682144165, + "rewards/rejected": -1.9551160335540771, + "step": 7656 + }, + { + "epoch": 0.89, + "learning_rate": 3.327034368725641e-08, + "logits/chosen": -2.6894359588623047, + "logits/rejected": -2.854762315750122, + "logps/chosen": -347.1199951171875, + "logps/rejected": -192.97671508789062, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6202476024627686, + "rewards/margins": 2.4951579570770264, + "rewards/rejected": -3.115406036376953, + "step": 7657 + }, + { + "epoch": 0.89, + "learning_rate": 3.3234912011338136e-08, + "logits/chosen": -2.6689212322235107, + "logits/rejected": -2.6596908569335938, + "logps/chosen": -271.7841796875, + "logps/rejected": -242.7332000732422, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7787849307060242, + "rewards/margins": 1.9692171812057495, + "rewards/rejected": -2.748002290725708, + "step": 7658 + }, + { + "epoch": 0.89, + "learning_rate": 3.3199480335419865e-08, + "logits/chosen": -2.9938547611236572, + "logits/rejected": -2.9006423950195312, + "logps/chosen": -232.12716674804688, + "logps/rejected": -178.9857177734375, + "loss": 0.2902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.483713299036026, + "rewards/margins": 1.7413876056671143, + "rewards/rejected": -2.2251009941101074, + "step": 7659 + }, + { + "epoch": 0.89, + "learning_rate": 3.3164048659501594e-08, + "logits/chosen": -2.8198580741882324, + "logits/rejected": -2.8341641426086426, + "logps/chosen": -236.49810791015625, + "logps/rejected": -251.93162536621094, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47168272733688354, + "rewards/margins": 3.270084857940674, + "rewards/rejected": -3.7417678833007812, + "step": 7660 + }, + { + "epoch": 0.89, + "learning_rate": 3.312861698358332e-08, + "logits/chosen": -2.931985378265381, + "logits/rejected": -2.9639294147491455, + "logps/chosen": -149.93771362304688, + "logps/rejected": -187.45413208007812, + "loss": 0.4082, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4805744886398315, + "rewards/margins": 1.3743395805358887, + "rewards/rejected": -2.8549141883850098, + "step": 7661 + }, + { + "epoch": 0.89, + "learning_rate": 3.309318530766505e-08, + "logits/chosen": -2.3765816688537598, + "logits/rejected": -2.2938594818115234, + "logps/chosen": -224.10992431640625, + "logps/rejected": -243.92904663085938, + "loss": 0.3319, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5071866512298584, + "rewards/margins": 2.3063862323760986, + "rewards/rejected": -2.813572883605957, + "step": 7662 + }, + { + "epoch": 0.89, + "learning_rate": 3.305775363174678e-08, + "logits/chosen": -2.4725093841552734, + "logits/rejected": -2.458439588546753, + "logps/chosen": -297.870361328125, + "logps/rejected": -369.881103515625, + "loss": 0.2813, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.297526478767395, + "rewards/margins": 2.308138370513916, + "rewards/rejected": -3.6056647300720215, + "step": 7663 + }, + { + "epoch": 0.89, + "learning_rate": 3.302232195582851e-08, + "logits/chosen": -2.761014223098755, + "logits/rejected": -2.7023820877075195, + "logps/chosen": -258.4473571777344, + "logps/rejected": -246.70248413085938, + "loss": 0.3099, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1458522081375122, + "rewards/margins": 3.363274097442627, + "rewards/rejected": -4.50912618637085, + "step": 7664 + }, + { + "epoch": 0.89, + "learning_rate": 3.298689027991024e-08, + "logits/chosen": -2.1015801429748535, + "logits/rejected": -1.864659070968628, + "logps/chosen": -246.90020751953125, + "logps/rejected": -285.5361022949219, + "loss": 0.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7182380557060242, + "rewards/margins": 1.512763500213623, + "rewards/rejected": -2.231001615524292, + "step": 7665 + }, + { + "epoch": 0.89, + "learning_rate": 3.2951458603991966e-08, + "logits/chosen": -1.7852157354354858, + "logits/rejected": -2.249724864959717, + "logps/chosen": -287.9789123535156, + "logps/rejected": -183.77792358398438, + "loss": 0.3572, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9650783538818359, + "rewards/margins": 1.6263879537582397, + "rewards/rejected": -2.591466188430786, + "step": 7666 + }, + { + "epoch": 0.89, + "learning_rate": 3.2916026928073695e-08, + "logits/chosen": -2.0630855560302734, + "logits/rejected": -1.758200764656067, + "logps/chosen": -277.74810791015625, + "logps/rejected": -362.7735290527344, + "loss": 0.2917, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1279546022415161, + "rewards/margins": 3.341498851776123, + "rewards/rejected": -3.2135443687438965, + "step": 7667 + }, + { + "epoch": 0.89, + "learning_rate": 3.2880595252155424e-08, + "logits/chosen": -2.110138177871704, + "logits/rejected": -2.3599557876586914, + "logps/chosen": -260.71832275390625, + "logps/rejected": -253.713623046875, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4029574394226074, + "rewards/margins": 2.2544403076171875, + "rewards/rejected": -3.657397508621216, + "step": 7668 + }, + { + "epoch": 0.89, + "learning_rate": 3.284516357623715e-08, + "logits/chosen": -2.3457024097442627, + "logits/rejected": -2.5956506729125977, + "logps/chosen": -520.1580810546875, + "logps/rejected": -455.50872802734375, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7137840390205383, + "rewards/margins": 3.9460339546203613, + "rewards/rejected": -4.659817695617676, + "step": 7669 + }, + { + "epoch": 0.89, + "learning_rate": 3.280973190031889e-08, + "logits/chosen": -1.896113634109497, + "logits/rejected": -2.3443267345428467, + "logps/chosen": -435.766357421875, + "logps/rejected": -247.95126342773438, + "loss": 0.8535, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4853274822235107, + "rewards/margins": 0.5584214329719543, + "rewards/rejected": -2.0437488555908203, + "step": 7670 + }, + { + "epoch": 0.89, + "learning_rate": 3.277430022440061e-08, + "logits/chosen": -2.1985597610473633, + "logits/rejected": -2.538966178894043, + "logps/chosen": -475.57501220703125, + "logps/rejected": -330.271484375, + "loss": 0.2831, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17180538177490234, + "rewards/margins": 2.6048200130462646, + "rewards/rejected": -2.776625394821167, + "step": 7671 + }, + { + "epoch": 0.89, + "learning_rate": 3.273886854848234e-08, + "logits/chosen": -2.060203790664673, + "logits/rejected": -1.928792953491211, + "logps/chosen": -351.61859130859375, + "logps/rejected": -290.73272705078125, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14719811081886292, + "rewards/margins": 1.8245810270309448, + "rewards/rejected": -1.9717791080474854, + "step": 7672 + }, + { + "epoch": 0.89, + "learning_rate": 3.2703436872564074e-08, + "logits/chosen": -2.4832000732421875, + "logits/rejected": -2.5773651599884033, + "logps/chosen": -233.0186767578125, + "logps/rejected": -383.2530517578125, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08504168689250946, + "rewards/margins": 2.4097607135772705, + "rewards/rejected": -2.494802474975586, + "step": 7673 + }, + { + "epoch": 0.89, + "learning_rate": 3.2668005196645796e-08, + "logits/chosen": -1.7958588600158691, + "logits/rejected": -1.9948673248291016, + "logps/chosen": -523.9320068359375, + "logps/rejected": -426.4649658203125, + "loss": 0.24, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3226063847541809, + "rewards/margins": 2.351445198059082, + "rewards/rejected": -2.6740517616271973, + "step": 7674 + }, + { + "epoch": 0.89, + "learning_rate": 3.263257352072753e-08, + "logits/chosen": -2.020905017852783, + "logits/rejected": -2.3196091651916504, + "logps/chosen": -468.1844177246094, + "logps/rejected": -355.77301025390625, + "loss": 0.4032, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5732598304748535, + "rewards/margins": 1.1472797393798828, + "rewards/rejected": -2.7205395698547363, + "step": 7675 + }, + { + "epoch": 0.89, + "learning_rate": 3.259714184480926e-08, + "logits/chosen": -1.8211593627929688, + "logits/rejected": -1.6197009086608887, + "logps/chosen": -354.3990783691406, + "logps/rejected": -478.8633728027344, + "loss": 0.2456, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16968747973442078, + "rewards/margins": 2.9078164100646973, + "rewards/rejected": -2.738128662109375, + "step": 7676 + }, + { + "epoch": 0.89, + "learning_rate": 3.256171016889098e-08, + "logits/chosen": -2.373667001724243, + "logits/rejected": -2.11265230178833, + "logps/chosen": -183.045654296875, + "logps/rejected": -152.43719482421875, + "loss": 0.7171, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5732691287994385, + "rewards/margins": 0.27703166007995605, + "rewards/rejected": -1.8503007888793945, + "step": 7677 + }, + { + "epoch": 0.89, + "learning_rate": 3.252627849297272e-08, + "logits/chosen": -2.786393642425537, + "logits/rejected": -2.772739887237549, + "logps/chosen": -399.50152587890625, + "logps/rejected": -282.0231628417969, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5837604999542236, + "rewards/margins": 2.7103981971740723, + "rewards/rejected": -3.294158458709717, + "step": 7678 + }, + { + "epoch": 0.89, + "learning_rate": 3.2490846817054446e-08, + "logits/chosen": -2.1490118503570557, + "logits/rejected": -2.0671441555023193, + "logps/chosen": -396.83148193359375, + "logps/rejected": -245.38018798828125, + "loss": 0.3938, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.594802975654602, + "rewards/margins": 2.2088875770568848, + "rewards/rejected": -3.8036904335021973, + "step": 7679 + }, + { + "epoch": 0.89, + "learning_rate": 3.2455415141136175e-08, + "logits/chosen": -2.4950942993164062, + "logits/rejected": -2.6559665203094482, + "logps/chosen": -253.08169555664062, + "logps/rejected": -123.19444274902344, + "loss": 0.4677, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3046741485595703, + "rewards/margins": 0.8503532409667969, + "rewards/rejected": -2.155027389526367, + "step": 7680 + }, + { + "epoch": 0.89, + "learning_rate": 3.2419983465217904e-08, + "logits/chosen": -1.454131841659546, + "logits/rejected": -1.5898804664611816, + "logps/chosen": -299.0245361328125, + "logps/rejected": -333.68988037109375, + "loss": 0.3098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7924443483352661, + "rewards/margins": 1.945871353149414, + "rewards/rejected": -2.7383158206939697, + "step": 7681 + }, + { + "epoch": 0.89, + "learning_rate": 3.238455178929963e-08, + "logits/chosen": -2.1918280124664307, + "logits/rejected": -2.203430652618408, + "logps/chosen": -546.2197265625, + "logps/rejected": -328.8348388671875, + "loss": 0.2269, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4717222452163696, + "rewards/margins": 2.10371732711792, + "rewards/rejected": -3.575439453125, + "step": 7682 + }, + { + "epoch": 0.89, + "learning_rate": 3.234912011338136e-08, + "logits/chosen": -2.4558475017547607, + "logits/rejected": -2.487470865249634, + "logps/chosen": -479.5130310058594, + "logps/rejected": -312.52593994140625, + "loss": 0.3714, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6165217161178589, + "rewards/margins": 2.8955581188201904, + "rewards/rejected": -3.512079954147339, + "step": 7683 + }, + { + "epoch": 0.89, + "learning_rate": 3.231368843746309e-08, + "logits/chosen": -2.5036206245422363, + "logits/rejected": -2.8502919673919678, + "logps/chosen": -248.83132934570312, + "logps/rejected": -274.6932678222656, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9348282217979431, + "rewards/margins": 2.7907192707061768, + "rewards/rejected": -3.7255475521087646, + "step": 7684 + }, + { + "epoch": 0.89, + "learning_rate": 3.227825676154482e-08, + "logits/chosen": -2.4815752506256104, + "logits/rejected": -2.6117172241210938, + "logps/chosen": -410.2247314453125, + "logps/rejected": -391.9031677246094, + "loss": 0.5225, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.161960482597351, + "rewards/margins": 1.9557812213897705, + "rewards/rejected": -3.117741823196411, + "step": 7685 + }, + { + "epoch": 0.89, + "learning_rate": 3.224282508562655e-08, + "logits/chosen": -2.4186229705810547, + "logits/rejected": -2.4158034324645996, + "logps/chosen": -85.47789001464844, + "logps/rejected": -159.77943420410156, + "loss": 0.6325, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.376399278640747, + "rewards/margins": 2.5285327434539795, + "rewards/rejected": -3.9049320220947266, + "step": 7686 + }, + { + "epoch": 0.89, + "learning_rate": 3.2207393409708276e-08, + "logits/chosen": -2.766472578048706, + "logits/rejected": -2.4907023906707764, + "logps/chosen": -260.208740234375, + "logps/rejected": -223.1270294189453, + "loss": 0.3246, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7621030807495117, + "rewards/margins": 2.6078195571899414, + "rewards/rejected": -3.369922637939453, + "step": 7687 + }, + { + "epoch": 0.89, + "learning_rate": 3.2171961733790005e-08, + "logits/chosen": -2.0546655654907227, + "logits/rejected": -1.9740486145019531, + "logps/chosen": -143.8533477783203, + "logps/rejected": -192.94357299804688, + "loss": 0.4871, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3174545764923096, + "rewards/margins": 0.7432514429092407, + "rewards/rejected": -2.06070613861084, + "step": 7688 + }, + { + "epoch": 0.89, + "learning_rate": 3.213653005787174e-08, + "logits/chosen": -2.282803773880005, + "logits/rejected": -2.2564902305603027, + "logps/chosen": -403.13470458984375, + "logps/rejected": -364.0293884277344, + "loss": 0.4573, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6409841775894165, + "rewards/margins": 2.1511499881744385, + "rewards/rejected": -2.7921340465545654, + "step": 7689 + }, + { + "epoch": 0.89, + "learning_rate": 3.210109838195346e-08, + "logits/chosen": -2.4727511405944824, + "logits/rejected": -2.472590923309326, + "logps/chosen": -175.98504638671875, + "logps/rejected": -165.74166870117188, + "loss": 0.3011, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8442423343658447, + "rewards/margins": 1.7452383041381836, + "rewards/rejected": -2.5894806385040283, + "step": 7690 + }, + { + "epoch": 0.89, + "learning_rate": 3.206566670603519e-08, + "logits/chosen": -2.5262985229492188, + "logits/rejected": -2.464149236679077, + "logps/chosen": -256.99908447265625, + "logps/rejected": -251.1205596923828, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9181716442108154, + "rewards/margins": 2.593554973602295, + "rewards/rejected": -3.5117263793945312, + "step": 7691 + }, + { + "epoch": 0.89, + "learning_rate": 3.2030235030116927e-08, + "logits/chosen": -1.8887157440185547, + "logits/rejected": -2.0584282875061035, + "logps/chosen": -230.34088134765625, + "logps/rejected": -233.03871154785156, + "loss": 0.768, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3511770963668823, + "rewards/margins": 1.249011754989624, + "rewards/rejected": -2.600188732147217, + "step": 7692 + }, + { + "epoch": 0.89, + "learning_rate": 3.199480335419865e-08, + "logits/chosen": -2.304762840270996, + "logits/rejected": -2.536038398742676, + "logps/chosen": -204.12928771972656, + "logps/rejected": -313.2115478515625, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3411281108856201, + "rewards/margins": 2.7864460945129395, + "rewards/rejected": -3.1275739669799805, + "step": 7693 + }, + { + "epoch": 0.9, + "learning_rate": 3.1959371678280384e-08, + "logits/chosen": -2.753767728805542, + "logits/rejected": -2.6344051361083984, + "logps/chosen": -320.63800048828125, + "logps/rejected": -294.47784423828125, + "loss": 0.6327, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23476015031337738, + "rewards/margins": 0.9915107488632202, + "rewards/rejected": -1.2262707948684692, + "step": 7694 + }, + { + "epoch": 0.9, + "learning_rate": 3.192394000236211e-08, + "logits/chosen": -2.4077565670013428, + "logits/rejected": -2.5633111000061035, + "logps/chosen": -356.4228210449219, + "logps/rejected": -191.31298828125, + "loss": 0.2176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17679202556610107, + "rewards/margins": 1.930654764175415, + "rewards/rejected": -1.753862738609314, + "step": 7695 + }, + { + "epoch": 0.9, + "learning_rate": 3.1888508326443835e-08, + "logits/chosen": -2.474569797515869, + "logits/rejected": -2.5959839820861816, + "logps/chosen": -273.33428955078125, + "logps/rejected": -185.90911865234375, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.325956255197525, + "rewards/margins": 2.4216866493225098, + "rewards/rejected": -2.747642993927002, + "step": 7696 + }, + { + "epoch": 0.9, + "learning_rate": 3.185307665052557e-08, + "logits/chosen": -2.06085205078125, + "logits/rejected": -2.012091875076294, + "logps/chosen": -230.58041381835938, + "logps/rejected": -289.5764465332031, + "loss": 1.3405, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.52175772190094, + "rewards/margins": -0.4128458499908447, + "rewards/rejected": -1.1089118719100952, + "step": 7697 + }, + { + "epoch": 0.9, + "learning_rate": 3.18176449746073e-08, + "logits/chosen": -2.3191304206848145, + "logits/rejected": -2.2596113681793213, + "logps/chosen": -238.52813720703125, + "logps/rejected": -247.52743530273438, + "loss": 0.2378, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25012269616127014, + "rewards/margins": 2.1630523204803467, + "rewards/rejected": -2.413175106048584, + "step": 7698 + }, + { + "epoch": 0.9, + "learning_rate": 3.178221329868903e-08, + "logits/chosen": -1.945847988128662, + "logits/rejected": -2.3941879272460938, + "logps/chosen": -423.27508544921875, + "logps/rejected": -291.5628662109375, + "loss": 0.4376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9221624135971069, + "rewards/margins": 1.828317403793335, + "rewards/rejected": -2.7504796981811523, + "step": 7699 + }, + { + "epoch": 0.9, + "learning_rate": 3.1746781622770756e-08, + "logits/chosen": -2.4278039932250977, + "logits/rejected": -2.5137217044830322, + "logps/chosen": -329.341064453125, + "logps/rejected": -266.0711364746094, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9025762677192688, + "rewards/margins": 1.5573179721832275, + "rewards/rejected": -2.4598941802978516, + "step": 7700 + }, + { + "epoch": 0.9, + "learning_rate": 3.1711349946852485e-08, + "logits/chosen": -2.5545101165771484, + "logits/rejected": -2.583695411682129, + "logps/chosen": -150.54376220703125, + "logps/rejected": -205.59860229492188, + "loss": 0.5287, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8792843222618103, + "rewards/margins": 1.3776190280914307, + "rewards/rejected": -2.256903648376465, + "step": 7701 + }, + { + "epoch": 0.9, + "learning_rate": 3.1675918270934214e-08, + "logits/chosen": -2.8962819576263428, + "logits/rejected": -2.9400529861450195, + "logps/chosen": -336.44317626953125, + "logps/rejected": -392.6435852050781, + "loss": 0.5336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.761339008808136, + "rewards/margins": 1.4799129962921143, + "rewards/rejected": -2.2412519454956055, + "step": 7702 + }, + { + "epoch": 0.9, + "learning_rate": 3.164048659501594e-08, + "logits/chosen": -2.3159239292144775, + "logits/rejected": -2.376765012741089, + "logps/chosen": -409.47955322265625, + "logps/rejected": -289.60595703125, + "loss": 0.6077, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5045260190963745, + "rewards/margins": 1.6920639276504517, + "rewards/rejected": -3.196589946746826, + "step": 7703 + }, + { + "epoch": 0.9, + "learning_rate": 3.160505491909767e-08, + "logits/chosen": -2.2325756549835205, + "logits/rejected": -2.2492318153381348, + "logps/chosen": -226.4979705810547, + "logps/rejected": -271.08416748046875, + "loss": 0.124, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0885065346956253, + "rewards/margins": 3.9135594367980957, + "rewards/rejected": -3.8250532150268555, + "step": 7704 + }, + { + "epoch": 0.9, + "learning_rate": 3.15696232431794e-08, + "logits/chosen": -2.6708874702453613, + "logits/rejected": -2.686033248901367, + "logps/chosen": -308.9173583984375, + "logps/rejected": -272.8843078613281, + "loss": 0.6082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9487732648849487, + "rewards/margins": 0.7681126594543457, + "rewards/rejected": -1.7168858051300049, + "step": 7705 + }, + { + "epoch": 0.9, + "learning_rate": 3.153419156726113e-08, + "logits/chosen": -1.3865489959716797, + "logits/rejected": -1.4530807733535767, + "logps/chosen": -285.681396484375, + "logps/rejected": -307.33258056640625, + "loss": 0.2531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.252589225769043, + "rewards/margins": 3.6025233268737793, + "rewards/rejected": -4.855112075805664, + "step": 7706 + }, + { + "epoch": 0.9, + "learning_rate": 3.149875989134286e-08, + "logits/chosen": -2.062410593032837, + "logits/rejected": -2.1403391361236572, + "logps/chosen": -439.2113342285156, + "logps/rejected": -522.3545532226562, + "loss": 0.8571, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5995011329650879, + "rewards/margins": 0.6146401166915894, + "rewards/rejected": -1.2141412496566772, + "step": 7707 + }, + { + "epoch": 0.9, + "learning_rate": 3.1463328215424586e-08, + "logits/chosen": -1.9725115299224854, + "logits/rejected": -1.919339656829834, + "logps/chosen": -295.0091552734375, + "logps/rejected": -266.1778564453125, + "loss": 0.2097, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.180082306265831, + "rewards/margins": 3.3249170780181885, + "rewards/rejected": -3.5049996376037598, + "step": 7708 + }, + { + "epoch": 0.9, + "learning_rate": 3.1427896539506315e-08, + "logits/chosen": -1.8328361511230469, + "logits/rejected": -1.9955811500549316, + "logps/chosen": -277.7185363769531, + "logps/rejected": -328.88580322265625, + "loss": 0.4601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.825038492679596, + "rewards/margins": 1.0241550207138062, + "rewards/rejected": -1.8491935729980469, + "step": 7709 + }, + { + "epoch": 0.9, + "learning_rate": 3.1392464863588044e-08, + "logits/chosen": -2.7796478271484375, + "logits/rejected": -2.4764463901519775, + "logps/chosen": -289.9639892578125, + "logps/rejected": -395.9324035644531, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.604250431060791, + "rewards/margins": 2.898554563522339, + "rewards/rejected": -3.502805233001709, + "step": 7710 + }, + { + "epoch": 0.9, + "learning_rate": 3.135703318766978e-08, + "logits/chosen": -2.3143067359924316, + "logits/rejected": -2.2599239349365234, + "logps/chosen": -328.349853515625, + "logps/rejected": -383.67962646484375, + "loss": 0.3402, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3592385053634644, + "rewards/margins": 2.367685556411743, + "rewards/rejected": -3.726924419403076, + "step": 7711 + }, + { + "epoch": 0.9, + "learning_rate": 3.13216015117515e-08, + "logits/chosen": -2.427824020385742, + "logits/rejected": -2.3860466480255127, + "logps/chosen": -246.85569763183594, + "logps/rejected": -212.95767211914062, + "loss": 0.5264, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9636602997779846, + "rewards/margins": 1.4756860733032227, + "rewards/rejected": -2.4393463134765625, + "step": 7712 + }, + { + "epoch": 0.9, + "learning_rate": 3.128616983583323e-08, + "logits/chosen": -2.8860116004943848, + "logits/rejected": -2.9068150520324707, + "logps/chosen": -237.21530151367188, + "logps/rejected": -248.81024169921875, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5813058018684387, + "rewards/margins": 1.8793487548828125, + "rewards/rejected": -2.4606542587280273, + "step": 7713 + }, + { + "epoch": 0.9, + "learning_rate": 3.1250738159914965e-08, + "logits/chosen": -2.8352668285369873, + "logits/rejected": -2.840061902999878, + "logps/chosen": -385.05279541015625, + "logps/rejected": -292.4405822753906, + "loss": 0.2905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9941176772117615, + "rewards/margins": 2.370908737182617, + "rewards/rejected": -3.3650264739990234, + "step": 7714 + }, + { + "epoch": 0.9, + "learning_rate": 3.1215306483996694e-08, + "logits/chosen": -2.630293607711792, + "logits/rejected": -2.560131072998047, + "logps/chosen": -455.25555419921875, + "logps/rejected": -193.710205078125, + "loss": 0.2912, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7224465012550354, + "rewards/margins": 1.6022238731384277, + "rewards/rejected": -2.3246703147888184, + "step": 7715 + }, + { + "epoch": 0.9, + "learning_rate": 3.117987480807842e-08, + "logits/chosen": -2.578735113143921, + "logits/rejected": -2.6199350357055664, + "logps/chosen": -216.69082641601562, + "logps/rejected": -169.00830078125, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47047513723373413, + "rewards/margins": 2.0335986614227295, + "rewards/rejected": -2.5040738582611084, + "step": 7716 + }, + { + "epoch": 0.9, + "learning_rate": 3.114444313216015e-08, + "logits/chosen": -2.3303771018981934, + "logits/rejected": -2.497947931289673, + "logps/chosen": -324.0134582519531, + "logps/rejected": -263.43499755859375, + "loss": 0.6015, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3786022663116455, + "rewards/margins": 0.44372329115867615, + "rewards/rejected": -1.822325587272644, + "step": 7717 + }, + { + "epoch": 0.9, + "learning_rate": 3.110901145624188e-08, + "logits/chosen": -1.850367784500122, + "logits/rejected": -2.1428844928741455, + "logps/chosen": -224.75975036621094, + "logps/rejected": -193.00807189941406, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9381294846534729, + "rewards/margins": 1.828981637954712, + "rewards/rejected": -2.76711106300354, + "step": 7718 + }, + { + "epoch": 0.9, + "learning_rate": 3.107357978032361e-08, + "logits/chosen": -2.212958574295044, + "logits/rejected": -2.294132947921753, + "logps/chosen": -216.19674682617188, + "logps/rejected": -282.4936218261719, + "loss": 0.3646, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8140599727630615, + "rewards/margins": 1.5400123596191406, + "rewards/rejected": -2.354072093963623, + "step": 7719 + }, + { + "epoch": 0.9, + "learning_rate": 3.103814810440534e-08, + "logits/chosen": -2.7201569080352783, + "logits/rejected": -2.369631767272949, + "logps/chosen": -214.79458618164062, + "logps/rejected": -394.34307861328125, + "loss": 0.5078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9077961444854736, + "rewards/margins": 2.902552843093872, + "rewards/rejected": -3.810349225997925, + "step": 7720 + }, + { + "epoch": 0.9, + "learning_rate": 3.1002716428487067e-08, + "logits/chosen": -1.8970357179641724, + "logits/rejected": -1.9875106811523438, + "logps/chosen": -178.24368286132812, + "logps/rejected": -179.4664764404297, + "loss": 0.6482, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2592839002609253, + "rewards/margins": 0.49247926473617554, + "rewards/rejected": -0.751763105392456, + "step": 7721 + }, + { + "epoch": 0.9, + "learning_rate": 3.0967284752568795e-08, + "logits/chosen": -2.3443636894226074, + "logits/rejected": -2.330768585205078, + "logps/chosen": -313.458740234375, + "logps/rejected": -256.6027526855469, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4268263876438141, + "rewards/margins": 1.8772671222686768, + "rewards/rejected": -2.304093360900879, + "step": 7722 + }, + { + "epoch": 0.9, + "learning_rate": 3.0931853076650524e-08, + "logits/chosen": -1.7862917184829712, + "logits/rejected": -1.865889549255371, + "logps/chosen": -299.85302734375, + "logps/rejected": -416.4881286621094, + "loss": 0.5282, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8260456323623657, + "rewards/margins": 2.555398464202881, + "rewards/rejected": -4.381443977355957, + "step": 7723 + }, + { + "epoch": 0.9, + "learning_rate": 3.089642140073225e-08, + "logits/chosen": -1.8699270486831665, + "logits/rejected": -1.8416907787322998, + "logps/chosen": -393.93292236328125, + "logps/rejected": -305.73565673828125, + "loss": 0.7024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33216023445129395, + "rewards/margins": 1.9714853763580322, + "rewards/rejected": -2.3036458492279053, + "step": 7724 + }, + { + "epoch": 0.9, + "learning_rate": 3.086098972481398e-08, + "logits/chosen": -1.9424307346343994, + "logits/rejected": -1.8300082683563232, + "logps/chosen": -245.59762573242188, + "logps/rejected": -316.1534118652344, + "loss": 0.8619, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7061697244644165, + "rewards/margins": 0.6689471006393433, + "rewards/rejected": -2.3751168251037598, + "step": 7725 + }, + { + "epoch": 0.9, + "learning_rate": 3.082555804889571e-08, + "logits/chosen": -1.8793165683746338, + "logits/rejected": -2.098295211791992, + "logps/chosen": -516.914794921875, + "logps/rejected": -354.364990234375, + "loss": 0.2848, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4344823360443115, + "rewards/margins": 2.1833534240722656, + "rewards/rejected": -3.6178359985351562, + "step": 7726 + }, + { + "epoch": 0.9, + "learning_rate": 3.079012637297744e-08, + "logits/chosen": -2.2876880168914795, + "logits/rejected": -2.039128065109253, + "logps/chosen": -324.7662353515625, + "logps/rejected": -390.26214599609375, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6979549527168274, + "rewards/margins": 1.1160484552383423, + "rewards/rejected": -1.8140032291412354, + "step": 7727 + }, + { + "epoch": 0.9, + "learning_rate": 3.075469469705917e-08, + "logits/chosen": -2.5558853149414062, + "logits/rejected": -2.8291101455688477, + "logps/chosen": -300.22039794921875, + "logps/rejected": -271.5733947753906, + "loss": 0.1984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8689141869544983, + "rewards/margins": 4.188506126403809, + "rewards/rejected": -5.057420253753662, + "step": 7728 + }, + { + "epoch": 0.9, + "learning_rate": 3.0719263021140897e-08, + "logits/chosen": -1.8045103549957275, + "logits/rejected": -1.456221580505371, + "logps/chosen": -349.3807373046875, + "logps/rejected": -455.7493896484375, + "loss": 0.5373, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.446503221988678, + "rewards/margins": 2.0919976234436035, + "rewards/rejected": -2.538501024246216, + "step": 7729 + }, + { + "epoch": 0.9, + "learning_rate": 3.0683831345222625e-08, + "logits/chosen": -2.5353922843933105, + "logits/rejected": -2.7000842094421387, + "logps/chosen": -321.3793029785156, + "logps/rejected": -289.55181884765625, + "loss": 0.1452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19543391466140747, + "rewards/margins": 2.2365317344665527, + "rewards/rejected": -2.041097640991211, + "step": 7730 + }, + { + "epoch": 0.9, + "learning_rate": 3.0648399669304354e-08, + "logits/chosen": -2.8129920959472656, + "logits/rejected": -2.851372480392456, + "logps/chosen": -244.62733459472656, + "logps/rejected": -324.7100830078125, + "loss": 0.1196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.676973819732666, + "rewards/margins": 3.6947643756866455, + "rewards/rejected": -4.371737957000732, + "step": 7731 + }, + { + "epoch": 0.9, + "learning_rate": 3.061296799338608e-08, + "logits/chosen": -2.2964160442352295, + "logits/rejected": -2.0522241592407227, + "logps/chosen": -204.27105712890625, + "logps/rejected": -198.008544921875, + "loss": 0.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0785739421844482, + "rewards/margins": 0.6751055717468262, + "rewards/rejected": -1.7536795139312744, + "step": 7732 + }, + { + "epoch": 0.9, + "learning_rate": 3.057753631746782e-08, + "logits/chosen": -2.0741875171661377, + "logits/rejected": -2.0862209796905518, + "logps/chosen": -216.65496826171875, + "logps/rejected": -300.555419921875, + "loss": 0.6198, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0175130367279053, + "rewards/margins": 0.8269424438476562, + "rewards/rejected": -1.8444554805755615, + "step": 7733 + }, + { + "epoch": 0.9, + "learning_rate": 3.054210464154955e-08, + "logits/chosen": -2.228144884109497, + "logits/rejected": -2.152557611465454, + "logps/chosen": -286.3065185546875, + "logps/rejected": -376.6913146972656, + "loss": 1.2972, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2046055793762207, + "rewards/margins": -0.22007551789283752, + "rewards/rejected": -1.9845302104949951, + "step": 7734 + }, + { + "epoch": 0.9, + "learning_rate": 3.050667296563127e-08, + "logits/chosen": -2.1313629150390625, + "logits/rejected": -2.0979037284851074, + "logps/chosen": -216.67672729492188, + "logps/rejected": -340.4228210449219, + "loss": 0.5765, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1394686698913574, + "rewards/margins": 2.801581859588623, + "rewards/rejected": -4.9410505294799805, + "step": 7735 + }, + { + "epoch": 0.9, + "learning_rate": 3.0471241289713004e-08, + "logits/chosen": -2.3868799209594727, + "logits/rejected": -2.6906204223632812, + "logps/chosen": -418.4458923339844, + "logps/rejected": -235.22140502929688, + "loss": 0.337, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1503205299377441, + "rewards/margins": 1.3690664768218994, + "rewards/rejected": -2.5193872451782227, + "step": 7736 + }, + { + "epoch": 0.9, + "learning_rate": 3.043580961379473e-08, + "logits/chosen": -2.108027458190918, + "logits/rejected": -2.2079005241394043, + "logps/chosen": -311.6959533691406, + "logps/rejected": -285.4506530761719, + "loss": 4.7369, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.372298240661621, + "rewards/margins": -1.764843225479126, + "rewards/rejected": -3.607455015182495, + "step": 7737 + }, + { + "epoch": 0.9, + "learning_rate": 3.040037793787646e-08, + "logits/chosen": -2.4244465827941895, + "logits/rejected": -2.312498092651367, + "logps/chosen": -234.98977661132812, + "logps/rejected": -252.34422302246094, + "loss": 0.4269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6346054077148438, + "rewards/margins": 2.9261088371276855, + "rewards/rejected": -3.5607142448425293, + "step": 7738 + }, + { + "epoch": 0.9, + "learning_rate": 3.036494626195819e-08, + "logits/chosen": -2.064203977584839, + "logits/rejected": -2.0394821166992188, + "logps/chosen": -229.3427276611328, + "logps/rejected": -254.47433471679688, + "loss": 0.3944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.996541440486908, + "rewards/margins": 3.8487119674682617, + "rewards/rejected": -4.8452534675598145, + "step": 7739 + }, + { + "epoch": 0.9, + "learning_rate": 3.032951458603992e-08, + "logits/chosen": -2.5769548416137695, + "logits/rejected": -2.9213035106658936, + "logps/chosen": -251.8530731201172, + "logps/rejected": -202.98532104492188, + "loss": 0.3098, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.129786729812622, + "rewards/margins": 2.6989612579345703, + "rewards/rejected": -3.8287482261657715, + "step": 7740 + }, + { + "epoch": 0.9, + "learning_rate": 3.029408291012165e-08, + "logits/chosen": -1.5221531391143799, + "logits/rejected": -2.0079238414764404, + "logps/chosen": -469.9354248046875, + "logps/rejected": -372.0657958984375, + "loss": 0.3998, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45569857954978943, + "rewards/margins": 2.3955304622650146, + "rewards/rejected": -2.851228952407837, + "step": 7741 + }, + { + "epoch": 0.9, + "learning_rate": 3.025865123420338e-08, + "logits/chosen": -1.8487194776535034, + "logits/rejected": -2.0471534729003906, + "logps/chosen": -383.13214111328125, + "logps/rejected": -249.89935302734375, + "loss": 0.5481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4702525734901428, + "rewards/margins": 1.6733365058898926, + "rewards/rejected": -2.1435890197753906, + "step": 7742 + }, + { + "epoch": 0.9, + "learning_rate": 3.0223219558285106e-08, + "logits/chosen": -2.3251304626464844, + "logits/rejected": -2.3829345703125, + "logps/chosen": -403.6244201660156, + "logps/rejected": -433.0500793457031, + "loss": 0.736, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9047062397003174, + "rewards/margins": 1.41823410987854, + "rewards/rejected": -2.3229403495788574, + "step": 7743 + }, + { + "epoch": 0.9, + "learning_rate": 3.0187787882366834e-08, + "logits/chosen": -1.660660982131958, + "logits/rejected": -1.8386932611465454, + "logps/chosen": -512.203369140625, + "logps/rejected": -333.9020690917969, + "loss": 0.4088, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.765757143497467, + "rewards/margins": 2.746391773223877, + "rewards/rejected": -3.5121490955352783, + "step": 7744 + }, + { + "epoch": 0.9, + "learning_rate": 3.015235620644856e-08, + "logits/chosen": -2.025972366333008, + "logits/rejected": -2.0235674381256104, + "logps/chosen": -301.65557861328125, + "logps/rejected": -271.4912109375, + "loss": 0.2154, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0503520965576172, + "rewards/margins": 1.9604213237762451, + "rewards/rejected": -3.0107734203338623, + "step": 7745 + }, + { + "epoch": 0.9, + "learning_rate": 3.011692453053029e-08, + "logits/chosen": -2.3795154094696045, + "logits/rejected": -2.265852928161621, + "logps/chosen": -345.14227294921875, + "logps/rejected": -384.7630615234375, + "loss": 0.3236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4827134609222412, + "rewards/margins": 1.868208885192871, + "rewards/rejected": -3.3509223461151123, + "step": 7746 + }, + { + "epoch": 0.9, + "learning_rate": 3.008149285461202e-08, + "logits/chosen": -2.6126670837402344, + "logits/rejected": -2.4469966888427734, + "logps/chosen": -152.08448791503906, + "logps/rejected": -189.40301513671875, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03326214849948883, + "rewards/margins": 3.6540136337280273, + "rewards/rejected": -3.62075138092041, + "step": 7747 + }, + { + "epoch": 0.9, + "learning_rate": 3.004606117869375e-08, + "logits/chosen": -2.138805389404297, + "logits/rejected": -2.268827199935913, + "logps/chosen": -235.92210388183594, + "logps/rejected": -217.8069305419922, + "loss": 0.5862, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9416705369949341, + "rewards/margins": 0.8405090570449829, + "rewards/rejected": -1.782179594039917, + "step": 7748 + }, + { + "epoch": 0.9, + "learning_rate": 3.001062950277548e-08, + "logits/chosen": -2.421022415161133, + "logits/rejected": -2.1818885803222656, + "logps/chosen": -350.6033935546875, + "logps/rejected": -368.2622375488281, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8658220171928406, + "rewards/margins": 3.4829742908477783, + "rewards/rejected": -4.348796367645264, + "step": 7749 + }, + { + "epoch": 0.9, + "learning_rate": 2.997519782685721e-08, + "logits/chosen": -2.6439476013183594, + "logits/rejected": -2.6982502937316895, + "logps/chosen": -362.3180236816406, + "logps/rejected": -352.5337219238281, + "loss": 0.1865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8907387256622314, + "rewards/margins": 3.9270272254943848, + "rewards/rejected": -4.817766189575195, + "step": 7750 + }, + { + "epoch": 0.9, + "learning_rate": 2.9939766150938936e-08, + "logits/chosen": -2.28498911857605, + "logits/rejected": -2.1237874031066895, + "logps/chosen": -319.8314514160156, + "logps/rejected": -432.989501953125, + "loss": 0.4647, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7099806070327759, + "rewards/margins": 4.497669696807861, + "rewards/rejected": -5.207650184631348, + "step": 7751 + }, + { + "epoch": 0.9, + "learning_rate": 2.990433447502067e-08, + "logits/chosen": -2.669466257095337, + "logits/rejected": -2.5746665000915527, + "logps/chosen": -123.31913757324219, + "logps/rejected": -186.71994018554688, + "loss": 0.3553, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.840905487537384, + "rewards/margins": 1.9236035346984863, + "rewards/rejected": -2.7645092010498047, + "step": 7752 + }, + { + "epoch": 0.9, + "learning_rate": 2.98689027991024e-08, + "logits/chosen": -2.7125349044799805, + "logits/rejected": -2.657027244567871, + "logps/chosen": -301.6799011230469, + "logps/rejected": -322.27642822265625, + "loss": 0.3429, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2537753582000732, + "rewards/margins": 2.953031063079834, + "rewards/rejected": -4.206806182861328, + "step": 7753 + }, + { + "epoch": 0.9, + "learning_rate": 2.983347112318412e-08, + "logits/chosen": -2.0172929763793945, + "logits/rejected": -2.154832363128662, + "logps/chosen": -234.68484497070312, + "logps/rejected": -270.33251953125, + "loss": 0.357, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5685906410217285, + "rewards/margins": 2.0278334617614746, + "rewards/rejected": -2.596424102783203, + "step": 7754 + }, + { + "epoch": 0.9, + "learning_rate": 2.9798039447265854e-08, + "logits/chosen": -2.300719738006592, + "logits/rejected": -2.577850580215454, + "logps/chosen": -244.39242553710938, + "logps/rejected": -161.05502319335938, + "loss": 0.3363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9006334543228149, + "rewards/margins": 1.58696711063385, + "rewards/rejected": -2.487600564956665, + "step": 7755 + }, + { + "epoch": 0.9, + "learning_rate": 2.9762607771347586e-08, + "logits/chosen": -1.8488723039627075, + "logits/rejected": -2.1296777725219727, + "logps/chosen": -415.63336181640625, + "logps/rejected": -164.8297119140625, + "loss": 0.8275, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.390108346939087, + "rewards/margins": -0.09355072677135468, + "rewards/rejected": -1.2965576648712158, + "step": 7756 + }, + { + "epoch": 0.9, + "learning_rate": 2.972717609542931e-08, + "logits/chosen": -2.0294995307922363, + "logits/rejected": -2.3420538902282715, + "logps/chosen": -545.653076171875, + "logps/rejected": -336.63189697265625, + "loss": 0.2423, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5434222221374512, + "rewards/margins": 2.448622703552246, + "rewards/rejected": -2.9920451641082764, + "step": 7757 + }, + { + "epoch": 0.9, + "learning_rate": 2.9691744419511043e-08, + "logits/chosen": -2.561631441116333, + "logits/rejected": -2.2508046627044678, + "logps/chosen": -315.4842529296875, + "logps/rejected": -324.127685546875, + "loss": 0.3966, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9244118332862854, + "rewards/margins": 1.3542728424072266, + "rewards/rejected": -2.278684616088867, + "step": 7758 + }, + { + "epoch": 0.9, + "learning_rate": 2.9656312743592772e-08, + "logits/chosen": -1.807753086090088, + "logits/rejected": -1.873401165008545, + "logps/chosen": -293.9611511230469, + "logps/rejected": -246.61709594726562, + "loss": 1.0598, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3612035512924194, + "rewards/margins": 0.1128699779510498, + "rewards/rejected": -1.4740734100341797, + "step": 7759 + }, + { + "epoch": 0.9, + "learning_rate": 2.9620881067674497e-08, + "logits/chosen": -2.2371530532836914, + "logits/rejected": -1.870023488998413, + "logps/chosen": -368.02447509765625, + "logps/rejected": -430.2591552734375, + "loss": 0.7097, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.268432378768921, + "rewards/margins": 1.4887992143630981, + "rewards/rejected": -2.7572314739227295, + "step": 7760 + }, + { + "epoch": 0.9, + "learning_rate": 2.958544939175623e-08, + "logits/chosen": -2.780423641204834, + "logits/rejected": -2.7480783462524414, + "logps/chosen": -344.16632080078125, + "logps/rejected": -272.30126953125, + "loss": 0.0893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5013972520828247, + "rewards/margins": 3.364753246307373, + "rewards/rejected": -3.866150379180908, + "step": 7761 + }, + { + "epoch": 0.9, + "learning_rate": 2.9550017715837958e-08, + "logits/chosen": -2.386359691619873, + "logits/rejected": -2.3342761993408203, + "logps/chosen": -335.94549560546875, + "logps/rejected": -344.0096130371094, + "loss": 0.5507, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6650227308273315, + "rewards/margins": 1.2286245822906494, + "rewards/rejected": -1.8936471939086914, + "step": 7762 + }, + { + "epoch": 0.9, + "learning_rate": 2.9514586039919684e-08, + "logits/chosen": -2.614259958267212, + "logits/rejected": -2.9620201587677, + "logps/chosen": -285.33575439453125, + "logps/rejected": -276.64337158203125, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7389540076255798, + "rewards/margins": 2.490063428878784, + "rewards/rejected": -3.2290172576904297, + "step": 7763 + }, + { + "epoch": 0.9, + "learning_rate": 2.9479154364001416e-08, + "logits/chosen": -2.133617877960205, + "logits/rejected": -2.2232937812805176, + "logps/chosen": -207.13864135742188, + "logps/rejected": -392.1153259277344, + "loss": 0.523, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2496898174285889, + "rewards/margins": 2.291471004486084, + "rewards/rejected": -3.541160821914673, + "step": 7764 + }, + { + "epoch": 0.9, + "learning_rate": 2.9443722688083145e-08, + "logits/chosen": -2.2252912521362305, + "logits/rejected": -2.2977800369262695, + "logps/chosen": -353.73895263671875, + "logps/rejected": -478.194091796875, + "loss": 0.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5854321718215942, + "rewards/margins": 2.993300437927246, + "rewards/rejected": -3.578732490539551, + "step": 7765 + }, + { + "epoch": 0.9, + "learning_rate": 2.9408291012164873e-08, + "logits/chosen": -2.7660973072052, + "logits/rejected": -2.6385977268218994, + "logps/chosen": -239.15451049804688, + "logps/rejected": -171.27122497558594, + "loss": 0.5053, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8716668486595154, + "rewards/margins": 2.2358438968658447, + "rewards/rejected": -3.107510566711426, + "step": 7766 + }, + { + "epoch": 0.9, + "learning_rate": 2.9372859336246602e-08, + "logits/chosen": -2.3386037349700928, + "logits/rejected": -2.5810070037841797, + "logps/chosen": -291.3907470703125, + "logps/rejected": -208.01927185058594, + "loss": 0.5792, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6010730862617493, + "rewards/margins": 2.2854278087615967, + "rewards/rejected": -2.8865013122558594, + "step": 7767 + }, + { + "epoch": 0.9, + "learning_rate": 2.9337427660328334e-08, + "logits/chosen": -1.5870779752731323, + "logits/rejected": -1.8302547931671143, + "logps/chosen": -293.4154357910156, + "logps/rejected": -280.04345703125, + "loss": 0.8785, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4791429042816162, + "rewards/margins": 0.401305228471756, + "rewards/rejected": -1.8804481029510498, + "step": 7768 + }, + { + "epoch": 0.9, + "learning_rate": 2.930199598441006e-08, + "logits/chosen": -2.1764297485351562, + "logits/rejected": -2.1327829360961914, + "logps/chosen": -216.12704467773438, + "logps/rejected": -270.95184326171875, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5784616470336914, + "rewards/margins": 3.237293004989624, + "rewards/rejected": -4.815754413604736, + "step": 7769 + }, + { + "epoch": 0.9, + "learning_rate": 2.9266564308491788e-08, + "logits/chosen": -2.0971078872680664, + "logits/rejected": -2.1232049465179443, + "logps/chosen": -257.675537109375, + "logps/rejected": -260.11083984375, + "loss": 0.8103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5552384257316589, + "rewards/margins": 1.4537792205810547, + "rewards/rejected": -2.0090174674987793, + "step": 7770 + }, + { + "epoch": 0.9, + "learning_rate": 2.923113263257352e-08, + "logits/chosen": -2.531778335571289, + "logits/rejected": -2.390392303466797, + "logps/chosen": -352.658447265625, + "logps/rejected": -302.624267578125, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3945104479789734, + "rewards/margins": 2.8807358741760254, + "rewards/rejected": -3.2752463817596436, + "step": 7771 + }, + { + "epoch": 0.9, + "learning_rate": 2.919570095665525e-08, + "logits/chosen": -1.87582266330719, + "logits/rejected": -2.227600574493408, + "logps/chosen": -402.901611328125, + "logps/rejected": -227.29421997070312, + "loss": 0.3967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29113534092903137, + "rewards/margins": 1.3775665760040283, + "rewards/rejected": -1.6687018871307373, + "step": 7772 + }, + { + "epoch": 0.9, + "learning_rate": 2.9160269280736978e-08, + "logits/chosen": -2.8728246688842773, + "logits/rejected": -2.831409454345703, + "logps/chosen": -220.58148193359375, + "logps/rejected": -300.328857421875, + "loss": 0.1328, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29534661769866943, + "rewards/margins": 3.4064457416534424, + "rewards/rejected": -3.7017922401428223, + "step": 7773 + }, + { + "epoch": 0.9, + "learning_rate": 2.9124837604818706e-08, + "logits/chosen": -1.8029316663742065, + "logits/rejected": -1.9336904287338257, + "logps/chosen": -368.7296142578125, + "logps/rejected": -290.61444091796875, + "loss": 0.3123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6182440519332886, + "rewards/margins": 2.2695202827453613, + "rewards/rejected": -2.8877644538879395, + "step": 7774 + }, + { + "epoch": 0.9, + "learning_rate": 2.908940592890044e-08, + "logits/chosen": -2.259770631790161, + "logits/rejected": -2.1659631729125977, + "logps/chosen": -307.2257385253906, + "logps/rejected": -448.21099853515625, + "loss": 0.6255, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28634482622146606, + "rewards/margins": 2.053135395050049, + "rewards/rejected": -2.339480400085449, + "step": 7775 + }, + { + "epoch": 0.9, + "learning_rate": 2.9053974252982164e-08, + "logits/chosen": -2.494269609451294, + "logits/rejected": -2.530651092529297, + "logps/chosen": -217.77879333496094, + "logps/rejected": -213.6360626220703, + "loss": 0.244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5549649596214294, + "rewards/margins": 1.6481423377990723, + "rewards/rejected": -2.2031073570251465, + "step": 7776 + }, + { + "epoch": 0.9, + "learning_rate": 2.9018542577063893e-08, + "logits/chosen": -2.4585254192352295, + "logits/rejected": -2.6803500652313232, + "logps/chosen": -244.40150451660156, + "logps/rejected": -199.9104461669922, + "loss": 0.5719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9621416926383972, + "rewards/margins": 1.8430845737457275, + "rewards/rejected": -2.8052260875701904, + "step": 7777 + }, + { + "epoch": 0.9, + "learning_rate": 2.8983110901145625e-08, + "logits/chosen": -1.8665355443954468, + "logits/rejected": -2.422661781311035, + "logps/chosen": -506.8324279785156, + "logps/rejected": -259.5646057128906, + "loss": 0.6008, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3148493766784668, + "rewards/margins": 0.5736511945724487, + "rewards/rejected": -1.888500452041626, + "step": 7778 + }, + { + "epoch": 0.9, + "learning_rate": 2.894767922522735e-08, + "logits/chosen": -2.539205551147461, + "logits/rejected": -2.4175198078155518, + "logps/chosen": -73.31095123291016, + "logps/rejected": -305.56060791015625, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9111602306365967, + "rewards/margins": 4.629692077636719, + "rewards/rejected": -5.5408525466918945, + "step": 7779 + }, + { + "epoch": 0.91, + "learning_rate": 2.8912247549309082e-08, + "logits/chosen": -2.1128346920013428, + "logits/rejected": -1.9557201862335205, + "logps/chosen": -291.38629150390625, + "logps/rejected": -399.42572021484375, + "loss": 0.4571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6897319555282593, + "rewards/margins": 2.0726051330566406, + "rewards/rejected": -2.7623372077941895, + "step": 7780 + }, + { + "epoch": 0.91, + "learning_rate": 2.887681587339081e-08, + "logits/chosen": -2.19891619682312, + "logits/rejected": -2.337376117706299, + "logps/chosen": -119.53851318359375, + "logps/rejected": -129.3673095703125, + "loss": 0.5481, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47664231061935425, + "rewards/margins": 1.310901403427124, + "rewards/rejected": -1.787543773651123, + "step": 7781 + }, + { + "epoch": 0.91, + "learning_rate": 2.8841384197472536e-08, + "logits/chosen": -2.43851375579834, + "logits/rejected": -2.235294818878174, + "logps/chosen": -183.95376586914062, + "logps/rejected": -350.33294677734375, + "loss": 0.3787, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.797174334526062, + "rewards/margins": 1.8446927070617676, + "rewards/rejected": -2.641867160797119, + "step": 7782 + }, + { + "epoch": 0.91, + "learning_rate": 2.880595252155427e-08, + "logits/chosen": -2.655919075012207, + "logits/rejected": -2.634281873703003, + "logps/chosen": -158.1014404296875, + "logps/rejected": -176.4591827392578, + "loss": 0.5624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9254481792449951, + "rewards/margins": 1.2227340936660767, + "rewards/rejected": -2.1481821537017822, + "step": 7783 + }, + { + "epoch": 0.91, + "learning_rate": 2.8770520845635997e-08, + "logits/chosen": -2.5218958854675293, + "logits/rejected": -2.730910062789917, + "logps/chosen": -244.1710662841797, + "logps/rejected": -288.0486145019531, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0073072910308838, + "rewards/margins": 2.3031139373779297, + "rewards/rejected": -3.3104209899902344, + "step": 7784 + }, + { + "epoch": 0.91, + "learning_rate": 2.8735089169717726e-08, + "logits/chosen": -2.4748146533966064, + "logits/rejected": -2.5149314403533936, + "logps/chosen": -85.9519271850586, + "logps/rejected": -91.47857666015625, + "loss": 0.3517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20330707728862762, + "rewards/margins": 1.4892146587371826, + "rewards/rejected": -1.6925216913223267, + "step": 7785 + }, + { + "epoch": 0.91, + "learning_rate": 2.8699657493799455e-08, + "logits/chosen": -2.204655885696411, + "logits/rejected": -2.133025884628296, + "logps/chosen": -415.74774169921875, + "logps/rejected": -327.9106140136719, + "loss": 0.4433, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1045482158660889, + "rewards/margins": 1.266464114189148, + "rewards/rejected": -2.3710122108459473, + "step": 7786 + }, + { + "epoch": 0.91, + "learning_rate": 2.8664225817881187e-08, + "logits/chosen": -2.647646427154541, + "logits/rejected": -2.8007779121398926, + "logps/chosen": -227.7813720703125, + "logps/rejected": -200.08966064453125, + "loss": 0.2226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08488786220550537, + "rewards/margins": 3.060976505279541, + "rewards/rejected": -3.145864486694336, + "step": 7787 + }, + { + "epoch": 0.91, + "learning_rate": 2.8628794141962915e-08, + "logits/chosen": -2.281754732131958, + "logits/rejected": -2.268299102783203, + "logps/chosen": -341.0829772949219, + "logps/rejected": -452.7987060546875, + "loss": 0.4725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.570919394493103, + "rewards/margins": 3.05780029296875, + "rewards/rejected": -3.6287195682525635, + "step": 7788 + }, + { + "epoch": 0.91, + "learning_rate": 2.859336246604464e-08, + "logits/chosen": -2.0189661979675293, + "logits/rejected": -2.2825465202331543, + "logps/chosen": -385.54180908203125, + "logps/rejected": -407.10821533203125, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3544990122318268, + "rewards/margins": 2.3201181888580322, + "rewards/rejected": -2.674617290496826, + "step": 7789 + }, + { + "epoch": 0.91, + "learning_rate": 2.8557930790126373e-08, + "logits/chosen": -2.467367649078369, + "logits/rejected": -2.6049346923828125, + "logps/chosen": -353.00689697265625, + "logps/rejected": -299.5634765625, + "loss": 0.5764, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.204040765762329, + "rewards/margins": 1.365154504776001, + "rewards/rejected": -2.569195508956909, + "step": 7790 + }, + { + "epoch": 0.91, + "learning_rate": 2.8522499114208102e-08, + "logits/chosen": -2.240598440170288, + "logits/rejected": -2.5918936729431152, + "logps/chosen": -230.36732482910156, + "logps/rejected": -149.77911376953125, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.002949148416519165, + "rewards/margins": 2.9211301803588867, + "rewards/rejected": -2.9181809425354004, + "step": 7791 + }, + { + "epoch": 0.91, + "learning_rate": 2.8487067438289827e-08, + "logits/chosen": -2.7178173065185547, + "logits/rejected": -2.6147775650024414, + "logps/chosen": -265.32403564453125, + "logps/rejected": -289.42584228515625, + "loss": 0.3437, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9472178220748901, + "rewards/margins": 2.345597267150879, + "rewards/rejected": -3.2928152084350586, + "step": 7792 + }, + { + "epoch": 0.91, + "learning_rate": 2.845163576237156e-08, + "logits/chosen": -2.603745460510254, + "logits/rejected": -2.514835834503174, + "logps/chosen": -236.71751403808594, + "logps/rejected": -327.80712890625, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38996630907058716, + "rewards/margins": 3.835508108139038, + "rewards/rejected": -4.2254743576049805, + "step": 7793 + }, + { + "epoch": 0.91, + "learning_rate": 2.8416204086453288e-08, + "logits/chosen": -2.099008560180664, + "logits/rejected": -2.232409954071045, + "logps/chosen": -384.0172119140625, + "logps/rejected": -348.1450500488281, + "loss": 0.2377, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6556740403175354, + "rewards/margins": 2.87734317779541, + "rewards/rejected": -3.533017158508301, + "step": 7794 + }, + { + "epoch": 0.91, + "learning_rate": 2.8380772410535017e-08, + "logits/chosen": -2.308295965194702, + "logits/rejected": -2.5822501182556152, + "logps/chosen": -192.2874755859375, + "logps/rejected": -144.19613647460938, + "loss": 0.4977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7349642515182495, + "rewards/margins": 0.8757230043411255, + "rewards/rejected": -1.6106871366500854, + "step": 7795 + }, + { + "epoch": 0.91, + "learning_rate": 2.8345340734616745e-08, + "logits/chosen": -2.3906497955322266, + "logits/rejected": -2.23116135597229, + "logps/chosen": -225.14337158203125, + "logps/rejected": -189.8595733642578, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.194859266281128, + "rewards/margins": 2.6437125205993652, + "rewards/rejected": -4.838572025299072, + "step": 7796 + }, + { + "epoch": 0.91, + "learning_rate": 2.8309909058698477e-08, + "logits/chosen": -1.7749189138412476, + "logits/rejected": -1.9616481065750122, + "logps/chosen": -560.9602661132812, + "logps/rejected": -380.57916259765625, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7658616304397583, + "rewards/margins": 2.6699302196502686, + "rewards/rejected": -3.4357919692993164, + "step": 7797 + }, + { + "epoch": 0.91, + "learning_rate": 2.8274477382780203e-08, + "logits/chosen": -2.183626413345337, + "logits/rejected": -2.3917107582092285, + "logps/chosen": -231.58775329589844, + "logps/rejected": -259.0075378417969, + "loss": 0.2835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7388246059417725, + "rewards/margins": 3.359987258911133, + "rewards/rejected": -4.098812103271484, + "step": 7798 + }, + { + "epoch": 0.91, + "learning_rate": 2.823904570686193e-08, + "logits/chosen": -1.8685747385025024, + "logits/rejected": -1.4300687313079834, + "logps/chosen": -198.68832397460938, + "logps/rejected": -508.007080078125, + "loss": 0.3467, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2606270909309387, + "rewards/margins": 2.025501012802124, + "rewards/rejected": -2.286128044128418, + "step": 7799 + }, + { + "epoch": 0.91, + "learning_rate": 2.8203614030943664e-08, + "logits/chosen": -2.143021583557129, + "logits/rejected": -2.158989906311035, + "logps/chosen": -361.3835754394531, + "logps/rejected": -290.6054382324219, + "loss": 0.4174, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5084774494171143, + "rewards/margins": 1.304731011390686, + "rewards/rejected": -2.81320858001709, + "step": 7800 + }, + { + "epoch": 0.91, + "learning_rate": 2.816818235502539e-08, + "logits/chosen": -3.1506359577178955, + "logits/rejected": -3.0102901458740234, + "logps/chosen": -263.4444580078125, + "logps/rejected": -231.13949584960938, + "loss": 0.3016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5410915017127991, + "rewards/margins": 2.0273280143737793, + "rewards/rejected": -2.5684194564819336, + "step": 7801 + }, + { + "epoch": 0.91, + "learning_rate": 2.813275067910712e-08, + "logits/chosen": -2.123265266418457, + "logits/rejected": -2.0646462440490723, + "logps/chosen": -213.71926879882812, + "logps/rejected": -321.90972900390625, + "loss": 0.6255, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8886911273002625, + "rewards/margins": 1.2406991720199585, + "rewards/rejected": -2.129390239715576, + "step": 7802 + }, + { + "epoch": 0.91, + "learning_rate": 2.809731900318885e-08, + "logits/chosen": -2.1068053245544434, + "logits/rejected": -2.2231075763702393, + "logps/chosen": -296.60302734375, + "logps/rejected": -285.5090026855469, + "loss": 0.1711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26097890734672546, + "rewards/margins": 3.4135022163391113, + "rewards/rejected": -3.67448091506958, + "step": 7803 + }, + { + "epoch": 0.91, + "learning_rate": 2.8061887327270575e-08, + "logits/chosen": -1.8939952850341797, + "logits/rejected": -1.742956280708313, + "logps/chosen": -294.158935546875, + "logps/rejected": -364.29571533203125, + "loss": 0.3505, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3094886541366577, + "rewards/margins": 10.736759185791016, + "rewards/rejected": -12.046247482299805, + "step": 7804 + }, + { + "epoch": 0.91, + "learning_rate": 2.8026455651352307e-08, + "logits/chosen": -2.3410377502441406, + "logits/rejected": -2.3977479934692383, + "logps/chosen": -359.3341064453125, + "logps/rejected": -425.458251953125, + "loss": 0.1039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8780902624130249, + "rewards/margins": 3.4271817207336426, + "rewards/rejected": -4.305272102355957, + "step": 7805 + }, + { + "epoch": 0.91, + "learning_rate": 2.7991023975434036e-08, + "logits/chosen": -2.5017528533935547, + "logits/rejected": -2.3562936782836914, + "logps/chosen": -188.63217163085938, + "logps/rejected": -227.7791748046875, + "loss": 0.4809, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7357326745986938, + "rewards/margins": 1.9596054553985596, + "rewards/rejected": -2.695338010787964, + "step": 7806 + }, + { + "epoch": 0.91, + "learning_rate": 2.7955592299515768e-08, + "logits/chosen": -2.271571397781372, + "logits/rejected": -2.444981098175049, + "logps/chosen": -329.451171875, + "logps/rejected": -252.57118225097656, + "loss": 0.3333, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7090495824813843, + "rewards/margins": 1.9694814682006836, + "rewards/rejected": -3.6785311698913574, + "step": 7807 + }, + { + "epoch": 0.91, + "learning_rate": 2.7920160623597494e-08, + "logits/chosen": -1.804121732711792, + "logits/rejected": -2.0224833488464355, + "logps/chosen": -267.73162841796875, + "logps/rejected": -237.7820587158203, + "loss": 0.561, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5574276447296143, + "rewards/margins": 1.002381682395935, + "rewards/rejected": -1.5598094463348389, + "step": 7808 + }, + { + "epoch": 0.91, + "learning_rate": 2.7884728947679226e-08, + "logits/chosen": -1.908738613128662, + "logits/rejected": -1.9852681159973145, + "logps/chosen": -236.71371459960938, + "logps/rejected": -253.9727020263672, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.982132077217102, + "rewards/margins": 2.483746290206909, + "rewards/rejected": -3.465878486633301, + "step": 7809 + }, + { + "epoch": 0.91, + "learning_rate": 2.7849297271760954e-08, + "logits/chosen": -2.469327688217163, + "logits/rejected": -2.714245080947876, + "logps/chosen": -205.62155151367188, + "logps/rejected": -204.2027130126953, + "loss": 0.2266, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.034002043306827545, + "rewards/margins": 3.2153639793395996, + "rewards/rejected": -3.249366283416748, + "step": 7810 + }, + { + "epoch": 0.91, + "learning_rate": 2.781386559584268e-08, + "logits/chosen": -2.682725191116333, + "logits/rejected": -2.7206904888153076, + "logps/chosen": -422.62945556640625, + "logps/rejected": -330.67657470703125, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2972520589828491, + "rewards/margins": 3.714400053024292, + "rewards/rejected": -4.011651992797852, + "step": 7811 + }, + { + "epoch": 0.91, + "learning_rate": 2.7778433919924412e-08, + "logits/chosen": -2.5647411346435547, + "logits/rejected": -2.5124623775482178, + "logps/chosen": -298.2282409667969, + "logps/rejected": -232.6904296875, + "loss": 0.2368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6699087023735046, + "rewards/margins": 2.296438694000244, + "rewards/rejected": -2.9663472175598145, + "step": 7812 + }, + { + "epoch": 0.91, + "learning_rate": 2.774300224400614e-08, + "logits/chosen": -2.1758363246917725, + "logits/rejected": -2.3720457553863525, + "logps/chosen": -153.42904663085938, + "logps/rejected": -202.2509765625, + "loss": 1.4641, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2635390758514404, + "rewards/margins": 0.6524296998977661, + "rewards/rejected": -2.915968894958496, + "step": 7813 + }, + { + "epoch": 0.91, + "learning_rate": 2.770757056808787e-08, + "logits/chosen": -1.4636287689208984, + "logits/rejected": -1.659960389137268, + "logps/chosen": -359.4472961425781, + "logps/rejected": -326.4058532714844, + "loss": 0.5519, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7890291810035706, + "rewards/margins": 1.3984565734863281, + "rewards/rejected": -2.187485694885254, + "step": 7814 + }, + { + "epoch": 0.91, + "learning_rate": 2.7672138892169598e-08, + "logits/chosen": -2.0322749614715576, + "logits/rejected": -2.0300729274749756, + "logps/chosen": -274.7593994140625, + "logps/rejected": -356.2828369140625, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7962191104888916, + "rewards/margins": 2.67726469039917, + "rewards/rejected": -3.4734840393066406, + "step": 7815 + }, + { + "epoch": 0.91, + "learning_rate": 2.763670721625133e-08, + "logits/chosen": -3.074483871459961, + "logits/rejected": -3.001450300216675, + "logps/chosen": -201.23008728027344, + "logps/rejected": -190.70919799804688, + "loss": 0.3628, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2573307156562805, + "rewards/margins": 2.712768793106079, + "rewards/rejected": -2.970099449157715, + "step": 7816 + }, + { + "epoch": 0.91, + "learning_rate": 2.7601275540333056e-08, + "logits/chosen": -1.6934857368469238, + "logits/rejected": -1.8971480131149292, + "logps/chosen": -397.91375732421875, + "logps/rejected": -314.5960388183594, + "loss": 0.2804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.541378378868103, + "rewards/margins": 2.3073079586029053, + "rewards/rejected": -2.8486862182617188, + "step": 7817 + }, + { + "epoch": 0.91, + "learning_rate": 2.7565843864414784e-08, + "logits/chosen": -2.966054916381836, + "logits/rejected": -2.8723702430725098, + "logps/chosen": -251.3617706298828, + "logps/rejected": -263.105224609375, + "loss": 0.4065, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9083907008171082, + "rewards/margins": 2.338012456893921, + "rewards/rejected": -3.246403217315674, + "step": 7818 + }, + { + "epoch": 0.91, + "learning_rate": 2.7530412188496516e-08, + "logits/chosen": -1.48484206199646, + "logits/rejected": -2.0893795490264893, + "logps/chosen": -265.0536804199219, + "logps/rejected": -168.73045349121094, + "loss": 0.3536, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.069885015487671, + "rewards/margins": 1.3715636730194092, + "rewards/rejected": -2.44144868850708, + "step": 7819 + }, + { + "epoch": 0.91, + "learning_rate": 2.7494980512578242e-08, + "logits/chosen": -2.124945640563965, + "logits/rejected": -2.0588135719299316, + "logps/chosen": -311.458251953125, + "logps/rejected": -314.96575927734375, + "loss": 0.3767, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9676119089126587, + "rewards/margins": 2.650390863418579, + "rewards/rejected": -3.6180028915405273, + "step": 7820 + }, + { + "epoch": 0.91, + "learning_rate": 2.7459548836659974e-08, + "logits/chosen": -2.119208574295044, + "logits/rejected": -2.069493532180786, + "logps/chosen": -168.24563598632812, + "logps/rejected": -172.00186157226562, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6223419904708862, + "rewards/margins": 1.8465046882629395, + "rewards/rejected": -2.4688467979431152, + "step": 7821 + }, + { + "epoch": 0.91, + "learning_rate": 2.7424117160741703e-08, + "logits/chosen": -2.313366413116455, + "logits/rejected": -2.3397927284240723, + "logps/chosen": -122.8309326171875, + "logps/rejected": -242.44654846191406, + "loss": 0.4431, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4786389172077179, + "rewards/margins": 2.045037031173706, + "rewards/rejected": -2.5236759185791016, + "step": 7822 + }, + { + "epoch": 0.91, + "learning_rate": 2.7388685484823428e-08, + "logits/chosen": -2.571547508239746, + "logits/rejected": -2.645779609680176, + "logps/chosen": -97.63256072998047, + "logps/rejected": -190.81137084960938, + "loss": 0.4926, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.092734694480896, + "rewards/margins": 1.6755324602127075, + "rewards/rejected": -2.7682671546936035, + "step": 7823 + }, + { + "epoch": 0.91, + "learning_rate": 2.735325380890516e-08, + "logits/chosen": -2.3497262001037598, + "logits/rejected": -1.9311044216156006, + "logps/chosen": -192.95046997070312, + "logps/rejected": -405.9814453125, + "loss": 0.5422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6494795083999634, + "rewards/margins": 1.6202259063720703, + "rewards/rejected": -2.269705295562744, + "step": 7824 + }, + { + "epoch": 0.91, + "learning_rate": 2.731782213298689e-08, + "logits/chosen": -2.5598957538604736, + "logits/rejected": -2.765096664428711, + "logps/chosen": -259.1525573730469, + "logps/rejected": -137.61874389648438, + "loss": 0.6124, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3501899242401123, + "rewards/margins": 0.5727995038032532, + "rewards/rejected": -1.9229896068572998, + "step": 7825 + }, + { + "epoch": 0.91, + "learning_rate": 2.728239045706862e-08, + "logits/chosen": -1.441624641418457, + "logits/rejected": -1.8219115734100342, + "logps/chosen": -413.56219482421875, + "logps/rejected": -298.4718017578125, + "loss": 0.6591, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6945997476577759, + "rewards/margins": 1.2711715698242188, + "rewards/rejected": -2.965771436691284, + "step": 7826 + }, + { + "epoch": 0.91, + "learning_rate": 2.7246958781150346e-08, + "logits/chosen": -2.6693060398101807, + "logits/rejected": -2.8600101470947266, + "logps/chosen": -208.62977600097656, + "logps/rejected": -190.83123779296875, + "loss": 0.3446, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3883078396320343, + "rewards/margins": 2.26218318939209, + "rewards/rejected": -2.6504909992218018, + "step": 7827 + }, + { + "epoch": 0.91, + "learning_rate": 2.7211527105232075e-08, + "logits/chosen": -2.2302088737487793, + "logits/rejected": -2.34234619140625, + "logps/chosen": -305.230224609375, + "logps/rejected": -328.2203369140625, + "loss": 0.3125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7381885051727295, + "rewards/margins": 3.1803858280181885, + "rewards/rejected": -3.918574571609497, + "step": 7828 + }, + { + "epoch": 0.91, + "learning_rate": 2.7176095429313807e-08, + "logits/chosen": -2.4452567100524902, + "logits/rejected": -2.346367835998535, + "logps/chosen": -229.52728271484375, + "logps/rejected": -211.13671875, + "loss": 0.3219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17147789895534515, + "rewards/margins": 1.7991358041763306, + "rewards/rejected": -1.970613718032837, + "step": 7829 + }, + { + "epoch": 0.91, + "learning_rate": 2.7140663753395533e-08, + "logits/chosen": -1.804085612297058, + "logits/rejected": -2.008859395980835, + "logps/chosen": -206.87290954589844, + "logps/rejected": -311.7795104980469, + "loss": 0.1602, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6475632190704346, + "rewards/margins": 5.3357720375061035, + "rewards/rejected": -5.983335494995117, + "step": 7830 + }, + { + "epoch": 0.91, + "learning_rate": 2.7105232077477265e-08, + "logits/chosen": -2.578428268432617, + "logits/rejected": -2.469276189804077, + "logps/chosen": -291.09326171875, + "logps/rejected": -292.8094482421875, + "loss": 0.6713, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1974966526031494, + "rewards/margins": 0.9021276831626892, + "rewards/rejected": -2.0996243953704834, + "step": 7831 + }, + { + "epoch": 0.91, + "learning_rate": 2.7069800401558993e-08, + "logits/chosen": -2.534238815307617, + "logits/rejected": -2.2838048934936523, + "logps/chosen": -308.0565490722656, + "logps/rejected": -224.52523803710938, + "loss": 0.4064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4627076387405396, + "rewards/margins": 1.1398911476135254, + "rewards/rejected": -2.6025989055633545, + "step": 7832 + }, + { + "epoch": 0.91, + "learning_rate": 2.703436872564072e-08, + "logits/chosen": -2.111743927001953, + "logits/rejected": -2.2112739086151123, + "logps/chosen": -243.88580322265625, + "logps/rejected": -284.9048767089844, + "loss": 0.5281, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2588226795196533, + "rewards/margins": 1.6194127798080444, + "rewards/rejected": -2.878235340118408, + "step": 7833 + }, + { + "epoch": 0.91, + "learning_rate": 2.699893704972245e-08, + "logits/chosen": -2.6216118335723877, + "logits/rejected": -2.5046045780181885, + "logps/chosen": -143.48959350585938, + "logps/rejected": -276.5705871582031, + "loss": 0.3587, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7581494450569153, + "rewards/margins": 2.0551400184631348, + "rewards/rejected": -2.8132896423339844, + "step": 7834 + }, + { + "epoch": 0.91, + "learning_rate": 2.696350537380418e-08, + "logits/chosen": -2.6369690895080566, + "logits/rejected": -2.858222007751465, + "logps/chosen": -203.49349975585938, + "logps/rejected": -157.95584106445312, + "loss": 0.3212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42449846863746643, + "rewards/margins": 1.6568524837493896, + "rewards/rejected": -2.0813510417938232, + "step": 7835 + }, + { + "epoch": 0.91, + "learning_rate": 2.6928073697885908e-08, + "logits/chosen": -2.608896017074585, + "logits/rejected": -2.657409191131592, + "logps/chosen": -280.3772888183594, + "logps/rejected": -285.59381103515625, + "loss": 0.3285, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0597807168960571, + "rewards/margins": 2.1328701972961426, + "rewards/rejected": -3.1926510334014893, + "step": 7836 + }, + { + "epoch": 0.91, + "learning_rate": 2.6892642021967637e-08, + "logits/chosen": -2.037472724914551, + "logits/rejected": -2.4206342697143555, + "logps/chosen": -298.85736083984375, + "logps/rejected": -268.5363464355469, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5951290726661682, + "rewards/margins": 3.737485408782959, + "rewards/rejected": -4.332614898681641, + "step": 7837 + }, + { + "epoch": 0.91, + "learning_rate": 2.685721034604937e-08, + "logits/chosen": -2.461611032485962, + "logits/rejected": -2.3447670936584473, + "logps/chosen": -422.7827453613281, + "logps/rejected": -311.2020263671875, + "loss": 0.8573, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2099802494049072, + "rewards/margins": 0.30790793895721436, + "rewards/rejected": -1.5178881883621216, + "step": 7838 + }, + { + "epoch": 0.91, + "learning_rate": 2.6821778670131095e-08, + "logits/chosen": -2.4233460426330566, + "logits/rejected": -2.3633270263671875, + "logps/chosen": -226.41748046875, + "logps/rejected": -234.62802124023438, + "loss": 0.2493, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2887444496154785, + "rewards/margins": 2.8372323513031006, + "rewards/rejected": -3.125976800918579, + "step": 7839 + }, + { + "epoch": 0.91, + "learning_rate": 2.6786346994212823e-08, + "logits/chosen": -2.2658069133758545, + "logits/rejected": -2.4389095306396484, + "logps/chosen": -357.0491027832031, + "logps/rejected": -256.5008850097656, + "loss": 0.2984, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5574584007263184, + "rewards/margins": 1.8538297414779663, + "rewards/rejected": -2.411288261413574, + "step": 7840 + }, + { + "epoch": 0.91, + "learning_rate": 2.6750915318294555e-08, + "logits/chosen": -1.9805827140808105, + "logits/rejected": -2.27040433883667, + "logps/chosen": -297.7172546386719, + "logps/rejected": -224.51019287109375, + "loss": 0.4636, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1512980461120605, + "rewards/margins": 2.592268705368042, + "rewards/rejected": -3.7435667514801025, + "step": 7841 + }, + { + "epoch": 0.91, + "learning_rate": 2.6715483642376284e-08, + "logits/chosen": -2.438206195831299, + "logits/rejected": -2.7582108974456787, + "logps/chosen": -293.32330322265625, + "logps/rejected": -204.92874145507812, + "loss": 0.6156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8719755411148071, + "rewards/margins": 2.208967447280884, + "rewards/rejected": -4.0809431076049805, + "step": 7842 + }, + { + "epoch": 0.91, + "learning_rate": 2.6680051966458013e-08, + "logits/chosen": -2.54610538482666, + "logits/rejected": -2.6854209899902344, + "logps/chosen": -183.94552612304688, + "logps/rejected": -186.38099670410156, + "loss": 1.1229, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2942882776260376, + "rewards/margins": 0.5130681991577148, + "rewards/rejected": -1.807356595993042, + "step": 7843 + }, + { + "epoch": 0.91, + "learning_rate": 2.664462029053974e-08, + "logits/chosen": -2.4463138580322266, + "logits/rejected": -2.250988245010376, + "logps/chosen": -248.43740844726562, + "logps/rejected": -282.69677734375, + "loss": 0.5184, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1109504699707031, + "rewards/margins": 2.3870322704315186, + "rewards/rejected": -3.4979827404022217, + "step": 7844 + }, + { + "epoch": 0.91, + "learning_rate": 2.6609188614621474e-08, + "logits/chosen": -2.391791582107544, + "logits/rejected": -2.3228251934051514, + "logps/chosen": -309.38067626953125, + "logps/rejected": -321.7048645019531, + "loss": 0.3552, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7677709460258484, + "rewards/margins": 1.2337822914123535, + "rewards/rejected": -2.0015532970428467, + "step": 7845 + }, + { + "epoch": 0.91, + "learning_rate": 2.65737569387032e-08, + "logits/chosen": -2.736665725708008, + "logits/rejected": -2.5029428005218506, + "logps/chosen": -149.85308837890625, + "logps/rejected": -221.61337280273438, + "loss": 0.365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8474173545837402, + "rewards/margins": 2.9559707641601562, + "rewards/rejected": -3.8033883571624756, + "step": 7846 + }, + { + "epoch": 0.91, + "learning_rate": 2.6538325262784928e-08, + "logits/chosen": -2.3299365043640137, + "logits/rejected": -2.3816258907318115, + "logps/chosen": -193.51112365722656, + "logps/rejected": -329.27728271484375, + "loss": 0.3366, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2804360389709473, + "rewards/margins": 1.8022446632385254, + "rewards/rejected": -3.0826807022094727, + "step": 7847 + }, + { + "epoch": 0.91, + "learning_rate": 2.650289358686666e-08, + "logits/chosen": -2.2955245971679688, + "logits/rejected": -2.5322039127349854, + "logps/chosen": -542.624755859375, + "logps/rejected": -341.49090576171875, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07893425226211548, + "rewards/margins": 3.1659882068634033, + "rewards/rejected": -3.244922399520874, + "step": 7848 + }, + { + "epoch": 0.91, + "learning_rate": 2.6467461910948385e-08, + "logits/chosen": -2.2466304302215576, + "logits/rejected": -2.341965675354004, + "logps/chosen": -291.38641357421875, + "logps/rejected": -277.0706787109375, + "loss": 0.4509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7700836658477783, + "rewards/margins": 1.8395309448242188, + "rewards/rejected": -2.609614372253418, + "step": 7849 + }, + { + "epoch": 0.91, + "learning_rate": 2.6432030235030117e-08, + "logits/chosen": -1.677561640739441, + "logits/rejected": -2.24678373336792, + "logps/chosen": -489.99658203125, + "logps/rejected": -224.9002227783203, + "loss": 0.2801, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0613090991973877, + "rewards/margins": 2.1837997436523438, + "rewards/rejected": -3.2451090812683105, + "step": 7850 + }, + { + "epoch": 0.91, + "learning_rate": 2.6396598559111846e-08, + "logits/chosen": -2.2706356048583984, + "logits/rejected": -2.271763324737549, + "logps/chosen": -182.10598754882812, + "logps/rejected": -219.88475036621094, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9170463681221008, + "rewards/margins": 0.2941240072250366, + "rewards/rejected": -1.2111704349517822, + "step": 7851 + }, + { + "epoch": 0.91, + "learning_rate": 2.636116688319357e-08, + "logits/chosen": -2.800891876220703, + "logits/rejected": -2.5645909309387207, + "logps/chosen": -84.55953216552734, + "logps/rejected": -135.32501220703125, + "loss": 0.2098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2609913349151611, + "rewards/margins": 2.2511143684387207, + "rewards/rejected": -3.5121054649353027, + "step": 7852 + }, + { + "epoch": 0.91, + "learning_rate": 2.6325735207275304e-08, + "logits/chosen": -1.9907047748565674, + "logits/rejected": -2.37007737159729, + "logps/chosen": -369.38580322265625, + "logps/rejected": -335.4866943359375, + "loss": 0.3763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7452350854873657, + "rewards/margins": 3.2299132347106934, + "rewards/rejected": -3.9751482009887695, + "step": 7853 + }, + { + "epoch": 0.91, + "learning_rate": 2.6290303531357032e-08, + "logits/chosen": -2.104916572570801, + "logits/rejected": -2.4474425315856934, + "logps/chosen": -472.1258239746094, + "logps/rejected": -225.3874969482422, + "loss": 0.2113, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9740886688232422, + "rewards/margins": 2.0205676555633545, + "rewards/rejected": -2.9946563243865967, + "step": 7854 + }, + { + "epoch": 0.91, + "learning_rate": 2.625487185543876e-08, + "logits/chosen": -2.9697654247283936, + "logits/rejected": -2.6111884117126465, + "logps/chosen": -334.649169921875, + "logps/rejected": -291.5124816894531, + "loss": 0.319, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1983401775360107, + "rewards/margins": 2.1195785999298096, + "rewards/rejected": -3.3179187774658203, + "step": 7855 + }, + { + "epoch": 0.91, + "learning_rate": 2.621944017952049e-08, + "logits/chosen": -2.5467238426208496, + "logits/rejected": -2.605901002883911, + "logps/chosen": -269.2330017089844, + "logps/rejected": -252.74472045898438, + "loss": 0.6619, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3258498907089233, + "rewards/margins": 0.25637009739875793, + "rewards/rejected": -1.5822198390960693, + "step": 7856 + }, + { + "epoch": 0.91, + "learning_rate": 2.618400850360222e-08, + "logits/chosen": -2.4434361457824707, + "logits/rejected": -2.2004780769348145, + "logps/chosen": -305.7569580078125, + "logps/rejected": -313.63214111328125, + "loss": 0.4669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1815699338912964, + "rewards/margins": 1.5986425876617432, + "rewards/rejected": -2.780212640762329, + "step": 7857 + }, + { + "epoch": 0.91, + "learning_rate": 2.6148576827683947e-08, + "logits/chosen": -2.390625, + "logits/rejected": -2.212160110473633, + "logps/chosen": -269.5479736328125, + "logps/rejected": -392.1124267578125, + "loss": 0.398, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.238297700881958, + "rewards/margins": 3.0376007556915283, + "rewards/rejected": -4.2758989334106445, + "step": 7858 + }, + { + "epoch": 0.91, + "learning_rate": 2.6113145151765676e-08, + "logits/chosen": -2.5776987075805664, + "logits/rejected": -2.9130423069000244, + "logps/chosen": -379.9630432128906, + "logps/rejected": -325.0814514160156, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7534393668174744, + "rewards/margins": 2.1143686771392822, + "rewards/rejected": -2.8678081035614014, + "step": 7859 + }, + { + "epoch": 0.91, + "learning_rate": 2.6077713475847408e-08, + "logits/chosen": -1.7909144163131714, + "logits/rejected": -1.3509516716003418, + "logps/chosen": -344.3521728515625, + "logps/rejected": -504.03961181640625, + "loss": 0.4573, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7292894721031189, + "rewards/margins": 1.9575880765914917, + "rewards/rejected": -2.686877727508545, + "step": 7860 + }, + { + "epoch": 0.91, + "learning_rate": 2.6042281799929137e-08, + "logits/chosen": -1.8095626831054688, + "logits/rejected": -2.0864272117614746, + "logps/chosen": -289.32952880859375, + "logps/rejected": -219.05686950683594, + "loss": 0.2058, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4356086850166321, + "rewards/margins": 2.2327427864074707, + "rewards/rejected": -2.668351411819458, + "step": 7861 + }, + { + "epoch": 0.91, + "learning_rate": 2.6006850124010862e-08, + "logits/chosen": -2.6336028575897217, + "logits/rejected": -2.7273569107055664, + "logps/chosen": -341.30694580078125, + "logps/rejected": -314.98779296875, + "loss": 0.4345, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.703446388244629, + "rewards/margins": 2.04669189453125, + "rewards/rejected": -3.750138282775879, + "step": 7862 + }, + { + "epoch": 0.91, + "learning_rate": 2.5971418448092594e-08, + "logits/chosen": -2.6207571029663086, + "logits/rejected": -2.545320510864258, + "logps/chosen": -238.58065795898438, + "logps/rejected": -284.02691650390625, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5596529245376587, + "rewards/margins": 2.6254658699035645, + "rewards/rejected": -4.185118675231934, + "step": 7863 + }, + { + "epoch": 0.91, + "learning_rate": 2.5935986772174323e-08, + "logits/chosen": -1.9117369651794434, + "logits/rejected": -2.1039280891418457, + "logps/chosen": -294.133544921875, + "logps/rejected": -210.80856323242188, + "loss": 0.4226, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6247460842132568, + "rewards/margins": 1.4508183002471924, + "rewards/rejected": -2.075564384460449, + "step": 7864 + }, + { + "epoch": 0.91, + "learning_rate": 2.5900555096256052e-08, + "logits/chosen": -2.1804206371307373, + "logits/rejected": -2.276050329208374, + "logps/chosen": -292.1705627441406, + "logps/rejected": -344.1279296875, + "loss": 0.481, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.157456636428833, + "rewards/margins": 2.2988193035125732, + "rewards/rejected": -4.456275939941406, + "step": 7865 + }, + { + "epoch": 0.92, + "learning_rate": 2.586512342033778e-08, + "logits/chosen": -2.4031779766082764, + "logits/rejected": -2.333453893661499, + "logps/chosen": -198.59976196289062, + "logps/rejected": -227.4092559814453, + "loss": 0.6541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8001508712768555, + "rewards/margins": 4.353246688842773, + "rewards/rejected": -5.153397560119629, + "step": 7866 + }, + { + "epoch": 0.92, + "learning_rate": 2.5829691744419513e-08, + "logits/chosen": -2.1582913398742676, + "logits/rejected": -2.2558181285858154, + "logps/chosen": -268.95159912109375, + "logps/rejected": -213.8064422607422, + "loss": 0.638, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7250502109527588, + "rewards/margins": 0.8525638580322266, + "rewards/rejected": -1.5776140689849854, + "step": 7867 + }, + { + "epoch": 0.92, + "learning_rate": 2.5794260068501238e-08, + "logits/chosen": -1.8762321472167969, + "logits/rejected": -2.1637144088745117, + "logps/chosen": -347.3463134765625, + "logps/rejected": -271.4024658203125, + "loss": 1.0038, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1601426601409912, + "rewards/margins": 0.7902706861495972, + "rewards/rejected": -1.950413465499878, + "step": 7868 + }, + { + "epoch": 0.92, + "learning_rate": 2.5758828392582967e-08, + "logits/chosen": -2.405323028564453, + "logits/rejected": -2.5115509033203125, + "logps/chosen": -179.38900756835938, + "logps/rejected": -165.77456665039062, + "loss": 0.3839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7001006603240967, + "rewards/margins": 2.566908121109009, + "rewards/rejected": -3.2670090198516846, + "step": 7869 + }, + { + "epoch": 0.92, + "learning_rate": 2.57233967166647e-08, + "logits/chosen": -2.1263785362243652, + "logits/rejected": -2.1821258068084717, + "logps/chosen": -332.02496337890625, + "logps/rejected": -378.09130859375, + "loss": 0.7063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8021727800369263, + "rewards/margins": 1.4246606826782227, + "rewards/rejected": -2.2268335819244385, + "step": 7870 + }, + { + "epoch": 0.92, + "learning_rate": 2.5687965040746424e-08, + "logits/chosen": -2.0861408710479736, + "logits/rejected": -2.3406178951263428, + "logps/chosen": -199.94827270507812, + "logps/rejected": -175.6078643798828, + "loss": 0.9285, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.753055214881897, + "rewards/margins": 0.40921223163604736, + "rewards/rejected": -2.1622674465179443, + "step": 7871 + }, + { + "epoch": 0.92, + "learning_rate": 2.5652533364828156e-08, + "logits/chosen": -2.54176926612854, + "logits/rejected": -2.354806423187256, + "logps/chosen": -313.56414794921875, + "logps/rejected": -309.02227783203125, + "loss": 0.5369, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0835771560668945, + "rewards/margins": 2.6893181800842285, + "rewards/rejected": -3.772895336151123, + "step": 7872 + }, + { + "epoch": 0.92, + "learning_rate": 2.5617101688909885e-08, + "logits/chosen": -2.352356433868408, + "logits/rejected": -2.162715196609497, + "logps/chosen": -140.863037109375, + "logps/rejected": -267.7840270996094, + "loss": 0.5833, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4211351871490479, + "rewards/margins": 1.231209635734558, + "rewards/rejected": -2.6523447036743164, + "step": 7873 + }, + { + "epoch": 0.92, + "learning_rate": 2.558167001299161e-08, + "logits/chosen": -2.0563182830810547, + "logits/rejected": -2.365208148956299, + "logps/chosen": -537.2515869140625, + "logps/rejected": -236.13711547851562, + "loss": 0.4751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9500283002853394, + "rewards/margins": 0.7956160306930542, + "rewards/rejected": -1.7456443309783936, + "step": 7874 + }, + { + "epoch": 0.92, + "learning_rate": 2.5546238337073342e-08, + "logits/chosen": -2.562055826187134, + "logits/rejected": -2.339573860168457, + "logps/chosen": -254.1559600830078, + "logps/rejected": -185.94851684570312, + "loss": 0.1555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19967924058437347, + "rewards/margins": 2.334592342376709, + "rewards/rejected": -2.534271717071533, + "step": 7875 + }, + { + "epoch": 0.92, + "learning_rate": 2.551080666115507e-08, + "logits/chosen": -2.6980106830596924, + "logits/rejected": -2.565164804458618, + "logps/chosen": -253.7923583984375, + "logps/rejected": -178.51881408691406, + "loss": 0.3211, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48479902744293213, + "rewards/margins": 1.976266860961914, + "rewards/rejected": -2.4610657691955566, + "step": 7876 + }, + { + "epoch": 0.92, + "learning_rate": 2.54753749852368e-08, + "logits/chosen": -2.269768714904785, + "logits/rejected": -2.257021903991699, + "logps/chosen": -307.2211608886719, + "logps/rejected": -270.6490173339844, + "loss": 0.2759, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08433239907026291, + "rewards/margins": 2.1422829627990723, + "rewards/rejected": -2.2266151905059814, + "step": 7877 + }, + { + "epoch": 0.92, + "learning_rate": 2.543994330931853e-08, + "logits/chosen": -2.3993473052978516, + "logits/rejected": -2.399404287338257, + "logps/chosen": -241.12767028808594, + "logps/rejected": -369.8497314453125, + "loss": 0.8587, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1262191534042358, + "rewards/margins": 1.065895915031433, + "rewards/rejected": -2.192115068435669, + "step": 7878 + }, + { + "epoch": 0.92, + "learning_rate": 2.540451163340026e-08, + "logits/chosen": -1.9255352020263672, + "logits/rejected": -2.0879273414611816, + "logps/chosen": -240.8593292236328, + "logps/rejected": -211.7167205810547, + "loss": 0.5102, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0562353134155273, + "rewards/margins": 0.9928839206695557, + "rewards/rejected": -3.049118995666504, + "step": 7879 + }, + { + "epoch": 0.92, + "learning_rate": 2.536907995748199e-08, + "logits/chosen": -1.9334609508514404, + "logits/rejected": -2.1450767517089844, + "logps/chosen": -185.27565002441406, + "logps/rejected": -209.24728393554688, + "loss": 0.2414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05446035414934158, + "rewards/margins": 3.0589725971221924, + "rewards/rejected": -3.1134328842163086, + "step": 7880 + }, + { + "epoch": 0.92, + "learning_rate": 2.5333648281563715e-08, + "logits/chosen": -1.8030915260314941, + "logits/rejected": -2.298370361328125, + "logps/chosen": -220.80929565429688, + "logps/rejected": -179.35247802734375, + "loss": 1.1568, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9548395872116089, + "rewards/margins": 0.5012566447257996, + "rewards/rejected": -2.4560964107513428, + "step": 7881 + }, + { + "epoch": 0.92, + "learning_rate": 2.5298216605645447e-08, + "logits/chosen": -2.3822197914123535, + "logits/rejected": -2.631829023361206, + "logps/chosen": -305.43963623046875, + "logps/rejected": -293.9334716796875, + "loss": 0.2053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.16841459274292, + "rewards/margins": 3.2405471801757812, + "rewards/rejected": -4.408962249755859, + "step": 7882 + }, + { + "epoch": 0.92, + "learning_rate": 2.5262784929727176e-08, + "logits/chosen": -2.955157995223999, + "logits/rejected": -2.7164571285247803, + "logps/chosen": -72.7317123413086, + "logps/rejected": -238.9005126953125, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6832977533340454, + "rewards/margins": 2.0410208702087402, + "rewards/rejected": -2.724318504333496, + "step": 7883 + }, + { + "epoch": 0.92, + "learning_rate": 2.5227353253808904e-08, + "logits/chosen": -2.248410940170288, + "logits/rejected": -2.110419750213623, + "logps/chosen": -192.31283569335938, + "logps/rejected": -270.1246337890625, + "loss": 0.2419, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0254149436950684, + "rewards/margins": 2.2936766147613525, + "rewards/rejected": -3.319091796875, + "step": 7884 + }, + { + "epoch": 0.92, + "learning_rate": 2.5191921577890633e-08, + "logits/chosen": -2.5464251041412354, + "logits/rejected": -2.5784854888916016, + "logps/chosen": -209.05642700195312, + "logps/rejected": -309.631591796875, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49593672156333923, + "rewards/margins": 3.1813225746154785, + "rewards/rejected": -3.6772589683532715, + "step": 7885 + }, + { + "epoch": 0.92, + "learning_rate": 2.5156489901972365e-08, + "logits/chosen": -2.0244247913360596, + "logits/rejected": -2.161618232727051, + "logps/chosen": -538.9692993164062, + "logps/rejected": -466.417236328125, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4800707697868347, + "rewards/margins": 3.5814552307128906, + "rewards/rejected": -4.061526298522949, + "step": 7886 + }, + { + "epoch": 0.92, + "learning_rate": 2.512105822605409e-08, + "logits/chosen": -2.6560323238372803, + "logits/rejected": -2.659830093383789, + "logps/chosen": -123.50048065185547, + "logps/rejected": -148.53062438964844, + "loss": 0.3963, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9771144986152649, + "rewards/margins": 1.6141901016235352, + "rewards/rejected": -2.5913047790527344, + "step": 7887 + }, + { + "epoch": 0.92, + "learning_rate": 2.508562655013582e-08, + "logits/chosen": -2.6703877449035645, + "logits/rejected": -2.612598419189453, + "logps/chosen": -182.13963317871094, + "logps/rejected": -213.39883422851562, + "loss": 0.2974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6449593305587769, + "rewards/margins": 1.4322328567504883, + "rewards/rejected": -2.0771920680999756, + "step": 7888 + }, + { + "epoch": 0.92, + "learning_rate": 2.505019487421755e-08, + "logits/chosen": -1.859920859336853, + "logits/rejected": -2.127315044403076, + "logps/chosen": -255.97686767578125, + "logps/rejected": -285.4281005859375, + "loss": 0.3438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7993826866149902, + "rewards/margins": 1.4470783472061157, + "rewards/rejected": -2.2464611530303955, + "step": 7889 + }, + { + "epoch": 0.92, + "learning_rate": 2.5014763198299277e-08, + "logits/chosen": -2.2400808334350586, + "logits/rejected": -2.503457546234131, + "logps/chosen": -351.7080993652344, + "logps/rejected": -190.5353240966797, + "loss": 0.684, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4540064334869385, + "rewards/margins": 1.0180299282073975, + "rewards/rejected": -1.4720362424850464, + "step": 7890 + }, + { + "epoch": 0.92, + "learning_rate": 2.4979331522381006e-08, + "logits/chosen": -2.276614189147949, + "logits/rejected": -2.057448387145996, + "logps/chosen": -318.0711364746094, + "logps/rejected": -394.2580871582031, + "loss": 0.52, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.638512372970581, + "rewards/margins": 1.0929274559020996, + "rewards/rejected": -1.7314398288726807, + "step": 7891 + }, + { + "epoch": 0.92, + "learning_rate": 2.4943899846462738e-08, + "logits/chosen": -2.2769975662231445, + "logits/rejected": -2.121746063232422, + "logps/chosen": -138.58917236328125, + "logps/rejected": -267.1032409667969, + "loss": 0.2992, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40122705698013306, + "rewards/margins": 1.990156888961792, + "rewards/rejected": -2.3913841247558594, + "step": 7892 + }, + { + "epoch": 0.92, + "learning_rate": 2.4908468170544463e-08, + "logits/chosen": -2.5578696727752686, + "logits/rejected": -2.373919725418091, + "logps/chosen": -105.67323303222656, + "logps/rejected": -178.70294189453125, + "loss": 0.5176, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7515531778335571, + "rewards/margins": 1.3891359567642212, + "rewards/rejected": -2.1406891345977783, + "step": 7893 + }, + { + "epoch": 0.92, + "learning_rate": 2.4873036494626195e-08, + "logits/chosen": -2.0119104385375977, + "logits/rejected": -2.369147539138794, + "logps/chosen": -380.342529296875, + "logps/rejected": -281.191650390625, + "loss": 0.7559, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8536868095397949, + "rewards/margins": 1.5256433486938477, + "rewards/rejected": -2.3793303966522217, + "step": 7894 + }, + { + "epoch": 0.92, + "learning_rate": 2.4837604818707924e-08, + "logits/chosen": -2.3726658821105957, + "logits/rejected": -2.3749172687530518, + "logps/chosen": -451.7352294921875, + "logps/rejected": -594.7254638671875, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9132817387580872, + "rewards/margins": 2.7010536193847656, + "rewards/rejected": -3.614335536956787, + "step": 7895 + }, + { + "epoch": 0.92, + "learning_rate": 2.480217314278965e-08, + "logits/chosen": -2.298600196838379, + "logits/rejected": -2.5209572315216064, + "logps/chosen": -305.44720458984375, + "logps/rejected": -177.74185180664062, + "loss": 0.4003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6948826313018799, + "rewards/margins": 1.4177252054214478, + "rewards/rejected": -2.112607955932617, + "step": 7896 + }, + { + "epoch": 0.92, + "learning_rate": 2.476674146687138e-08, + "logits/chosen": -2.823870897293091, + "logits/rejected": -2.8776023387908936, + "logps/chosen": -261.9993896484375, + "logps/rejected": -267.5581359863281, + "loss": 0.4103, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8336429595947266, + "rewards/margins": 3.1213622093200684, + "rewards/rejected": -3.955005168914795, + "step": 7897 + }, + { + "epoch": 0.92, + "learning_rate": 2.473130979095311e-08, + "logits/chosen": -2.432009696960449, + "logits/rejected": -2.1000757217407227, + "logps/chosen": -309.5743103027344, + "logps/rejected": -447.2745361328125, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2706286311149597, + "rewards/margins": 2.2744650840759277, + "rewards/rejected": -2.5450940132141113, + "step": 7898 + }, + { + "epoch": 0.92, + "learning_rate": 2.4695878115034842e-08, + "logits/chosen": -2.00443696975708, + "logits/rejected": -1.992392659187317, + "logps/chosen": -358.02337646484375, + "logps/rejected": -275.7724914550781, + "loss": 0.4547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3304870128631592, + "rewards/margins": 1.0905416011810303, + "rewards/rejected": -2.4210288524627686, + "step": 7899 + }, + { + "epoch": 0.92, + "learning_rate": 2.4660446439116568e-08, + "logits/chosen": -2.655320167541504, + "logits/rejected": -2.6873185634613037, + "logps/chosen": -206.15830993652344, + "logps/rejected": -223.20849609375, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9752264022827148, + "rewards/margins": 1.9635382890701294, + "rewards/rejected": -2.9387645721435547, + "step": 7900 + }, + { + "epoch": 0.92, + "learning_rate": 2.46250147631983e-08, + "logits/chosen": -2.717519760131836, + "logits/rejected": -2.700474262237549, + "logps/chosen": -127.8447265625, + "logps/rejected": -188.705078125, + "loss": 0.1321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5006088018417358, + "rewards/margins": 3.2391958236694336, + "rewards/rejected": -3.73980450630188, + "step": 7901 + }, + { + "epoch": 0.92, + "learning_rate": 2.458958308728003e-08, + "logits/chosen": -1.9116932153701782, + "logits/rejected": -2.222883701324463, + "logps/chosen": -348.9131774902344, + "logps/rejected": -233.498046875, + "loss": 0.2679, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9342940449714661, + "rewards/margins": 2.780888795852661, + "rewards/rejected": -3.7151830196380615, + "step": 7902 + }, + { + "epoch": 0.92, + "learning_rate": 2.4554151411361754e-08, + "logits/chosen": -2.5011672973632812, + "logits/rejected": -2.6458537578582764, + "logps/chosen": -302.64996337890625, + "logps/rejected": -250.1312713623047, + "loss": 0.4905, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5306076407432556, + "rewards/margins": 1.8706095218658447, + "rewards/rejected": -2.401216983795166, + "step": 7903 + }, + { + "epoch": 0.92, + "learning_rate": 2.4518719735443486e-08, + "logits/chosen": -2.443427085876465, + "logits/rejected": -2.36769700050354, + "logps/chosen": -309.0700988769531, + "logps/rejected": -181.25990295410156, + "loss": 0.5333, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.124724268913269, + "rewards/margins": 0.9298417568206787, + "rewards/rejected": -2.0545661449432373, + "step": 7904 + }, + { + "epoch": 0.92, + "learning_rate": 2.4483288059525215e-08, + "logits/chosen": -2.0128378868103027, + "logits/rejected": -2.084937810897827, + "logps/chosen": -332.10955810546875, + "logps/rejected": -246.52999877929688, + "loss": 0.7447, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.134190797805786, + "rewards/margins": 1.2977468967437744, + "rewards/rejected": -3.4319374561309814, + "step": 7905 + }, + { + "epoch": 0.92, + "learning_rate": 2.4447856383606943e-08, + "logits/chosen": -2.3353376388549805, + "logits/rejected": -2.271012306213379, + "logps/chosen": -326.5997314453125, + "logps/rejected": -344.6171569824219, + "loss": 0.1638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29515403509140015, + "rewards/margins": 3.186164140701294, + "rewards/rejected": -3.481318235397339, + "step": 7906 + }, + { + "epoch": 0.92, + "learning_rate": 2.4412424707688672e-08, + "logits/chosen": -2.6031789779663086, + "logits/rejected": -2.447949171066284, + "logps/chosen": -258.5350341796875, + "logps/rejected": -226.427978515625, + "loss": 0.706, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4374229907989502, + "rewards/margins": 1.0430896282196045, + "rewards/rejected": -2.4805126190185547, + "step": 7907 + }, + { + "epoch": 0.92, + "learning_rate": 2.4376993031770404e-08, + "logits/chosen": -2.8087270259857178, + "logits/rejected": -2.790100574493408, + "logps/chosen": -141.4102783203125, + "logps/rejected": -155.72772216796875, + "loss": 0.2193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35797542333602905, + "rewards/margins": 3.4364938735961914, + "rewards/rejected": -3.7944693565368652, + "step": 7908 + }, + { + "epoch": 0.92, + "learning_rate": 2.434156135585213e-08, + "logits/chosen": -2.16719651222229, + "logits/rejected": -2.4255967140197754, + "logps/chosen": -244.84304809570312, + "logps/rejected": -204.42742919921875, + "loss": 0.172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14899897575378418, + "rewards/margins": 2.7660322189331055, + "rewards/rejected": -2.9150309562683105, + "step": 7909 + }, + { + "epoch": 0.92, + "learning_rate": 2.430612967993386e-08, + "logits/chosen": -2.1378636360168457, + "logits/rejected": -2.058483839035034, + "logps/chosen": -251.0749053955078, + "logps/rejected": -247.39463806152344, + "loss": 0.2482, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4721890687942505, + "rewards/margins": 2.135995864868164, + "rewards/rejected": -2.608185052871704, + "step": 7910 + }, + { + "epoch": 0.92, + "learning_rate": 2.427069800401559e-08, + "logits/chosen": -2.499842643737793, + "logits/rejected": -2.5841121673583984, + "logps/chosen": -185.98577880859375, + "logps/rejected": -337.7648010253906, + "loss": 0.2465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5679540038108826, + "rewards/margins": 1.837892770767212, + "rewards/rejected": -2.4058468341827393, + "step": 7911 + }, + { + "epoch": 0.92, + "learning_rate": 2.4235266328097316e-08, + "logits/chosen": -2.2288529872894287, + "logits/rejected": -2.246342658996582, + "logps/chosen": -360.1132507324219, + "logps/rejected": -184.73468017578125, + "loss": 0.8862, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4019582271575928, + "rewards/margins": 1.603406548500061, + "rewards/rejected": -3.0053648948669434, + "step": 7912 + }, + { + "epoch": 0.92, + "learning_rate": 2.4199834652179048e-08, + "logits/chosen": -2.1826891899108887, + "logits/rejected": -2.414909601211548, + "logps/chosen": -496.55963134765625, + "logps/rejected": -357.4420471191406, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9701114892959595, + "rewards/margins": 2.3873233795166016, + "rewards/rejected": -3.3574347496032715, + "step": 7913 + }, + { + "epoch": 0.92, + "learning_rate": 2.4164402976260777e-08, + "logits/chosen": -2.1301865577697754, + "logits/rejected": -2.2222049236297607, + "logps/chosen": -283.1365966796875, + "logps/rejected": -266.12677001953125, + "loss": 0.3776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9253539443016052, + "rewards/margins": 1.5733819007873535, + "rewards/rejected": -2.4987356662750244, + "step": 7914 + }, + { + "epoch": 0.92, + "learning_rate": 2.412897130034251e-08, + "logits/chosen": -2.443904399871826, + "logits/rejected": -2.4630465507507324, + "logps/chosen": -258.90826416015625, + "logps/rejected": -293.7914123535156, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7137433290481567, + "rewards/margins": 3.5591225624084473, + "rewards/rejected": -4.2728657722473145, + "step": 7915 + }, + { + "epoch": 0.92, + "learning_rate": 2.4093539624424234e-08, + "logits/chosen": -2.2282724380493164, + "logits/rejected": -2.0345242023468018, + "logps/chosen": -399.265869140625, + "logps/rejected": -387.71661376953125, + "loss": 0.2353, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0928014516830444, + "rewards/margins": 3.081101179122925, + "rewards/rejected": -4.17390251159668, + "step": 7916 + }, + { + "epoch": 0.92, + "learning_rate": 2.4058107948505963e-08, + "logits/chosen": -3.0315189361572266, + "logits/rejected": -2.989769697189331, + "logps/chosen": -339.30767822265625, + "logps/rejected": -260.9018859863281, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3835554122924805, + "rewards/margins": 3.3323888778686523, + "rewards/rejected": -4.715943813323975, + "step": 7917 + }, + { + "epoch": 0.92, + "learning_rate": 2.4022676272587695e-08, + "logits/chosen": -2.321711778640747, + "logits/rejected": -2.415259599685669, + "logps/chosen": -225.35968017578125, + "logps/rejected": -277.0367431640625, + "loss": 0.3222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5182996988296509, + "rewards/margins": 2.3043363094329834, + "rewards/rejected": -2.8226358890533447, + "step": 7918 + }, + { + "epoch": 0.92, + "learning_rate": 2.398724459666942e-08, + "logits/chosen": -2.5577704906463623, + "logits/rejected": -2.5920004844665527, + "logps/chosen": -168.31614685058594, + "logps/rejected": -161.8942108154297, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5825552940368652, + "rewards/margins": 0.847343385219574, + "rewards/rejected": -2.429898500442505, + "step": 7919 + }, + { + "epoch": 0.92, + "learning_rate": 2.3951812920751152e-08, + "logits/chosen": -1.8999004364013672, + "logits/rejected": -1.6054641008377075, + "logps/chosen": -272.4313049316406, + "logps/rejected": -343.7594909667969, + "loss": 0.3802, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9420961141586304, + "rewards/margins": 1.3364864587783813, + "rewards/rejected": -2.2785825729370117, + "step": 7920 + }, + { + "epoch": 0.92, + "learning_rate": 2.391638124483288e-08, + "logits/chosen": -2.4054789543151855, + "logits/rejected": -2.3733601570129395, + "logps/chosen": -291.7012023925781, + "logps/rejected": -268.06207275390625, + "loss": 0.4918, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37824904918670654, + "rewards/margins": 1.5260368585586548, + "rewards/rejected": -1.9042860269546509, + "step": 7921 + }, + { + "epoch": 0.92, + "learning_rate": 2.3880949568914607e-08, + "logits/chosen": -2.3120837211608887, + "logits/rejected": -2.5042855739593506, + "logps/chosen": -174.71795654296875, + "logps/rejected": -267.41485595703125, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22401051223278046, + "rewards/margins": 2.9427590370178223, + "rewards/rejected": -3.1667697429656982, + "step": 7922 + }, + { + "epoch": 0.92, + "learning_rate": 2.384551789299634e-08, + "logits/chosen": -2.602632761001587, + "logits/rejected": -2.6612229347229004, + "logps/chosen": -156.10350036621094, + "logps/rejected": -236.39727783203125, + "loss": 0.1849, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2153866291046143, + "rewards/margins": 3.019911766052246, + "rewards/rejected": -4.235298156738281, + "step": 7923 + }, + { + "epoch": 0.92, + "learning_rate": 2.3810086217078067e-08, + "logits/chosen": -1.9638186693191528, + "logits/rejected": -2.304720878601074, + "logps/chosen": -260.29376220703125, + "logps/rejected": -242.98907470703125, + "loss": 0.3161, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22403855621814728, + "rewards/margins": 2.5980265140533447, + "rewards/rejected": -2.8220651149749756, + "step": 7924 + }, + { + "epoch": 0.92, + "learning_rate": 2.3774654541159793e-08, + "logits/chosen": -2.0644359588623047, + "logits/rejected": -2.0033154487609863, + "logps/chosen": -395.11846923828125, + "logps/rejected": -372.88330078125, + "loss": 0.4911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0338728427886963, + "rewards/margins": 1.3245233297348022, + "rewards/rejected": -2.358396053314209, + "step": 7925 + }, + { + "epoch": 0.92, + "learning_rate": 2.3739222865241525e-08, + "logits/chosen": -2.635441303253174, + "logits/rejected": -2.605717420578003, + "logps/chosen": -284.25518798828125, + "logps/rejected": -260.8153991699219, + "loss": 0.265, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.831153929233551, + "rewards/margins": 2.0157456398010254, + "rewards/rejected": -2.8468995094299316, + "step": 7926 + }, + { + "epoch": 0.92, + "learning_rate": 2.3703791189323254e-08, + "logits/chosen": -2.5409960746765137, + "logits/rejected": -2.225231409072876, + "logps/chosen": -269.84112548828125, + "logps/rejected": -351.6633605957031, + "loss": 0.4643, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4332435131072998, + "rewards/margins": 2.1261606216430664, + "rewards/rejected": -3.5594043731689453, + "step": 7927 + }, + { + "epoch": 0.92, + "learning_rate": 2.3668359513404982e-08, + "logits/chosen": -2.5507864952087402, + "logits/rejected": -2.5258522033691406, + "logps/chosen": -159.28123474121094, + "logps/rejected": -188.44107055664062, + "loss": 0.506, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8579781651496887, + "rewards/margins": 2.154188394546509, + "rewards/rejected": -3.0121665000915527, + "step": 7928 + }, + { + "epoch": 0.92, + "learning_rate": 2.363292783748671e-08, + "logits/chosen": -1.9448347091674805, + "logits/rejected": -2.4302451610565186, + "logps/chosen": -391.88494873046875, + "logps/rejected": -270.1730041503906, + "loss": 0.7114, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1585097312927246, + "rewards/margins": 0.6377213597297668, + "rewards/rejected": -1.7962310314178467, + "step": 7929 + }, + { + "epoch": 0.92, + "learning_rate": 2.3597496161568443e-08, + "logits/chosen": -2.4405508041381836, + "logits/rejected": -2.615535259246826, + "logps/chosen": -274.6668395996094, + "logps/rejected": -146.52352905273438, + "loss": 0.3987, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.44402971863746643, + "rewards/margins": 1.398289442062378, + "rewards/rejected": -1.8423190116882324, + "step": 7930 + }, + { + "epoch": 0.92, + "learning_rate": 2.356206448565017e-08, + "logits/chosen": -2.7612464427948, + "logits/rejected": -2.4905612468719482, + "logps/chosen": -186.17674255371094, + "logps/rejected": -233.54598999023438, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22131508588790894, + "rewards/margins": 2.2624282836914062, + "rewards/rejected": -2.483743667602539, + "step": 7931 + }, + { + "epoch": 0.92, + "learning_rate": 2.3526632809731897e-08, + "logits/chosen": -2.012943744659424, + "logits/rejected": -2.272374391555786, + "logps/chosen": -290.6260070800781, + "logps/rejected": -288.4266662597656, + "loss": 0.7443, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.996061384677887, + "rewards/margins": 1.260544776916504, + "rewards/rejected": -2.256605863571167, + "step": 7932 + }, + { + "epoch": 0.92, + "learning_rate": 2.349120113381363e-08, + "logits/chosen": -2.3637595176696777, + "logits/rejected": -2.1447930335998535, + "logps/chosen": -299.8051452636719, + "logps/rejected": -333.3448486328125, + "loss": 0.8299, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.295386552810669, + "rewards/margins": 2.759638786315918, + "rewards/rejected": -4.055025100708008, + "step": 7933 + }, + { + "epoch": 0.92, + "learning_rate": 2.3455769457895358e-08, + "logits/chosen": -2.504493474960327, + "logits/rejected": -2.4831273555755615, + "logps/chosen": -111.75584411621094, + "logps/rejected": -254.7050323486328, + "loss": 0.6642, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7831162214279175, + "rewards/margins": 1.4520812034606934, + "rewards/rejected": -2.2351975440979004, + "step": 7934 + }, + { + "epoch": 0.92, + "learning_rate": 2.3420337781977087e-08, + "logits/chosen": -2.5030336380004883, + "logits/rejected": -2.653775215148926, + "logps/chosen": -237.0608367919922, + "logps/rejected": -316.8910827636719, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9311028122901917, + "rewards/margins": 2.081559419631958, + "rewards/rejected": -3.012662410736084, + "step": 7935 + }, + { + "epoch": 0.92, + "learning_rate": 2.3384906106058816e-08, + "logits/chosen": -2.766071081161499, + "logits/rejected": -2.440854787826538, + "logps/chosen": -128.44578552246094, + "logps/rejected": -265.63031005859375, + "loss": 0.5464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9430932998657227, + "rewards/margins": 0.9608340859413147, + "rewards/rejected": -1.9039273262023926, + "step": 7936 + }, + { + "epoch": 0.92, + "learning_rate": 2.3349474430140548e-08, + "logits/chosen": -2.6547579765319824, + "logits/rejected": -2.2129507064819336, + "logps/chosen": -126.23860168457031, + "logps/rejected": -276.2158203125, + "loss": 0.329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.75138258934021, + "rewards/margins": 1.5121071338653564, + "rewards/rejected": -2.2634897232055664, + "step": 7937 + }, + { + "epoch": 0.92, + "learning_rate": 2.3314042754222273e-08, + "logits/chosen": -2.5367753505706787, + "logits/rejected": -2.7656497955322266, + "logps/chosen": -443.9340515136719, + "logps/rejected": -247.61654663085938, + "loss": 0.2357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3303983509540558, + "rewards/margins": 2.048977851867676, + "rewards/rejected": -2.37937593460083, + "step": 7938 + }, + { + "epoch": 0.92, + "learning_rate": 2.3278611078304002e-08, + "logits/chosen": -2.0574212074279785, + "logits/rejected": -2.099177837371826, + "logps/chosen": -415.122314453125, + "logps/rejected": -198.55160522460938, + "loss": 0.592, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0559746026992798, + "rewards/margins": 0.7258423566818237, + "rewards/rejected": -1.7818169593811035, + "step": 7939 + }, + { + "epoch": 0.92, + "learning_rate": 2.3243179402385734e-08, + "logits/chosen": -2.9010469913482666, + "logits/rejected": -2.8756604194641113, + "logps/chosen": -332.96893310546875, + "logps/rejected": -233.5495147705078, + "loss": 0.1476, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3168620765209198, + "rewards/margins": 4.073558330535889, + "rewards/rejected": -4.390420436859131, + "step": 7940 + }, + { + "epoch": 0.92, + "learning_rate": 2.320774772646746e-08, + "logits/chosen": -2.3103156089782715, + "logits/rejected": -2.54202938079834, + "logps/chosen": -171.20233154296875, + "logps/rejected": -238.95062255859375, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24492357671260834, + "rewards/margins": 2.239535331726074, + "rewards/rejected": -2.4844589233398438, + "step": 7941 + }, + { + "epoch": 0.92, + "learning_rate": 2.317231605054919e-08, + "logits/chosen": -2.677290201187134, + "logits/rejected": -2.575899839401245, + "logps/chosen": -290.17620849609375, + "logps/rejected": -363.729736328125, + "loss": 0.1541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9728565812110901, + "rewards/margins": 2.401655673980713, + "rewards/rejected": -3.374512195587158, + "step": 7942 + }, + { + "epoch": 0.92, + "learning_rate": 2.313688437463092e-08, + "logits/chosen": -2.1409707069396973, + "logits/rejected": -2.2890095710754395, + "logps/chosen": -201.75604248046875, + "logps/rejected": -269.7251281738281, + "loss": 1.8355, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.644479751586914, + "rewards/margins": 0.465520977973938, + "rewards/rejected": -3.1100008487701416, + "step": 7943 + }, + { + "epoch": 0.92, + "learning_rate": 2.3101452698712645e-08, + "logits/chosen": -2.4376142024993896, + "logits/rejected": -2.4085497856140137, + "logps/chosen": -314.70452880859375, + "logps/rejected": -279.63714599609375, + "loss": 0.2087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9749330282211304, + "rewards/margins": 2.119748830795288, + "rewards/rejected": -3.094681739807129, + "step": 7944 + }, + { + "epoch": 0.92, + "learning_rate": 2.3066021022794378e-08, + "logits/chosen": -2.178804874420166, + "logits/rejected": -2.14040207862854, + "logps/chosen": -337.7563781738281, + "logps/rejected": -306.2362060546875, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30014145374298096, + "rewards/margins": 3.6921420097351074, + "rewards/rejected": -3.992283344268799, + "step": 7945 + }, + { + "epoch": 0.92, + "learning_rate": 2.3030589346876106e-08, + "logits/chosen": -1.9577772617340088, + "logits/rejected": -2.1096670627593994, + "logps/chosen": -212.10662841796875, + "logps/rejected": -185.49472045898438, + "loss": 0.4694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5436193943023682, + "rewards/margins": 1.1928845643997192, + "rewards/rejected": -1.7365039587020874, + "step": 7946 + }, + { + "epoch": 0.92, + "learning_rate": 2.2995157670957835e-08, + "logits/chosen": -2.908970355987549, + "logits/rejected": -2.829209566116333, + "logps/chosen": -260.43890380859375, + "logps/rejected": -295.95294189453125, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0091475248336792, + "rewards/margins": 3.479346752166748, + "rewards/rejected": -4.488494873046875, + "step": 7947 + }, + { + "epoch": 0.92, + "learning_rate": 2.2959725995039564e-08, + "logits/chosen": -2.686594009399414, + "logits/rejected": -2.343799352645874, + "logps/chosen": -158.93704223632812, + "logps/rejected": -220.22451782226562, + "loss": 0.3775, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3530195951461792, + "rewards/margins": 1.897355079650879, + "rewards/rejected": -3.2503745555877686, + "step": 7948 + }, + { + "epoch": 0.92, + "learning_rate": 2.2924294319121296e-08, + "logits/chosen": -2.818746328353882, + "logits/rejected": -2.828338623046875, + "logps/chosen": -259.7038269042969, + "logps/rejected": -171.31246948242188, + "loss": 0.5046, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.615162193775177, + "rewards/margins": 1.5230515003204346, + "rewards/rejected": -2.138213634490967, + "step": 7949 + }, + { + "epoch": 0.92, + "learning_rate": 2.288886264320302e-08, + "logits/chosen": -1.772756576538086, + "logits/rejected": -1.8191776275634766, + "logps/chosen": -311.5915832519531, + "logps/rejected": -282.1527404785156, + "loss": 0.2818, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6765326857566833, + "rewards/margins": 1.7645983695983887, + "rewards/rejected": -2.441131114959717, + "step": 7950 + }, + { + "epoch": 0.92, + "learning_rate": 2.285343096728475e-08, + "logits/chosen": -1.9832357168197632, + "logits/rejected": -2.0177032947540283, + "logps/chosen": -334.6487731933594, + "logps/rejected": -279.83563232421875, + "loss": 0.3283, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7178694009780884, + "rewards/margins": 1.4255225658416748, + "rewards/rejected": -3.1433918476104736, + "step": 7951 + }, + { + "epoch": 0.93, + "learning_rate": 2.2817999291366482e-08, + "logits/chosen": -2.804170608520508, + "logits/rejected": -2.8846681118011475, + "logps/chosen": -261.3087463378906, + "logps/rejected": -304.3049011230469, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6728501915931702, + "rewards/margins": 2.2403061389923096, + "rewards/rejected": -2.913156270980835, + "step": 7952 + }, + { + "epoch": 0.93, + "learning_rate": 2.278256761544821e-08, + "logits/chosen": -2.4191031455993652, + "logits/rejected": -2.369968891143799, + "logps/chosen": -200.87057495117188, + "logps/rejected": -198.14102172851562, + "loss": 0.615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38278234004974365, + "rewards/margins": 1.8026577234268188, + "rewards/rejected": -2.1854400634765625, + "step": 7953 + }, + { + "epoch": 0.93, + "learning_rate": 2.2747135939529936e-08, + "logits/chosen": -2.8239660263061523, + "logits/rejected": -2.689587354660034, + "logps/chosen": -159.35874938964844, + "logps/rejected": -267.359130859375, + "loss": 0.2505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4760451018810272, + "rewards/margins": 3.2247085571289062, + "rewards/rejected": -3.7007534503936768, + "step": 7954 + }, + { + "epoch": 0.93, + "learning_rate": 2.2711704263611668e-08, + "logits/chosen": -2.252699613571167, + "logits/rejected": -2.1995182037353516, + "logps/chosen": -131.16091918945312, + "logps/rejected": -185.00538635253906, + "loss": 0.414, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0721017122268677, + "rewards/margins": 1.4404927492141724, + "rewards/rejected": -2.51259446144104, + "step": 7955 + }, + { + "epoch": 0.93, + "learning_rate": 2.2676272587693397e-08, + "logits/chosen": -2.2172956466674805, + "logits/rejected": -2.5874757766723633, + "logps/chosen": -517.50537109375, + "logps/rejected": -310.1287841796875, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18548892438411713, + "rewards/margins": 2.702253818511963, + "rewards/rejected": -2.887742757797241, + "step": 7956 + }, + { + "epoch": 0.93, + "learning_rate": 2.2640840911775126e-08, + "logits/chosen": -2.8744606971740723, + "logits/rejected": -2.926292896270752, + "logps/chosen": -83.27838134765625, + "logps/rejected": -150.31118774414062, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9619206786155701, + "rewards/margins": 1.2763993740081787, + "rewards/rejected": -2.2383198738098145, + "step": 7957 + }, + { + "epoch": 0.93, + "learning_rate": 2.2605409235856854e-08, + "logits/chosen": -2.554727554321289, + "logits/rejected": -2.803117036819458, + "logps/chosen": -243.24822998046875, + "logps/rejected": -207.26861572265625, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6724598407745361, + "rewards/margins": 1.8769890069961548, + "rewards/rejected": -2.5494489669799805, + "step": 7958 + }, + { + "epoch": 0.93, + "learning_rate": 2.2569977559938587e-08, + "logits/chosen": -1.783987283706665, + "logits/rejected": -1.9412462711334229, + "logps/chosen": -454.9590759277344, + "logps/rejected": -298.49041748046875, + "loss": 0.4846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5409785509109497, + "rewards/margins": 1.830592155456543, + "rewards/rejected": -2.3715708255767822, + "step": 7959 + }, + { + "epoch": 0.93, + "learning_rate": 2.2534545884020312e-08, + "logits/chosen": -1.743109941482544, + "logits/rejected": -1.922055959701538, + "logps/chosen": -283.1545104980469, + "logps/rejected": -294.14691162109375, + "loss": 0.8222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8095879554748535, + "rewards/margins": 2.9660778045654297, + "rewards/rejected": -3.775665760040283, + "step": 7960 + }, + { + "epoch": 0.93, + "learning_rate": 2.249911420810204e-08, + "logits/chosen": -2.4052834510803223, + "logits/rejected": -2.3537933826446533, + "logps/chosen": -350.47259521484375, + "logps/rejected": -266.0898742675781, + "loss": 0.4271, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1431539058685303, + "rewards/margins": 1.0197665691375732, + "rewards/rejected": -2.1629204750061035, + "step": 7961 + }, + { + "epoch": 0.93, + "learning_rate": 2.2463682532183773e-08, + "logits/chosen": -2.596705675125122, + "logits/rejected": -2.5372259616851807, + "logps/chosen": -348.93829345703125, + "logps/rejected": -418.7188415527344, + "loss": 0.2252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8061137795448303, + "rewards/margins": 1.731471061706543, + "rewards/rejected": -2.5375847816467285, + "step": 7962 + }, + { + "epoch": 0.93, + "learning_rate": 2.2428250856265498e-08, + "logits/chosen": -2.0659141540527344, + "logits/rejected": -2.0213091373443604, + "logps/chosen": -249.2177276611328, + "logps/rejected": -263.25885009765625, + "loss": 0.4614, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7510859966278076, + "rewards/margins": 2.186274290084839, + "rewards/rejected": -3.9373602867126465, + "step": 7963 + }, + { + "epoch": 0.93, + "learning_rate": 2.239281918034723e-08, + "logits/chosen": -2.4914402961730957, + "logits/rejected": -2.4141383171081543, + "logps/chosen": -208.51651000976562, + "logps/rejected": -238.66949462890625, + "loss": 0.3218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8023133277893066, + "rewards/margins": 2.325784206390381, + "rewards/rejected": -3.1280975341796875, + "step": 7964 + }, + { + "epoch": 0.93, + "learning_rate": 2.235738750442896e-08, + "logits/chosen": -2.380258560180664, + "logits/rejected": -2.4839227199554443, + "logps/chosen": -253.82901000976562, + "logps/rejected": -200.03713989257812, + "loss": 0.4398, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15005724132061005, + "rewards/margins": 2.073590040206909, + "rewards/rejected": -2.223647117614746, + "step": 7965 + }, + { + "epoch": 0.93, + "learning_rate": 2.2321955828510684e-08, + "logits/chosen": -2.410712957382202, + "logits/rejected": -2.053783416748047, + "logps/chosen": -146.37596130371094, + "logps/rejected": -260.8443298339844, + "loss": 0.8249, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6185486316680908, + "rewards/margins": 1.801001787185669, + "rewards/rejected": -2.419550657272339, + "step": 7966 + }, + { + "epoch": 0.93, + "learning_rate": 2.2286524152592416e-08, + "logits/chosen": -2.784095287322998, + "logits/rejected": -2.690244674682617, + "logps/chosen": -246.8460693359375, + "logps/rejected": -288.37200927734375, + "loss": 0.3269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0475585460662842, + "rewards/margins": 1.824329137802124, + "rewards/rejected": -2.871887683868408, + "step": 7967 + }, + { + "epoch": 0.93, + "learning_rate": 2.2251092476674145e-08, + "logits/chosen": -2.003535509109497, + "logits/rejected": -1.7436556816101074, + "logps/chosen": -365.09381103515625, + "logps/rejected": -479.1763916015625, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37955570220947266, + "rewards/margins": 4.792245388031006, + "rewards/rejected": -5.17180061340332, + "step": 7968 + }, + { + "epoch": 0.93, + "learning_rate": 2.2215660800755874e-08, + "logits/chosen": -2.5739240646362305, + "logits/rejected": -2.5221571922302246, + "logps/chosen": -180.19772338867188, + "logps/rejected": -221.1044921875, + "loss": 0.338, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0150916576385498, + "rewards/margins": 2.4401440620422363, + "rewards/rejected": -3.455235481262207, + "step": 7969 + }, + { + "epoch": 0.93, + "learning_rate": 2.2180229124837603e-08, + "logits/chosen": -2.3289341926574707, + "logits/rejected": -2.7526071071624756, + "logps/chosen": -307.6087646484375, + "logps/rejected": -287.57415771484375, + "loss": 0.4436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8485410809516907, + "rewards/margins": 1.780888557434082, + "rewards/rejected": -2.629429578781128, + "step": 7970 + }, + { + "epoch": 0.93, + "learning_rate": 2.2144797448919335e-08, + "logits/chosen": -2.5578601360321045, + "logits/rejected": -2.614640951156616, + "logps/chosen": -315.7478332519531, + "logps/rejected": -285.97369384765625, + "loss": 0.8171, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0711640119552612, + "rewards/margins": 0.7397758960723877, + "rewards/rejected": -1.810939908027649, + "step": 7971 + }, + { + "epoch": 0.93, + "learning_rate": 2.2109365773001063e-08, + "logits/chosen": -2.289844512939453, + "logits/rejected": -2.2932121753692627, + "logps/chosen": -341.7578125, + "logps/rejected": -297.2193603515625, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.33973026275634766, + "rewards/margins": 3.5787386894226074, + "rewards/rejected": -3.918468952178955, + "step": 7972 + }, + { + "epoch": 0.93, + "learning_rate": 2.207393409708279e-08, + "logits/chosen": -2.584620952606201, + "logits/rejected": -2.5750203132629395, + "logps/chosen": -237.507080078125, + "logps/rejected": -319.0863037109375, + "loss": 0.1713, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39611050486564636, + "rewards/margins": 3.1721997261047363, + "rewards/rejected": -3.568310022354126, + "step": 7973 + }, + { + "epoch": 0.93, + "learning_rate": 2.203850242116452e-08, + "logits/chosen": -2.3300206661224365, + "logits/rejected": -2.408979654312134, + "logps/chosen": -359.6336669921875, + "logps/rejected": -347.79632568359375, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.211295023560524, + "rewards/margins": 3.5260958671569824, + "rewards/rejected": -3.7373909950256348, + "step": 7974 + }, + { + "epoch": 0.93, + "learning_rate": 2.200307074524625e-08, + "logits/chosen": -2.739744186401367, + "logits/rejected": -2.729660987854004, + "logps/chosen": -265.20550537109375, + "logps/rejected": -289.67852783203125, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16416095197200775, + "rewards/margins": 4.302430152893066, + "rewards/rejected": -4.466590881347656, + "step": 7975 + }, + { + "epoch": 0.93, + "learning_rate": 2.196763906932798e-08, + "logits/chosen": -2.1292455196380615, + "logits/rejected": -2.473879337310791, + "logps/chosen": -342.6792907714844, + "logps/rejected": -270.7359313964844, + "loss": 0.27, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1868382692337036, + "rewards/margins": 2.839244842529297, + "rewards/rejected": -4.026083469390869, + "step": 7976 + }, + { + "epoch": 0.93, + "learning_rate": 2.1932207393409707e-08, + "logits/chosen": -2.653928279876709, + "logits/rejected": -2.7606658935546875, + "logps/chosen": -372.07000732421875, + "logps/rejected": -214.67076110839844, + "loss": 0.1586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.451236367225647, + "rewards/margins": 2.9641218185424805, + "rewards/rejected": -3.415358543395996, + "step": 7977 + }, + { + "epoch": 0.93, + "learning_rate": 2.189677571749144e-08, + "logits/chosen": -2.7326440811157227, + "logits/rejected": -2.905953884124756, + "logps/chosen": -403.1810302734375, + "logps/rejected": -224.84967041015625, + "loss": 0.375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2886511087417603, + "rewards/margins": 1.4819252490997314, + "rewards/rejected": -2.770576238632202, + "step": 7978 + }, + { + "epoch": 0.93, + "learning_rate": 2.1861344041573165e-08, + "logits/chosen": -2.035466194152832, + "logits/rejected": -2.008164644241333, + "logps/chosen": -304.0318908691406, + "logps/rejected": -288.59332275390625, + "loss": 0.7338, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.893661618232727, + "rewards/margins": 1.662966012954712, + "rewards/rejected": -3.5566277503967285, + "step": 7979 + }, + { + "epoch": 0.93, + "learning_rate": 2.1825912365654893e-08, + "logits/chosen": -2.045381546020508, + "logits/rejected": -2.4216904640197754, + "logps/chosen": -414.32611083984375, + "logps/rejected": -316.7781066894531, + "loss": 0.3726, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05281589925289154, + "rewards/margins": 1.7778620719909668, + "rewards/rejected": -1.83067786693573, + "step": 7980 + }, + { + "epoch": 0.93, + "learning_rate": 2.1790480689736625e-08, + "logits/chosen": -2.1544790267944336, + "logits/rejected": -2.2158472537994385, + "logps/chosen": -194.3736114501953, + "logps/rejected": -240.66116333007812, + "loss": 1.3612, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.400655746459961, + "rewards/margins": -0.1296863555908203, + "rewards/rejected": -2.2709696292877197, + "step": 7981 + }, + { + "epoch": 0.93, + "learning_rate": 2.175504901381835e-08, + "logits/chosen": -2.287816286087036, + "logits/rejected": -2.358912706375122, + "logps/chosen": -128.73123168945312, + "logps/rejected": -283.630615234375, + "loss": 0.7669, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3471447229385376, + "rewards/margins": 1.949524164199829, + "rewards/rejected": -3.296668767929077, + "step": 7982 + }, + { + "epoch": 0.93, + "learning_rate": 2.1719617337900083e-08, + "logits/chosen": -1.987858533859253, + "logits/rejected": -2.1607489585876465, + "logps/chosen": -418.91943359375, + "logps/rejected": -358.042236328125, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08607292175292969, + "rewards/margins": 3.7297887802124023, + "rewards/rejected": -3.815861940383911, + "step": 7983 + }, + { + "epoch": 0.93, + "learning_rate": 2.1684185661981812e-08, + "logits/chosen": -2.2343132495880127, + "logits/rejected": -2.245279550552368, + "logps/chosen": -209.7129669189453, + "logps/rejected": -276.41668701171875, + "loss": 0.342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8181359767913818, + "rewards/margins": 2.3515584468841553, + "rewards/rejected": -3.169694423675537, + "step": 7984 + }, + { + "epoch": 0.93, + "learning_rate": 2.1648753986063537e-08, + "logits/chosen": -2.4495882987976074, + "logits/rejected": -2.008030652999878, + "logps/chosen": -117.72089385986328, + "logps/rejected": -338.3353576660156, + "loss": 0.226, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9642826914787292, + "rewards/margins": 3.2623400688171387, + "rewards/rejected": -4.226623058319092, + "step": 7985 + }, + { + "epoch": 0.93, + "learning_rate": 2.161332231014527e-08, + "logits/chosen": -2.6385602951049805, + "logits/rejected": -2.756490707397461, + "logps/chosen": -356.8431091308594, + "logps/rejected": -355.6638488769531, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7990865707397461, + "rewards/margins": 2.8925833702087402, + "rewards/rejected": -3.6916699409484863, + "step": 7986 + }, + { + "epoch": 0.93, + "learning_rate": 2.1577890634226998e-08, + "logits/chosen": -2.134584426879883, + "logits/rejected": -2.2066569328308105, + "logps/chosen": -280.8466796875, + "logps/rejected": -295.53448486328125, + "loss": 0.4146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4595000743865967, + "rewards/margins": 1.5076870918273926, + "rewards/rejected": -1.9671871662139893, + "step": 7987 + }, + { + "epoch": 0.93, + "learning_rate": 2.154245895830873e-08, + "logits/chosen": -2.2359235286712646, + "logits/rejected": -2.0676114559173584, + "logps/chosen": -253.96841430664062, + "logps/rejected": -346.63800048828125, + "loss": 0.7587, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.229308843612671, + "rewards/margins": 1.6873306035995483, + "rewards/rejected": -2.9166393280029297, + "step": 7988 + }, + { + "epoch": 0.93, + "learning_rate": 2.1507027282390455e-08, + "logits/chosen": -2.272461414337158, + "logits/rejected": -2.136066198348999, + "logps/chosen": -338.4999084472656, + "logps/rejected": -309.7479248046875, + "loss": 0.3871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1676366627216339, + "rewards/margins": 1.760820746421814, + "rewards/rejected": -1.928457498550415, + "step": 7989 + }, + { + "epoch": 0.93, + "learning_rate": 2.1471595606472184e-08, + "logits/chosen": -2.375124216079712, + "logits/rejected": -2.458726406097412, + "logps/chosen": -241.36647033691406, + "logps/rejected": -171.52743530273438, + "loss": 0.4824, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031467944383621216, + "rewards/margins": 1.833339810371399, + "rewards/rejected": -1.8018720149993896, + "step": 7990 + }, + { + "epoch": 0.93, + "learning_rate": 2.1436163930553916e-08, + "logits/chosen": -2.494670867919922, + "logits/rejected": -2.6911003589630127, + "logps/chosen": -283.97003173828125, + "logps/rejected": -270.91180419921875, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.31934550404548645, + "rewards/margins": 1.9394376277923584, + "rewards/rejected": -2.2587833404541016, + "step": 7991 + }, + { + "epoch": 0.93, + "learning_rate": 2.140073225463564e-08, + "logits/chosen": -2.3459160327911377, + "logits/rejected": -2.311495304107666, + "logps/chosen": -181.8314208984375, + "logps/rejected": -226.3282470703125, + "loss": 0.2661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8728684782981873, + "rewards/margins": 2.0685901641845703, + "rewards/rejected": -2.9414589405059814, + "step": 7992 + }, + { + "epoch": 0.93, + "learning_rate": 2.1365300578717374e-08, + "logits/chosen": -2.6651415824890137, + "logits/rejected": -2.580374002456665, + "logps/chosen": -150.93251037597656, + "logps/rejected": -222.8038330078125, + "loss": 0.2834, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.236695408821106, + "rewards/margins": 2.008108139038086, + "rewards/rejected": -3.2448036670684814, + "step": 7993 + }, + { + "epoch": 0.93, + "learning_rate": 2.1329868902799102e-08, + "logits/chosen": -2.3615214824676514, + "logits/rejected": -2.3639745712280273, + "logps/chosen": -149.2554931640625, + "logps/rejected": -181.45912170410156, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7607493996620178, + "rewards/margins": 3.8407835960388184, + "rewards/rejected": -4.601532936096191, + "step": 7994 + }, + { + "epoch": 0.93, + "learning_rate": 2.1294437226880828e-08, + "logits/chosen": -2.4990766048431396, + "logits/rejected": -2.559098958969116, + "logps/chosen": -272.220947265625, + "logps/rejected": -212.87466430664062, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9893308877944946, + "rewards/margins": 1.2760288715362549, + "rewards/rejected": -2.26535964012146, + "step": 7995 + }, + { + "epoch": 0.93, + "learning_rate": 2.125900555096256e-08, + "logits/chosen": -2.244805335998535, + "logits/rejected": -1.9662052392959595, + "logps/chosen": -243.71119689941406, + "logps/rejected": -303.9477844238281, + "loss": 0.3036, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2401723861694336, + "rewards/margins": 2.5939159393310547, + "rewards/rejected": -3.8340883255004883, + "step": 7996 + }, + { + "epoch": 0.93, + "learning_rate": 2.122357387504429e-08, + "logits/chosen": -2.760526180267334, + "logits/rejected": -2.869185447692871, + "logps/chosen": -172.59080505371094, + "logps/rejected": -319.48590087890625, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2546253204345703, + "rewards/margins": 4.089526176452637, + "rewards/rejected": -5.344151496887207, + "step": 7997 + }, + { + "epoch": 0.93, + "learning_rate": 2.1188142199126017e-08, + "logits/chosen": -2.648529529571533, + "logits/rejected": -2.6099541187286377, + "logps/chosen": -201.4505615234375, + "logps/rejected": -146.0668182373047, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1993567943572998, + "rewards/margins": 1.903824806213379, + "rewards/rejected": -3.1031813621520996, + "step": 7998 + }, + { + "epoch": 0.93, + "learning_rate": 2.1152710523207746e-08, + "logits/chosen": -2.3633294105529785, + "logits/rejected": -2.564870834350586, + "logps/chosen": -248.1110076904297, + "logps/rejected": -243.41659545898438, + "loss": 0.7302, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6178865432739258, + "rewards/margins": 1.0867955684661865, + "rewards/rejected": -2.7046823501586914, + "step": 7999 + }, + { + "epoch": 0.93, + "learning_rate": 2.1117278847289478e-08, + "logits/chosen": -1.9721966981887817, + "logits/rejected": -2.3795511722564697, + "logps/chosen": -238.2112579345703, + "logps/rejected": -158.1881103515625, + "loss": 0.3897, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.844830334186554, + "rewards/margins": 2.080979824066162, + "rewards/rejected": -2.9258103370666504, + "step": 8000 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -1.7372090816497803, + "eval_logits/rejected": -1.7367677688598633, + "eval_logps/chosen": -278.76385498046875, + "eval_logps/rejected": -279.25750732421875, + "eval_loss": 0.3668655753135681, + "eval_rewards/accuracies": 0.8491379022598267, + "eval_rewards/chosen": -0.6598379015922546, + "eval_rewards/margins": 2.1652562618255615, + "eval_rewards/rejected": -2.825094223022461, + "eval_runtime": 237.2001, + "eval_samples_per_second": 2.93, + "eval_steps_per_second": 1.467, + "step": 8000 + } + ], + "logging_steps": 1, + "max_steps": 8596, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}