{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.235602094240838e-09, "logits/chosen": -1.3201165199279785, "logits/rejected": -1.2275193929672241, "logps/chosen": -2993.4990234375, "logps/rejected": -2222.55078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.2356020942408376e-08, "logits/chosen": -1.2813271284103394, "logits/rejected": -1.2465020418167114, "logps/chosen": -3047.636474609375, "logps/rejected": -2742.105712890625, "loss": 0.6973, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.00026022063684649765, "rewards/margins": 0.0008929346804507077, "rewards/rejected": -0.0006327141309157014, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.0471204188481675e-07, "logits/chosen": -1.2586185932159424, "logits/rejected": -1.1957629919052124, "logps/chosen": -2689.84716796875, "logps/rejected": -2126.1083984375, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.014919064939022064, "rewards/margins": 0.006186266429722309, "rewards/rejected": 0.008732798509299755, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.5706806282722514e-07, "logits/chosen": -1.175875186920166, "logits/rejected": -1.1656105518341064, "logps/chosen": -2198.431640625, "logps/rejected": -2021.9176025390625, "loss": 0.7049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0053156702779233456, "rewards/margins": -0.05735307186841965, "rewards/rejected": 0.05203740671277046, "step": 30 }, { "epoch": 0.02, "learning_rate": 2.094240837696335e-07, "logits/chosen": -1.1858023405075073, "logits/rejected": -1.1230406761169434, "logps/chosen": -2056.973388671875, "logps/rejected": -2170.3056640625, "loss": 0.6906, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.1324843466281891, "rewards/margins": -0.016001610085368156, "rewards/rejected": 0.1484859436750412, "step": 40 }, { "epoch": 0.03, "learning_rate": 2.6178010471204185e-07, "logits/chosen": -1.2066991329193115, "logits/rejected": -1.15940260887146, "logps/chosen": -2678.28515625, "logps/rejected": -2157.86376953125, "loss": 0.6707, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.32956749200820923, "rewards/margins": 0.08421512693166733, "rewards/rejected": 0.2453523427248001, "step": 50 }, { "epoch": 0.03, "learning_rate": 3.1413612565445027e-07, "logits/chosen": -1.2342027425765991, "logits/rejected": -1.1995573043823242, "logps/chosen": -2410.271484375, "logps/rejected": -2036.266845703125, "loss": 0.6833, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.37566477060317993, "rewards/margins": 0.07754239439964294, "rewards/rejected": 0.2981223464012146, "step": 60 }, { "epoch": 0.04, "learning_rate": 3.6649214659685864e-07, "logits/chosen": -1.1794008016586304, "logits/rejected": -1.1591062545776367, "logps/chosen": -2638.678955078125, "logps/rejected": -2372.677001953125, "loss": 0.6778, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.43634381890296936, "rewards/margins": 0.0520954504609108, "rewards/rejected": 0.38424837589263916, "step": 70 }, { "epoch": 0.04, "learning_rate": 4.18848167539267e-07, "logits/chosen": -1.2023160457611084, "logits/rejected": -1.1861956119537354, "logps/chosen": -2399.763671875, "logps/rejected": -2263.85888671875, "loss": 0.6818, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.452880322933197, "rewards/margins": 0.04662833362817764, "rewards/rejected": 0.4062519967556, "step": 80 }, { "epoch": 0.05, "learning_rate": 4.712041884816754e-07, "logits/chosen": -1.2319462299346924, "logits/rejected": -1.2353641986846924, "logps/chosen": -2180.666259765625, "logps/rejected": -2063.204345703125, "loss": 0.6665, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.36227527260780334, "rewards/margins": 0.02720705047249794, "rewards/rejected": 0.3350681960582733, "step": 90 }, { "epoch": 0.05, "learning_rate": 5.235602094240837e-07, "logits/chosen": -1.2101176977157593, "logits/rejected": -1.1575647592544556, "logps/chosen": -2522.456298828125, "logps/rejected": -2253.9931640625, "loss": 0.6558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5727291703224182, "rewards/margins": 0.10190355777740479, "rewards/rejected": 0.47082558274269104, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -1.2241016626358032, "eval_logits/rejected": -1.182218313217163, "eval_logps/chosen": -2595.654296875, "eval_logps/rejected": -2172.529052734375, "eval_loss": 0.6526807546615601, "eval_rewards/accuracies": 0.5740000009536743, "eval_rewards/chosen": 0.7712106108665466, "eval_rewards/margins": 0.1913326531648636, "eval_rewards/rejected": 0.5798779726028442, "eval_runtime": 302.6088, "eval_samples_per_second": 6.609, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.06, "learning_rate": 5.759162303664922e-07, "logits/chosen": -1.162023901939392, "logits/rejected": -1.1786675453186035, "logps/chosen": -2315.97216796875, "logps/rejected": -2253.127685546875, "loss": 0.6732, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7014600038528442, "rewards/margins": 0.1181831955909729, "rewards/rejected": 0.5832767486572266, "step": 110 }, { "epoch": 0.06, "learning_rate": 6.282722513089005e-07, "logits/chosen": -1.2144238948822021, "logits/rejected": -1.1650540828704834, "logps/chosen": -2668.5830078125, "logps/rejected": -1998.516845703125, "loss": 0.6723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6677217483520508, "rewards/margins": 0.20832547545433044, "rewards/rejected": 0.45939627289772034, "step": 120 }, { "epoch": 0.07, "learning_rate": 6.806282722513089e-07, "logits/chosen": -1.220961332321167, "logits/rejected": -1.1595335006713867, "logps/chosen": -2847.095458984375, "logps/rejected": -2245.98828125, "loss": 0.6455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8143318891525269, "rewards/margins": 0.25173696875572205, "rewards/rejected": 0.5625948905944824, "step": 130 }, { "epoch": 0.07, "learning_rate": 7.329842931937173e-07, "logits/chosen": -1.1750261783599854, "logits/rejected": -1.1362488269805908, "logps/chosen": -2556.08349609375, "logps/rejected": -2165.498779296875, "loss": 0.6639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6861199140548706, "rewards/margins": 0.11765004694461823, "rewards/rejected": 0.5684698820114136, "step": 140 }, { "epoch": 0.08, "learning_rate": 7.853403141361256e-07, "logits/chosen": -1.213008165359497, "logits/rejected": -1.1688684225082397, "logps/chosen": -2662.8193359375, "logps/rejected": -2211.24072265625, "loss": 0.6339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.870284914970398, "rewards/margins": 0.22113271057605743, "rewards/rejected": 0.6491522192955017, "step": 150 }, { "epoch": 0.08, "learning_rate": 8.37696335078534e-07, "logits/chosen": -1.1444575786590576, "logits/rejected": -1.091567039489746, "logps/chosen": -2689.31298828125, "logps/rejected": -2391.873291015625, "loss": 0.6469, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5730727314949036, "rewards/margins": 0.2371658980846405, "rewards/rejected": 0.33590689301490784, "step": 160 }, { "epoch": 0.09, "learning_rate": 8.900523560209424e-07, "logits/chosen": -1.1294758319854736, "logits/rejected": -1.178647756576538, "logps/chosen": -2683.22509765625, "logps/rejected": -2484.3818359375, "loss": 0.6628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4668382704257965, "rewards/margins": 0.08485493808984756, "rewards/rejected": 0.38198333978652954, "step": 170 }, { "epoch": 0.09, "learning_rate": 9.424083769633508e-07, "logits/chosen": -1.2192734479904175, "logits/rejected": -1.1568591594696045, "logps/chosen": -2561.9091796875, "logps/rejected": -2213.013916015625, "loss": 0.6581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9690437316894531, "rewards/margins": 0.3352271616458893, "rewards/rejected": 0.6338165998458862, "step": 180 }, { "epoch": 0.1, "learning_rate": 9.947643979057591e-07, "logits/chosen": -1.184699535369873, "logits/rejected": -1.1766315698623657, "logps/chosen": -2123.99072265625, "logps/rejected": -2111.645751953125, "loss": 0.6809, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6798163652420044, "rewards/margins": 0.07367928326129913, "rewards/rejected": 0.6061369776725769, "step": 190 }, { "epoch": 0.1, "learning_rate": 9.999323662872996e-07, "logits/chosen": -1.2072479724884033, "logits/rejected": -1.1839154958724976, "logps/chosen": -2698.072998046875, "logps/rejected": -2592.82861328125, "loss": 0.6404, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6077369451522827, "rewards/margins": 0.17234833538532257, "rewards/rejected": 0.4353886544704437, "step": 200 }, { "epoch": 0.1, "eval_logits/chosen": -1.2423152923583984, "eval_logits/rejected": -1.201860785484314, "eval_logps/chosen": -2626.8759765625, "eval_logps/rejected": -2203.748291015625, "eval_loss": 0.6911113858222961, "eval_rewards/accuracies": 0.5860000252723694, "eval_rewards/chosen": 0.45899277925491333, "eval_rewards/margins": 0.19130723178386688, "eval_rewards/rejected": 0.26768550276756287, "eval_runtime": 302.3649, "eval_samples_per_second": 6.615, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.11, "learning_rate": 9.996985942280678e-07, "logits/chosen": -1.2993234395980835, "logits/rejected": -1.2211077213287354, "logps/chosen": -2626.205810546875, "logps/rejected": -1850.9456787109375, "loss": 0.6556, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6171352863311768, "rewards/margins": 0.32769179344177246, "rewards/rejected": 0.2894434928894043, "step": 210 }, { "epoch": 0.12, "learning_rate": 9.99297926897573e-07, "logits/chosen": -1.249463438987732, "logits/rejected": -1.2620993852615356, "logps/chosen": -2312.38427734375, "logps/rejected": -2108.46826171875, "loss": 0.6647, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7376146912574768, "rewards/margins": 0.25427359342575073, "rewards/rejected": 0.48334112763404846, "step": 220 }, { "epoch": 0.12, "learning_rate": 9.987304981154493e-07, "logits/chosen": -1.2905672788619995, "logits/rejected": -1.2782526016235352, "logps/chosen": -2793.2978515625, "logps/rejected": -2365.16552734375, "loss": 0.7268, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6005491018295288, "rewards/margins": 0.08131317794322968, "rewards/rejected": 0.5192359685897827, "step": 230 }, { "epoch": 0.13, "learning_rate": 9.979964973983e-07, "logits/chosen": -1.402222752571106, "logits/rejected": -1.3204929828643799, "logps/chosen": -2332.16650390625, "logps/rejected": -1890.1295166015625, "loss": 0.6892, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3606724441051483, "rewards/margins": 0.234793022274971, "rewards/rejected": 0.1258794367313385, "step": 240 }, { "epoch": 0.13, "learning_rate": 9.970961698964024e-07, "logits/chosen": -1.399332046508789, "logits/rejected": -1.3611127138137817, "logps/chosen": -2618.633056640625, "logps/rejected": -2216.18505859375, "loss": 0.7038, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7694709897041321, "rewards/margins": 0.19202515482902527, "rewards/rejected": 0.577445924282074, "step": 250 }, { "epoch": 0.14, "learning_rate": 9.960298163118284e-07, "logits/chosen": -1.4756546020507812, "logits/rejected": -1.3830201625823975, "logps/chosen": -2662.10986328125, "logps/rejected": -2112.115478515625, "loss": 0.6914, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.9105646014213562, "rewards/margins": 0.19633980095386505, "rewards/rejected": 0.7142248749732971, "step": 260 }, { "epoch": 0.14, "learning_rate": 9.94797792798013e-07, "logits/chosen": -1.4841511249542236, "logits/rejected": -1.4767415523529053, "logps/chosen": -2305.857177734375, "logps/rejected": -2128.56396484375, "loss": 0.6626, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2341788113117218, "rewards/margins": 0.13304655253887177, "rewards/rejected": 0.10113225132226944, "step": 270 }, { "epoch": 0.15, "learning_rate": 9.934005108408016e-07, "logits/chosen": -1.4331722259521484, "logits/rejected": -1.3947049379348755, "logps/chosen": -2292.278564453125, "logps/rejected": -1913.346435546875, "loss": 0.661, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.8132773637771606, "rewards/margins": 0.22855396568775177, "rewards/rejected": 0.5847233533859253, "step": 280 }, { "epoch": 0.15, "learning_rate": 9.918384371210175e-07, "logits/chosen": -1.4025981426239014, "logits/rejected": -1.3736456632614136, "logps/chosen": -2201.71044921875, "logps/rejected": -2091.62255859375, "loss": 0.6766, "rewards/accuracies": 0.625, "rewards/chosen": 0.7386767268180847, "rewards/margins": 0.228462815284729, "rewards/rejected": 0.5102138519287109, "step": 290 }, { "epoch": 0.16, "learning_rate": 9.901120933585937e-07, "logits/chosen": -1.3154966831207275, "logits/rejected": -1.326516032218933, "logps/chosen": -2670.81201171875, "logps/rejected": -2235.08349609375, "loss": 0.6725, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.7127049565315247, "rewards/margins": 0.18496084213256836, "rewards/rejected": 0.5277441143989563, "step": 300 }, { "epoch": 0.16, "eval_logits/chosen": -1.3645591735839844, "eval_logits/rejected": -1.314851999282837, "eval_logps/chosen": -2591.692138671875, "eval_logps/rejected": -2178.205810546875, "eval_loss": 0.6602776050567627, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": 0.8108287453651428, "eval_rewards/margins": 0.28771865367889404, "eval_rewards/rejected": 0.5231101512908936, "eval_runtime": 302.3737, "eval_samples_per_second": 6.614, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.16, "learning_rate": 9.882220561383237e-07, "logits/chosen": -1.3421976566314697, "logits/rejected": -1.2967360019683838, "logps/chosen": -2590.6484375, "logps/rejected": -2214.814208984375, "loss": 0.6749, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7196224331855774, "rewards/margins": 0.18787309527397156, "rewards/rejected": 0.5317493081092834, "step": 310 }, { "epoch": 0.17, "learning_rate": 9.861689567172849e-07, "logits/chosen": -1.3033558130264282, "logits/rejected": -1.2557708024978638, "logps/chosen": -2364.27587890625, "logps/rejected": -2370.61865234375, "loss": 0.7144, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6515111923217773, "rewards/margins": 0.11765609681606293, "rewards/rejected": 0.5338551998138428, "step": 320 }, { "epoch": 0.17, "learning_rate": 9.839534808140065e-07, "logits/chosen": -1.2571797370910645, "logits/rejected": -1.2486730813980103, "logps/chosen": -2348.859130859375, "logps/rejected": -1969.1402587890625, "loss": 0.7502, "rewards/accuracies": 0.5, "rewards/chosen": 0.6785815954208374, "rewards/margins": 0.024524565786123276, "rewards/rejected": 0.6540570259094238, "step": 330 }, { "epoch": 0.18, "learning_rate": 9.815763683794431e-07, "logits/chosen": -1.2969481945037842, "logits/rejected": -1.2044627666473389, "logps/chosen": -2964.642578125, "logps/rejected": -2117.79150390625, "loss": 0.689, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3430386185646057, "rewards/margins": 0.15022581815719604, "rewards/rejected": 0.19281277060508728, "step": 340 }, { "epoch": 0.18, "learning_rate": 9.790384133498377e-07, "logits/chosen": -1.3875682353973389, "logits/rejected": -1.3528212308883667, "logps/chosen": -2609.759765625, "logps/rejected": -2217.990234375, "loss": 0.65, "rewards/accuracies": 0.625, "rewards/chosen": 1.2748018503189087, "rewards/margins": 0.3255355954170227, "rewards/rejected": 0.9492664337158203, "step": 350 }, { "epoch": 0.19, "learning_rate": 9.763404633815536e-07, "logits/chosen": -1.4445443153381348, "logits/rejected": -1.409148931503296, "logps/chosen": -2325.73095703125, "logps/rejected": -2067.62646484375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": 0.9282833337783813, "rewards/margins": 0.2425541877746582, "rewards/rejected": 0.6857292056083679, "step": 360 }, { "epoch": 0.19, "learning_rate": 9.73483419567964e-07, "logits/chosen": -1.5681045055389404, "logits/rejected": -1.47848379611969, "logps/chosen": -2851.124267578125, "logps/rejected": -2266.677734375, "loss": 0.6686, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8608830571174622, "rewards/margins": 0.2859550416469574, "rewards/rejected": 0.5749280452728271, "step": 370 }, { "epoch": 0.2, "learning_rate": 9.70468236138494e-07, "logits/chosen": -1.5734655857086182, "logits/rejected": -1.4612947702407837, "logps/chosen": -2619.15576171875, "logps/rejected": -1996.1292724609375, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": 0.7406389117240906, "rewards/margins": 0.2604018747806549, "rewards/rejected": 0.4802371561527252, "step": 380 }, { "epoch": 0.2, "learning_rate": 9.672959201399155e-07, "logits/chosen": -1.4863954782485962, "logits/rejected": -1.4341216087341309, "logps/chosen": -2418.91748046875, "logps/rejected": -2210.710205078125, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": 0.9080713987350464, "rewards/margins": 0.19638116657733917, "rewards/rejected": 0.7116903066635132, "step": 390 }, { "epoch": 0.21, "learning_rate": 9.639675311000027e-07, "logits/chosen": -1.478477120399475, "logits/rejected": -1.4470995664596558, "logps/chosen": -2378.759521484375, "logps/rejected": -2213.616455078125, "loss": 0.689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5797199606895447, "rewards/margins": 0.15609867870807648, "rewards/rejected": 0.4236213266849518, "step": 400 }, { "epoch": 0.21, "eval_logits/chosen": -1.5029045343399048, "eval_logits/rejected": -1.4427672624588013, "eval_logps/chosen": -2591.764892578125, "eval_logps/rejected": -2180.5830078125, "eval_loss": 0.6528961658477783, "eval_rewards/accuracies": 0.628000020980835, "eval_rewards/chosen": 0.8101032376289368, "eval_rewards/margins": 0.31076449155807495, "eval_rewards/rejected": 0.49933871626853943, "eval_runtime": 300.9467, "eval_samples_per_second": 6.646, "eval_steps_per_second": 0.415, "step": 400 }, { "epoch": 0.21, "learning_rate": 9.60484180673657e-07, "logits/chosen": -1.4771575927734375, "logits/rejected": -1.449158787727356, "logps/chosen": -2471.6416015625, "logps/rejected": -2168.50439453125, "loss": 0.7235, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.5357500314712524, "rewards/margins": 0.03546437621116638, "rewards/rejected": 0.5002856254577637, "step": 410 }, { "epoch": 0.22, "learning_rate": 9.568470322716246e-07, "logits/chosen": -1.461313247680664, "logits/rejected": -1.3947060108184814, "logps/chosen": -2724.66748046875, "logps/rejected": -2191.56787109375, "loss": 0.672, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7562235593795776, "rewards/margins": 0.328954815864563, "rewards/rejected": 0.4272686541080475, "step": 420 }, { "epoch": 0.23, "learning_rate": 9.530573006719263e-07, "logits/chosen": -1.5015565156936646, "logits/rejected": -1.4776034355163574, "logps/chosen": -2666.500732421875, "logps/rejected": -2279.621826171875, "loss": 0.6588, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5253168344497681, "rewards/margins": 0.28119999170303345, "rewards/rejected": 0.24411681294441223, "step": 430 }, { "epoch": 0.23, "learning_rate": 9.491162516141307e-07, "logits/chosen": -1.4172331094741821, "logits/rejected": -1.422502040863037, "logps/chosen": -2282.531005859375, "logps/rejected": -2387.561767578125, "loss": 0.6692, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.860162615776062, "rewards/margins": 0.07978199422359467, "rewards/rejected": 0.7803806662559509, "step": 440 }, { "epoch": 0.24, "learning_rate": 9.450252013766092e-07, "logits/chosen": -1.3361685276031494, "logits/rejected": -1.2606579065322876, "logps/chosen": -2627.769775390625, "logps/rejected": -2308.65380859375, "loss": 0.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5678683519363403, "rewards/margins": 0.21432606875896454, "rewards/rejected": 0.3535422682762146, "step": 450 }, { "epoch": 0.24, "learning_rate": 9.407855163369078e-07, "logits/chosen": -1.306783676147461, "logits/rejected": -1.2825387716293335, "logps/chosen": -2633.41162109375, "logps/rejected": -2218.27294921875, "loss": 0.6678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.9192908406257629, "rewards/margins": 0.24978260695934296, "rewards/rejected": 0.669508159160614, "step": 460 }, { "epoch": 0.25, "learning_rate": 9.3639861251539e-07, "logits/chosen": -1.2543857097625732, "logits/rejected": -1.195093035697937, "logps/chosen": -2341.584228515625, "logps/rejected": -1947.591796875, "loss": 0.6284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5097464919090271, "rewards/margins": 0.36333781480789185, "rewards/rejected": 0.14640869200229645, "step": 470 }, { "epoch": 0.25, "learning_rate": 9.318659551022955e-07, "logits/chosen": -1.3397210836410522, "logits/rejected": -1.281937837600708, "logps/chosen": -2238.00732421875, "logps/rejected": -1736.181640625, "loss": 0.6609, "rewards/accuracies": 0.625, "rewards/chosen": 0.786676287651062, "rewards/margins": 0.2685468792915344, "rewards/rejected": 0.5181293487548828, "step": 480 }, { "epoch": 0.26, "learning_rate": 9.271890579683804e-07, "logits/chosen": -1.4926373958587646, "logits/rejected": -1.4876558780670166, "logps/chosen": -2662.705322265625, "logps/rejected": -2349.420166015625, "loss": 0.7143, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7370970845222473, "rewards/margins": 0.34762194752693176, "rewards/rejected": 0.38947516679763794, "step": 490 }, { "epoch": 0.26, "learning_rate": 9.223694831592952e-07, "logits/chosen": -1.5373231172561646, "logits/rejected": -1.4849967956542969, "logps/chosen": -2402.5634765625, "logps/rejected": -2132.68701171875, "loss": 0.6682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7446134090423584, "rewards/margins": 0.32214781641960144, "rewards/rejected": 0.42246556282043457, "step": 500 }, { "epoch": 0.26, "eval_logits/chosen": -1.5664644241333008, "eval_logits/rejected": -1.5148077011108398, "eval_logps/chosen": -2576.100830078125, "eval_logps/rejected": -2169.265380859375, "eval_loss": 0.6673685312271118, "eval_rewards/accuracies": 0.6420000195503235, "eval_rewards/chosen": 0.966746985912323, "eval_rewards/margins": 0.3542312681674957, "eval_rewards/rejected": 0.6125158071517944, "eval_runtime": 302.6642, "eval_samples_per_second": 6.608, "eval_steps_per_second": 0.413, "step": 500 }, { "epoch": 0.27, "learning_rate": 9.174088403738755e-07, "logits/chosen": -1.5560601949691772, "logits/rejected": -1.5580723285675049, "logps/chosen": -2103.93310546875, "logps/rejected": -2181.848876953125, "loss": 0.6493, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6312128305435181, "rewards/margins": 0.3005504906177521, "rewards/rejected": 0.330662339925766, "step": 510 }, { "epoch": 0.27, "learning_rate": 9.123087864265147e-07, "logits/chosen": -1.543971061706543, "logits/rejected": -1.5191954374313354, "logps/chosen": -2323.391357421875, "logps/rejected": -2031.1025390625, "loss": 0.6736, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.41579127311706543, "rewards/margins": 0.1768406629562378, "rewards/rejected": 0.23895065486431122, "step": 520 }, { "epoch": 0.28, "learning_rate": 9.070710246938016e-07, "logits/chosen": -1.5579715967178345, "logits/rejected": -1.5655916929244995, "logps/chosen": -2268.76318359375, "logps/rejected": -2190.51318359375, "loss": 0.6519, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6393724083900452, "rewards/margins": 0.3283298909664154, "rewards/rejected": 0.3110424876213074, "step": 530 }, { "epoch": 0.28, "learning_rate": 9.016973045456073e-07, "logits/chosen": -1.6396840810775757, "logits/rejected": -1.6098705530166626, "logps/chosen": -2668.9462890625, "logps/rejected": -2160.803955078125, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.8928348422050476, "rewards/margins": 0.47784289717674255, "rewards/rejected": 0.41499200463294983, "step": 540 }, { "epoch": 0.29, "learning_rate": 8.961894207608087e-07, "logits/chosen": -1.6586135625839233, "logits/rejected": -1.6290054321289062, "logps/chosen": -2212.68994140625, "logps/rejected": -2054.17626953125, "loss": 0.6597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.49731844663619995, "rewards/margins": 0.16844932734966278, "rewards/rejected": 0.32886913418769836, "step": 550 }, { "epoch": 0.29, "learning_rate": 8.905492129278477e-07, "logits/chosen": -1.6478192806243896, "logits/rejected": -1.5791934728622437, "logps/chosen": -2915.1103515625, "logps/rejected": -2492.820068359375, "loss": 0.6553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6440809965133667, "rewards/margins": 0.29220613837242126, "rewards/rejected": 0.35187482833862305, "step": 560 }, { "epoch": 0.3, "learning_rate": 8.847785648303233e-07, "logits/chosen": -1.648879051208496, "logits/rejected": -1.5808627605438232, "logps/chosen": -2345.06787109375, "logps/rejected": -1874.7965087890625, "loss": 0.6562, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5588332414627075, "rewards/margins": 0.2794465720653534, "rewards/rejected": 0.2793866991996765, "step": 570 }, { "epoch": 0.3, "learning_rate": 8.788794038178232e-07, "logits/chosen": -1.646813154220581, "logits/rejected": -1.5900137424468994, "logps/chosen": -2427.92822265625, "logps/rejected": -1974.943359375, "loss": 0.6286, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.0534051656723022, "rewards/margins": 0.35476142168045044, "rewards/rejected": 0.6986437439918518, "step": 580 }, { "epoch": 0.31, "learning_rate": 8.728537001622049e-07, "logits/chosen": -1.6359336376190186, "logits/rejected": -1.5665844678878784, "logps/chosen": -2346.7265625, "logps/rejected": -1916.209716796875, "loss": 0.6555, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7451139092445374, "rewards/margins": 0.24112704396247864, "rewards/rejected": 0.5039868354797363, "step": 590 }, { "epoch": 0.31, "learning_rate": 8.667034663995408e-07, "logits/chosen": -1.6207376718521118, "logits/rejected": -1.5811537504196167, "logps/chosen": -2380.62939453125, "logps/rejected": -2060.835205078125, "loss": 0.6309, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8570950627326965, "rewards/margins": 0.32400840520858765, "rewards/rejected": 0.5330866575241089, "step": 600 }, { "epoch": 0.31, "eval_logits/chosen": -1.6448516845703125, "eval_logits/rejected": -1.588512897491455, "eval_logps/chosen": -2589.297119140625, "eval_logps/rejected": -2183.78515625, "eval_loss": 0.6445065140724182, "eval_rewards/accuracies": 0.6579999923706055, "eval_rewards/chosen": 0.834783673286438, "eval_rewards/margins": 0.3674681782722473, "eval_rewards/rejected": 0.4673156440258026, "eval_runtime": 306.3454, "eval_samples_per_second": 6.529, "eval_steps_per_second": 0.408, "step": 600 }, { "epoch": 0.32, "learning_rate": 8.604307566579472e-07, "logits/chosen": -1.5816807746887207, "logits/rejected": -1.6054216623306274, "logps/chosen": -2258.828857421875, "logps/rejected": -2473.440185546875, "loss": 0.6656, "rewards/accuracies": 0.625, "rewards/chosen": 0.3196907639503479, "rewards/margins": 0.4189208149909973, "rewards/rejected": -0.09922999143600464, "step": 610 }, { "epoch": 0.32, "learning_rate": 8.540376659715225e-07, "logits/chosen": -1.6599409580230713, "logits/rejected": -1.5913432836532593, "logps/chosen": -2412.462890625, "logps/rejected": -2083.058837890625, "loss": 0.6291, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8317147493362427, "rewards/margins": 0.3438655138015747, "rewards/rejected": 0.48784923553466797, "step": 620 }, { "epoch": 0.33, "learning_rate": 8.47526329580623e-07, "logits/chosen": -1.535036325454712, "logits/rejected": -1.5678516626358032, "logps/chosen": -2142.04931640625, "logps/rejected": -2099.13720703125, "loss": 0.633, "rewards/accuracies": 0.625, "rewards/chosen": 0.9528681635856628, "rewards/margins": 0.24734528362751007, "rewards/rejected": 0.7055227756500244, "step": 630 }, { "epoch": 0.33, "learning_rate": 8.408989222187096e-07, "logits/chosen": -1.5995115041732788, "logits/rejected": -1.5139375925064087, "logps/chosen": -3065.62451171875, "logps/rejected": -2365.10107421875, "loss": 0.6969, "rewards/accuracies": 0.625, "rewards/chosen": 0.9795970916748047, "rewards/margins": 0.47979211807250977, "rewards/rejected": 0.49980488419532776, "step": 640 }, { "epoch": 0.34, "learning_rate": 8.341576573860047e-07, "logits/chosen": -1.5332003831863403, "logits/rejected": -1.4982550144195557, "logps/chosen": -2392.21728515625, "logps/rejected": -1984.2425537109375, "loss": 0.694, "rewards/accuracies": 0.625, "rewards/chosen": 0.8843706846237183, "rewards/margins": 0.32931455969810486, "rewards/rejected": 0.5550561547279358, "step": 650 }, { "epoch": 0.35, "learning_rate": 8.27304786610201e-07, "logits/chosen": -1.5626050233840942, "logits/rejected": -1.5275344848632812, "logps/chosen": -2318.65625, "logps/rejected": -1863.1956787109375, "loss": 0.6323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.8664724230766296, "rewards/margins": 0.5049992799758911, "rewards/rejected": 0.3614731729030609, "step": 660 }, { "epoch": 0.35, "learning_rate": 8.203425986944696e-07, "logits/chosen": -1.5559314489364624, "logits/rejected": -1.5068961381912231, "logps/chosen": -2837.03369140625, "logps/rejected": -2028.3587646484375, "loss": 0.6661, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7081668972969055, "rewards/margins": 0.37415772676467896, "rewards/rejected": 0.3340091109275818, "step": 670 }, { "epoch": 0.36, "learning_rate": 8.132734189530182e-07, "logits/chosen": -1.569585919380188, "logits/rejected": -1.5583667755126953, "logps/chosen": -2081.708984375, "logps/rejected": -2073.14892578125, "loss": 0.7058, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.22897915542125702, "rewards/margins": 0.06144998222589493, "rewards/rejected": 0.1675291508436203, "step": 680 }, { "epoch": 0.36, "learning_rate": 8.060996084344553e-07, "logits/chosen": -1.6668421030044556, "logits/rejected": -1.6300331354141235, "logps/chosen": -2808.94140625, "logps/rejected": -2424.194580078125, "loss": 0.6651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9484899640083313, "rewards/margins": 0.38452741503715515, "rewards/rejected": 0.5639625787734985, "step": 690 }, { "epoch": 0.37, "learning_rate": 7.98823563133219e-07, "logits/chosen": -1.6251919269561768, "logits/rejected": -1.6152589321136475, "logps/chosen": -2532.464111328125, "logps/rejected": -2264.97802734375, "loss": 0.6467, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8040878176689148, "rewards/margins": 0.3939053416252136, "rewards/rejected": 0.4101824164390564, "step": 700 }, { "epoch": 0.37, "eval_logits/chosen": -1.7105224132537842, "eval_logits/rejected": -1.6561530828475952, "eval_logps/chosen": -2584.251220703125, "eval_logps/rejected": -2175.965087890625, "eval_loss": 0.6481595635414124, "eval_rewards/accuracies": 0.6240000128746033, "eval_rewards/chosen": 0.8852397799491882, "eval_rewards/margins": 0.3397220969200134, "eval_rewards/rejected": 0.54551762342453, "eval_runtime": 303.8379, "eval_samples_per_second": 6.582, "eval_steps_per_second": 0.411, "step": 700 }, { "epoch": 0.37, "learning_rate": 7.914477131893342e-07, "logits/chosen": -1.71377432346344, "logits/rejected": -1.708833932876587, "logps/chosen": -2544.854248046875, "logps/rejected": -2375.308349609375, "loss": 0.6722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6972166299819946, "rewards/margins": 0.06426803767681122, "rewards/rejected": 0.6329485774040222, "step": 710 }, { "epoch": 0.38, "learning_rate": 7.839745220767661e-07, "logits/chosen": -1.694154143333435, "logits/rejected": -1.669390320777893, "logps/chosen": -2534.442626953125, "logps/rejected": -2229.87158203125, "loss": 0.6723, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.48106852173805237, "rewards/margins": 0.24986381828784943, "rewards/rejected": 0.23120474815368652, "step": 720 }, { "epoch": 0.38, "learning_rate": 7.764064857806389e-07, "logits/chosen": -1.6268012523651123, "logits/rejected": -1.575046420097351, "logps/chosen": -2722.456298828125, "logps/rejected": -2351.8857421875, "loss": 0.643, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7691014409065247, "rewards/margins": 0.34405142068862915, "rewards/rejected": 0.42504996061325073, "step": 730 }, { "epoch": 0.39, "learning_rate": 7.68746131963598e-07, "logits/chosen": -1.6478900909423828, "logits/rejected": -1.597701072692871, "logps/chosen": -2222.41259765625, "logps/rejected": -1990.4273681640625, "loss": 0.6243, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6448198556900024, "rewards/margins": 0.2648247182369232, "rewards/rejected": 0.37999510765075684, "step": 740 }, { "epoch": 0.39, "learning_rate": 7.609960191215909e-07, "logits/chosen": -1.6781095266342163, "logits/rejected": -1.6269840002059937, "logps/chosen": -2453.95068359375, "logps/rejected": -2161.110595703125, "loss": 0.6632, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6921306252479553, "rewards/margins": 0.11647888273000717, "rewards/rejected": 0.5756517648696899, "step": 750 }, { "epoch": 0.4, "learning_rate": 7.531587357293505e-07, "logits/chosen": -1.6048580408096313, "logits/rejected": -1.6003602743148804, "logps/chosen": -2562.139404296875, "logps/rejected": -2293.66943359375, "loss": 0.6594, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7623199820518494, "rewards/margins": 0.2832568287849426, "rewards/rejected": 0.4790631830692291, "step": 760 }, { "epoch": 0.4, "learning_rate": 7.452368993758645e-07, "logits/chosen": -1.585092544555664, "logits/rejected": -1.557943344116211, "logps/chosen": -2426.169677734375, "logps/rejected": -2058.61083984375, "loss": 0.6519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.553870677947998, "rewards/margins": 0.39466503262519836, "rewards/rejected": 0.15920567512512207, "step": 770 }, { "epoch": 0.41, "learning_rate": 7.372331558901237e-07, "logits/chosen": -1.5951181650161743, "logits/rejected": -1.55776846408844, "logps/chosen": -2530.603515625, "logps/rejected": -2058.31494140625, "loss": 0.663, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5011290311813354, "rewards/margins": 0.12420739978551865, "rewards/rejected": 0.3769216239452362, "step": 780 }, { "epoch": 0.41, "learning_rate": 7.291501784574355e-07, "logits/chosen": -1.7254797220230103, "logits/rejected": -1.6313526630401611, "logps/chosen": -2754.68408203125, "logps/rejected": -2185.399169921875, "loss": 0.6073, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6229848265647888, "rewards/margins": 0.35090917348861694, "rewards/rejected": 0.27207568287849426, "step": 790 }, { "epoch": 0.42, "learning_rate": 7.209906667266017e-07, "logits/chosen": -1.7093772888183594, "logits/rejected": -1.6865718364715576, "logps/chosen": -2462.615478515625, "logps/rejected": -2213.93798828125, "loss": 0.6215, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1175382137298584, "rewards/margins": 0.40151238441467285, "rewards/rejected": 0.7160258293151855, "step": 800 }, { "epoch": 0.42, "eval_logits/chosen": -1.7084823846817017, "eval_logits/rejected": -1.6541036367416382, "eval_logps/chosen": -2563.754638671875, "eval_logps/rejected": -2162.267822265625, "eval_loss": 0.6452978253364563, "eval_rewards/accuracies": 0.6380000114440918, "eval_rewards/chosen": 1.0902061462402344, "eval_rewards/margins": 0.4077164828777313, "eval_rewards/rejected": 0.6824895739555359, "eval_runtime": 301.7419, "eval_samples_per_second": 6.628, "eval_steps_per_second": 0.414, "step": 800 }, { "epoch": 0.42, "learning_rate": 7.12757345908258e-07, "logits/chosen": -1.7412763833999634, "logits/rejected": -1.6791282892227173, "logps/chosen": -2606.15283203125, "logps/rejected": -1956.8831787109375, "loss": 0.6358, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9316846132278442, "rewards/margins": 0.45442262291908264, "rewards/rejected": 0.47726184129714966, "step": 810 }, { "epoch": 0.43, "learning_rate": 7.044529658646761e-07, "logits/chosen": -1.710146188735962, "logits/rejected": -1.7056090831756592, "logps/chosen": -2651.176513671875, "logps/rejected": -2550.99755859375, "loss": 0.6601, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6641424298286438, "rewards/margins": 0.2352844774723053, "rewards/rejected": 0.4288579821586609, "step": 820 }, { "epoch": 0.43, "learning_rate": 6.960803001913314e-07, "logits/chosen": -1.6102991104125977, "logits/rejected": -1.5880324840545654, "logps/chosen": -1818.771484375, "logps/rejected": -1763.439208984375, "loss": 0.6175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4126269817352295, "rewards/margins": 0.2377271205186844, "rewards/rejected": 0.1748998463153839, "step": 830 }, { "epoch": 0.44, "learning_rate": 6.876421452905448e-07, "logits/chosen": -1.6048507690429688, "logits/rejected": -1.5550066232681274, "logps/chosen": -2419.88818359375, "logps/rejected": -1979.8333740234375, "loss": 0.672, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1539905071258545, "rewards/margins": 0.44714298844337463, "rewards/rejected": 0.7068475484848022, "step": 840 }, { "epoch": 0.44, "learning_rate": 6.791413194375076e-07, "logits/chosen": -1.5756229162216187, "logits/rejected": -1.5317662954330444, "logps/chosen": -2326.3671875, "logps/rejected": -2082.76123046875, "loss": 0.6358, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8061001896858215, "rewards/margins": 0.20508570969104767, "rewards/rejected": 0.6010144948959351, "step": 850 }, { "epoch": 0.45, "learning_rate": 6.705806618389997e-07, "logits/chosen": -1.6245572566986084, "logits/rejected": -1.6081863641738892, "logps/chosen": -2542.473876953125, "logps/rejected": -2442.247314453125, "loss": 0.6751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8134121894836426, "rewards/margins": 0.18775935471057892, "rewards/rejected": 0.6256529092788696, "step": 860 }, { "epoch": 0.46, "learning_rate": 6.619630316851182e-07, "logits/chosen": -1.6937329769134521, "logits/rejected": -1.6594982147216797, "logps/chosen": -2513.98046875, "logps/rejected": -2264.63623046875, "loss": 0.6902, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6390259861946106, "rewards/margins": 0.21240201592445374, "rewards/rejected": 0.4266239106655121, "step": 870 }, { "epoch": 0.46, "learning_rate": 6.532913071943307e-07, "logits/chosen": -1.6279165744781494, "logits/rejected": -1.5716134309768677, "logps/chosen": -2358.2890625, "logps/rejected": -2005.8092041015625, "loss": 0.6588, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1509922742843628, "rewards/margins": 0.4300170838832855, "rewards/rejected": 0.7209752798080444, "step": 880 }, { "epoch": 0.47, "learning_rate": 6.445683846521738e-07, "logits/chosen": -1.458832025527954, "logits/rejected": -1.3705499172210693, "logps/chosen": -2031.3890380859375, "logps/rejected": -1786.692626953125, "loss": 0.6727, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4857109487056732, "rewards/margins": 0.11415307223796844, "rewards/rejected": 0.3715578615665436, "step": 890 }, { "epoch": 0.47, "learning_rate": 6.357971774439177e-07, "logits/chosen": -1.446877360343933, "logits/rejected": -1.4010428190231323, "logps/chosen": -2083.528564453125, "logps/rejected": -2091.34228515625, "loss": 0.6674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3722456991672516, "rewards/margins": 0.18528583645820618, "rewards/rejected": 0.18695983290672302, "step": 900 }, { "epoch": 0.47, "eval_logits/chosen": -1.5652438402175903, "eval_logits/rejected": -1.5145412683486938, "eval_logps/chosen": -2594.7568359375, "eval_logps/rejected": -2185.613525390625, "eval_loss": 0.6415941119194031, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 0.780185878276825, "eval_rewards/margins": 0.33115366101264954, "eval_rewards/rejected": 0.44903212785720825, "eval_runtime": 290.6591, "eval_samples_per_second": 6.881, "eval_steps_per_second": 0.43, "step": 900 }, { "epoch": 0.48, "learning_rate": 6.269806150815187e-07, "logits/chosen": -1.580451250076294, "logits/rejected": -1.5398848056793213, "logps/chosen": -2756.412109375, "logps/rejected": -2110.937255859375, "loss": 0.5836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.1578181982040405, "rewards/margins": 0.4512609839439392, "rewards/rejected": 0.7065572738647461, "step": 910 }, { "epoch": 0.48, "learning_rate": 6.181216422251862e-07, "logits/chosen": -1.6002380847930908, "logits/rejected": -1.5482442378997803, "logps/chosen": -2669.18408203125, "logps/rejected": -2383.2392578125, "loss": 0.6651, "rewards/accuracies": 0.625, "rewards/chosen": 1.3831857442855835, "rewards/margins": 0.3688461184501648, "rewards/rejected": 1.014339566230774, "step": 920 }, { "epoch": 0.49, "learning_rate": 6.092232176998897e-07, "logits/chosen": -1.5446488857269287, "logits/rejected": -1.5036358833312988, "logps/chosen": -2283.471923828125, "logps/rejected": -2156.527587890625, "loss": 0.6389, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8198372721672058, "rewards/margins": 0.23020341992378235, "rewards/rejected": 0.5896340012550354, "step": 930 }, { "epoch": 0.49, "learning_rate": 6.002883135071362e-07, "logits/chosen": -1.4674952030181885, "logits/rejected": -1.3860971927642822, "logps/chosen": -2495.39794921875, "logps/rejected": -2081.33544921875, "loss": 0.6479, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.46686476469039917, "rewards/margins": 0.33061760663986206, "rewards/rejected": 0.1362471729516983, "step": 940 }, { "epoch": 0.5, "learning_rate": 5.913199138323448e-07, "logits/chosen": -1.5902820825576782, "logits/rejected": -1.5817844867706299, "logps/chosen": -2237.93603515625, "logps/rejected": -2165.838623046875, "loss": 0.699, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5283821821212769, "rewards/margins": 0.3398032486438751, "rewards/rejected": 0.18857893347740173, "step": 950 }, { "epoch": 0.5, "learning_rate": 5.82321014048154e-07, "logits/chosen": -1.5519543886184692, "logits/rejected": -1.5687713623046875, "logps/chosen": -2170.23583984375, "logps/rejected": -2091.04248046875, "loss": 0.6617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2492622435092926, "rewards/margins": 0.2455929070711136, "rewards/rejected": 0.00366935133934021, "step": 960 }, { "epoch": 0.51, "learning_rate": 5.732946197139906e-07, "logits/chosen": -1.5598348379135132, "logits/rejected": -1.5337880849838257, "logps/chosen": -2266.143310546875, "logps/rejected": -2009.6168212890625, "loss": 0.6497, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.789315402507782, "rewards/margins": 0.16782251000404358, "rewards/rejected": 0.6214929223060608, "step": 970 }, { "epoch": 0.51, "learning_rate": 5.642437455722381e-07, "logits/chosen": -1.5074641704559326, "logits/rejected": -1.4456851482391357, "logps/chosen": -2503.286865234375, "logps/rejected": -2021.8304443359375, "loss": 0.6258, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.141226053237915, "rewards/margins": 0.3969436287879944, "rewards/rejected": 0.7442826628684998, "step": 980 }, { "epoch": 0.52, "learning_rate": 5.551714145413368e-07, "logits/chosen": -1.468330979347229, "logits/rejected": -1.3824667930603027, "logps/chosen": -2575.858154296875, "logps/rejected": -1971.8447265625, "loss": 0.647, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.726246178150177, "rewards/margins": 0.32752370834350586, "rewards/rejected": 0.39872246980667114, "step": 990 }, { "epoch": 0.52, "learning_rate": 5.460806567061533e-07, "logits/chosen": -1.5170243978500366, "logits/rejected": -1.4751875400543213, "logps/chosen": -2752.580322265625, "logps/rejected": -2291.04833984375, "loss": 0.644, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8746698498725891, "rewards/margins": 0.38163238763809204, "rewards/rejected": 0.4930374026298523, "step": 1000 }, { "epoch": 0.52, "eval_logits/chosen": -1.5046511888504028, "eval_logits/rejected": -1.4505603313446045, "eval_logps/chosen": -2602.00390625, "eval_logps/rejected": -2193.728515625, "eval_loss": 0.6499609351158142, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": 0.7077119946479797, "eval_rewards/margins": 0.3398290276527405, "eval_rewards/rejected": 0.36788299679756165, "eval_runtime": 299.5822, "eval_samples_per_second": 6.676, "eval_steps_per_second": 0.417, "step": 1000 }, { "epoch": 0.53, "learning_rate": 5.369745083059577e-07, "logits/chosen": -1.490482211112976, "logits/rejected": -1.424222707748413, "logps/chosen": -2471.395263671875, "logps/rejected": -1937.520751953125, "loss": 0.6353, "rewards/accuracies": 0.6875, "rewards/chosen": 0.47275876998901367, "rewards/margins": 0.2599312365055084, "rewards/rejected": 0.21282756328582764, "step": 1010 }, { "epoch": 0.53, "learning_rate": 5.278560107203437e-07, "logits/chosen": -1.459146499633789, "logits/rejected": -1.4577230215072632, "logps/chosen": -2559.42724609375, "logps/rejected": -2042.339599609375, "loss": 0.6634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7685127258300781, "rewards/margins": 0.3085792660713196, "rewards/rejected": 0.45993345975875854, "step": 1020 }, { "epoch": 0.54, "learning_rate": 5.18728209453432e-07, "logits/chosen": -1.5719316005706787, "logits/rejected": -1.5082643032073975, "logps/chosen": -2554.538818359375, "logps/rejected": -2257.06201171875, "loss": 0.6673, "rewards/accuracies": 0.5625, "rewards/chosen": 0.9037872552871704, "rewards/margins": 0.3130941092967987, "rewards/rejected": 0.5906931161880493, "step": 1030 }, { "epoch": 0.54, "learning_rate": 5.095941531166982e-07, "logits/chosen": -1.5710715055465698, "logits/rejected": -1.5428146123886108, "logps/chosen": -2587.89111328125, "logps/rejected": -2198.08056640625, "loss": 0.6266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7090158462524414, "rewards/margins": 0.3786148130893707, "rewards/rejected": 0.33040106296539307, "step": 1040 }, { "epoch": 0.55, "learning_rate": 5.004568924107598e-07, "logits/chosen": -1.6318562030792236, "logits/rejected": -1.5859413146972656, "logps/chosen": -2931.807373046875, "logps/rejected": -2507.31298828125, "loss": 0.6294, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7447463274002075, "rewards/margins": 0.2536779046058655, "rewards/rejected": 0.49106842279434204, "step": 1050 }, { "epoch": 0.55, "learning_rate": 4.913194791064675e-07, "logits/chosen": -1.639493703842163, "logits/rejected": -1.5823523998260498, "logps/chosen": -2601.8447265625, "logps/rejected": -2357.34814453125, "loss": 0.6441, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7931571006774902, "rewards/margins": 0.5028332471847534, "rewards/rejected": 0.2903238832950592, "step": 1060 }, { "epoch": 0.56, "learning_rate": 4.82184965025639e-07, "logits/chosen": -1.5899850130081177, "logits/rejected": -1.5473779439926147, "logps/chosen": -2727.800537109375, "logps/rejected": -2362.034423828125, "loss": 0.6419, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9983813166618347, "rewards/margins": 0.39955899119377136, "rewards/rejected": 0.5988222360610962, "step": 1070 }, { "epoch": 0.57, "learning_rate": 4.73056401021775e-07, "logits/chosen": -1.5197970867156982, "logits/rejected": -1.4553916454315186, "logps/chosen": -2388.419921875, "logps/rejected": -2081.69775390625, "loss": 0.6171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.601919949054718, "rewards/margins": 0.227634459733963, "rewards/rejected": 0.374285489320755, "step": 1080 }, { "epoch": 0.57, "learning_rate": 4.639368359610982e-07, "logits/chosen": -1.4987363815307617, "logits/rejected": -1.4325814247131348, "logps/chosen": -2522.322509765625, "logps/rejected": -2121.84912109375, "loss": 0.6571, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5968645215034485, "rewards/margins": 0.3043002486228943, "rewards/rejected": 0.2925642132759094, "step": 1090 }, { "epoch": 0.58, "learning_rate": 4.5482931570425803e-07, "logits/chosen": -1.5703797340393066, "logits/rejected": -1.5181505680084229, "logps/chosen": -2581.994140625, "logps/rejected": -2270.20166015625, "loss": 0.6539, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6347614526748657, "rewards/margins": 0.321241557598114, "rewards/rejected": 0.31352001428604126, "step": 1100 }, { "epoch": 0.58, "eval_logits/chosen": -1.5226702690124512, "eval_logits/rejected": -1.4696787595748901, "eval_logps/chosen": -2588.0068359375, "eval_logps/rejected": -2181.99365234375, "eval_loss": 0.6389243006706238, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 0.8476871848106384, "eval_rewards/margins": 0.362454891204834, "eval_rewards/rejected": 0.4852323532104492, "eval_runtime": 301.2203, "eval_samples_per_second": 6.64, "eval_steps_per_second": 0.415, "step": 1100 }, { "epoch": 0.58, "learning_rate": 4.4573688208903686e-07, "logits/chosen": -1.4915900230407715, "logits/rejected": -1.3990033864974976, "logps/chosen": -2177.49169921875, "logps/rejected": -1711.8460693359375, "loss": 0.6447, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6701298356056213, "rewards/margins": 0.3176502585411072, "rewards/rejected": 0.3524795174598694, "step": 1110 }, { "epoch": 0.59, "learning_rate": 4.366625719144016e-07, "logits/chosen": -1.5326006412506104, "logits/rejected": -1.4640724658966064, "logps/chosen": -2241.04052734375, "logps/rejected": -1938.517822265625, "loss": 0.6094, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.9225455522537231, "rewards/margins": 0.3196006417274475, "rewards/rejected": 0.6029448509216309, "step": 1120 }, { "epoch": 0.59, "learning_rate": 4.276094159262368e-07, "logits/chosen": -1.459031343460083, "logits/rejected": -1.4118678569793701, "logps/chosen": -2329.41943359375, "logps/rejected": -2065.614501953125, "loss": 0.6114, "rewards/accuracies": 0.625, "rewards/chosen": 1.0603306293487549, "rewards/margins": 0.38362884521484375, "rewards/rejected": 0.6767016649246216, "step": 1130 }, { "epoch": 0.6, "learning_rate": 4.1858043780510135e-07, "logits/chosen": -1.4943807125091553, "logits/rejected": -1.4440956115722656, "logps/chosen": -2648.4462890625, "logps/rejected": -2317.19970703125, "loss": 0.6521, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9549520611763, "rewards/margins": 0.1597224771976471, "rewards/rejected": 0.7952295541763306, "step": 1140 }, { "epoch": 0.6, "learning_rate": 4.0957865315634204e-07, "logits/chosen": -1.4685379266738892, "logits/rejected": -1.4013986587524414, "logps/chosen": -2750.71142578125, "logps/rejected": -2100.20068359375, "loss": 0.6027, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7830246686935425, "rewards/margins": 0.5725045204162598, "rewards/rejected": 0.2105201780796051, "step": 1150 }, { "epoch": 0.61, "learning_rate": 4.006070685029075e-07, "logits/chosen": -1.484535813331604, "logits/rejected": -1.4587595462799072, "logps/chosen": -2228.81787109375, "logps/rejected": -2157.81298828125, "loss": 0.6803, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4148440957069397, "rewards/margins": 0.10307104885578156, "rewards/rejected": 0.3117729723453522, "step": 1160 }, { "epoch": 0.61, "learning_rate": 3.916686802811927e-07, "logits/chosen": -1.3863401412963867, "logits/rejected": -1.4270175695419312, "logps/chosen": -2092.947998046875, "logps/rejected": -2140.6953125, "loss": 0.624, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6797593832015991, "rewards/margins": 0.16529296338558197, "rewards/rejected": 0.514466404914856, "step": 1170 }, { "epoch": 0.62, "learning_rate": 3.8276647384025467e-07, "logits/chosen": -1.4469492435455322, "logits/rejected": -1.3607311248779297, "logps/chosen": -2557.885009765625, "logps/rejected": -2165.09033203125, "loss": 0.6423, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6221494078636169, "rewards/margins": 0.2990773320198059, "rewards/rejected": 0.3230721354484558, "step": 1180 }, { "epoch": 0.62, "learning_rate": 3.7390342244472883e-07, "logits/chosen": -1.5888515710830688, "logits/rejected": -1.5609667301177979, "logps/chosen": -2778.28515625, "logps/rejected": -2496.6396484375, "loss": 0.6533, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.9935188293457031, "rewards/margins": 0.3621361255645752, "rewards/rejected": 0.6313827037811279, "step": 1190 }, { "epoch": 0.63, "learning_rate": 3.6508248628178446e-07, "logits/chosen": -1.6396839618682861, "logits/rejected": -1.5974278450012207, "logps/chosen": -2493.72216796875, "logps/rejected": -2359.435791015625, "loss": 0.7267, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.9704666137695312, "rewards/margins": 0.3994936943054199, "rewards/rejected": 0.5709729790687561, "step": 1200 }, { "epoch": 0.63, "eval_logits/chosen": -1.6800066232681274, "eval_logits/rejected": -1.6292266845703125, "eval_logps/chosen": -2618.873779296875, "eval_logps/rejected": -2207.94384765625, "eval_loss": 0.6421077847480774, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": 0.5390151143074036, "eval_rewards/margins": 0.3132854104042053, "eval_rewards/rejected": 0.22572976350784302, "eval_runtime": 304.4335, "eval_samples_per_second": 6.57, "eval_steps_per_second": 0.411, "step": 1200 }, { "epoch": 0.63, "learning_rate": 3.563066114724441e-07, "logits/chosen": -1.6271164417266846, "logits/rejected": -1.5858738422393799, "logps/chosen": -2807.364990234375, "logps/rejected": -2029.6510009765625, "loss": 0.6347, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6177263855934143, "rewards/margins": 0.27368754148483276, "rewards/rejected": 0.3440387547016144, "step": 1210 }, { "epoch": 0.64, "learning_rate": 3.475787290876055e-07, "logits/chosen": -1.5973155498504639, "logits/rejected": -1.558475375175476, "logps/chosen": -2490.0703125, "logps/rejected": -2087.466064453125, "loss": 0.6385, "rewards/accuracies": 0.625, "rewards/chosen": 0.8565654754638672, "rewards/margins": 0.4143308699131012, "rewards/rejected": 0.4422345757484436, "step": 1220 }, { "epoch": 0.64, "learning_rate": 3.389017541690854e-07, "logits/chosen": -1.5630786418914795, "logits/rejected": -1.548064947128296, "logps/chosen": -2276.59619140625, "logps/rejected": -1839.0726318359375, "loss": 0.6357, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7428416609764099, "rewards/margins": 0.3907639980316162, "rewards/rejected": 0.35207757353782654, "step": 1230 }, { "epoch": 0.65, "learning_rate": 3.30278584756021e-07, "logits/chosen": -1.548689365386963, "logits/rejected": -1.4891592264175415, "logps/chosen": -2640.1591796875, "logps/rejected": -2317.181396484375, "loss": 0.6184, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.910789966583252, "rewards/margins": 0.37699228525161743, "rewards/rejected": 0.5337976217269897, "step": 1240 }, { "epoch": 0.65, "learning_rate": 3.2171210091694735e-07, "logits/chosen": -1.608028769493103, "logits/rejected": -1.5826674699783325, "logps/chosen": -2531.904296875, "logps/rejected": -2342.30419921875, "loss": 0.6087, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7222377061843872, "rewards/margins": 0.4060022830963135, "rewards/rejected": 0.3162355422973633, "step": 1250 }, { "epoch": 0.66, "learning_rate": 3.132051637878789e-07, "logits/chosen": -1.5921976566314697, "logits/rejected": -1.4880411624908447, "logps/chosen": -2295.463134765625, "logps/rejected": -1800.047119140625, "loss": 0.6709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8092087507247925, "rewards/margins": 0.39788728952407837, "rewards/rejected": 0.4113215506076813, "step": 1260 }, { "epoch": 0.66, "learning_rate": 3.0476061461671155e-07, "logits/chosen": -1.5929429531097412, "logits/rejected": -1.560585856437683, "logps/chosen": -2178.914306640625, "logps/rejected": -2029.7672119140625, "loss": 0.6315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8661308288574219, "rewards/margins": 0.3666331171989441, "rewards/rejected": 0.4994977116584778, "step": 1270 }, { "epoch": 0.67, "learning_rate": 2.9638127381427127e-07, "logits/chosen": -1.4586659669876099, "logits/rejected": -1.4546220302581787, "logps/chosen": -2244.927978515625, "logps/rejected": -2030.598876953125, "loss": 0.5909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7213354110717773, "rewards/margins": 0.37115171551704407, "rewards/rejected": 0.3501836955547333, "step": 1280 }, { "epoch": 0.68, "learning_rate": 2.8806994001231766e-07, "logits/chosen": -1.462428092956543, "logits/rejected": -1.4601207971572876, "logps/chosen": -2553.372314453125, "logps/rejected": -2366.053955078125, "loss": 0.6324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9080332517623901, "rewards/margins": 0.3693556487560272, "rewards/rejected": 0.5386777520179749, "step": 1290 }, { "epoch": 0.68, "learning_rate": 2.7982938912882544e-07, "logits/chosen": -1.5518906116485596, "logits/rejected": -1.47800874710083, "logps/chosen": -2843.82421875, "logps/rejected": -2309.199951171875, "loss": 0.5746, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.102667212486267, "rewards/margins": 0.6155067682266235, "rewards/rejected": 0.48716044425964355, "step": 1300 }, { "epoch": 0.68, "eval_logits/chosen": -1.5460779666900635, "eval_logits/rejected": -1.4993510246276855, "eval_logps/chosen": -2582.20947265625, "eval_logps/rejected": -2181.592041015625, "eval_loss": 0.6300790905952454, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 0.9056587815284729, "eval_rewards/margins": 0.41641080379486084, "eval_rewards/rejected": 0.48924797773361206, "eval_runtime": 299.2617, "eval_samples_per_second": 6.683, "eval_steps_per_second": 0.418, "step": 1300 }, { "epoch": 0.69, "learning_rate": 2.716623734408488e-07, "logits/chosen": -1.5478688478469849, "logits/rejected": -1.509421944618225, "logps/chosen": -2733.4658203125, "logps/rejected": -2210.788330078125, "loss": 0.676, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.8321071863174438, "rewards/margins": 0.17042401432991028, "rewards/rejected": 0.661683201789856, "step": 1310 }, { "epoch": 0.69, "learning_rate": 2.635716206652843e-07, "logits/chosen": -1.51913321018219, "logits/rejected": -1.5177617073059082, "logps/chosen": -2348.56005859375, "logps/rejected": -2216.1884765625, "loss": 0.5911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6093713045120239, "rewards/margins": 0.3202818036079407, "rewards/rejected": 0.28908950090408325, "step": 1320 }, { "epoch": 0.7, "learning_rate": 2.5555983304783515e-07, "logits/chosen": -1.4471040964126587, "logits/rejected": -1.4324887990951538, "logps/chosen": -2042.9017333984375, "logps/rejected": -1859.039306640625, "loss": 0.6168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.49268943071365356, "rewards/margins": 0.3076168894767761, "rewards/rejected": 0.18507252633571625, "step": 1330 }, { "epoch": 0.7, "learning_rate": 2.4762968646048356e-07, "logits/chosen": -1.4452800750732422, "logits/rejected": -1.3810513019561768, "logps/chosen": -2950.53271484375, "logps/rejected": -2301.14892578125, "loss": 0.6184, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.9146178364753723, "rewards/margins": 0.5878747701644897, "rewards/rejected": 0.326742947101593, "step": 1340 }, { "epoch": 0.71, "learning_rate": 2.397838295077703e-07, "logits/chosen": -1.4514172077178955, "logits/rejected": -1.430443525314331, "logps/chosen": -2407.11181640625, "logps/rejected": -2338.7666015625, "loss": 0.6172, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6004685163497925, "rewards/margins": 0.15282198786735535, "rewards/rejected": 0.44764652848243713, "step": 1350 }, { "epoch": 0.71, "learning_rate": 2.3202488264218357e-07, "logits/chosen": -1.4685500860214233, "logits/rejected": -1.3829035758972168, "logps/chosen": -2675.003173828125, "logps/rejected": -2091.812744140625, "loss": 0.61, "rewards/accuracies": 0.625, "rewards/chosen": 0.828966498374939, "rewards/margins": 0.3263750672340393, "rewards/rejected": 0.5025915503501892, "step": 1360 }, { "epoch": 0.72, "learning_rate": 2.243554372889479e-07, "logits/chosen": -1.4399888515472412, "logits/rejected": -1.3919384479522705, "logps/chosen": -2576.9365234375, "logps/rejected": -2010.0601806640625, "loss": 0.597, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.9715896844863892, "rewards/margins": 0.460097074508667, "rewards/rejected": 0.5114925503730774, "step": 1370 }, { "epoch": 0.72, "learning_rate": 2.1677805498050998e-07, "logits/chosen": -1.3894431591033936, "logits/rejected": -1.3669414520263672, "logps/chosen": -1986.740966796875, "logps/rejected": -1580.8253173828125, "loss": 0.6499, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5728658437728882, "rewards/margins": 0.245010107755661, "rewards/rejected": 0.32785576581954956, "step": 1380 }, { "epoch": 0.73, "learning_rate": 2.0929526650100716e-07, "logits/chosen": -1.4540735483169556, "logits/rejected": -1.3499418497085571, "logps/chosen": -2753.11669921875, "logps/rejected": -2095.53466796875, "loss": 0.6456, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8334699869155884, "rewards/margins": 0.7317672967910767, "rewards/rejected": 0.10170261561870575, "step": 1390 }, { "epoch": 0.73, "learning_rate": 2.0190957104100692e-07, "logits/chosen": -1.4822982549667358, "logits/rejected": -1.4137917757034302, "logps/chosen": -2363.976806640625, "logps/rejected": -1997.6536865234375, "loss": 0.6053, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7185107469558716, "rewards/margins": 0.401099294424057, "rewards/rejected": 0.3174114525318146, "step": 1400 }, { "epoch": 0.73, "eval_logits/chosen": -1.4891161918640137, "eval_logits/rejected": -1.4439697265625, "eval_logps/chosen": -2585.19140625, "eval_logps/rejected": -2184.890869140625, "eval_loss": 0.6342132091522217, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 0.8758403062820435, "eval_rewards/margins": 0.4195804297924042, "eval_rewards/rejected": 0.4562598764896393, "eval_runtime": 299.1063, "eval_samples_per_second": 6.687, "eval_steps_per_second": 0.418, "step": 1400 }, { "epoch": 0.74, "learning_rate": 1.9462343536279612e-07, "logits/chosen": -1.475975751876831, "logits/rejected": -1.4379873275756836, "logps/chosen": -2481.176025390625, "logps/rejected": -2232.84912109375, "loss": 0.6145, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9434836506843567, "rewards/margins": 0.4186176657676697, "rewards/rejected": 0.524865984916687, "step": 1410 }, { "epoch": 0.74, "learning_rate": 1.874392929765044e-07, "logits/chosen": -1.4733283519744873, "logits/rejected": -1.3902546167373657, "logps/chosen": -2782.106689453125, "logps/rejected": -2127.639404296875, "loss": 0.5946, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1233876943588257, "rewards/margins": 0.5207871198654175, "rewards/rejected": 0.6026005148887634, "step": 1420 }, { "epoch": 0.75, "learning_rate": 1.8035954332732889e-07, "logits/chosen": -1.4501025676727295, "logits/rejected": -1.4023559093475342, "logps/chosen": -2202.23974609375, "logps/rejected": -1934.811279296875, "loss": 0.6426, "rewards/accuracies": 0.625, "rewards/chosen": 0.644204318523407, "rewards/margins": 0.34255489706993103, "rewards/rejected": 0.30164945125579834, "step": 1430 }, { "epoch": 0.75, "learning_rate": 1.733865509941419e-07, "logits/chosen": -1.4848979711532593, "logits/rejected": -1.445502519607544, "logps/chosen": -2633.660888671875, "logps/rejected": -2392.826416015625, "loss": 0.6303, "rewards/accuracies": 0.625, "rewards/chosen": 0.8459588885307312, "rewards/margins": 0.4044179916381836, "rewards/rejected": 0.4415409564971924, "step": 1440 }, { "epoch": 0.76, "learning_rate": 1.6652264489973861e-07, "logits/chosen": -1.4826475381851196, "logits/rejected": -1.426309585571289, "logps/chosen": -2556.17626953125, "logps/rejected": -1992.7232666015625, "loss": 0.6061, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6768069267272949, "rewards/margins": 0.32333052158355713, "rewards/rejected": 0.3534763753414154, "step": 1450 }, { "epoch": 0.76, "learning_rate": 1.5977011753299724e-07, "logits/chosen": -1.5091631412506104, "logits/rejected": -1.4753676652908325, "logps/chosen": -2201.044921875, "logps/rejected": -1877.4302978515625, "loss": 0.612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7366055250167847, "rewards/margins": 0.2949199378490448, "rewards/rejected": 0.44168558716773987, "step": 1460 }, { "epoch": 0.77, "learning_rate": 1.5313122418320496e-07, "logits/chosen": -1.5059702396392822, "logits/rejected": -1.4471460580825806, "logps/chosen": -2972.50439453125, "logps/rejected": -2307.0458984375, "loss": 0.6042, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.0683201551437378, "rewards/margins": 0.5777542591094971, "rewards/rejected": 0.49056586623191833, "step": 1470 }, { "epoch": 0.77, "learning_rate": 1.4660818218681125e-07, "logits/chosen": -1.4828715324401855, "logits/rejected": -1.4702181816101074, "logps/chosen": -2593.748046875, "logps/rejected": -2591.448974609375, "loss": 0.588, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.9806830286979675, "rewards/margins": 0.4084799885749817, "rewards/rejected": 0.5722029805183411, "step": 1480 }, { "epoch": 0.78, "learning_rate": 1.4020317018685362e-07, "logits/chosen": -1.456514596939087, "logits/rejected": -1.390700101852417, "logps/chosen": -2405.19482421875, "logps/rejected": -1981.04296875, "loss": 0.6567, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.614848256111145, "rewards/margins": 0.3052050471305847, "rewards/rejected": 0.3096432089805603, "step": 1490 }, { "epoch": 0.79, "learning_rate": 1.3391832740531055e-07, "logits/chosen": -1.4236390590667725, "logits/rejected": -1.3956820964813232, "logps/chosen": -2446.695068359375, "logps/rejected": -2376.41259765625, "loss": 0.6232, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7929419875144958, "rewards/margins": 0.35024353861808777, "rewards/rejected": 0.44269853830337524, "step": 1500 }, { "epoch": 0.79, "eval_logits/chosen": -1.4759258031845093, "eval_logits/rejected": -1.4282684326171875, "eval_logps/chosen": -2592.221923828125, "eval_logps/rejected": -2190.57958984375, "eval_loss": 0.6323803663253784, "eval_rewards/accuracies": 0.6579999923706055, "eval_rewards/chosen": 0.8055330514907837, "eval_rewards/margins": 0.40616247057914734, "eval_rewards/rejected": 0.39937061071395874, "eval_runtime": 299.6311, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.417, "step": 1500 }, { "epoch": 0.79, "learning_rate": 1.2775575292861707e-07, "logits/chosen": -1.4745705127716064, "logits/rejected": -1.4221175909042358, "logps/chosen": -2639.8076171875, "logps/rejected": -2123.642578125, "loss": 0.6056, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.9188385009765625, "rewards/margins": 0.5551499128341675, "rewards/rejected": 0.3636886477470398, "step": 1510 }, { "epoch": 0.8, "learning_rate": 1.21717505006588e-07, "logits/chosen": -1.4603058099746704, "logits/rejected": -1.4439467191696167, "logps/chosen": -2664.22119140625, "logps/rejected": -2496.781005859375, "loss": 0.6213, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9416143298149109, "rewards/margins": 0.3402588963508606, "rewards/rejected": 0.6013555526733398, "step": 1520 }, { "epoch": 0.8, "learning_rate": 1.1580560036497877e-07, "logits/chosen": -1.473534345626831, "logits/rejected": -1.4060730934143066, "logps/chosen": -2819.74462890625, "logps/rejected": -2299.840576171875, "loss": 0.6071, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8828972578048706, "rewards/margins": 0.5179694294929504, "rewards/rejected": 0.3649279475212097, "step": 1530 }, { "epoch": 0.81, "learning_rate": 1.1002201353191521e-07, "logits/chosen": -1.4415251016616821, "logits/rejected": -1.461745023727417, "logps/chosen": -2390.272705078125, "logps/rejected": -2447.08642578125, "loss": 0.6433, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.6030459403991699, "rewards/margins": 0.1989385038614273, "rewards/rejected": 0.4041074216365814, "step": 1540 }, { "epoch": 0.81, "learning_rate": 1.0436867617841766e-07, "logits/chosen": -1.4779837131500244, "logits/rejected": -1.443192958831787, "logps/chosen": -2101.65771484375, "logps/rejected": -1614.459228515625, "loss": 0.5839, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.38758862018585205, "rewards/margins": 0.41923385858535767, "rewards/rejected": -0.03164520859718323, "step": 1550 }, { "epoch": 0.82, "learning_rate": 9.884747647323854e-08, "logits/chosen": -1.4118781089782715, "logits/rejected": -1.398271083831787, "logps/chosen": -2657.19287109375, "logps/rejected": -2414.64990234375, "loss": 0.6554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6775075793266296, "rewards/margins": 0.22664561867713928, "rewards/rejected": 0.4508620798587799, "step": 1560 }, { "epoch": 0.82, "learning_rate": 9.346025845222871e-08, "logits/chosen": -1.4589564800262451, "logits/rejected": -1.4241827726364136, "logps/chosen": -2566.69384765625, "logps/rejected": -2381.8310546875, "loss": 0.6699, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7393044829368591, "rewards/margins": 0.26446717977523804, "rewards/rejected": 0.4748373031616211, "step": 1570 }, { "epoch": 0.83, "learning_rate": 8.82088214024454e-08, "logits/chosen": -1.4593846797943115, "logits/rejected": -1.4349015951156616, "logps/chosen": -2314.169189453125, "logps/rejected": -2187.58544921875, "loss": 0.6497, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4934845566749573, "rewards/margins": 0.21952751278877258, "rewards/rejected": 0.2739570140838623, "step": 1580 }, { "epoch": 0.83, "learning_rate": 8.309491926120393e-08, "logits/chosen": -1.4479442834854126, "logits/rejected": -1.388183832168579, "logps/chosen": -2701.14111328125, "logps/rejected": -2293.677001953125, "loss": 0.6347, "rewards/accuracies": 0.625, "rewards/chosen": 0.6762970089912415, "rewards/margins": 0.43245062232017517, "rewards/rejected": 0.2438463717699051, "step": 1590 }, { "epoch": 0.84, "learning_rate": 7.812026003027771e-08, "logits/chosen": -1.2826584577560425, "logits/rejected": -1.2632884979248047, "logps/chosen": -2654.244873046875, "logps/rejected": -2260.9638671875, "loss": 0.6326, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.40709176659584045, "rewards/margins": 0.25365540385246277, "rewards/rejected": 0.15343639254570007, "step": 1600 }, { "epoch": 0.84, "eval_logits/chosen": -1.4959200620651245, "eval_logits/rejected": -1.450128436088562, "eval_logps/chosen": -2627.5283203125, "eval_logps/rejected": -2220.19970703125, "eval_loss": 0.6391750574111938, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": 0.4524710476398468, "eval_rewards/margins": 0.3492998778820038, "eval_rewards/rejected": 0.10317116975784302, "eval_runtime": 302.5644, "eval_samples_per_second": 6.61, "eval_steps_per_second": 0.413, "step": 1600 }, { "epoch": 0.84, "learning_rate": 7.328650520543906e-08, "logits/chosen": -1.4119188785552979, "logits/rejected": -1.2946244478225708, "logps/chosen": -2411.543701171875, "logps/rejected": -1841.427978515625, "loss": 0.6211, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.3078997731208801, "rewards/margins": 0.17288625240325928, "rewards/rejected": 0.13501352071762085, "step": 1610 }, { "epoch": 0.85, "learning_rate": 6.859526922153352e-08, "logits/chosen": -1.4251132011413574, "logits/rejected": -1.3843073844909668, "logps/chosen": -2429.940185546875, "logps/rejected": -1990.4915771484375, "loss": 0.6556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5606139898300171, "rewards/margins": 0.2745349407196045, "rewards/rejected": 0.286079078912735, "step": 1620 }, { "epoch": 0.85, "learning_rate": 6.40481189132711e-08, "logits/chosen": -1.4726622104644775, "logits/rejected": -1.4261372089385986, "logps/chosen": -2766.93115234375, "logps/rejected": -2061.09912109375, "loss": 0.6425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5808910131454468, "rewards/margins": 0.43574967980384827, "rewards/rejected": 0.1451413631439209, "step": 1630 }, { "epoch": 0.86, "learning_rate": 5.964657299191711e-08, "logits/chosen": -1.4473376274108887, "logits/rejected": -1.4126627445220947, "logps/chosen": -2487.42919921875, "logps/rejected": -2065.8955078125, "loss": 0.6381, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8268505930900574, "rewards/margins": 0.4533798098564148, "rewards/rejected": 0.37347084283828735, "step": 1640 }, { "epoch": 0.86, "learning_rate": 5.53921015380539e-08, "logits/chosen": -1.428260087966919, "logits/rejected": -1.4423437118530273, "logps/chosen": -2295.45556640625, "logps/rejected": -2376.85595703125, "loss": 0.6173, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6111637353897095, "rewards/margins": 0.18756714463233948, "rewards/rejected": 0.4235965311527252, "step": 1650 }, { "epoch": 0.87, "learning_rate": 5.1286125510586805e-08, "logits/chosen": -1.462693452835083, "logits/rejected": -1.4421815872192383, "logps/chosen": -2543.067626953125, "logps/rejected": -2478.81494140625, "loss": 0.613, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8606179356575012, "rewards/margins": 0.37834784388542175, "rewards/rejected": 0.48227009177207947, "step": 1660 }, { "epoch": 0.87, "learning_rate": 4.733001627215466e-08, "logits/chosen": -1.4652189016342163, "logits/rejected": -1.4526941776275635, "logps/chosen": -2576.45556640625, "logps/rejected": -2486.090576171875, "loss": 0.6675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7547820210456848, "rewards/margins": 0.23100514709949493, "rewards/rejected": 0.5237768292427063, "step": 1670 }, { "epoch": 0.88, "learning_rate": 4.352509513110658e-08, "logits/chosen": -1.4286987781524658, "logits/rejected": -1.4079492092132568, "logps/chosen": -2363.428955078125, "logps/rejected": -2208.08740234375, "loss": 0.6258, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.574379026889801, "rewards/margins": 0.24226748943328857, "rewards/rejected": 0.33211153745651245, "step": 1680 }, { "epoch": 0.88, "learning_rate": 3.9872632900194936e-08, "logits/chosen": -1.4842069149017334, "logits/rejected": -1.415021300315857, "logps/chosen": -2913.2490234375, "logps/rejected": -2346.609619140625, "loss": 0.6436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.6599145531654358, "rewards/margins": 0.26596465706825256, "rewards/rejected": 0.3939499258995056, "step": 1690 }, { "epoch": 0.89, "learning_rate": 3.6373849472134954e-08, "logits/chosen": -1.4031012058258057, "logits/rejected": -1.3779500722885132, "logps/chosen": -2266.2158203125, "logps/rejected": -1981.5833740234375, "loss": 0.6469, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5494025945663452, "rewards/margins": 0.21553239226341248, "rewards/rejected": 0.33387020230293274, "step": 1700 }, { "epoch": 0.89, "eval_logits/chosen": -1.4758340120315552, "eval_logits/rejected": -1.4289432764053345, "eval_logps/chosen": -2598.2412109375, "eval_logps/rejected": -2195.535888671875, "eval_loss": 0.6306354403495789, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 0.7453421354293823, "eval_rewards/margins": 0.3955351710319519, "eval_rewards/rejected": 0.3498069643974304, "eval_runtime": 295.7456, "eval_samples_per_second": 6.763, "eval_steps_per_second": 0.423, "step": 1700 }, { "epoch": 0.9, "learning_rate": 3.302991341216976e-08, "logits/chosen": -1.4159257411956787, "logits/rejected": -1.392617106437683, "logps/chosen": -2077.9482421875, "logps/rejected": -1972.2515869140625, "loss": 0.6409, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5421566367149353, "rewards/margins": 0.2578433156013489, "rewards/rejected": 0.28431329131126404, "step": 1710 }, { "epoch": 0.9, "learning_rate": 2.9841941567779474e-08, "logits/chosen": -1.4799764156341553, "logits/rejected": -1.4051799774169922, "logps/chosen": -2897.63232421875, "logps/rejected": -2480.90625, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": 0.8838424682617188, "rewards/margins": 0.3801085352897644, "rewards/rejected": 0.5037339925765991, "step": 1720 }, { "epoch": 0.91, "learning_rate": 2.681099869566328e-08, "logits/chosen": -1.4630422592163086, "logits/rejected": -1.4653818607330322, "logps/chosen": -2166.15966796875, "logps/rejected": -2133.84326171875, "loss": 0.6171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5266101956367493, "rewards/margins": 0.21218034625053406, "rewards/rejected": 0.3144298195838928, "step": 1730 }, { "epoch": 0.91, "learning_rate": 2.3938097106119216e-08, "logits/chosen": -1.4574975967407227, "logits/rejected": -1.4154255390167236, "logps/chosen": -2208.398681640625, "logps/rejected": -1935.158203125, "loss": 0.6305, "rewards/accuracies": 0.625, "rewards/chosen": 0.6478286981582642, "rewards/margins": 0.3098670542240143, "rewards/rejected": 0.33796167373657227, "step": 1740 }, { "epoch": 0.92, "learning_rate": 2.12241963249406e-08, "logits/chosen": -1.4689569473266602, "logits/rejected": -1.4307196140289307, "logps/chosen": -2519.071044921875, "logps/rejected": -2212.586181640625, "loss": 0.6578, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6326580047607422, "rewards/margins": 0.3157083988189697, "rewards/rejected": 0.31694963574409485, "step": 1750 }, { "epoch": 0.92, "learning_rate": 1.8670202772942568e-08, "logits/chosen": -1.4382356405258179, "logits/rejected": -1.3769454956054688, "logps/chosen": -2694.0830078125, "logps/rejected": -2166.41845703125, "loss": 0.6341, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7784560322761536, "rewards/margins": 0.3015449643135071, "rewards/rejected": 0.4769110679626465, "step": 1760 }, { "epoch": 0.93, "learning_rate": 1.6276969463224545e-08, "logits/chosen": -1.4650015830993652, "logits/rejected": -1.463744878768921, "logps/chosen": -2586.126220703125, "logps/rejected": -2591.75439453125, "loss": 0.6103, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6145761609077454, "rewards/margins": 0.4303979277610779, "rewards/rejected": 0.18417824804782867, "step": 1770 }, { "epoch": 0.93, "learning_rate": 1.4045295716271e-08, "logits/chosen": -1.4920063018798828, "logits/rejected": -1.450634241104126, "logps/chosen": -2605.60986328125, "logps/rejected": -2116.304931640625, "loss": 0.608, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.645778477191925, "rewards/margins": 0.3450910151004791, "rewards/rejected": 0.30068737268447876, "step": 1780 }, { "epoch": 0.94, "learning_rate": 1.1975926892984766e-08, "logits/chosen": -1.4100964069366455, "logits/rejected": -1.3769333362579346, "logps/chosen": -2435.0087890625, "logps/rejected": -2033.880126953125, "loss": 0.6496, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6221305727958679, "rewards/margins": 0.3294012248516083, "rewards/rejected": 0.29272931814193726, "step": 1790 }, { "epoch": 0.94, "learning_rate": 1.0069554145742787e-08, "logits/chosen": -1.395265817642212, "logits/rejected": -1.3731589317321777, "logps/chosen": -2578.064697265625, "logps/rejected": -2280.887451171875, "loss": 0.669, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6557528972625732, "rewards/margins": 0.4573606848716736, "rewards/rejected": 0.1983920931816101, "step": 1800 }, { "epoch": 0.94, "eval_logits/chosen": -1.4769095182418823, "eval_logits/rejected": -1.4307643175125122, "eval_logps/chosen": -2607.336669921875, "eval_logps/rejected": -2203.039306640625, "eval_loss": 0.6322839260101318, "eval_rewards/accuracies": 0.6600000262260437, "eval_rewards/chosen": 0.6543857455253601, "eval_rewards/margins": 0.3796128034591675, "eval_rewards/rejected": 0.2747729420661926, "eval_runtime": 293.77, "eval_samples_per_second": 6.808, "eval_steps_per_second": 0.426, "step": 1800 }, { "epoch": 0.95, "learning_rate": 8.326814187556485e-09, "logits/chosen": -1.4078927040100098, "logits/rejected": -1.380299687385559, "logps/chosen": -2524.50439453125, "logps/rejected": -2226.43994140625, "loss": 0.6208, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5907411575317383, "rewards/margins": 0.25163906812667847, "rewards/rejected": 0.3391020894050598, "step": 1810 }, { "epoch": 0.95, "learning_rate": 6.7482890794151594e-09, "logits/chosen": -1.4838191270828247, "logits/rejected": -1.4362868070602417, "logps/chosen": -2814.218017578125, "logps/rejected": -2245.9033203125, "loss": 0.632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8923807144165039, "rewards/margins": 0.4581494927406311, "rewards/rejected": 0.4342312812805176, "step": 1820 }, { "epoch": 0.96, "learning_rate": 5.334506035882036e-09, "logits/chosen": -1.370774507522583, "logits/rejected": -1.3359023332595825, "logps/chosen": -2687.776123046875, "logps/rejected": -2035.099609375, "loss": 0.6014, "rewards/accuracies": 0.75, "rewards/chosen": 0.6979535818099976, "rewards/margins": 0.42525219917297363, "rewards/rejected": 0.27270132303237915, "step": 1830 }, { "epoch": 0.96, "learning_rate": 4.0859372490090194e-09, "logits/chosen": -1.4562771320343018, "logits/rejected": -1.4093388319015503, "logps/chosen": -2788.104248046875, "logps/rejected": -2335.853759765625, "loss": 0.6116, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7869713306427002, "rewards/margins": 0.4374913275241852, "rewards/rejected": 0.3494799733161926, "step": 1840 }, { "epoch": 0.97, "learning_rate": 3.0029997306283416e-09, "logits/chosen": -1.4756406545639038, "logits/rejected": -1.3986704349517822, "logps/chosen": -2574.64111328125, "logps/rejected": -1893.6328125, "loss": 0.6546, "rewards/accuracies": 0.625, "rewards/chosen": 0.6360118389129639, "rewards/margins": 0.3881533145904541, "rewards/rejected": 0.24785849452018738, "step": 1850 }, { "epoch": 0.97, "learning_rate": 2.0860551730742526e-09, "logits/chosen": -1.4544508457183838, "logits/rejected": -1.419983983039856, "logps/chosen": -2375.126220703125, "logps/rejected": -2017.3466796875, "loss": 0.5584, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.8510934710502625, "rewards/margins": 0.6245936155319214, "rewards/rejected": 0.22649994492530823, "step": 1860 }, { "epoch": 0.98, "learning_rate": 1.3354098283802628e-09, "logits/chosen": -1.4696677923202515, "logits/rejected": -1.4230769872665405, "logps/chosen": -2438.054931640625, "logps/rejected": -2103.46044921875, "loss": 0.621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7339269518852234, "rewards/margins": 0.35110199451446533, "rewards/rejected": 0.38282495737075806, "step": 1870 }, { "epoch": 0.98, "learning_rate": 7.513144059937415e-10, "logits/chosen": -1.4952335357666016, "logits/rejected": -1.442657232284546, "logps/chosen": -2848.296630859375, "logps/rejected": -2374.80126953125, "loss": 0.6061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7486821413040161, "rewards/margins": 0.31393861770629883, "rewards/rejected": 0.43474358320236206, "step": 1880 }, { "epoch": 0.99, "learning_rate": 3.3396398904106393e-10, "logits/chosen": -1.4425480365753174, "logits/rejected": -1.4436792135238647, "logps/chosen": -2551.7880859375, "logps/rejected": -2169.797607421875, "loss": 0.6124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5796520709991455, "rewards/margins": 0.4224782884120941, "rewards/rejected": 0.15717382729053497, "step": 1890 }, { "epoch": 0.99, "learning_rate": 8.349796917112018e-11, "logits/chosen": -1.4112383127212524, "logits/rejected": -1.3823628425598145, "logps/chosen": -2330.736083984375, "logps/rejected": -2090.098876953125, "loss": 0.6531, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.5138527750968933, "rewards/margins": 0.193558931350708, "rewards/rejected": 0.3202938437461853, "step": 1900 }, { "epoch": 0.99, "eval_logits/chosen": -1.4753704071044922, "eval_logits/rejected": -1.4289445877075195, "eval_logps/chosen": -2603.777587890625, "eval_logps/rejected": -2200.1181640625, "eval_loss": 0.6316895484924316, "eval_rewards/accuracies": 0.6639999747276306, "eval_rewards/chosen": 0.6899767518043518, "eval_rewards/margins": 0.38598912954330444, "eval_rewards/rejected": 0.30398762226104736, "eval_runtime": 302.6434, "eval_samples_per_second": 6.608, "eval_steps_per_second": 0.413, "step": 1900 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -1.4598416090011597, "logits/rejected": -1.4293019771575928, "logps/chosen": -2462.09912109375, "logps/rejected": -2050.02490234375, "loss": 0.6322, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5636069178581238, "rewards/margins": 0.25135958194732666, "rewards/rejected": 0.3122473955154419, "step": 1910 }, { "epoch": 1.0, "step": 1910, "total_flos": 0.0, "train_loss": 0.6480738864519209, "train_runtime": 26013.0665, "train_samples_per_second": 2.35, "train_steps_per_second": 0.073 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000000, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }