diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3022 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997382884061764, + "eval_steps": 100, + "global_step": 1910, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.235602094240838e-09, + "logits/chosen": -1.3201165199279785, + "logits/rejected": -1.2275193929672241, + "logps/chosen": -2993.4990234375, + "logps/rejected": -2222.55078125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 5.2356020942408376e-08, + "logits/chosen": -1.2813271284103394, + "logits/rejected": -1.2465020418167114, + "logps/chosen": -3047.636474609375, + "logps/rejected": -2742.105712890625, + "loss": 0.6973, + "rewards/accuracies": 0.4583333432674408, + "rewards/chosen": 0.00026022063684649765, + "rewards/margins": 0.0008929346804507077, + "rewards/rejected": -0.0006327141309157014, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.0471204188481675e-07, + "logits/chosen": -1.2586185932159424, + "logits/rejected": -1.1957629919052124, + "logps/chosen": -2689.84716796875, + "logps/rejected": -2126.1083984375, + "loss": 0.6916, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.014919064939022064, + "rewards/margins": 0.006186266429722309, + "rewards/rejected": 0.008732798509299755, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 1.5706806282722514e-07, + "logits/chosen": -1.175875186920166, + "logits/rejected": -1.1656105518341064, + "logps/chosen": -2198.431640625, + "logps/rejected": -2021.9176025390625, + "loss": 0.7049, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0053156702779233456, + "rewards/margins": -0.05735307186841965, + "rewards/rejected": 0.05203740671277046, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 2.094240837696335e-07, + "logits/chosen": -1.1858023405075073, + "logits/rejected": -1.1230406761169434, + "logps/chosen": -2056.973388671875, + "logps/rejected": -2170.3056640625, + "loss": 0.6906, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.1324843466281891, + "rewards/margins": -0.016001610085368156, + "rewards/rejected": 0.1484859436750412, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 2.6178010471204185e-07, + "logits/chosen": -1.2066991329193115, + "logits/rejected": -1.15940260887146, + "logps/chosen": -2678.28515625, + "logps/rejected": -2157.86376953125, + "loss": 0.6707, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.32956749200820923, + "rewards/margins": 0.08421512693166733, + "rewards/rejected": 0.2453523427248001, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 3.1413612565445027e-07, + "logits/chosen": -1.2342027425765991, + "logits/rejected": -1.1995573043823242, + "logps/chosen": -2410.271484375, + "logps/rejected": -2036.266845703125, + "loss": 0.6833, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.37566477060317993, + "rewards/margins": 0.07754239439964294, + "rewards/rejected": 0.2981223464012146, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 3.6649214659685864e-07, + "logits/chosen": -1.1794008016586304, + "logits/rejected": -1.1591062545776367, + "logps/chosen": -2638.678955078125, + "logps/rejected": -2372.677001953125, + "loss": 0.6778, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.43634381890296936, + "rewards/margins": 0.0520954504609108, + "rewards/rejected": 0.38424837589263916, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 4.18848167539267e-07, + "logits/chosen": -1.2023160457611084, + "logits/rejected": -1.1861956119537354, + "logps/chosen": -2399.763671875, + "logps/rejected": -2263.85888671875, + "loss": 0.6818, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.452880322933197, + "rewards/margins": 0.04662833362817764, + "rewards/rejected": 0.4062519967556, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 4.712041884816754e-07, + "logits/chosen": -1.2319462299346924, + "logits/rejected": -1.2353641986846924, + "logps/chosen": -2180.666259765625, + "logps/rejected": -2063.204345703125, + "loss": 0.6665, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.36227527260780334, + "rewards/margins": 0.02720705047249794, + "rewards/rejected": 0.3350681960582733, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 5.235602094240837e-07, + "logits/chosen": -1.2101176977157593, + "logits/rejected": -1.1575647592544556, + "logps/chosen": -2522.456298828125, + "logps/rejected": -2253.9931640625, + "loss": 0.6558, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5727291703224182, + "rewards/margins": 0.10190355777740479, + "rewards/rejected": 0.47082558274269104, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -1.2241016626358032, + "eval_logits/rejected": -1.182218313217163, + "eval_logps/chosen": -2595.654296875, + "eval_logps/rejected": -2172.529052734375, + "eval_loss": 0.6526807546615601, + "eval_rewards/accuracies": 0.5740000009536743, + "eval_rewards/chosen": 0.7712106108665466, + "eval_rewards/margins": 0.1913326531648636, + "eval_rewards/rejected": 0.5798779726028442, + "eval_runtime": 302.6088, + "eval_samples_per_second": 6.609, + "eval_steps_per_second": 0.413, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 5.759162303664922e-07, + "logits/chosen": -1.162023901939392, + "logits/rejected": -1.1786675453186035, + "logps/chosen": -2315.97216796875, + "logps/rejected": -2253.127685546875, + "loss": 0.6732, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7014600038528442, + "rewards/margins": 0.1181831955909729, + "rewards/rejected": 0.5832767486572266, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 6.282722513089005e-07, + "logits/chosen": -1.2144238948822021, + "logits/rejected": -1.1650540828704834, + "logps/chosen": -2668.5830078125, + "logps/rejected": -1998.516845703125, + "loss": 0.6723, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6677217483520508, + "rewards/margins": 0.20832547545433044, + "rewards/rejected": 0.45939627289772034, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 6.806282722513089e-07, + "logits/chosen": -1.220961332321167, + "logits/rejected": -1.1595335006713867, + "logps/chosen": -2847.095458984375, + "logps/rejected": -2245.98828125, + "loss": 0.6455, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8143318891525269, + "rewards/margins": 0.25173696875572205, + "rewards/rejected": 0.5625948905944824, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 7.329842931937173e-07, + "logits/chosen": -1.1750261783599854, + "logits/rejected": -1.1362488269805908, + "logps/chosen": -2556.08349609375, + "logps/rejected": -2165.498779296875, + "loss": 0.6639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6861199140548706, + "rewards/margins": 0.11765004694461823, + "rewards/rejected": 0.5684698820114136, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 7.853403141361256e-07, + "logits/chosen": -1.213008165359497, + "logits/rejected": -1.1688684225082397, + "logps/chosen": -2662.8193359375, + "logps/rejected": -2211.24072265625, + "loss": 0.6339, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.870284914970398, + "rewards/margins": 0.22113271057605743, + "rewards/rejected": 0.6491522192955017, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 8.37696335078534e-07, + "logits/chosen": -1.1444575786590576, + "logits/rejected": -1.091567039489746, + "logps/chosen": -2689.31298828125, + "logps/rejected": -2391.873291015625, + "loss": 0.6469, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5730727314949036, + "rewards/margins": 0.2371658980846405, + "rewards/rejected": 0.33590689301490784, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 8.900523560209424e-07, + "logits/chosen": -1.1294758319854736, + "logits/rejected": -1.178647756576538, + "logps/chosen": -2683.22509765625, + "logps/rejected": -2484.3818359375, + "loss": 0.6628, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.4668382704257965, + "rewards/margins": 0.08485493808984756, + "rewards/rejected": 0.38198333978652954, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 9.424083769633508e-07, + "logits/chosen": -1.2192734479904175, + "logits/rejected": -1.1568591594696045, + "logps/chosen": -2561.9091796875, + "logps/rejected": -2213.013916015625, + "loss": 0.6581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.9690437316894531, + "rewards/margins": 0.3352271616458893, + "rewards/rejected": 0.6338165998458862, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 9.947643979057591e-07, + "logits/chosen": -1.184699535369873, + "logits/rejected": -1.1766315698623657, + "logps/chosen": -2123.99072265625, + "logps/rejected": -2111.645751953125, + "loss": 0.6809, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6798163652420044, + "rewards/margins": 0.07367928326129913, + "rewards/rejected": 0.6061369776725769, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 9.999323662872996e-07, + "logits/chosen": -1.2072479724884033, + "logits/rejected": -1.1839154958724976, + "logps/chosen": -2698.072998046875, + "logps/rejected": -2592.82861328125, + "loss": 0.6404, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6077369451522827, + "rewards/margins": 0.17234833538532257, + "rewards/rejected": 0.4353886544704437, + "step": 200 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -1.2423152923583984, + "eval_logits/rejected": -1.201860785484314, + "eval_logps/chosen": -2626.8759765625, + "eval_logps/rejected": -2203.748291015625, + "eval_loss": 0.6911113858222961, + "eval_rewards/accuracies": 0.5860000252723694, + "eval_rewards/chosen": 0.45899277925491333, + "eval_rewards/margins": 0.19130723178386688, + "eval_rewards/rejected": 0.26768550276756287, + "eval_runtime": 302.3649, + "eval_samples_per_second": 6.615, + "eval_steps_per_second": 0.413, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 9.996985942280678e-07, + "logits/chosen": -1.2993234395980835, + "logits/rejected": -1.2211077213287354, + "logps/chosen": -2626.205810546875, + "logps/rejected": -1850.9456787109375, + "loss": 0.6556, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6171352863311768, + "rewards/margins": 0.32769179344177246, + "rewards/rejected": 0.2894434928894043, + "step": 210 + }, + { + "epoch": 0.12, + "learning_rate": 9.99297926897573e-07, + "logits/chosen": -1.249463438987732, + "logits/rejected": -1.2620993852615356, + "logps/chosen": -2312.38427734375, + "logps/rejected": -2108.46826171875, + "loss": 0.6647, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7376146912574768, + "rewards/margins": 0.25427359342575073, + "rewards/rejected": 0.48334112763404846, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 9.987304981154493e-07, + "logits/chosen": -1.2905672788619995, + "logits/rejected": -1.2782526016235352, + "logps/chosen": -2793.2978515625, + "logps/rejected": -2365.16552734375, + "loss": 0.7268, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6005491018295288, + "rewards/margins": 0.08131317794322968, + "rewards/rejected": 0.5192359685897827, + "step": 230 + }, + { + "epoch": 0.13, + "learning_rate": 9.979964973983e-07, + "logits/chosen": -1.402222752571106, + "logits/rejected": -1.3204929828643799, + "logps/chosen": -2332.16650390625, + "logps/rejected": -1890.1295166015625, + "loss": 0.6892, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.3606724441051483, + "rewards/margins": 0.234793022274971, + "rewards/rejected": 0.1258794367313385, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 9.970961698964024e-07, + "logits/chosen": -1.399332046508789, + "logits/rejected": -1.3611127138137817, + "logps/chosen": -2618.633056640625, + "logps/rejected": -2216.18505859375, + "loss": 0.7038, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7694709897041321, + "rewards/margins": 0.19202515482902527, + "rewards/rejected": 0.577445924282074, + "step": 250 + }, + { + "epoch": 0.14, + "learning_rate": 9.960298163118284e-07, + "logits/chosen": -1.4756546020507812, + "logits/rejected": -1.3830201625823975, + "logps/chosen": -2662.10986328125, + "logps/rejected": -2112.115478515625, + "loss": 0.6914, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.9105646014213562, + "rewards/margins": 0.19633980095386505, + "rewards/rejected": 0.7142248749732971, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 9.94797792798013e-07, + "logits/chosen": -1.4841511249542236, + "logits/rejected": -1.4767415523529053, + "logps/chosen": -2305.857177734375, + "logps/rejected": -2128.56396484375, + "loss": 0.6626, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.2341788113117218, + "rewards/margins": 0.13304655253887177, + "rewards/rejected": 0.10113225132226944, + "step": 270 + }, + { + "epoch": 0.15, + "learning_rate": 9.934005108408016e-07, + "logits/chosen": -1.4331722259521484, + "logits/rejected": -1.3947049379348755, + "logps/chosen": -2292.278564453125, + "logps/rejected": -1913.346435546875, + "loss": 0.661, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.8132773637771606, + "rewards/margins": 0.22855396568775177, + "rewards/rejected": 0.5847233533859253, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 9.918384371210175e-07, + "logits/chosen": -1.4025981426239014, + "logits/rejected": -1.3736456632614136, + "logps/chosen": -2201.71044921875, + "logps/rejected": -2091.62255859375, + "loss": 0.6766, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7386767268180847, + "rewards/margins": 0.228462815284729, + "rewards/rejected": 0.5102138519287109, + "step": 290 + }, + { + "epoch": 0.16, + "learning_rate": 9.901120933585937e-07, + "logits/chosen": -1.3154966831207275, + "logits/rejected": -1.326516032218933, + "logps/chosen": -2670.81201171875, + "logps/rejected": -2235.08349609375, + "loss": 0.6725, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.7127049565315247, + "rewards/margins": 0.18496084213256836, + "rewards/rejected": 0.5277441143989563, + "step": 300 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -1.3645591735839844, + "eval_logits/rejected": -1.314851999282837, + "eval_logps/chosen": -2591.692138671875, + "eval_logps/rejected": -2178.205810546875, + "eval_loss": 0.6602776050567627, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": 0.8108287453651428, + "eval_rewards/margins": 0.28771865367889404, + "eval_rewards/rejected": 0.5231101512908936, + "eval_runtime": 302.3737, + "eval_samples_per_second": 6.614, + "eval_steps_per_second": 0.413, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 9.882220561383237e-07, + "logits/chosen": -1.3421976566314697, + "logits/rejected": -1.2967360019683838, + "logps/chosen": -2590.6484375, + "logps/rejected": -2214.814208984375, + "loss": 0.6749, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.7196224331855774, + "rewards/margins": 0.18787309527397156, + "rewards/rejected": 0.5317493081092834, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 9.861689567172849e-07, + "logits/chosen": -1.3033558130264282, + "logits/rejected": -1.2557708024978638, + "logps/chosen": -2364.27587890625, + "logps/rejected": -2370.61865234375, + "loss": 0.7144, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6515111923217773, + "rewards/margins": 0.11765609681606293, + "rewards/rejected": 0.5338551998138428, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 9.839534808140065e-07, + "logits/chosen": -1.2571797370910645, + "logits/rejected": -1.2486730813980103, + "logps/chosen": -2348.859130859375, + "logps/rejected": -1969.1402587890625, + "loss": 0.7502, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6785815954208374, + "rewards/margins": 0.024524565786123276, + "rewards/rejected": 0.6540570259094238, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 9.815763683794431e-07, + "logits/chosen": -1.2969481945037842, + "logits/rejected": -1.2044627666473389, + "logps/chosen": -2964.642578125, + "logps/rejected": -2117.79150390625, + "loss": 0.689, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3430386185646057, + "rewards/margins": 0.15022581815719604, + "rewards/rejected": 0.19281277060508728, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 9.790384133498377e-07, + "logits/chosen": -1.3875682353973389, + "logits/rejected": -1.3528212308883667, + "logps/chosen": -2609.759765625, + "logps/rejected": -2217.990234375, + "loss": 0.65, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.2748018503189087, + "rewards/margins": 0.3255355954170227, + "rewards/rejected": 0.9492664337158203, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 9.763404633815536e-07, + "logits/chosen": -1.4445443153381348, + "logits/rejected": -1.409148931503296, + "logps/chosen": -2325.73095703125, + "logps/rejected": -2067.62646484375, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9282833337783813, + "rewards/margins": 0.2425541877746582, + "rewards/rejected": 0.6857292056083679, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 9.73483419567964e-07, + "logits/chosen": -1.5681045055389404, + "logits/rejected": -1.47848379611969, + "logps/chosen": -2851.124267578125, + "logps/rejected": -2266.677734375, + "loss": 0.6686, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.8608830571174622, + "rewards/margins": 0.2859550416469574, + "rewards/rejected": 0.5749280452728271, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 9.70468236138494e-07, + "logits/chosen": -1.5734655857086182, + "logits/rejected": -1.4612947702407837, + "logps/chosen": -2619.15576171875, + "logps/rejected": -1996.1292724609375, + "loss": 0.6587, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7406389117240906, + "rewards/margins": 0.2604018747806549, + "rewards/rejected": 0.4802371561527252, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 9.672959201399155e-07, + "logits/chosen": -1.4863954782485962, + "logits/rejected": -1.4341216087341309, + "logps/chosen": -2418.91748046875, + "logps/rejected": -2210.710205078125, + "loss": 0.6831, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9080713987350464, + "rewards/margins": 0.19638116657733917, + "rewards/rejected": 0.7116903066635132, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 9.639675311000027e-07, + "logits/chosen": -1.478477120399475, + "logits/rejected": -1.4470995664596558, + "logps/chosen": -2378.759521484375, + "logps/rejected": -2213.616455078125, + "loss": 0.689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5797199606895447, + "rewards/margins": 0.15609867870807648, + "rewards/rejected": 0.4236213266849518, + "step": 400 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -1.5029045343399048, + "eval_logits/rejected": -1.4427672624588013, + "eval_logps/chosen": -2591.764892578125, + "eval_logps/rejected": -2180.5830078125, + "eval_loss": 0.6528961658477783, + "eval_rewards/accuracies": 0.628000020980835, + "eval_rewards/chosen": 0.8101032376289368, + "eval_rewards/margins": 0.31076449155807495, + "eval_rewards/rejected": 0.49933871626853943, + "eval_runtime": 300.9467, + "eval_samples_per_second": 6.646, + "eval_steps_per_second": 0.415, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 9.60484180673657e-07, + "logits/chosen": -1.4771575927734375, + "logits/rejected": -1.449158787727356, + "logps/chosen": -2471.6416015625, + "logps/rejected": -2168.50439453125, + "loss": 0.7235, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.5357500314712524, + "rewards/margins": 0.03546437621116638, + "rewards/rejected": 0.5002856254577637, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 9.568470322716246e-07, + "logits/chosen": -1.461313247680664, + "logits/rejected": -1.3947060108184814, + "logps/chosen": -2724.66748046875, + "logps/rejected": -2191.56787109375, + "loss": 0.672, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7562235593795776, + "rewards/margins": 0.328954815864563, + "rewards/rejected": 0.4272686541080475, + "step": 420 + }, + { + "epoch": 0.23, + "learning_rate": 9.530573006719263e-07, + "logits/chosen": -1.5015565156936646, + "logits/rejected": -1.4776034355163574, + "logps/chosen": -2666.500732421875, + "logps/rejected": -2279.621826171875, + "loss": 0.6588, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.5253168344497681, + "rewards/margins": 0.28119999170303345, + "rewards/rejected": 0.24411681294441223, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 9.491162516141307e-07, + "logits/chosen": -1.4172331094741821, + "logits/rejected": -1.422502040863037, + "logps/chosen": -2282.531005859375, + "logps/rejected": -2387.561767578125, + "loss": 0.6692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.860162615776062, + "rewards/margins": 0.07978199422359467, + "rewards/rejected": 0.7803806662559509, + "step": 440 + }, + { + "epoch": 0.24, + "learning_rate": 9.450252013766092e-07, + "logits/chosen": -1.3361685276031494, + "logits/rejected": -1.2606579065322876, + "logps/chosen": -2627.769775390625, + "logps/rejected": -2308.65380859375, + "loss": 0.6375, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5678683519363403, + "rewards/margins": 0.21432606875896454, + "rewards/rejected": 0.3535422682762146, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 9.407855163369078e-07, + "logits/chosen": -1.306783676147461, + "logits/rejected": -1.2825387716293335, + "logps/chosen": -2633.41162109375, + "logps/rejected": -2218.27294921875, + "loss": 0.6678, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.9192908406257629, + "rewards/margins": 0.24978260695934296, + "rewards/rejected": 0.669508159160614, + "step": 460 + }, + { + "epoch": 0.25, + "learning_rate": 9.3639861251539e-07, + "logits/chosen": -1.2543857097625732, + "logits/rejected": -1.195093035697937, + "logps/chosen": -2341.584228515625, + "logps/rejected": -1947.591796875, + "loss": 0.6284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5097464919090271, + "rewards/margins": 0.36333781480789185, + "rewards/rejected": 0.14640869200229645, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 9.318659551022955e-07, + "logits/chosen": -1.3397210836410522, + "logits/rejected": -1.281937837600708, + "logps/chosen": -2238.00732421875, + "logps/rejected": -1736.181640625, + "loss": 0.6609, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.786676287651062, + "rewards/margins": 0.2685468792915344, + "rewards/rejected": 0.5181293487548828, + "step": 480 + }, + { + "epoch": 0.26, + "learning_rate": 9.271890579683804e-07, + "logits/chosen": -1.4926373958587646, + "logits/rejected": -1.4876558780670166, + "logps/chosen": -2662.705322265625, + "logps/rejected": -2349.420166015625, + "loss": 0.7143, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7370970845222473, + "rewards/margins": 0.34762194752693176, + "rewards/rejected": 0.38947516679763794, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 9.223694831592952e-07, + "logits/chosen": -1.5373231172561646, + "logits/rejected": -1.4849967956542969, + "logps/chosen": -2402.5634765625, + "logps/rejected": -2132.68701171875, + "loss": 0.6682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7446134090423584, + "rewards/margins": 0.32214781641960144, + "rewards/rejected": 0.42246556282043457, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -1.5664644241333008, + "eval_logits/rejected": -1.5148077011108398, + "eval_logps/chosen": -2576.100830078125, + "eval_logps/rejected": -2169.265380859375, + "eval_loss": 0.6673685312271118, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": 0.966746985912323, + "eval_rewards/margins": 0.3542312681674957, + "eval_rewards/rejected": 0.6125158071517944, + "eval_runtime": 302.6642, + "eval_samples_per_second": 6.608, + "eval_steps_per_second": 0.413, + "step": 500 + }, + { + "epoch": 0.27, + "learning_rate": 9.174088403738755e-07, + "logits/chosen": -1.5560601949691772, + "logits/rejected": -1.5580723285675049, + "logps/chosen": -2103.93310546875, + "logps/rejected": -2181.848876953125, + "loss": 0.6493, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6312128305435181, + "rewards/margins": 0.3005504906177521, + "rewards/rejected": 0.330662339925766, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 9.123087864265147e-07, + "logits/chosen": -1.543971061706543, + "logits/rejected": -1.5191954374313354, + "logps/chosen": -2323.391357421875, + "logps/rejected": -2031.1025390625, + "loss": 0.6736, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.41579127311706543, + "rewards/margins": 0.1768406629562378, + "rewards/rejected": 0.23895065486431122, + "step": 520 + }, + { + "epoch": 0.28, + "learning_rate": 9.070710246938016e-07, + "logits/chosen": -1.5579715967178345, + "logits/rejected": -1.5655916929244995, + "logps/chosen": -2268.76318359375, + "logps/rejected": -2190.51318359375, + "loss": 0.6519, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6393724083900452, + "rewards/margins": 0.3283298909664154, + "rewards/rejected": 0.3110424876213074, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 9.016973045456073e-07, + "logits/chosen": -1.6396840810775757, + "logits/rejected": -1.6098705530166626, + "logps/chosen": -2668.9462890625, + "logps/rejected": -2160.803955078125, + "loss": 0.669, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8928348422050476, + "rewards/margins": 0.47784289717674255, + "rewards/rejected": 0.41499200463294983, + "step": 540 + }, + { + "epoch": 0.29, + "learning_rate": 8.961894207608087e-07, + "logits/chosen": -1.6586135625839233, + "logits/rejected": -1.6290054321289062, + "logps/chosen": -2212.68994140625, + "logps/rejected": -2054.17626953125, + "loss": 0.6597, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.49731844663619995, + "rewards/margins": 0.16844932734966278, + "rewards/rejected": 0.32886913418769836, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 8.905492129278477e-07, + "logits/chosen": -1.6478192806243896, + "logits/rejected": -1.5791934728622437, + "logps/chosen": -2915.1103515625, + "logps/rejected": -2492.820068359375, + "loss": 0.6553, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6440809965133667, + "rewards/margins": 0.29220613837242126, + "rewards/rejected": 0.35187482833862305, + "step": 560 + }, + { + "epoch": 0.3, + "learning_rate": 8.847785648303233e-07, + "logits/chosen": -1.648879051208496, + "logits/rejected": -1.5808627605438232, + "logps/chosen": -2345.06787109375, + "logps/rejected": -1874.7965087890625, + "loss": 0.6562, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.5588332414627075, + "rewards/margins": 0.2794465720653534, + "rewards/rejected": 0.2793866991996765, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 8.788794038178232e-07, + "logits/chosen": -1.646813154220581, + "logits/rejected": -1.5900137424468994, + "logps/chosen": -2427.92822265625, + "logps/rejected": -1974.943359375, + "loss": 0.6286, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 1.0534051656723022, + "rewards/margins": 0.35476142168045044, + "rewards/rejected": 0.6986437439918518, + "step": 580 + }, + { + "epoch": 0.31, + "learning_rate": 8.728537001622049e-07, + "logits/chosen": -1.6359336376190186, + "logits/rejected": -1.5665844678878784, + "logps/chosen": -2346.7265625, + "logps/rejected": -1916.209716796875, + "loss": 0.6555, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7451139092445374, + "rewards/margins": 0.24112704396247864, + "rewards/rejected": 0.5039868354797363, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 8.667034663995408e-07, + "logits/chosen": -1.6207376718521118, + "logits/rejected": -1.5811537504196167, + "logps/chosen": -2380.62939453125, + "logps/rejected": -2060.835205078125, + "loss": 0.6309, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.8570950627326965, + "rewards/margins": 0.32400840520858765, + "rewards/rejected": 0.5330866575241089, + "step": 600 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -1.6448516845703125, + "eval_logits/rejected": -1.588512897491455, + "eval_logps/chosen": -2589.297119140625, + "eval_logps/rejected": -2183.78515625, + "eval_loss": 0.6445065140724182, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": 0.834783673286438, + "eval_rewards/margins": 0.3674681782722473, + "eval_rewards/rejected": 0.4673156440258026, + "eval_runtime": 306.3454, + "eval_samples_per_second": 6.529, + "eval_steps_per_second": 0.408, + "step": 600 + }, + { + "epoch": 0.32, + "learning_rate": 8.604307566579472e-07, + "logits/chosen": -1.5816807746887207, + "logits/rejected": -1.6054216623306274, + "logps/chosen": -2258.828857421875, + "logps/rejected": -2473.440185546875, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3196907639503479, + "rewards/margins": 0.4189208149909973, + "rewards/rejected": -0.09922999143600464, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 8.540376659715225e-07, + "logits/chosen": -1.6599409580230713, + "logits/rejected": -1.5913432836532593, + "logps/chosen": -2412.462890625, + "logps/rejected": -2083.058837890625, + "loss": 0.6291, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8317147493362427, + "rewards/margins": 0.3438655138015747, + "rewards/rejected": 0.48784923553466797, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 8.47526329580623e-07, + "logits/chosen": -1.535036325454712, + "logits/rejected": -1.5678516626358032, + "logps/chosen": -2142.04931640625, + "logps/rejected": -2099.13720703125, + "loss": 0.633, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9528681635856628, + "rewards/margins": 0.24734528362751007, + "rewards/rejected": 0.7055227756500244, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 8.408989222187096e-07, + "logits/chosen": -1.5995115041732788, + "logits/rejected": -1.5139375925064087, + "logps/chosen": -3065.62451171875, + "logps/rejected": -2365.10107421875, + "loss": 0.6969, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9795970916748047, + "rewards/margins": 0.47979211807250977, + "rewards/rejected": 0.49980488419532776, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 8.341576573860047e-07, + "logits/chosen": -1.5332003831863403, + "logits/rejected": -1.4982550144195557, + "logps/chosen": -2392.21728515625, + "logps/rejected": -1984.2425537109375, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8843706846237183, + "rewards/margins": 0.32931455969810486, + "rewards/rejected": 0.5550561547279358, + "step": 650 + }, + { + "epoch": 0.35, + "learning_rate": 8.27304786610201e-07, + "logits/chosen": -1.5626050233840942, + "logits/rejected": -1.5275344848632812, + "logps/chosen": -2318.65625, + "logps/rejected": -1863.1956787109375, + "loss": 0.6323, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.8664724230766296, + "rewards/margins": 0.5049992799758911, + "rewards/rejected": 0.3614731729030609, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 8.203425986944696e-07, + "logits/chosen": -1.5559314489364624, + "logits/rejected": -1.5068961381912231, + "logps/chosen": -2837.03369140625, + "logps/rejected": -2028.3587646484375, + "loss": 0.6661, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7081668972969055, + "rewards/margins": 0.37415772676467896, + "rewards/rejected": 0.3340091109275818, + "step": 670 + }, + { + "epoch": 0.36, + "learning_rate": 8.132734189530182e-07, + "logits/chosen": -1.569585919380188, + "logits/rejected": -1.5583667755126953, + "logps/chosen": -2081.708984375, + "logps/rejected": -2073.14892578125, + "loss": 0.7058, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.22897915542125702, + "rewards/margins": 0.06144998222589493, + "rewards/rejected": 0.1675291508436203, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 8.060996084344553e-07, + "logits/chosen": -1.6668421030044556, + "logits/rejected": -1.6300331354141235, + "logps/chosen": -2808.94140625, + "logps/rejected": -2424.194580078125, + "loss": 0.6651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.9484899640083313, + "rewards/margins": 0.38452741503715515, + "rewards/rejected": 0.5639625787734985, + "step": 690 + }, + { + "epoch": 0.37, + "learning_rate": 7.98823563133219e-07, + "logits/chosen": -1.6251919269561768, + "logits/rejected": -1.6152589321136475, + "logps/chosen": -2532.464111328125, + "logps/rejected": -2264.97802734375, + "loss": 0.6467, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8040878176689148, + "rewards/margins": 0.3939053416252136, + "rewards/rejected": 0.4101824164390564, + "step": 700 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.7105224132537842, + "eval_logits/rejected": -1.6561530828475952, + "eval_logps/chosen": -2584.251220703125, + "eval_logps/rejected": -2175.965087890625, + "eval_loss": 0.6481595635414124, + "eval_rewards/accuracies": 0.6240000128746033, + "eval_rewards/chosen": 0.8852397799491882, + "eval_rewards/margins": 0.3397220969200134, + "eval_rewards/rejected": 0.54551762342453, + "eval_runtime": 303.8379, + "eval_samples_per_second": 6.582, + "eval_steps_per_second": 0.411, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 7.914477131893342e-07, + "logits/chosen": -1.71377432346344, + "logits/rejected": -1.708833932876587, + "logps/chosen": -2544.854248046875, + "logps/rejected": -2375.308349609375, + "loss": 0.6722, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6972166299819946, + "rewards/margins": 0.06426803767681122, + "rewards/rejected": 0.6329485774040222, + "step": 710 + }, + { + "epoch": 0.38, + "learning_rate": 7.839745220767661e-07, + "logits/chosen": -1.694154143333435, + "logits/rejected": -1.669390320777893, + "logps/chosen": -2534.442626953125, + "logps/rejected": -2229.87158203125, + "loss": 0.6723, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.48106852173805237, + "rewards/margins": 0.24986381828784943, + "rewards/rejected": 0.23120474815368652, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 7.764064857806389e-07, + "logits/chosen": -1.6268012523651123, + "logits/rejected": -1.575046420097351, + "logps/chosen": -2722.456298828125, + "logps/rejected": -2351.8857421875, + "loss": 0.643, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7691014409065247, + "rewards/margins": 0.34405142068862915, + "rewards/rejected": 0.42504996061325073, + "step": 730 + }, + { + "epoch": 0.39, + "learning_rate": 7.68746131963598e-07, + "logits/chosen": -1.6478900909423828, + "logits/rejected": -1.597701072692871, + "logps/chosen": -2222.41259765625, + "logps/rejected": -1990.4273681640625, + "loss": 0.6243, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6448198556900024, + "rewards/margins": 0.2648247182369232, + "rewards/rejected": 0.37999510765075684, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 7.609960191215909e-07, + "logits/chosen": -1.6781095266342163, + "logits/rejected": -1.6269840002059937, + "logps/chosen": -2453.95068359375, + "logps/rejected": -2161.110595703125, + "loss": 0.6632, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.6921306252479553, + "rewards/margins": 0.11647888273000717, + "rewards/rejected": 0.5756517648696899, + "step": 750 + }, + { + "epoch": 0.4, + "learning_rate": 7.531587357293505e-07, + "logits/chosen": -1.6048580408096313, + "logits/rejected": -1.6003602743148804, + "logps/chosen": -2562.139404296875, + "logps/rejected": -2293.66943359375, + "loss": 0.6594, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7623199820518494, + "rewards/margins": 0.2832568287849426, + "rewards/rejected": 0.4790631830692291, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 7.452368993758645e-07, + "logits/chosen": -1.585092544555664, + "logits/rejected": -1.557943344116211, + "logps/chosen": -2426.169677734375, + "logps/rejected": -2058.61083984375, + "loss": 0.6519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.553870677947998, + "rewards/margins": 0.39466503262519836, + "rewards/rejected": 0.15920567512512207, + "step": 770 + }, + { + "epoch": 0.41, + "learning_rate": 7.372331558901237e-07, + "logits/chosen": -1.5951181650161743, + "logits/rejected": -1.55776846408844, + "logps/chosen": -2530.603515625, + "logps/rejected": -2058.31494140625, + "loss": 0.663, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5011290311813354, + "rewards/margins": 0.12420739978551865, + "rewards/rejected": 0.3769216239452362, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 7.291501784574355e-07, + "logits/chosen": -1.7254797220230103, + "logits/rejected": -1.6313526630401611, + "logps/chosen": -2754.68408203125, + "logps/rejected": -2185.399169921875, + "loss": 0.6073, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6229848265647888, + "rewards/margins": 0.35090917348861694, + "rewards/rejected": 0.27207568287849426, + "step": 790 + }, + { + "epoch": 0.42, + "learning_rate": 7.209906667266017e-07, + "logits/chosen": -1.7093772888183594, + "logits/rejected": -1.6865718364715576, + "logps/chosen": -2462.615478515625, + "logps/rejected": -2213.93798828125, + "loss": 0.6215, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 1.1175382137298584, + "rewards/margins": 0.40151238441467285, + "rewards/rejected": 0.7160258293151855, + "step": 800 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -1.7084823846817017, + "eval_logits/rejected": -1.6541036367416382, + "eval_logps/chosen": -2563.754638671875, + "eval_logps/rejected": -2162.267822265625, + "eval_loss": 0.6452978253364563, + "eval_rewards/accuracies": 0.6380000114440918, + "eval_rewards/chosen": 1.0902061462402344, + "eval_rewards/margins": 0.4077164828777313, + "eval_rewards/rejected": 0.6824895739555359, + "eval_runtime": 301.7419, + "eval_samples_per_second": 6.628, + "eval_steps_per_second": 0.414, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 7.12757345908258e-07, + "logits/chosen": -1.7412763833999634, + "logits/rejected": -1.6791282892227173, + "logps/chosen": -2606.15283203125, + "logps/rejected": -1956.8831787109375, + "loss": 0.6358, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9316846132278442, + "rewards/margins": 0.45442262291908264, + "rewards/rejected": 0.47726184129714966, + "step": 810 + }, + { + "epoch": 0.43, + "learning_rate": 7.044529658646761e-07, + "logits/chosen": -1.710146188735962, + "logits/rejected": -1.7056090831756592, + "logps/chosen": -2651.176513671875, + "logps/rejected": -2550.99755859375, + "loss": 0.6601, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6641424298286438, + "rewards/margins": 0.2352844774723053, + "rewards/rejected": 0.4288579821586609, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 6.960803001913314e-07, + "logits/chosen": -1.6102991104125977, + "logits/rejected": -1.5880324840545654, + "logps/chosen": -1818.771484375, + "logps/rejected": -1763.439208984375, + "loss": 0.6175, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4126269817352295, + "rewards/margins": 0.2377271205186844, + "rewards/rejected": 0.1748998463153839, + "step": 830 + }, + { + "epoch": 0.44, + "learning_rate": 6.876421452905448e-07, + "logits/chosen": -1.6048507690429688, + "logits/rejected": -1.5550066232681274, + "logps/chosen": -2419.88818359375, + "logps/rejected": -1979.8333740234375, + "loss": 0.672, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.1539905071258545, + "rewards/margins": 0.44714298844337463, + "rewards/rejected": 0.7068475484848022, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 6.791413194375076e-07, + "logits/chosen": -1.5756229162216187, + "logits/rejected": -1.5317662954330444, + "logps/chosen": -2326.3671875, + "logps/rejected": -2082.76123046875, + "loss": 0.6358, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.8061001896858215, + "rewards/margins": 0.20508570969104767, + "rewards/rejected": 0.6010144948959351, + "step": 850 + }, + { + "epoch": 0.45, + "learning_rate": 6.705806618389997e-07, + "logits/chosen": -1.6245572566986084, + "logits/rejected": -1.6081863641738892, + "logps/chosen": -2542.473876953125, + "logps/rejected": -2442.247314453125, + "loss": 0.6751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.8134121894836426, + "rewards/margins": 0.18775935471057892, + "rewards/rejected": 0.6256529092788696, + "step": 860 + }, + { + "epoch": 0.46, + "learning_rate": 6.619630316851182e-07, + "logits/chosen": -1.6937329769134521, + "logits/rejected": -1.6594982147216797, + "logps/chosen": -2513.98046875, + "logps/rejected": -2264.63623046875, + "loss": 0.6902, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6390259861946106, + "rewards/margins": 0.21240201592445374, + "rewards/rejected": 0.4266239106655121, + "step": 870 + }, + { + "epoch": 0.46, + "learning_rate": 6.532913071943307e-07, + "logits/chosen": -1.6279165744781494, + "logits/rejected": -1.5716134309768677, + "logps/chosen": -2358.2890625, + "logps/rejected": -2005.8092041015625, + "loss": 0.6588, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 1.1509922742843628, + "rewards/margins": 0.4300170838832855, + "rewards/rejected": 0.7209752798080444, + "step": 880 + }, + { + "epoch": 0.47, + "learning_rate": 6.445683846521738e-07, + "logits/chosen": -1.458832025527954, + "logits/rejected": -1.3705499172210693, + "logps/chosen": -2031.3890380859375, + "logps/rejected": -1786.692626953125, + "loss": 0.6727, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.4857109487056732, + "rewards/margins": 0.11415307223796844, + "rewards/rejected": 0.3715578615665436, + "step": 890 + }, + { + "epoch": 0.47, + "learning_rate": 6.357971774439177e-07, + "logits/chosen": -1.446877360343933, + "logits/rejected": -1.4010428190231323, + "logps/chosen": -2083.528564453125, + "logps/rejected": -2091.34228515625, + "loss": 0.6674, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3722456991672516, + "rewards/margins": 0.18528583645820618, + "rewards/rejected": 0.18695983290672302, + "step": 900 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -1.5652438402175903, + "eval_logits/rejected": -1.5145412683486938, + "eval_logps/chosen": -2594.7568359375, + "eval_logps/rejected": -2185.613525390625, + "eval_loss": 0.6415941119194031, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": 0.780185878276825, + "eval_rewards/margins": 0.33115366101264954, + "eval_rewards/rejected": 0.44903212785720825, + "eval_runtime": 290.6591, + "eval_samples_per_second": 6.881, + "eval_steps_per_second": 0.43, + "step": 900 + }, + { + "epoch": 0.48, + "learning_rate": 6.269806150815187e-07, + "logits/chosen": -1.580451250076294, + "logits/rejected": -1.5398848056793213, + "logps/chosen": -2756.412109375, + "logps/rejected": -2110.937255859375, + "loss": 0.5836, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.1578181982040405, + "rewards/margins": 0.4512609839439392, + "rewards/rejected": 0.7065572738647461, + "step": 910 + }, + { + "epoch": 0.48, + "learning_rate": 6.181216422251862e-07, + "logits/chosen": -1.6002380847930908, + "logits/rejected": -1.5482442378997803, + "logps/chosen": -2669.18408203125, + "logps/rejected": -2383.2392578125, + "loss": 0.6651, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.3831857442855835, + "rewards/margins": 0.3688461184501648, + "rewards/rejected": 1.014339566230774, + "step": 920 + }, + { + "epoch": 0.49, + "learning_rate": 6.092232176998897e-07, + "logits/chosen": -1.5446488857269287, + "logits/rejected": -1.5036358833312988, + "logps/chosen": -2283.471923828125, + "logps/rejected": -2156.527587890625, + "loss": 0.6389, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.8198372721672058, + "rewards/margins": 0.23020341992378235, + "rewards/rejected": 0.5896340012550354, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 6.002883135071362e-07, + "logits/chosen": -1.4674952030181885, + "logits/rejected": -1.3860971927642822, + "logps/chosen": -2495.39794921875, + "logps/rejected": -2081.33544921875, + "loss": 0.6479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.46686476469039917, + "rewards/margins": 0.33061760663986206, + "rewards/rejected": 0.1362471729516983, + "step": 940 + }, + { + "epoch": 0.5, + "learning_rate": 5.913199138323448e-07, + "logits/chosen": -1.5902820825576782, + "logits/rejected": -1.5817844867706299, + "logps/chosen": -2237.93603515625, + "logps/rejected": -2165.838623046875, + "loss": 0.699, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5283821821212769, + "rewards/margins": 0.3398032486438751, + "rewards/rejected": 0.18857893347740173, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 5.82321014048154e-07, + "logits/chosen": -1.5519543886184692, + "logits/rejected": -1.5687713623046875, + "logps/chosen": -2170.23583984375, + "logps/rejected": -2091.04248046875, + "loss": 0.6617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2492622435092926, + "rewards/margins": 0.2455929070711136, + "rewards/rejected": 0.00366935133934021, + "step": 960 + }, + { + "epoch": 0.51, + "learning_rate": 5.732946197139906e-07, + "logits/chosen": -1.5598348379135132, + "logits/rejected": -1.5337880849838257, + "logps/chosen": -2266.143310546875, + "logps/rejected": -2009.6168212890625, + "loss": 0.6497, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.789315402507782, + "rewards/margins": 0.16782251000404358, + "rewards/rejected": 0.6214929223060608, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 5.642437455722381e-07, + "logits/chosen": -1.5074641704559326, + "logits/rejected": -1.4456851482391357, + "logps/chosen": -2503.286865234375, + "logps/rejected": -2021.8304443359375, + "loss": 0.6258, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 1.141226053237915, + "rewards/margins": 0.3969436287879944, + "rewards/rejected": 0.7442826628684998, + "step": 980 + }, + { + "epoch": 0.52, + "learning_rate": 5.551714145413368e-07, + "logits/chosen": -1.468330979347229, + "logits/rejected": -1.3824667930603027, + "logps/chosen": -2575.858154296875, + "logps/rejected": -1971.8447265625, + "loss": 0.647, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.726246178150177, + "rewards/margins": 0.32752370834350586, + "rewards/rejected": 0.39872246980667114, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 5.460806567061533e-07, + "logits/chosen": -1.5170243978500366, + "logits/rejected": -1.4751875400543213, + "logps/chosen": -2752.580322265625, + "logps/rejected": -2291.04833984375, + "loss": 0.644, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8746698498725891, + "rewards/margins": 0.38163238763809204, + "rewards/rejected": 0.4930374026298523, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -1.5046511888504028, + "eval_logits/rejected": -1.4505603313446045, + "eval_logps/chosen": -2602.00390625, + "eval_logps/rejected": -2193.728515625, + "eval_loss": 0.6499609351158142, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": 0.7077119946479797, + "eval_rewards/margins": 0.3398290276527405, + "eval_rewards/rejected": 0.36788299679756165, + "eval_runtime": 299.5822, + "eval_samples_per_second": 6.676, + "eval_steps_per_second": 0.417, + "step": 1000 + }, + { + "epoch": 0.53, + "learning_rate": 5.369745083059577e-07, + "logits/chosen": -1.490482211112976, + "logits/rejected": -1.424222707748413, + "logps/chosen": -2471.395263671875, + "logps/rejected": -1937.520751953125, + "loss": 0.6353, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.47275876998901367, + "rewards/margins": 0.2599312365055084, + "rewards/rejected": 0.21282756328582764, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 5.278560107203437e-07, + "logits/chosen": -1.459146499633789, + "logits/rejected": -1.4577230215072632, + "logps/chosen": -2559.42724609375, + "logps/rejected": -2042.339599609375, + "loss": 0.6634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7685127258300781, + "rewards/margins": 0.3085792660713196, + "rewards/rejected": 0.45993345975875854, + "step": 1020 + }, + { + "epoch": 0.54, + "learning_rate": 5.18728209453432e-07, + "logits/chosen": -1.5719316005706787, + "logits/rejected": -1.5082643032073975, + "logps/chosen": -2554.538818359375, + "logps/rejected": -2257.06201171875, + "loss": 0.6673, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.9037872552871704, + "rewards/margins": 0.3130941092967987, + "rewards/rejected": 0.5906931161880493, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 5.095941531166982e-07, + "logits/chosen": -1.5710715055465698, + "logits/rejected": -1.5428146123886108, + "logps/chosen": -2587.89111328125, + "logps/rejected": -2198.08056640625, + "loss": 0.6266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7090158462524414, + "rewards/margins": 0.3786148130893707, + "rewards/rejected": 0.33040106296539307, + "step": 1040 + }, + { + "epoch": 0.55, + "learning_rate": 5.004568924107598e-07, + "logits/chosen": -1.6318562030792236, + "logits/rejected": -1.5859413146972656, + "logps/chosen": -2931.807373046875, + "logps/rejected": -2507.31298828125, + "loss": 0.6294, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7447463274002075, + "rewards/margins": 0.2536779046058655, + "rewards/rejected": 0.49106842279434204, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.913194791064675e-07, + "logits/chosen": -1.639493703842163, + "logits/rejected": -1.5823523998260498, + "logps/chosen": -2601.8447265625, + "logps/rejected": -2357.34814453125, + "loss": 0.6441, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7931571006774902, + "rewards/margins": 0.5028332471847534, + "rewards/rejected": 0.2903238832950592, + "step": 1060 + }, + { + "epoch": 0.56, + "learning_rate": 4.82184965025639e-07, + "logits/chosen": -1.5899850130081177, + "logits/rejected": -1.5473779439926147, + "logps/chosen": -2727.800537109375, + "logps/rejected": -2362.034423828125, + "loss": 0.6419, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.9983813166618347, + "rewards/margins": 0.39955899119377136, + "rewards/rejected": 0.5988222360610962, + "step": 1070 + }, + { + "epoch": 0.57, + "learning_rate": 4.73056401021775e-07, + "logits/chosen": -1.5197970867156982, + "logits/rejected": -1.4553916454315186, + "logps/chosen": -2388.419921875, + "logps/rejected": -2081.69775390625, + "loss": 0.6171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.601919949054718, + "rewards/margins": 0.227634459733963, + "rewards/rejected": 0.374285489320755, + "step": 1080 + }, + { + "epoch": 0.57, + "learning_rate": 4.639368359610982e-07, + "logits/chosen": -1.4987363815307617, + "logits/rejected": -1.4325814247131348, + "logps/chosen": -2522.322509765625, + "logps/rejected": -2121.84912109375, + "loss": 0.6571, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5968645215034485, + "rewards/margins": 0.3043002486228943, + "rewards/rejected": 0.2925642132759094, + "step": 1090 + }, + { + "epoch": 0.58, + "learning_rate": 4.5482931570425803e-07, + "logits/chosen": -1.5703797340393066, + "logits/rejected": -1.5181505680084229, + "logps/chosen": -2581.994140625, + "logps/rejected": -2270.20166015625, + "loss": 0.6539, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6347614526748657, + "rewards/margins": 0.321241557598114, + "rewards/rejected": 0.31352001428604126, + "step": 1100 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.5226702690124512, + "eval_logits/rejected": -1.4696787595748901, + "eval_logps/chosen": -2588.0068359375, + "eval_logps/rejected": -2181.99365234375, + "eval_loss": 0.6389243006706238, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": 0.8476871848106384, + "eval_rewards/margins": 0.362454891204834, + "eval_rewards/rejected": 0.4852323532104492, + "eval_runtime": 301.2203, + "eval_samples_per_second": 6.64, + "eval_steps_per_second": 0.415, + "step": 1100 + }, + { + "epoch": 0.58, + "learning_rate": 4.4573688208903686e-07, + "logits/chosen": -1.4915900230407715, + "logits/rejected": -1.3990033864974976, + "logps/chosen": -2177.49169921875, + "logps/rejected": -1711.8460693359375, + "loss": 0.6447, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6701298356056213, + "rewards/margins": 0.3176502585411072, + "rewards/rejected": 0.3524795174598694, + "step": 1110 + }, + { + "epoch": 0.59, + "learning_rate": 4.366625719144016e-07, + "logits/chosen": -1.5326006412506104, + "logits/rejected": -1.4640724658966064, + "logps/chosen": -2241.04052734375, + "logps/rejected": -1938.517822265625, + "loss": 0.6094, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.9225455522537231, + "rewards/margins": 0.3196006417274475, + "rewards/rejected": 0.6029448509216309, + "step": 1120 + }, + { + "epoch": 0.59, + "learning_rate": 4.276094159262368e-07, + "logits/chosen": -1.459031343460083, + "logits/rejected": -1.4118678569793701, + "logps/chosen": -2329.41943359375, + "logps/rejected": -2065.614501953125, + "loss": 0.6114, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.0603306293487549, + "rewards/margins": 0.38362884521484375, + "rewards/rejected": 0.6767016649246216, + "step": 1130 + }, + { + "epoch": 0.6, + "learning_rate": 4.1858043780510135e-07, + "logits/chosen": -1.4943807125091553, + "logits/rejected": -1.4440956115722656, + "logps/chosen": -2648.4462890625, + "logps/rejected": -2317.19970703125, + "loss": 0.6521, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9549520611763, + "rewards/margins": 0.1597224771976471, + "rewards/rejected": 0.7952295541763306, + "step": 1140 + }, + { + "epoch": 0.6, + "learning_rate": 4.0957865315634204e-07, + "logits/chosen": -1.4685379266738892, + "logits/rejected": -1.4013986587524414, + "logps/chosen": -2750.71142578125, + "logps/rejected": -2100.20068359375, + "loss": 0.6027, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.7830246686935425, + "rewards/margins": 0.5725045204162598, + "rewards/rejected": 0.2105201780796051, + "step": 1150 + }, + { + "epoch": 0.61, + "learning_rate": 4.006070685029075e-07, + "logits/chosen": -1.484535813331604, + "logits/rejected": -1.4587595462799072, + "logps/chosen": -2228.81787109375, + "logps/rejected": -2157.81298828125, + "loss": 0.6803, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.4148440957069397, + "rewards/margins": 0.10307104885578156, + "rewards/rejected": 0.3117729723453522, + "step": 1160 + }, + { + "epoch": 0.61, + "learning_rate": 3.916686802811927e-07, + "logits/chosen": -1.3863401412963867, + "logits/rejected": -1.4270175695419312, + "logps/chosen": -2092.947998046875, + "logps/rejected": -2140.6953125, + "loss": 0.624, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6797593832015991, + "rewards/margins": 0.16529296338558197, + "rewards/rejected": 0.514466404914856, + "step": 1170 + }, + { + "epoch": 0.62, + "learning_rate": 3.8276647384025467e-07, + "logits/chosen": -1.4469492435455322, + "logits/rejected": -1.3607311248779297, + "logps/chosen": -2557.885009765625, + "logps/rejected": -2165.09033203125, + "loss": 0.6423, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6221494078636169, + "rewards/margins": 0.2990773320198059, + "rewards/rejected": 0.3230721354484558, + "step": 1180 + }, + { + "epoch": 0.62, + "learning_rate": 3.7390342244472883e-07, + "logits/chosen": -1.5888515710830688, + "logits/rejected": -1.5609667301177979, + "logps/chosen": -2778.28515625, + "logps/rejected": -2496.6396484375, + "loss": 0.6533, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.9935188293457031, + "rewards/margins": 0.3621361255645752, + "rewards/rejected": 0.6313827037811279, + "step": 1190 + }, + { + "epoch": 0.63, + "learning_rate": 3.6508248628178446e-07, + "logits/chosen": -1.6396839618682861, + "logits/rejected": -1.5974278450012207, + "logps/chosen": -2493.72216796875, + "logps/rejected": -2359.435791015625, + "loss": 0.7267, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.9704666137695312, + "rewards/margins": 0.3994936943054199, + "rewards/rejected": 0.5709729790687561, + "step": 1200 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -1.6800066232681274, + "eval_logits/rejected": -1.6292266845703125, + "eval_logps/chosen": -2618.873779296875, + "eval_logps/rejected": -2207.94384765625, + "eval_loss": 0.6421077847480774, + "eval_rewards/accuracies": 0.6620000004768372, + "eval_rewards/chosen": 0.5390151143074036, + "eval_rewards/margins": 0.3132854104042053, + "eval_rewards/rejected": 0.22572976350784302, + "eval_runtime": 304.4335, + "eval_samples_per_second": 6.57, + "eval_steps_per_second": 0.411, + "step": 1200 + }, + { + "epoch": 0.63, + "learning_rate": 3.563066114724441e-07, + "logits/chosen": -1.6271164417266846, + "logits/rejected": -1.5858738422393799, + "logps/chosen": -2807.364990234375, + "logps/rejected": -2029.6510009765625, + "loss": 0.6347, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6177263855934143, + "rewards/margins": 0.27368754148483276, + "rewards/rejected": 0.3440387547016144, + "step": 1210 + }, + { + "epoch": 0.64, + "learning_rate": 3.475787290876055e-07, + "logits/chosen": -1.5973155498504639, + "logits/rejected": -1.558475375175476, + "logps/chosen": -2490.0703125, + "logps/rejected": -2087.466064453125, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8565654754638672, + "rewards/margins": 0.4143308699131012, + "rewards/rejected": 0.4422345757484436, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 3.389017541690854e-07, + "logits/chosen": -1.5630786418914795, + "logits/rejected": -1.548064947128296, + "logps/chosen": -2276.59619140625, + "logps/rejected": -1839.0726318359375, + "loss": 0.6357, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.7428416609764099, + "rewards/margins": 0.3907639980316162, + "rewards/rejected": 0.35207757353782654, + "step": 1230 + }, + { + "epoch": 0.65, + "learning_rate": 3.30278584756021e-07, + "logits/chosen": -1.548689365386963, + "logits/rejected": -1.4891592264175415, + "logps/chosen": -2640.1591796875, + "logps/rejected": -2317.181396484375, + "loss": 0.6184, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.910789966583252, + "rewards/margins": 0.37699228525161743, + "rewards/rejected": 0.5337976217269897, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 3.2171210091694735e-07, + "logits/chosen": -1.608028769493103, + "logits/rejected": -1.5826674699783325, + "logps/chosen": -2531.904296875, + "logps/rejected": -2342.30419921875, + "loss": 0.6087, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.7222377061843872, + "rewards/margins": 0.4060022830963135, + "rewards/rejected": 0.3162355422973633, + "step": 1250 + }, + { + "epoch": 0.66, + "learning_rate": 3.132051637878789e-07, + "logits/chosen": -1.5921976566314697, + "logits/rejected": -1.4880411624908447, + "logps/chosen": -2295.463134765625, + "logps/rejected": -1800.047119140625, + "loss": 0.6709, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8092087507247925, + "rewards/margins": 0.39788728952407837, + "rewards/rejected": 0.4113215506076813, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 3.0476061461671155e-07, + "logits/chosen": -1.5929429531097412, + "logits/rejected": -1.560585856437683, + "logps/chosen": -2178.914306640625, + "logps/rejected": -2029.7672119140625, + "loss": 0.6315, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.8661308288574219, + "rewards/margins": 0.3666331171989441, + "rewards/rejected": 0.4994977116584778, + "step": 1270 + }, + { + "epoch": 0.67, + "learning_rate": 2.9638127381427127e-07, + "logits/chosen": -1.4586659669876099, + "logits/rejected": -1.4546220302581787, + "logps/chosen": -2244.927978515625, + "logps/rejected": -2030.598876953125, + "loss": 0.5909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7213354110717773, + "rewards/margins": 0.37115171551704407, + "rewards/rejected": 0.3501836955547333, + "step": 1280 + }, + { + "epoch": 0.68, + "learning_rate": 2.8806994001231766e-07, + "logits/chosen": -1.462428092956543, + "logits/rejected": -1.4601207971572876, + "logps/chosen": -2553.372314453125, + "logps/rejected": -2366.053955078125, + "loss": 0.6324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.9080332517623901, + "rewards/margins": 0.3693556487560272, + "rewards/rejected": 0.5386777520179749, + "step": 1290 + }, + { + "epoch": 0.68, + "learning_rate": 2.7982938912882544e-07, + "logits/chosen": -1.5518906116485596, + "logits/rejected": -1.47800874710083, + "logps/chosen": -2843.82421875, + "logps/rejected": -2309.199951171875, + "loss": 0.5746, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 1.102667212486267, + "rewards/margins": 0.6155067682266235, + "rewards/rejected": 0.48716044425964355, + "step": 1300 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.5460779666900635, + "eval_logits/rejected": -1.4993510246276855, + "eval_logps/chosen": -2582.20947265625, + "eval_logps/rejected": -2181.592041015625, + "eval_loss": 0.6300790905952454, + "eval_rewards/accuracies": 0.6660000085830688, + "eval_rewards/chosen": 0.9056587815284729, + "eval_rewards/margins": 0.41641080379486084, + "eval_rewards/rejected": 0.48924797773361206, + "eval_runtime": 299.2617, + "eval_samples_per_second": 6.683, + "eval_steps_per_second": 0.418, + "step": 1300 + }, + { + "epoch": 0.69, + "learning_rate": 2.716623734408488e-07, + "logits/chosen": -1.5478688478469849, + "logits/rejected": -1.509421944618225, + "logps/chosen": -2733.4658203125, + "logps/rejected": -2210.788330078125, + "loss": 0.676, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.8321071863174438, + "rewards/margins": 0.17042401432991028, + "rewards/rejected": 0.661683201789856, + "step": 1310 + }, + { + "epoch": 0.69, + "learning_rate": 2.635716206652843e-07, + "logits/chosen": -1.51913321018219, + "logits/rejected": -1.5177617073059082, + "logps/chosen": -2348.56005859375, + "logps/rejected": -2216.1884765625, + "loss": 0.5911, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6093713045120239, + "rewards/margins": 0.3202818036079407, + "rewards/rejected": 0.28908950090408325, + "step": 1320 + }, + { + "epoch": 0.7, + "learning_rate": 2.5555983304783515e-07, + "logits/chosen": -1.4471040964126587, + "logits/rejected": -1.4324887990951538, + "logps/chosen": -2042.9017333984375, + "logps/rejected": -1859.039306640625, + "loss": 0.6168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.49268943071365356, + "rewards/margins": 0.3076168894767761, + "rewards/rejected": 0.18507252633571625, + "step": 1330 + }, + { + "epoch": 0.7, + "learning_rate": 2.4762968646048356e-07, + "logits/chosen": -1.4452800750732422, + "logits/rejected": -1.3810513019561768, + "logps/chosen": -2950.53271484375, + "logps/rejected": -2301.14892578125, + "loss": 0.6184, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.9146178364753723, + "rewards/margins": 0.5878747701644897, + "rewards/rejected": 0.326742947101593, + "step": 1340 + }, + { + "epoch": 0.71, + "learning_rate": 2.397838295077703e-07, + "logits/chosen": -1.4514172077178955, + "logits/rejected": -1.430443525314331, + "logps/chosen": -2407.11181640625, + "logps/rejected": -2338.7666015625, + "loss": 0.6172, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6004685163497925, + "rewards/margins": 0.15282198786735535, + "rewards/rejected": 0.44764652848243713, + "step": 1350 + }, + { + "epoch": 0.71, + "learning_rate": 2.3202488264218357e-07, + "logits/chosen": -1.4685500860214233, + "logits/rejected": -1.3829035758972168, + "logps/chosen": -2675.003173828125, + "logps/rejected": -2091.812744140625, + "loss": 0.61, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.828966498374939, + "rewards/margins": 0.3263750672340393, + "rewards/rejected": 0.5025915503501892, + "step": 1360 + }, + { + "epoch": 0.72, + "learning_rate": 2.243554372889479e-07, + "logits/chosen": -1.4399888515472412, + "logits/rejected": -1.3919384479522705, + "logps/chosen": -2576.9365234375, + "logps/rejected": -2010.0601806640625, + "loss": 0.597, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.9715896844863892, + "rewards/margins": 0.460097074508667, + "rewards/rejected": 0.5114925503730774, + "step": 1370 + }, + { + "epoch": 0.72, + "learning_rate": 2.1677805498050998e-07, + "logits/chosen": -1.3894431591033936, + "logits/rejected": -1.3669414520263672, + "logps/chosen": -1986.740966796875, + "logps/rejected": -1580.8253173828125, + "loss": 0.6499, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5728658437728882, + "rewards/margins": 0.245010107755661, + "rewards/rejected": 0.32785576581954956, + "step": 1380 + }, + { + "epoch": 0.73, + "learning_rate": 2.0929526650100716e-07, + "logits/chosen": -1.4540735483169556, + "logits/rejected": -1.3499418497085571, + "logps/chosen": -2753.11669921875, + "logps/rejected": -2095.53466796875, + "loss": 0.6456, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8334699869155884, + "rewards/margins": 0.7317672967910767, + "rewards/rejected": 0.10170261561870575, + "step": 1390 + }, + { + "epoch": 0.73, + "learning_rate": 2.0190957104100692e-07, + "logits/chosen": -1.4822982549667358, + "logits/rejected": -1.4137917757034302, + "logps/chosen": -2363.976806640625, + "logps/rejected": -1997.6536865234375, + "loss": 0.6053, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.7185107469558716, + "rewards/margins": 0.401099294424057, + "rewards/rejected": 0.3174114525318146, + "step": 1400 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -1.4891161918640137, + "eval_logits/rejected": -1.4439697265625, + "eval_logps/chosen": -2585.19140625, + "eval_logps/rejected": -2184.890869140625, + "eval_loss": 0.6342132091522217, + "eval_rewards/accuracies": 0.6660000085830688, + "eval_rewards/chosen": 0.8758403062820435, + "eval_rewards/margins": 0.4195804297924042, + "eval_rewards/rejected": 0.4562598764896393, + "eval_runtime": 299.1063, + "eval_samples_per_second": 6.687, + "eval_steps_per_second": 0.418, + "step": 1400 + }, + { + "epoch": 0.74, + "learning_rate": 1.9462343536279612e-07, + "logits/chosen": -1.475975751876831, + "logits/rejected": -1.4379873275756836, + "logps/chosen": -2481.176025390625, + "logps/rejected": -2232.84912109375, + "loss": 0.6145, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.9434836506843567, + "rewards/margins": 0.4186176657676697, + "rewards/rejected": 0.524865984916687, + "step": 1410 + }, + { + "epoch": 0.74, + "learning_rate": 1.874392929765044e-07, + "logits/chosen": -1.4733283519744873, + "logits/rejected": -1.3902546167373657, + "logps/chosen": -2782.106689453125, + "logps/rejected": -2127.639404296875, + "loss": 0.5946, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 1.1233876943588257, + "rewards/margins": 0.5207871198654175, + "rewards/rejected": 0.6026005148887634, + "step": 1420 + }, + { + "epoch": 0.75, + "learning_rate": 1.8035954332732889e-07, + "logits/chosen": -1.4501025676727295, + "logits/rejected": -1.4023559093475342, + "logps/chosen": -2202.23974609375, + "logps/rejected": -1934.811279296875, + "loss": 0.6426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.644204318523407, + "rewards/margins": 0.34255489706993103, + "rewards/rejected": 0.30164945125579834, + "step": 1430 + }, + { + "epoch": 0.75, + "learning_rate": 1.733865509941419e-07, + "logits/chosen": -1.4848979711532593, + "logits/rejected": -1.445502519607544, + "logps/chosen": -2633.660888671875, + "logps/rejected": -2392.826416015625, + "loss": 0.6303, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8459588885307312, + "rewards/margins": 0.4044179916381836, + "rewards/rejected": 0.4415409564971924, + "step": 1440 + }, + { + "epoch": 0.76, + "learning_rate": 1.6652264489973861e-07, + "logits/chosen": -1.4826475381851196, + "logits/rejected": -1.426309585571289, + "logps/chosen": -2556.17626953125, + "logps/rejected": -1992.7232666015625, + "loss": 0.6061, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6768069267272949, + "rewards/margins": 0.32333052158355713, + "rewards/rejected": 0.3534763753414154, + "step": 1450 + }, + { + "epoch": 0.76, + "learning_rate": 1.5977011753299724e-07, + "logits/chosen": -1.5091631412506104, + "logits/rejected": -1.4753676652908325, + "logps/chosen": -2201.044921875, + "logps/rejected": -1877.4302978515625, + "loss": 0.612, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7366055250167847, + "rewards/margins": 0.2949199378490448, + "rewards/rejected": 0.44168558716773987, + "step": 1460 + }, + { + "epoch": 0.77, + "learning_rate": 1.5313122418320496e-07, + "logits/chosen": -1.5059702396392822, + "logits/rejected": -1.4471460580825806, + "logps/chosen": -2972.50439453125, + "logps/rejected": -2307.0458984375, + "loss": 0.6042, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 1.0683201551437378, + "rewards/margins": 0.5777542591094971, + "rewards/rejected": 0.49056586623191833, + "step": 1470 + }, + { + "epoch": 0.77, + "learning_rate": 1.4660818218681125e-07, + "logits/chosen": -1.4828715324401855, + "logits/rejected": -1.4702181816101074, + "logps/chosen": -2593.748046875, + "logps/rejected": -2591.448974609375, + "loss": 0.588, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.9806830286979675, + "rewards/margins": 0.4084799885749817, + "rewards/rejected": 0.5722029805183411, + "step": 1480 + }, + { + "epoch": 0.78, + "learning_rate": 1.4020317018685362e-07, + "logits/chosen": -1.456514596939087, + "logits/rejected": -1.390700101852417, + "logps/chosen": -2405.19482421875, + "logps/rejected": -1981.04296875, + "loss": 0.6567, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.614848256111145, + "rewards/margins": 0.3052050471305847, + "rewards/rejected": 0.3096432089805603, + "step": 1490 + }, + { + "epoch": 0.79, + "learning_rate": 1.3391832740531055e-07, + "logits/chosen": -1.4236390590667725, + "logits/rejected": -1.3956820964813232, + "logps/chosen": -2446.695068359375, + "logps/rejected": -2376.41259765625, + "loss": 0.6232, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7929419875144958, + "rewards/margins": 0.35024353861808777, + "rewards/rejected": 0.44269853830337524, + "step": 1500 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -1.4759258031845093, + "eval_logits/rejected": -1.4282684326171875, + "eval_logps/chosen": -2592.221923828125, + "eval_logps/rejected": -2190.57958984375, + "eval_loss": 0.6323803663253784, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": 0.8055330514907837, + "eval_rewards/margins": 0.40616247057914734, + "eval_rewards/rejected": 0.39937061071395874, + "eval_runtime": 299.6311, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.417, + "step": 1500 + }, + { + "epoch": 0.79, + "learning_rate": 1.2775575292861707e-07, + "logits/chosen": -1.4745705127716064, + "logits/rejected": -1.4221175909042358, + "logps/chosen": -2639.8076171875, + "logps/rejected": -2123.642578125, + "loss": 0.6056, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.9188385009765625, + "rewards/margins": 0.5551499128341675, + "rewards/rejected": 0.3636886477470398, + "step": 1510 + }, + { + "epoch": 0.8, + "learning_rate": 1.21717505006588e-07, + "logits/chosen": -1.4603058099746704, + "logits/rejected": -1.4439467191696167, + "logps/chosen": -2664.22119140625, + "logps/rejected": -2496.781005859375, + "loss": 0.6213, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9416143298149109, + "rewards/margins": 0.3402588963508606, + "rewards/rejected": 0.6013555526733398, + "step": 1520 + }, + { + "epoch": 0.8, + "learning_rate": 1.1580560036497877e-07, + "logits/chosen": -1.473534345626831, + "logits/rejected": -1.4060730934143066, + "logps/chosen": -2819.74462890625, + "logps/rejected": -2299.840576171875, + "loss": 0.6071, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.8828972578048706, + "rewards/margins": 0.5179694294929504, + "rewards/rejected": 0.3649279475212097, + "step": 1530 + }, + { + "epoch": 0.81, + "learning_rate": 1.1002201353191521e-07, + "logits/chosen": -1.4415251016616821, + "logits/rejected": -1.461745023727417, + "logps/chosen": -2390.272705078125, + "logps/rejected": -2447.08642578125, + "loss": 0.6433, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.6030459403991699, + "rewards/margins": 0.1989385038614273, + "rewards/rejected": 0.4041074216365814, + "step": 1540 + }, + { + "epoch": 0.81, + "learning_rate": 1.0436867617841766e-07, + "logits/chosen": -1.4779837131500244, + "logits/rejected": -1.443192958831787, + "logps/chosen": -2101.65771484375, + "logps/rejected": -1614.459228515625, + "loss": 0.5839, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.38758862018585205, + "rewards/margins": 0.41923385858535767, + "rewards/rejected": -0.03164520859718323, + "step": 1550 + }, + { + "epoch": 0.82, + "learning_rate": 9.884747647323854e-08, + "logits/chosen": -1.4118781089782715, + "logits/rejected": -1.398271083831787, + "logps/chosen": -2657.19287109375, + "logps/rejected": -2414.64990234375, + "loss": 0.6554, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6775075793266296, + "rewards/margins": 0.22664561867713928, + "rewards/rejected": 0.4508620798587799, + "step": 1560 + }, + { + "epoch": 0.82, + "learning_rate": 9.346025845222871e-08, + "logits/chosen": -1.4589564800262451, + "logits/rejected": -1.4241827726364136, + "logps/chosen": -2566.69384765625, + "logps/rejected": -2381.8310546875, + "loss": 0.6699, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7393044829368591, + "rewards/margins": 0.26446717977523804, + "rewards/rejected": 0.4748373031616211, + "step": 1570 + }, + { + "epoch": 0.83, + "learning_rate": 8.82088214024454e-08, + "logits/chosen": -1.4593846797943115, + "logits/rejected": -1.4349015951156616, + "logps/chosen": -2314.169189453125, + "logps/rejected": -2187.58544921875, + "loss": 0.6497, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4934845566749573, + "rewards/margins": 0.21952751278877258, + "rewards/rejected": 0.2739570140838623, + "step": 1580 + }, + { + "epoch": 0.83, + "learning_rate": 8.309491926120393e-08, + "logits/chosen": -1.4479442834854126, + "logits/rejected": -1.388183832168579, + "logps/chosen": -2701.14111328125, + "logps/rejected": -2293.677001953125, + "loss": 0.6347, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6762970089912415, + "rewards/margins": 0.43245062232017517, + "rewards/rejected": 0.2438463717699051, + "step": 1590 + }, + { + "epoch": 0.84, + "learning_rate": 7.812026003027771e-08, + "logits/chosen": -1.2826584577560425, + "logits/rejected": -1.2632884979248047, + "logps/chosen": -2654.244873046875, + "logps/rejected": -2260.9638671875, + "loss": 0.6326, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.40709176659584045, + "rewards/margins": 0.25365540385246277, + "rewards/rejected": 0.15343639254570007, + "step": 1600 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -1.4959200620651245, + "eval_logits/rejected": -1.450128436088562, + "eval_logps/chosen": -2627.5283203125, + "eval_logps/rejected": -2220.19970703125, + "eval_loss": 0.6391750574111938, + "eval_rewards/accuracies": 0.656000018119812, + "eval_rewards/chosen": 0.4524710476398468, + "eval_rewards/margins": 0.3492998778820038, + "eval_rewards/rejected": 0.10317116975784302, + "eval_runtime": 302.5644, + "eval_samples_per_second": 6.61, + "eval_steps_per_second": 0.413, + "step": 1600 + }, + { + "epoch": 0.84, + "learning_rate": 7.328650520543906e-08, + "logits/chosen": -1.4119188785552979, + "logits/rejected": -1.2946244478225708, + "logps/chosen": -2411.543701171875, + "logps/rejected": -1841.427978515625, + "loss": 0.6211, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.3078997731208801, + "rewards/margins": 0.17288625240325928, + "rewards/rejected": 0.13501352071762085, + "step": 1610 + }, + { + "epoch": 0.85, + "learning_rate": 6.859526922153352e-08, + "logits/chosen": -1.4251132011413574, + "logits/rejected": -1.3843073844909668, + "logps/chosen": -2429.940185546875, + "logps/rejected": -1990.4915771484375, + "loss": 0.6556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5606139898300171, + "rewards/margins": 0.2745349407196045, + "rewards/rejected": 0.286079078912735, + "step": 1620 + }, + { + "epoch": 0.85, + "learning_rate": 6.40481189132711e-08, + "logits/chosen": -1.4726622104644775, + "logits/rejected": -1.4261372089385986, + "logps/chosen": -2766.93115234375, + "logps/rejected": -2061.09912109375, + "loss": 0.6425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5808910131454468, + "rewards/margins": 0.43574967980384827, + "rewards/rejected": 0.1451413631439209, + "step": 1630 + }, + { + "epoch": 0.86, + "learning_rate": 5.964657299191711e-08, + "logits/chosen": -1.4473376274108887, + "logits/rejected": -1.4126627445220947, + "logps/chosen": -2487.42919921875, + "logps/rejected": -2065.8955078125, + "loss": 0.6381, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8268505930900574, + "rewards/margins": 0.4533798098564148, + "rewards/rejected": 0.37347084283828735, + "step": 1640 + }, + { + "epoch": 0.86, + "learning_rate": 5.53921015380539e-08, + "logits/chosen": -1.428260087966919, + "logits/rejected": -1.4423437118530273, + "logps/chosen": -2295.45556640625, + "logps/rejected": -2376.85595703125, + "loss": 0.6173, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6111637353897095, + "rewards/margins": 0.18756714463233948, + "rewards/rejected": 0.4235965311527252, + "step": 1650 + }, + { + "epoch": 0.87, + "learning_rate": 5.1286125510586805e-08, + "logits/chosen": -1.462693452835083, + "logits/rejected": -1.4421815872192383, + "logps/chosen": -2543.067626953125, + "logps/rejected": -2478.81494140625, + "loss": 0.613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8606179356575012, + "rewards/margins": 0.37834784388542175, + "rewards/rejected": 0.48227009177207947, + "step": 1660 + }, + { + "epoch": 0.87, + "learning_rate": 4.733001627215466e-08, + "logits/chosen": -1.4652189016342163, + "logits/rejected": -1.4526941776275635, + "logps/chosen": -2576.45556640625, + "logps/rejected": -2486.090576171875, + "loss": 0.6675, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7547820210456848, + "rewards/margins": 0.23100514709949493, + "rewards/rejected": 0.5237768292427063, + "step": 1670 + }, + { + "epoch": 0.88, + "learning_rate": 4.352509513110658e-08, + "logits/chosen": -1.4286987781524658, + "logits/rejected": -1.4079492092132568, + "logps/chosen": -2363.428955078125, + "logps/rejected": -2208.08740234375, + "loss": 0.6258, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.574379026889801, + "rewards/margins": 0.24226748943328857, + "rewards/rejected": 0.33211153745651245, + "step": 1680 + }, + { + "epoch": 0.88, + "learning_rate": 3.9872632900194936e-08, + "logits/chosen": -1.4842069149017334, + "logits/rejected": -1.415021300315857, + "logps/chosen": -2913.2490234375, + "logps/rejected": -2346.609619140625, + "loss": 0.6436, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.6599145531654358, + "rewards/margins": 0.26596465706825256, + "rewards/rejected": 0.3939499258995056, + "step": 1690 + }, + { + "epoch": 0.89, + "learning_rate": 3.6373849472134954e-08, + "logits/chosen": -1.4031012058258057, + "logits/rejected": -1.3779500722885132, + "logps/chosen": -2266.2158203125, + "logps/rejected": -1981.5833740234375, + "loss": 0.6469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5494025945663452, + "rewards/margins": 0.21553239226341248, + "rewards/rejected": 0.33387020230293274, + "step": 1700 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -1.4758340120315552, + "eval_logits/rejected": -1.4289432764053345, + "eval_logps/chosen": -2598.2412109375, + "eval_logps/rejected": -2195.535888671875, + "eval_loss": 0.6306354403495789, + "eval_rewards/accuracies": 0.6660000085830688, + "eval_rewards/chosen": 0.7453421354293823, + "eval_rewards/margins": 0.3955351710319519, + "eval_rewards/rejected": 0.3498069643974304, + "eval_runtime": 295.7456, + "eval_samples_per_second": 6.763, + "eval_steps_per_second": 0.423, + "step": 1700 + }, + { + "epoch": 0.9, + "learning_rate": 3.302991341216976e-08, + "logits/chosen": -1.4159257411956787, + "logits/rejected": -1.392617106437683, + "logps/chosen": -2077.9482421875, + "logps/rejected": -1972.2515869140625, + "loss": 0.6409, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.5421566367149353, + "rewards/margins": 0.2578433156013489, + "rewards/rejected": 0.28431329131126404, + "step": 1710 + }, + { + "epoch": 0.9, + "learning_rate": 2.9841941567779474e-08, + "logits/chosen": -1.4799764156341553, + "logits/rejected": -1.4051799774169922, + "logps/chosen": -2897.63232421875, + "logps/rejected": -2480.90625, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8838424682617188, + "rewards/margins": 0.3801085352897644, + "rewards/rejected": 0.5037339925765991, + "step": 1720 + }, + { + "epoch": 0.91, + "learning_rate": 2.681099869566328e-08, + "logits/chosen": -1.4630422592163086, + "logits/rejected": -1.4653818607330322, + "logps/chosen": -2166.15966796875, + "logps/rejected": -2133.84326171875, + "loss": 0.6171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5266101956367493, + "rewards/margins": 0.21218034625053406, + "rewards/rejected": 0.3144298195838928, + "step": 1730 + }, + { + "epoch": 0.91, + "learning_rate": 2.3938097106119216e-08, + "logits/chosen": -1.4574975967407227, + "logits/rejected": -1.4154255390167236, + "logps/chosen": -2208.398681640625, + "logps/rejected": -1935.158203125, + "loss": 0.6305, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6478286981582642, + "rewards/margins": 0.3098670542240143, + "rewards/rejected": 0.33796167373657227, + "step": 1740 + }, + { + "epoch": 0.92, + "learning_rate": 2.12241963249406e-08, + "logits/chosen": -1.4689569473266602, + "logits/rejected": -1.4307196140289307, + "logps/chosen": -2519.071044921875, + "logps/rejected": -2212.586181640625, + "loss": 0.6578, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6326580047607422, + "rewards/margins": 0.3157083988189697, + "rewards/rejected": 0.31694963574409485, + "step": 1750 + }, + { + "epoch": 0.92, + "learning_rate": 1.8670202772942568e-08, + "logits/chosen": -1.4382356405258179, + "logits/rejected": -1.3769454956054688, + "logps/chosen": -2694.0830078125, + "logps/rejected": -2166.41845703125, + "loss": 0.6341, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7784560322761536, + "rewards/margins": 0.3015449643135071, + "rewards/rejected": 0.4769110679626465, + "step": 1760 + }, + { + "epoch": 0.93, + "learning_rate": 1.6276969463224545e-08, + "logits/chosen": -1.4650015830993652, + "logits/rejected": -1.463744878768921, + "logps/chosen": -2586.126220703125, + "logps/rejected": -2591.75439453125, + "loss": 0.6103, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6145761609077454, + "rewards/margins": 0.4303979277610779, + "rewards/rejected": 0.18417824804782867, + "step": 1770 + }, + { + "epoch": 0.93, + "learning_rate": 1.4045295716271e-08, + "logits/chosen": -1.4920063018798828, + "logits/rejected": -1.450634241104126, + "logps/chosen": -2605.60986328125, + "logps/rejected": -2116.304931640625, + "loss": 0.608, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.645778477191925, + "rewards/margins": 0.3450910151004791, + "rewards/rejected": 0.30068737268447876, + "step": 1780 + }, + { + "epoch": 0.94, + "learning_rate": 1.1975926892984766e-08, + "logits/chosen": -1.4100964069366455, + "logits/rejected": -1.3769333362579346, + "logps/chosen": -2435.0087890625, + "logps/rejected": -2033.880126953125, + "loss": 0.6496, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6221305727958679, + "rewards/margins": 0.3294012248516083, + "rewards/rejected": 0.29272931814193726, + "step": 1790 + }, + { + "epoch": 0.94, + "learning_rate": 1.0069554145742787e-08, + "logits/chosen": -1.395265817642212, + "logits/rejected": -1.3731589317321777, + "logps/chosen": -2578.064697265625, + "logps/rejected": -2280.887451171875, + "loss": 0.669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6557528972625732, + "rewards/margins": 0.4573606848716736, + "rewards/rejected": 0.1983920931816101, + "step": 1800 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -1.4769095182418823, + "eval_logits/rejected": -1.4307643175125122, + "eval_logps/chosen": -2607.336669921875, + "eval_logps/rejected": -2203.039306640625, + "eval_loss": 0.6322839260101318, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": 0.6543857455253601, + "eval_rewards/margins": 0.3796128034591675, + "eval_rewards/rejected": 0.2747729420661926, + "eval_runtime": 293.77, + "eval_samples_per_second": 6.808, + "eval_steps_per_second": 0.426, + "step": 1800 + }, + { + "epoch": 0.95, + "learning_rate": 8.326814187556485e-09, + "logits/chosen": -1.4078927040100098, + "logits/rejected": -1.380299687385559, + "logps/chosen": -2524.50439453125, + "logps/rejected": -2226.43994140625, + "loss": 0.6208, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5907411575317383, + "rewards/margins": 0.25163906812667847, + "rewards/rejected": 0.3391020894050598, + "step": 1810 + }, + { + "epoch": 0.95, + "learning_rate": 6.7482890794151594e-09, + "logits/chosen": -1.4838191270828247, + "logits/rejected": -1.4362868070602417, + "logps/chosen": -2814.218017578125, + "logps/rejected": -2245.9033203125, + "loss": 0.632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8923807144165039, + "rewards/margins": 0.4581494927406311, + "rewards/rejected": 0.4342312812805176, + "step": 1820 + }, + { + "epoch": 0.96, + "learning_rate": 5.334506035882036e-09, + "logits/chosen": -1.370774507522583, + "logits/rejected": -1.3359023332595825, + "logps/chosen": -2687.776123046875, + "logps/rejected": -2035.099609375, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6979535818099976, + "rewards/margins": 0.42525219917297363, + "rewards/rejected": 0.27270132303237915, + "step": 1830 + }, + { + "epoch": 0.96, + "learning_rate": 4.0859372490090194e-09, + "logits/chosen": -1.4562771320343018, + "logits/rejected": -1.4093388319015503, + "logps/chosen": -2788.104248046875, + "logps/rejected": -2335.853759765625, + "loss": 0.6116, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7869713306427002, + "rewards/margins": 0.4374913275241852, + "rewards/rejected": 0.3494799733161926, + "step": 1840 + }, + { + "epoch": 0.97, + "learning_rate": 3.0029997306283416e-09, + "logits/chosen": -1.4756406545639038, + "logits/rejected": -1.3986704349517822, + "logps/chosen": -2574.64111328125, + "logps/rejected": -1893.6328125, + "loss": 0.6546, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6360118389129639, + "rewards/margins": 0.3881533145904541, + "rewards/rejected": 0.24785849452018738, + "step": 1850 + }, + { + "epoch": 0.97, + "learning_rate": 2.0860551730742526e-09, + "logits/chosen": -1.4544508457183838, + "logits/rejected": -1.419983983039856, + "logps/chosen": -2375.126220703125, + "logps/rejected": -2017.3466796875, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.8510934710502625, + "rewards/margins": 0.6245936155319214, + "rewards/rejected": 0.22649994492530823, + "step": 1860 + }, + { + "epoch": 0.98, + "learning_rate": 1.3354098283802628e-09, + "logits/chosen": -1.4696677923202515, + "logits/rejected": -1.4230769872665405, + "logps/chosen": -2438.054931640625, + "logps/rejected": -2103.46044921875, + "loss": 0.621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7339269518852234, + "rewards/margins": 0.35110199451446533, + "rewards/rejected": 0.38282495737075806, + "step": 1870 + }, + { + "epoch": 0.98, + "learning_rate": 7.513144059937415e-10, + "logits/chosen": -1.4952335357666016, + "logits/rejected": -1.442657232284546, + "logps/chosen": -2848.296630859375, + "logps/rejected": -2374.80126953125, + "loss": 0.6061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7486821413040161, + "rewards/margins": 0.31393861770629883, + "rewards/rejected": 0.43474358320236206, + "step": 1880 + }, + { + "epoch": 0.99, + "learning_rate": 3.3396398904106393e-10, + "logits/chosen": -1.4425480365753174, + "logits/rejected": -1.4436792135238647, + "logps/chosen": -2551.7880859375, + "logps/rejected": -2169.797607421875, + "loss": 0.6124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5796520709991455, + "rewards/margins": 0.4224782884120941, + "rewards/rejected": 0.15717382729053497, + "step": 1890 + }, + { + "epoch": 0.99, + "learning_rate": 8.349796917112018e-11, + "logits/chosen": -1.4112383127212524, + "logits/rejected": -1.3823628425598145, + "logps/chosen": -2330.736083984375, + "logps/rejected": -2090.098876953125, + "loss": 0.6531, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.5138527750968933, + "rewards/margins": 0.193558931350708, + "rewards/rejected": 0.3202938437461853, + "step": 1900 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -1.4753704071044922, + "eval_logits/rejected": -1.4289445877075195, + "eval_logps/chosen": -2603.777587890625, + "eval_logps/rejected": -2200.1181640625, + "eval_loss": 0.6316895484924316, + "eval_rewards/accuracies": 0.6639999747276306, + "eval_rewards/chosen": 0.6899767518043518, + "eval_rewards/margins": 0.38598912954330444, + "eval_rewards/rejected": 0.30398762226104736, + "eval_runtime": 302.6434, + "eval_samples_per_second": 6.608, + "eval_steps_per_second": 0.413, + "step": 1900 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.4598416090011597, + "logits/rejected": -1.4293019771575928, + "logps/chosen": -2462.09912109375, + "logps/rejected": -2050.02490234375, + "loss": 0.6322, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5636069178581238, + "rewards/margins": 0.25135958194732666, + "rewards/rejected": 0.3122473955154419, + "step": 1910 + }, + { + "epoch": 1.0, + "step": 1910, + "total_flos": 0.0, + "train_loss": 0.6480738864519209, + "train_runtime": 26013.0665, + "train_samples_per_second": 2.35, + "train_steps_per_second": 0.073 + } + ], + "logging_steps": 10, + "max_steps": 1910, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100000000, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}