llama-8b-dpo-full / trainer_state.json
fenguhao's picture
Model save
b002b79 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5.235602094240838e-09,
"logits/chosen": -1.3201165199279785,
"logits/rejected": -1.2275193929672241,
"logps/chosen": -2993.4990234375,
"logps/rejected": -2222.55078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 5.2356020942408376e-08,
"logits/chosen": -1.2813271284103394,
"logits/rejected": -1.2465020418167114,
"logps/chosen": -3047.636474609375,
"logps/rejected": -2742.105712890625,
"loss": 0.6973,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": 0.00026022063684649765,
"rewards/margins": 0.0008929346804507077,
"rewards/rejected": -0.0006327141309157014,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 1.0471204188481675e-07,
"logits/chosen": -1.2586185932159424,
"logits/rejected": -1.1957629919052124,
"logps/chosen": -2689.84716796875,
"logps/rejected": -2126.1083984375,
"loss": 0.6916,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.014919064939022064,
"rewards/margins": 0.006186266429722309,
"rewards/rejected": 0.008732798509299755,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 1.5706806282722514e-07,
"logits/chosen": -1.175875186920166,
"logits/rejected": -1.1656105518341064,
"logps/chosen": -2198.431640625,
"logps/rejected": -2021.9176025390625,
"loss": 0.7049,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0053156702779233456,
"rewards/margins": -0.05735307186841965,
"rewards/rejected": 0.05203740671277046,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 2.094240837696335e-07,
"logits/chosen": -1.1858023405075073,
"logits/rejected": -1.1230406761169434,
"logps/chosen": -2056.973388671875,
"logps/rejected": -2170.3056640625,
"loss": 0.6906,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.1324843466281891,
"rewards/margins": -0.016001610085368156,
"rewards/rejected": 0.1484859436750412,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 2.6178010471204185e-07,
"logits/chosen": -1.2066991329193115,
"logits/rejected": -1.15940260887146,
"logps/chosen": -2678.28515625,
"logps/rejected": -2157.86376953125,
"loss": 0.6707,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.32956749200820923,
"rewards/margins": 0.08421512693166733,
"rewards/rejected": 0.2453523427248001,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 3.1413612565445027e-07,
"logits/chosen": -1.2342027425765991,
"logits/rejected": -1.1995573043823242,
"logps/chosen": -2410.271484375,
"logps/rejected": -2036.266845703125,
"loss": 0.6833,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.37566477060317993,
"rewards/margins": 0.07754239439964294,
"rewards/rejected": 0.2981223464012146,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 3.6649214659685864e-07,
"logits/chosen": -1.1794008016586304,
"logits/rejected": -1.1591062545776367,
"logps/chosen": -2638.678955078125,
"logps/rejected": -2372.677001953125,
"loss": 0.6778,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.43634381890296936,
"rewards/margins": 0.0520954504609108,
"rewards/rejected": 0.38424837589263916,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 4.18848167539267e-07,
"logits/chosen": -1.2023160457611084,
"logits/rejected": -1.1861956119537354,
"logps/chosen": -2399.763671875,
"logps/rejected": -2263.85888671875,
"loss": 0.6818,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.452880322933197,
"rewards/margins": 0.04662833362817764,
"rewards/rejected": 0.4062519967556,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 4.712041884816754e-07,
"logits/chosen": -1.2319462299346924,
"logits/rejected": -1.2353641986846924,
"logps/chosen": -2180.666259765625,
"logps/rejected": -2063.204345703125,
"loss": 0.6665,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.36227527260780334,
"rewards/margins": 0.02720705047249794,
"rewards/rejected": 0.3350681960582733,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 5.235602094240837e-07,
"logits/chosen": -1.2101176977157593,
"logits/rejected": -1.1575647592544556,
"logps/chosen": -2522.456298828125,
"logps/rejected": -2253.9931640625,
"loss": 0.6558,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.5727291703224182,
"rewards/margins": 0.10190355777740479,
"rewards/rejected": 0.47082558274269104,
"step": 100
},
{
"epoch": 0.05,
"eval_logits/chosen": -1.2241016626358032,
"eval_logits/rejected": -1.182218313217163,
"eval_logps/chosen": -2595.654296875,
"eval_logps/rejected": -2172.529052734375,
"eval_loss": 0.6526807546615601,
"eval_rewards/accuracies": 0.5740000009536743,
"eval_rewards/chosen": 0.7712106108665466,
"eval_rewards/margins": 0.1913326531648636,
"eval_rewards/rejected": 0.5798779726028442,
"eval_runtime": 302.6088,
"eval_samples_per_second": 6.609,
"eval_steps_per_second": 0.413,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 5.759162303664922e-07,
"logits/chosen": -1.162023901939392,
"logits/rejected": -1.1786675453186035,
"logps/chosen": -2315.97216796875,
"logps/rejected": -2253.127685546875,
"loss": 0.6732,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.7014600038528442,
"rewards/margins": 0.1181831955909729,
"rewards/rejected": 0.5832767486572266,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 6.282722513089005e-07,
"logits/chosen": -1.2144238948822021,
"logits/rejected": -1.1650540828704834,
"logps/chosen": -2668.5830078125,
"logps/rejected": -1998.516845703125,
"loss": 0.6723,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6677217483520508,
"rewards/margins": 0.20832547545433044,
"rewards/rejected": 0.45939627289772034,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 6.806282722513089e-07,
"logits/chosen": -1.220961332321167,
"logits/rejected": -1.1595335006713867,
"logps/chosen": -2847.095458984375,
"logps/rejected": -2245.98828125,
"loss": 0.6455,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.8143318891525269,
"rewards/margins": 0.25173696875572205,
"rewards/rejected": 0.5625948905944824,
"step": 130
},
{
"epoch": 0.07,
"learning_rate": 7.329842931937173e-07,
"logits/chosen": -1.1750261783599854,
"logits/rejected": -1.1362488269805908,
"logps/chosen": -2556.08349609375,
"logps/rejected": -2165.498779296875,
"loss": 0.6639,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6861199140548706,
"rewards/margins": 0.11765004694461823,
"rewards/rejected": 0.5684698820114136,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 7.853403141361256e-07,
"logits/chosen": -1.213008165359497,
"logits/rejected": -1.1688684225082397,
"logps/chosen": -2662.8193359375,
"logps/rejected": -2211.24072265625,
"loss": 0.6339,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.870284914970398,
"rewards/margins": 0.22113271057605743,
"rewards/rejected": 0.6491522192955017,
"step": 150
},
{
"epoch": 0.08,
"learning_rate": 8.37696335078534e-07,
"logits/chosen": -1.1444575786590576,
"logits/rejected": -1.091567039489746,
"logps/chosen": -2689.31298828125,
"logps/rejected": -2391.873291015625,
"loss": 0.6469,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.5730727314949036,
"rewards/margins": 0.2371658980846405,
"rewards/rejected": 0.33590689301490784,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 8.900523560209424e-07,
"logits/chosen": -1.1294758319854736,
"logits/rejected": -1.178647756576538,
"logps/chosen": -2683.22509765625,
"logps/rejected": -2484.3818359375,
"loss": 0.6628,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.4668382704257965,
"rewards/margins": 0.08485493808984756,
"rewards/rejected": 0.38198333978652954,
"step": 170
},
{
"epoch": 0.09,
"learning_rate": 9.424083769633508e-07,
"logits/chosen": -1.2192734479904175,
"logits/rejected": -1.1568591594696045,
"logps/chosen": -2561.9091796875,
"logps/rejected": -2213.013916015625,
"loss": 0.6581,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.9690437316894531,
"rewards/margins": 0.3352271616458893,
"rewards/rejected": 0.6338165998458862,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 9.947643979057591e-07,
"logits/chosen": -1.184699535369873,
"logits/rejected": -1.1766315698623657,
"logps/chosen": -2123.99072265625,
"logps/rejected": -2111.645751953125,
"loss": 0.6809,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.6798163652420044,
"rewards/margins": 0.07367928326129913,
"rewards/rejected": 0.6061369776725769,
"step": 190
},
{
"epoch": 0.1,
"learning_rate": 9.999323662872996e-07,
"logits/chosen": -1.2072479724884033,
"logits/rejected": -1.1839154958724976,
"logps/chosen": -2698.072998046875,
"logps/rejected": -2592.82861328125,
"loss": 0.6404,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6077369451522827,
"rewards/margins": 0.17234833538532257,
"rewards/rejected": 0.4353886544704437,
"step": 200
},
{
"epoch": 0.1,
"eval_logits/chosen": -1.2423152923583984,
"eval_logits/rejected": -1.201860785484314,
"eval_logps/chosen": -2626.8759765625,
"eval_logps/rejected": -2203.748291015625,
"eval_loss": 0.6911113858222961,
"eval_rewards/accuracies": 0.5860000252723694,
"eval_rewards/chosen": 0.45899277925491333,
"eval_rewards/margins": 0.19130723178386688,
"eval_rewards/rejected": 0.26768550276756287,
"eval_runtime": 302.3649,
"eval_samples_per_second": 6.615,
"eval_steps_per_second": 0.413,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 9.996985942280678e-07,
"logits/chosen": -1.2993234395980835,
"logits/rejected": -1.2211077213287354,
"logps/chosen": -2626.205810546875,
"logps/rejected": -1850.9456787109375,
"loss": 0.6556,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.6171352863311768,
"rewards/margins": 0.32769179344177246,
"rewards/rejected": 0.2894434928894043,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 9.99297926897573e-07,
"logits/chosen": -1.249463438987732,
"logits/rejected": -1.2620993852615356,
"logps/chosen": -2312.38427734375,
"logps/rejected": -2108.46826171875,
"loss": 0.6647,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.7376146912574768,
"rewards/margins": 0.25427359342575073,
"rewards/rejected": 0.48334112763404846,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 9.987304981154493e-07,
"logits/chosen": -1.2905672788619995,
"logits/rejected": -1.2782526016235352,
"logps/chosen": -2793.2978515625,
"logps/rejected": -2365.16552734375,
"loss": 0.7268,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.6005491018295288,
"rewards/margins": 0.08131317794322968,
"rewards/rejected": 0.5192359685897827,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 9.979964973983e-07,
"logits/chosen": -1.402222752571106,
"logits/rejected": -1.3204929828643799,
"logps/chosen": -2332.16650390625,
"logps/rejected": -1890.1295166015625,
"loss": 0.6892,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.3606724441051483,
"rewards/margins": 0.234793022274971,
"rewards/rejected": 0.1258794367313385,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 9.970961698964024e-07,
"logits/chosen": -1.399332046508789,
"logits/rejected": -1.3611127138137817,
"logps/chosen": -2618.633056640625,
"logps/rejected": -2216.18505859375,
"loss": 0.7038,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.7694709897041321,
"rewards/margins": 0.19202515482902527,
"rewards/rejected": 0.577445924282074,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 9.960298163118284e-07,
"logits/chosen": -1.4756546020507812,
"logits/rejected": -1.3830201625823975,
"logps/chosen": -2662.10986328125,
"logps/rejected": -2112.115478515625,
"loss": 0.6914,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.9105646014213562,
"rewards/margins": 0.19633980095386505,
"rewards/rejected": 0.7142248749732971,
"step": 260
},
{
"epoch": 0.14,
"learning_rate": 9.94797792798013e-07,
"logits/chosen": -1.4841511249542236,
"logits/rejected": -1.4767415523529053,
"logps/chosen": -2305.857177734375,
"logps/rejected": -2128.56396484375,
"loss": 0.6626,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.2341788113117218,
"rewards/margins": 0.13304655253887177,
"rewards/rejected": 0.10113225132226944,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 9.934005108408016e-07,
"logits/chosen": -1.4331722259521484,
"logits/rejected": -1.3947049379348755,
"logps/chosen": -2292.278564453125,
"logps/rejected": -1913.346435546875,
"loss": 0.661,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.8132773637771606,
"rewards/margins": 0.22855396568775177,
"rewards/rejected": 0.5847233533859253,
"step": 280
},
{
"epoch": 0.15,
"learning_rate": 9.918384371210175e-07,
"logits/chosen": -1.4025981426239014,
"logits/rejected": -1.3736456632614136,
"logps/chosen": -2201.71044921875,
"logps/rejected": -2091.62255859375,
"loss": 0.6766,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.7386767268180847,
"rewards/margins": 0.228462815284729,
"rewards/rejected": 0.5102138519287109,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 9.901120933585937e-07,
"logits/chosen": -1.3154966831207275,
"logits/rejected": -1.326516032218933,
"logps/chosen": -2670.81201171875,
"logps/rejected": -2235.08349609375,
"loss": 0.6725,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.7127049565315247,
"rewards/margins": 0.18496084213256836,
"rewards/rejected": 0.5277441143989563,
"step": 300
},
{
"epoch": 0.16,
"eval_logits/chosen": -1.3645591735839844,
"eval_logits/rejected": -1.314851999282837,
"eval_logps/chosen": -2591.692138671875,
"eval_logps/rejected": -2178.205810546875,
"eval_loss": 0.6602776050567627,
"eval_rewards/accuracies": 0.6320000290870667,
"eval_rewards/chosen": 0.8108287453651428,
"eval_rewards/margins": 0.28771865367889404,
"eval_rewards/rejected": 0.5231101512908936,
"eval_runtime": 302.3737,
"eval_samples_per_second": 6.614,
"eval_steps_per_second": 0.413,
"step": 300
},
{
"epoch": 0.16,
"learning_rate": 9.882220561383237e-07,
"logits/chosen": -1.3421976566314697,
"logits/rejected": -1.2967360019683838,
"logps/chosen": -2590.6484375,
"logps/rejected": -2214.814208984375,
"loss": 0.6749,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.7196224331855774,
"rewards/margins": 0.18787309527397156,
"rewards/rejected": 0.5317493081092834,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 9.861689567172849e-07,
"logits/chosen": -1.3033558130264282,
"logits/rejected": -1.2557708024978638,
"logps/chosen": -2364.27587890625,
"logps/rejected": -2370.61865234375,
"loss": 0.7144,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6515111923217773,
"rewards/margins": 0.11765609681606293,
"rewards/rejected": 0.5338551998138428,
"step": 320
},
{
"epoch": 0.17,
"learning_rate": 9.839534808140065e-07,
"logits/chosen": -1.2571797370910645,
"logits/rejected": -1.2486730813980103,
"logps/chosen": -2348.859130859375,
"logps/rejected": -1969.1402587890625,
"loss": 0.7502,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.6785815954208374,
"rewards/margins": 0.024524565786123276,
"rewards/rejected": 0.6540570259094238,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 9.815763683794431e-07,
"logits/chosen": -1.2969481945037842,
"logits/rejected": -1.2044627666473389,
"logps/chosen": -2964.642578125,
"logps/rejected": -2117.79150390625,
"loss": 0.689,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.3430386185646057,
"rewards/margins": 0.15022581815719604,
"rewards/rejected": 0.19281277060508728,
"step": 340
},
{
"epoch": 0.18,
"learning_rate": 9.790384133498377e-07,
"logits/chosen": -1.3875682353973389,
"logits/rejected": -1.3528212308883667,
"logps/chosen": -2609.759765625,
"logps/rejected": -2217.990234375,
"loss": 0.65,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.2748018503189087,
"rewards/margins": 0.3255355954170227,
"rewards/rejected": 0.9492664337158203,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 9.763404633815536e-07,
"logits/chosen": -1.4445443153381348,
"logits/rejected": -1.409148931503296,
"logps/chosen": -2325.73095703125,
"logps/rejected": -2067.62646484375,
"loss": 0.6703,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9282833337783813,
"rewards/margins": 0.2425541877746582,
"rewards/rejected": 0.6857292056083679,
"step": 360
},
{
"epoch": 0.19,
"learning_rate": 9.73483419567964e-07,
"logits/chosen": -1.5681045055389404,
"logits/rejected": -1.47848379611969,
"logps/chosen": -2851.124267578125,
"logps/rejected": -2266.677734375,
"loss": 0.6686,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.8608830571174622,
"rewards/margins": 0.2859550416469574,
"rewards/rejected": 0.5749280452728271,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 9.70468236138494e-07,
"logits/chosen": -1.5734655857086182,
"logits/rejected": -1.4612947702407837,
"logps/chosen": -2619.15576171875,
"logps/rejected": -1996.1292724609375,
"loss": 0.6587,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.7406389117240906,
"rewards/margins": 0.2604018747806549,
"rewards/rejected": 0.4802371561527252,
"step": 380
},
{
"epoch": 0.2,
"learning_rate": 9.672959201399155e-07,
"logits/chosen": -1.4863954782485962,
"logits/rejected": -1.4341216087341309,
"logps/chosen": -2418.91748046875,
"logps/rejected": -2210.710205078125,
"loss": 0.6831,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9080713987350464,
"rewards/margins": 0.19638116657733917,
"rewards/rejected": 0.7116903066635132,
"step": 390
},
{
"epoch": 0.21,
"learning_rate": 9.639675311000027e-07,
"logits/chosen": -1.478477120399475,
"logits/rejected": -1.4470995664596558,
"logps/chosen": -2378.759521484375,
"logps/rejected": -2213.616455078125,
"loss": 0.689,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.5797199606895447,
"rewards/margins": 0.15609867870807648,
"rewards/rejected": 0.4236213266849518,
"step": 400
},
{
"epoch": 0.21,
"eval_logits/chosen": -1.5029045343399048,
"eval_logits/rejected": -1.4427672624588013,
"eval_logps/chosen": -2591.764892578125,
"eval_logps/rejected": -2180.5830078125,
"eval_loss": 0.6528961658477783,
"eval_rewards/accuracies": 0.628000020980835,
"eval_rewards/chosen": 0.8101032376289368,
"eval_rewards/margins": 0.31076449155807495,
"eval_rewards/rejected": 0.49933871626853943,
"eval_runtime": 300.9467,
"eval_samples_per_second": 6.646,
"eval_steps_per_second": 0.415,
"step": 400
},
{
"epoch": 0.21,
"learning_rate": 9.60484180673657e-07,
"logits/chosen": -1.4771575927734375,
"logits/rejected": -1.449158787727356,
"logps/chosen": -2471.6416015625,
"logps/rejected": -2168.50439453125,
"loss": 0.7235,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.5357500314712524,
"rewards/margins": 0.03546437621116638,
"rewards/rejected": 0.5002856254577637,
"step": 410
},
{
"epoch": 0.22,
"learning_rate": 9.568470322716246e-07,
"logits/chosen": -1.461313247680664,
"logits/rejected": -1.3947060108184814,
"logps/chosen": -2724.66748046875,
"logps/rejected": -2191.56787109375,
"loss": 0.672,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.7562235593795776,
"rewards/margins": 0.328954815864563,
"rewards/rejected": 0.4272686541080475,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 9.530573006719263e-07,
"logits/chosen": -1.5015565156936646,
"logits/rejected": -1.4776034355163574,
"logps/chosen": -2666.500732421875,
"logps/rejected": -2279.621826171875,
"loss": 0.6588,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5253168344497681,
"rewards/margins": 0.28119999170303345,
"rewards/rejected": 0.24411681294441223,
"step": 430
},
{
"epoch": 0.23,
"learning_rate": 9.491162516141307e-07,
"logits/chosen": -1.4172331094741821,
"logits/rejected": -1.422502040863037,
"logps/chosen": -2282.531005859375,
"logps/rejected": -2387.561767578125,
"loss": 0.6692,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.860162615776062,
"rewards/margins": 0.07978199422359467,
"rewards/rejected": 0.7803806662559509,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 9.450252013766092e-07,
"logits/chosen": -1.3361685276031494,
"logits/rejected": -1.2606579065322876,
"logps/chosen": -2627.769775390625,
"logps/rejected": -2308.65380859375,
"loss": 0.6375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.5678683519363403,
"rewards/margins": 0.21432606875896454,
"rewards/rejected": 0.3535422682762146,
"step": 450
},
{
"epoch": 0.24,
"learning_rate": 9.407855163369078e-07,
"logits/chosen": -1.306783676147461,
"logits/rejected": -1.2825387716293335,
"logps/chosen": -2633.41162109375,
"logps/rejected": -2218.27294921875,
"loss": 0.6678,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.9192908406257629,
"rewards/margins": 0.24978260695934296,
"rewards/rejected": 0.669508159160614,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 9.3639861251539e-07,
"logits/chosen": -1.2543857097625732,
"logits/rejected": -1.195093035697937,
"logps/chosen": -2341.584228515625,
"logps/rejected": -1947.591796875,
"loss": 0.6284,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.5097464919090271,
"rewards/margins": 0.36333781480789185,
"rewards/rejected": 0.14640869200229645,
"step": 470
},
{
"epoch": 0.25,
"learning_rate": 9.318659551022955e-07,
"logits/chosen": -1.3397210836410522,
"logits/rejected": -1.281937837600708,
"logps/chosen": -2238.00732421875,
"logps/rejected": -1736.181640625,
"loss": 0.6609,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.786676287651062,
"rewards/margins": 0.2685468792915344,
"rewards/rejected": 0.5181293487548828,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 9.271890579683804e-07,
"logits/chosen": -1.4926373958587646,
"logits/rejected": -1.4876558780670166,
"logps/chosen": -2662.705322265625,
"logps/rejected": -2349.420166015625,
"loss": 0.7143,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.7370970845222473,
"rewards/margins": 0.34762194752693176,
"rewards/rejected": 0.38947516679763794,
"step": 490
},
{
"epoch": 0.26,
"learning_rate": 9.223694831592952e-07,
"logits/chosen": -1.5373231172561646,
"logits/rejected": -1.4849967956542969,
"logps/chosen": -2402.5634765625,
"logps/rejected": -2132.68701171875,
"loss": 0.6682,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.7446134090423584,
"rewards/margins": 0.32214781641960144,
"rewards/rejected": 0.42246556282043457,
"step": 500
},
{
"epoch": 0.26,
"eval_logits/chosen": -1.5664644241333008,
"eval_logits/rejected": -1.5148077011108398,
"eval_logps/chosen": -2576.100830078125,
"eval_logps/rejected": -2169.265380859375,
"eval_loss": 0.6673685312271118,
"eval_rewards/accuracies": 0.6420000195503235,
"eval_rewards/chosen": 0.966746985912323,
"eval_rewards/margins": 0.3542312681674957,
"eval_rewards/rejected": 0.6125158071517944,
"eval_runtime": 302.6642,
"eval_samples_per_second": 6.608,
"eval_steps_per_second": 0.413,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 9.174088403738755e-07,
"logits/chosen": -1.5560601949691772,
"logits/rejected": -1.5580723285675049,
"logps/chosen": -2103.93310546875,
"logps/rejected": -2181.848876953125,
"loss": 0.6493,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.6312128305435181,
"rewards/margins": 0.3005504906177521,
"rewards/rejected": 0.330662339925766,
"step": 510
},
{
"epoch": 0.27,
"learning_rate": 9.123087864265147e-07,
"logits/chosen": -1.543971061706543,
"logits/rejected": -1.5191954374313354,
"logps/chosen": -2323.391357421875,
"logps/rejected": -2031.1025390625,
"loss": 0.6736,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.41579127311706543,
"rewards/margins": 0.1768406629562378,
"rewards/rejected": 0.23895065486431122,
"step": 520
},
{
"epoch": 0.28,
"learning_rate": 9.070710246938016e-07,
"logits/chosen": -1.5579715967178345,
"logits/rejected": -1.5655916929244995,
"logps/chosen": -2268.76318359375,
"logps/rejected": -2190.51318359375,
"loss": 0.6519,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.6393724083900452,
"rewards/margins": 0.3283298909664154,
"rewards/rejected": 0.3110424876213074,
"step": 530
},
{
"epoch": 0.28,
"learning_rate": 9.016973045456073e-07,
"logits/chosen": -1.6396840810775757,
"logits/rejected": -1.6098705530166626,
"logps/chosen": -2668.9462890625,
"logps/rejected": -2160.803955078125,
"loss": 0.669,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8928348422050476,
"rewards/margins": 0.47784289717674255,
"rewards/rejected": 0.41499200463294983,
"step": 540
},
{
"epoch": 0.29,
"learning_rate": 8.961894207608087e-07,
"logits/chosen": -1.6586135625839233,
"logits/rejected": -1.6290054321289062,
"logps/chosen": -2212.68994140625,
"logps/rejected": -2054.17626953125,
"loss": 0.6597,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.49731844663619995,
"rewards/margins": 0.16844932734966278,
"rewards/rejected": 0.32886913418769836,
"step": 550
},
{
"epoch": 0.29,
"learning_rate": 8.905492129278477e-07,
"logits/chosen": -1.6478192806243896,
"logits/rejected": -1.5791934728622437,
"logps/chosen": -2915.1103515625,
"logps/rejected": -2492.820068359375,
"loss": 0.6553,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6440809965133667,
"rewards/margins": 0.29220613837242126,
"rewards/rejected": 0.35187482833862305,
"step": 560
},
{
"epoch": 0.3,
"learning_rate": 8.847785648303233e-07,
"logits/chosen": -1.648879051208496,
"logits/rejected": -1.5808627605438232,
"logps/chosen": -2345.06787109375,
"logps/rejected": -1874.7965087890625,
"loss": 0.6562,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.5588332414627075,
"rewards/margins": 0.2794465720653534,
"rewards/rejected": 0.2793866991996765,
"step": 570
},
{
"epoch": 0.3,
"learning_rate": 8.788794038178232e-07,
"logits/chosen": -1.646813154220581,
"logits/rejected": -1.5900137424468994,
"logps/chosen": -2427.92822265625,
"logps/rejected": -1974.943359375,
"loss": 0.6286,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 1.0534051656723022,
"rewards/margins": 0.35476142168045044,
"rewards/rejected": 0.6986437439918518,
"step": 580
},
{
"epoch": 0.31,
"learning_rate": 8.728537001622049e-07,
"logits/chosen": -1.6359336376190186,
"logits/rejected": -1.5665844678878784,
"logps/chosen": -2346.7265625,
"logps/rejected": -1916.209716796875,
"loss": 0.6555,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.7451139092445374,
"rewards/margins": 0.24112704396247864,
"rewards/rejected": 0.5039868354797363,
"step": 590
},
{
"epoch": 0.31,
"learning_rate": 8.667034663995408e-07,
"logits/chosen": -1.6207376718521118,
"logits/rejected": -1.5811537504196167,
"logps/chosen": -2380.62939453125,
"logps/rejected": -2060.835205078125,
"loss": 0.6309,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.8570950627326965,
"rewards/margins": 0.32400840520858765,
"rewards/rejected": 0.5330866575241089,
"step": 600
},
{
"epoch": 0.31,
"eval_logits/chosen": -1.6448516845703125,
"eval_logits/rejected": -1.588512897491455,
"eval_logps/chosen": -2589.297119140625,
"eval_logps/rejected": -2183.78515625,
"eval_loss": 0.6445065140724182,
"eval_rewards/accuracies": 0.6579999923706055,
"eval_rewards/chosen": 0.834783673286438,
"eval_rewards/margins": 0.3674681782722473,
"eval_rewards/rejected": 0.4673156440258026,
"eval_runtime": 306.3454,
"eval_samples_per_second": 6.529,
"eval_steps_per_second": 0.408,
"step": 600
},
{
"epoch": 0.32,
"learning_rate": 8.604307566579472e-07,
"logits/chosen": -1.5816807746887207,
"logits/rejected": -1.6054216623306274,
"logps/chosen": -2258.828857421875,
"logps/rejected": -2473.440185546875,
"loss": 0.6656,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.3196907639503479,
"rewards/margins": 0.4189208149909973,
"rewards/rejected": -0.09922999143600464,
"step": 610
},
{
"epoch": 0.32,
"learning_rate": 8.540376659715225e-07,
"logits/chosen": -1.6599409580230713,
"logits/rejected": -1.5913432836532593,
"logps/chosen": -2412.462890625,
"logps/rejected": -2083.058837890625,
"loss": 0.6291,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.8317147493362427,
"rewards/margins": 0.3438655138015747,
"rewards/rejected": 0.48784923553466797,
"step": 620
},
{
"epoch": 0.33,
"learning_rate": 8.47526329580623e-07,
"logits/chosen": -1.535036325454712,
"logits/rejected": -1.5678516626358032,
"logps/chosen": -2142.04931640625,
"logps/rejected": -2099.13720703125,
"loss": 0.633,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9528681635856628,
"rewards/margins": 0.24734528362751007,
"rewards/rejected": 0.7055227756500244,
"step": 630
},
{
"epoch": 0.33,
"learning_rate": 8.408989222187096e-07,
"logits/chosen": -1.5995115041732788,
"logits/rejected": -1.5139375925064087,
"logps/chosen": -3065.62451171875,
"logps/rejected": -2365.10107421875,
"loss": 0.6969,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9795970916748047,
"rewards/margins": 0.47979211807250977,
"rewards/rejected": 0.49980488419532776,
"step": 640
},
{
"epoch": 0.34,
"learning_rate": 8.341576573860047e-07,
"logits/chosen": -1.5332003831863403,
"logits/rejected": -1.4982550144195557,
"logps/chosen": -2392.21728515625,
"logps/rejected": -1984.2425537109375,
"loss": 0.694,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8843706846237183,
"rewards/margins": 0.32931455969810486,
"rewards/rejected": 0.5550561547279358,
"step": 650
},
{
"epoch": 0.35,
"learning_rate": 8.27304786610201e-07,
"logits/chosen": -1.5626050233840942,
"logits/rejected": -1.5275344848632812,
"logps/chosen": -2318.65625,
"logps/rejected": -1863.1956787109375,
"loss": 0.6323,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.8664724230766296,
"rewards/margins": 0.5049992799758911,
"rewards/rejected": 0.3614731729030609,
"step": 660
},
{
"epoch": 0.35,
"learning_rate": 8.203425986944696e-07,
"logits/chosen": -1.5559314489364624,
"logits/rejected": -1.5068961381912231,
"logps/chosen": -2837.03369140625,
"logps/rejected": -2028.3587646484375,
"loss": 0.6661,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.7081668972969055,
"rewards/margins": 0.37415772676467896,
"rewards/rejected": 0.3340091109275818,
"step": 670
},
{
"epoch": 0.36,
"learning_rate": 8.132734189530182e-07,
"logits/chosen": -1.569585919380188,
"logits/rejected": -1.5583667755126953,
"logps/chosen": -2081.708984375,
"logps/rejected": -2073.14892578125,
"loss": 0.7058,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.22897915542125702,
"rewards/margins": 0.06144998222589493,
"rewards/rejected": 0.1675291508436203,
"step": 680
},
{
"epoch": 0.36,
"learning_rate": 8.060996084344553e-07,
"logits/chosen": -1.6668421030044556,
"logits/rejected": -1.6300331354141235,
"logps/chosen": -2808.94140625,
"logps/rejected": -2424.194580078125,
"loss": 0.6651,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.9484899640083313,
"rewards/margins": 0.38452741503715515,
"rewards/rejected": 0.5639625787734985,
"step": 690
},
{
"epoch": 0.37,
"learning_rate": 7.98823563133219e-07,
"logits/chosen": -1.6251919269561768,
"logits/rejected": -1.6152589321136475,
"logps/chosen": -2532.464111328125,
"logps/rejected": -2264.97802734375,
"loss": 0.6467,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.8040878176689148,
"rewards/margins": 0.3939053416252136,
"rewards/rejected": 0.4101824164390564,
"step": 700
},
{
"epoch": 0.37,
"eval_logits/chosen": -1.7105224132537842,
"eval_logits/rejected": -1.6561530828475952,
"eval_logps/chosen": -2584.251220703125,
"eval_logps/rejected": -2175.965087890625,
"eval_loss": 0.6481595635414124,
"eval_rewards/accuracies": 0.6240000128746033,
"eval_rewards/chosen": 0.8852397799491882,
"eval_rewards/margins": 0.3397220969200134,
"eval_rewards/rejected": 0.54551762342453,
"eval_runtime": 303.8379,
"eval_samples_per_second": 6.582,
"eval_steps_per_second": 0.411,
"step": 700
},
{
"epoch": 0.37,
"learning_rate": 7.914477131893342e-07,
"logits/chosen": -1.71377432346344,
"logits/rejected": -1.708833932876587,
"logps/chosen": -2544.854248046875,
"logps/rejected": -2375.308349609375,
"loss": 0.6722,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6972166299819946,
"rewards/margins": 0.06426803767681122,
"rewards/rejected": 0.6329485774040222,
"step": 710
},
{
"epoch": 0.38,
"learning_rate": 7.839745220767661e-07,
"logits/chosen": -1.694154143333435,
"logits/rejected": -1.669390320777893,
"logps/chosen": -2534.442626953125,
"logps/rejected": -2229.87158203125,
"loss": 0.6723,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.48106852173805237,
"rewards/margins": 0.24986381828784943,
"rewards/rejected": 0.23120474815368652,
"step": 720
},
{
"epoch": 0.38,
"learning_rate": 7.764064857806389e-07,
"logits/chosen": -1.6268012523651123,
"logits/rejected": -1.575046420097351,
"logps/chosen": -2722.456298828125,
"logps/rejected": -2351.8857421875,
"loss": 0.643,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.7691014409065247,
"rewards/margins": 0.34405142068862915,
"rewards/rejected": 0.42504996061325073,
"step": 730
},
{
"epoch": 0.39,
"learning_rate": 7.68746131963598e-07,
"logits/chosen": -1.6478900909423828,
"logits/rejected": -1.597701072692871,
"logps/chosen": -2222.41259765625,
"logps/rejected": -1990.4273681640625,
"loss": 0.6243,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.6448198556900024,
"rewards/margins": 0.2648247182369232,
"rewards/rejected": 0.37999510765075684,
"step": 740
},
{
"epoch": 0.39,
"learning_rate": 7.609960191215909e-07,
"logits/chosen": -1.6781095266342163,
"logits/rejected": -1.6269840002059937,
"logps/chosen": -2453.95068359375,
"logps/rejected": -2161.110595703125,
"loss": 0.6632,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.6921306252479553,
"rewards/margins": 0.11647888273000717,
"rewards/rejected": 0.5756517648696899,
"step": 750
},
{
"epoch": 0.4,
"learning_rate": 7.531587357293505e-07,
"logits/chosen": -1.6048580408096313,
"logits/rejected": -1.6003602743148804,
"logps/chosen": -2562.139404296875,
"logps/rejected": -2293.66943359375,
"loss": 0.6594,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.7623199820518494,
"rewards/margins": 0.2832568287849426,
"rewards/rejected": 0.4790631830692291,
"step": 760
},
{
"epoch": 0.4,
"learning_rate": 7.452368993758645e-07,
"logits/chosen": -1.585092544555664,
"logits/rejected": -1.557943344116211,
"logps/chosen": -2426.169677734375,
"logps/rejected": -2058.61083984375,
"loss": 0.6519,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.553870677947998,
"rewards/margins": 0.39466503262519836,
"rewards/rejected": 0.15920567512512207,
"step": 770
},
{
"epoch": 0.41,
"learning_rate": 7.372331558901237e-07,
"logits/chosen": -1.5951181650161743,
"logits/rejected": -1.55776846408844,
"logps/chosen": -2530.603515625,
"logps/rejected": -2058.31494140625,
"loss": 0.663,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.5011290311813354,
"rewards/margins": 0.12420739978551865,
"rewards/rejected": 0.3769216239452362,
"step": 780
},
{
"epoch": 0.41,
"learning_rate": 7.291501784574355e-07,
"logits/chosen": -1.7254797220230103,
"logits/rejected": -1.6313526630401611,
"logps/chosen": -2754.68408203125,
"logps/rejected": -2185.399169921875,
"loss": 0.6073,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6229848265647888,
"rewards/margins": 0.35090917348861694,
"rewards/rejected": 0.27207568287849426,
"step": 790
},
{
"epoch": 0.42,
"learning_rate": 7.209906667266017e-07,
"logits/chosen": -1.7093772888183594,
"logits/rejected": -1.6865718364715576,
"logps/chosen": -2462.615478515625,
"logps/rejected": -2213.93798828125,
"loss": 0.6215,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.1175382137298584,
"rewards/margins": 0.40151238441467285,
"rewards/rejected": 0.7160258293151855,
"step": 800
},
{
"epoch": 0.42,
"eval_logits/chosen": -1.7084823846817017,
"eval_logits/rejected": -1.6541036367416382,
"eval_logps/chosen": -2563.754638671875,
"eval_logps/rejected": -2162.267822265625,
"eval_loss": 0.6452978253364563,
"eval_rewards/accuracies": 0.6380000114440918,
"eval_rewards/chosen": 1.0902061462402344,
"eval_rewards/margins": 0.4077164828777313,
"eval_rewards/rejected": 0.6824895739555359,
"eval_runtime": 301.7419,
"eval_samples_per_second": 6.628,
"eval_steps_per_second": 0.414,
"step": 800
},
{
"epoch": 0.42,
"learning_rate": 7.12757345908258e-07,
"logits/chosen": -1.7412763833999634,
"logits/rejected": -1.6791282892227173,
"logps/chosen": -2606.15283203125,
"logps/rejected": -1956.8831787109375,
"loss": 0.6358,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.9316846132278442,
"rewards/margins": 0.45442262291908264,
"rewards/rejected": 0.47726184129714966,
"step": 810
},
{
"epoch": 0.43,
"learning_rate": 7.044529658646761e-07,
"logits/chosen": -1.710146188735962,
"logits/rejected": -1.7056090831756592,
"logps/chosen": -2651.176513671875,
"logps/rejected": -2550.99755859375,
"loss": 0.6601,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6641424298286438,
"rewards/margins": 0.2352844774723053,
"rewards/rejected": 0.4288579821586609,
"step": 820
},
{
"epoch": 0.43,
"learning_rate": 6.960803001913314e-07,
"logits/chosen": -1.6102991104125977,
"logits/rejected": -1.5880324840545654,
"logps/chosen": -1818.771484375,
"logps/rejected": -1763.439208984375,
"loss": 0.6175,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.4126269817352295,
"rewards/margins": 0.2377271205186844,
"rewards/rejected": 0.1748998463153839,
"step": 830
},
{
"epoch": 0.44,
"learning_rate": 6.876421452905448e-07,
"logits/chosen": -1.6048507690429688,
"logits/rejected": -1.5550066232681274,
"logps/chosen": -2419.88818359375,
"logps/rejected": -1979.8333740234375,
"loss": 0.672,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.1539905071258545,
"rewards/margins": 0.44714298844337463,
"rewards/rejected": 0.7068475484848022,
"step": 840
},
{
"epoch": 0.44,
"learning_rate": 6.791413194375076e-07,
"logits/chosen": -1.5756229162216187,
"logits/rejected": -1.5317662954330444,
"logps/chosen": -2326.3671875,
"logps/rejected": -2082.76123046875,
"loss": 0.6358,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.8061001896858215,
"rewards/margins": 0.20508570969104767,
"rewards/rejected": 0.6010144948959351,
"step": 850
},
{
"epoch": 0.45,
"learning_rate": 6.705806618389997e-07,
"logits/chosen": -1.6245572566986084,
"logits/rejected": -1.6081863641738892,
"logps/chosen": -2542.473876953125,
"logps/rejected": -2442.247314453125,
"loss": 0.6751,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.8134121894836426,
"rewards/margins": 0.18775935471057892,
"rewards/rejected": 0.6256529092788696,
"step": 860
},
{
"epoch": 0.46,
"learning_rate": 6.619630316851182e-07,
"logits/chosen": -1.6937329769134521,
"logits/rejected": -1.6594982147216797,
"logps/chosen": -2513.98046875,
"logps/rejected": -2264.63623046875,
"loss": 0.6902,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.6390259861946106,
"rewards/margins": 0.21240201592445374,
"rewards/rejected": 0.4266239106655121,
"step": 870
},
{
"epoch": 0.46,
"learning_rate": 6.532913071943307e-07,
"logits/chosen": -1.6279165744781494,
"logits/rejected": -1.5716134309768677,
"logps/chosen": -2358.2890625,
"logps/rejected": -2005.8092041015625,
"loss": 0.6588,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 1.1509922742843628,
"rewards/margins": 0.4300170838832855,
"rewards/rejected": 0.7209752798080444,
"step": 880
},
{
"epoch": 0.47,
"learning_rate": 6.445683846521738e-07,
"logits/chosen": -1.458832025527954,
"logits/rejected": -1.3705499172210693,
"logps/chosen": -2031.3890380859375,
"logps/rejected": -1786.692626953125,
"loss": 0.6727,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.4857109487056732,
"rewards/margins": 0.11415307223796844,
"rewards/rejected": 0.3715578615665436,
"step": 890
},
{
"epoch": 0.47,
"learning_rate": 6.357971774439177e-07,
"logits/chosen": -1.446877360343933,
"logits/rejected": -1.4010428190231323,
"logps/chosen": -2083.528564453125,
"logps/rejected": -2091.34228515625,
"loss": 0.6674,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.3722456991672516,
"rewards/margins": 0.18528583645820618,
"rewards/rejected": 0.18695983290672302,
"step": 900
},
{
"epoch": 0.47,
"eval_logits/chosen": -1.5652438402175903,
"eval_logits/rejected": -1.5145412683486938,
"eval_logps/chosen": -2594.7568359375,
"eval_logps/rejected": -2185.613525390625,
"eval_loss": 0.6415941119194031,
"eval_rewards/accuracies": 0.6439999938011169,
"eval_rewards/chosen": 0.780185878276825,
"eval_rewards/margins": 0.33115366101264954,
"eval_rewards/rejected": 0.44903212785720825,
"eval_runtime": 290.6591,
"eval_samples_per_second": 6.881,
"eval_steps_per_second": 0.43,
"step": 900
},
{
"epoch": 0.48,
"learning_rate": 6.269806150815187e-07,
"logits/chosen": -1.580451250076294,
"logits/rejected": -1.5398848056793213,
"logps/chosen": -2756.412109375,
"logps/rejected": -2110.937255859375,
"loss": 0.5836,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.1578181982040405,
"rewards/margins": 0.4512609839439392,
"rewards/rejected": 0.7065572738647461,
"step": 910
},
{
"epoch": 0.48,
"learning_rate": 6.181216422251862e-07,
"logits/chosen": -1.6002380847930908,
"logits/rejected": -1.5482442378997803,
"logps/chosen": -2669.18408203125,
"logps/rejected": -2383.2392578125,
"loss": 0.6651,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.3831857442855835,
"rewards/margins": 0.3688461184501648,
"rewards/rejected": 1.014339566230774,
"step": 920
},
{
"epoch": 0.49,
"learning_rate": 6.092232176998897e-07,
"logits/chosen": -1.5446488857269287,
"logits/rejected": -1.5036358833312988,
"logps/chosen": -2283.471923828125,
"logps/rejected": -2156.527587890625,
"loss": 0.6389,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.8198372721672058,
"rewards/margins": 0.23020341992378235,
"rewards/rejected": 0.5896340012550354,
"step": 930
},
{
"epoch": 0.49,
"learning_rate": 6.002883135071362e-07,
"logits/chosen": -1.4674952030181885,
"logits/rejected": -1.3860971927642822,
"logps/chosen": -2495.39794921875,
"logps/rejected": -2081.33544921875,
"loss": 0.6479,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.46686476469039917,
"rewards/margins": 0.33061760663986206,
"rewards/rejected": 0.1362471729516983,
"step": 940
},
{
"epoch": 0.5,
"learning_rate": 5.913199138323448e-07,
"logits/chosen": -1.5902820825576782,
"logits/rejected": -1.5817844867706299,
"logps/chosen": -2237.93603515625,
"logps/rejected": -2165.838623046875,
"loss": 0.699,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.5283821821212769,
"rewards/margins": 0.3398032486438751,
"rewards/rejected": 0.18857893347740173,
"step": 950
},
{
"epoch": 0.5,
"learning_rate": 5.82321014048154e-07,
"logits/chosen": -1.5519543886184692,
"logits/rejected": -1.5687713623046875,
"logps/chosen": -2170.23583984375,
"logps/rejected": -2091.04248046875,
"loss": 0.6617,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.2492622435092926,
"rewards/margins": 0.2455929070711136,
"rewards/rejected": 0.00366935133934021,
"step": 960
},
{
"epoch": 0.51,
"learning_rate": 5.732946197139906e-07,
"logits/chosen": -1.5598348379135132,
"logits/rejected": -1.5337880849838257,
"logps/chosen": -2266.143310546875,
"logps/rejected": -2009.6168212890625,
"loss": 0.6497,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.789315402507782,
"rewards/margins": 0.16782251000404358,
"rewards/rejected": 0.6214929223060608,
"step": 970
},
{
"epoch": 0.51,
"learning_rate": 5.642437455722381e-07,
"logits/chosen": -1.5074641704559326,
"logits/rejected": -1.4456851482391357,
"logps/chosen": -2503.286865234375,
"logps/rejected": -2021.8304443359375,
"loss": 0.6258,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 1.141226053237915,
"rewards/margins": 0.3969436287879944,
"rewards/rejected": 0.7442826628684998,
"step": 980
},
{
"epoch": 0.52,
"learning_rate": 5.551714145413368e-07,
"logits/chosen": -1.468330979347229,
"logits/rejected": -1.3824667930603027,
"logps/chosen": -2575.858154296875,
"logps/rejected": -1971.8447265625,
"loss": 0.647,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.726246178150177,
"rewards/margins": 0.32752370834350586,
"rewards/rejected": 0.39872246980667114,
"step": 990
},
{
"epoch": 0.52,
"learning_rate": 5.460806567061533e-07,
"logits/chosen": -1.5170243978500366,
"logits/rejected": -1.4751875400543213,
"logps/chosen": -2752.580322265625,
"logps/rejected": -2291.04833984375,
"loss": 0.644,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.8746698498725891,
"rewards/margins": 0.38163238763809204,
"rewards/rejected": 0.4930374026298523,
"step": 1000
},
{
"epoch": 0.52,
"eval_logits/chosen": -1.5046511888504028,
"eval_logits/rejected": -1.4505603313446045,
"eval_logps/chosen": -2602.00390625,
"eval_logps/rejected": -2193.728515625,
"eval_loss": 0.6499609351158142,
"eval_rewards/accuracies": 0.6399999856948853,
"eval_rewards/chosen": 0.7077119946479797,
"eval_rewards/margins": 0.3398290276527405,
"eval_rewards/rejected": 0.36788299679756165,
"eval_runtime": 299.5822,
"eval_samples_per_second": 6.676,
"eval_steps_per_second": 0.417,
"step": 1000
},
{
"epoch": 0.53,
"learning_rate": 5.369745083059577e-07,
"logits/chosen": -1.490482211112976,
"logits/rejected": -1.424222707748413,
"logps/chosen": -2471.395263671875,
"logps/rejected": -1937.520751953125,
"loss": 0.6353,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.47275876998901367,
"rewards/margins": 0.2599312365055084,
"rewards/rejected": 0.21282756328582764,
"step": 1010
},
{
"epoch": 0.53,
"learning_rate": 5.278560107203437e-07,
"logits/chosen": -1.459146499633789,
"logits/rejected": -1.4577230215072632,
"logps/chosen": -2559.42724609375,
"logps/rejected": -2042.339599609375,
"loss": 0.6634,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.7685127258300781,
"rewards/margins": 0.3085792660713196,
"rewards/rejected": 0.45993345975875854,
"step": 1020
},
{
"epoch": 0.54,
"learning_rate": 5.18728209453432e-07,
"logits/chosen": -1.5719316005706787,
"logits/rejected": -1.5082643032073975,
"logps/chosen": -2554.538818359375,
"logps/rejected": -2257.06201171875,
"loss": 0.6673,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.9037872552871704,
"rewards/margins": 0.3130941092967987,
"rewards/rejected": 0.5906931161880493,
"step": 1030
},
{
"epoch": 0.54,
"learning_rate": 5.095941531166982e-07,
"logits/chosen": -1.5710715055465698,
"logits/rejected": -1.5428146123886108,
"logps/chosen": -2587.89111328125,
"logps/rejected": -2198.08056640625,
"loss": 0.6266,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.7090158462524414,
"rewards/margins": 0.3786148130893707,
"rewards/rejected": 0.33040106296539307,
"step": 1040
},
{
"epoch": 0.55,
"learning_rate": 5.004568924107598e-07,
"logits/chosen": -1.6318562030792236,
"logits/rejected": -1.5859413146972656,
"logps/chosen": -2931.807373046875,
"logps/rejected": -2507.31298828125,
"loss": 0.6294,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.7447463274002075,
"rewards/margins": 0.2536779046058655,
"rewards/rejected": 0.49106842279434204,
"step": 1050
},
{
"epoch": 0.55,
"learning_rate": 4.913194791064675e-07,
"logits/chosen": -1.639493703842163,
"logits/rejected": -1.5823523998260498,
"logps/chosen": -2601.8447265625,
"logps/rejected": -2357.34814453125,
"loss": 0.6441,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.7931571006774902,
"rewards/margins": 0.5028332471847534,
"rewards/rejected": 0.2903238832950592,
"step": 1060
},
{
"epoch": 0.56,
"learning_rate": 4.82184965025639e-07,
"logits/chosen": -1.5899850130081177,
"logits/rejected": -1.5473779439926147,
"logps/chosen": -2727.800537109375,
"logps/rejected": -2362.034423828125,
"loss": 0.6419,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.9983813166618347,
"rewards/margins": 0.39955899119377136,
"rewards/rejected": 0.5988222360610962,
"step": 1070
},
{
"epoch": 0.57,
"learning_rate": 4.73056401021775e-07,
"logits/chosen": -1.5197970867156982,
"logits/rejected": -1.4553916454315186,
"logps/chosen": -2388.419921875,
"logps/rejected": -2081.69775390625,
"loss": 0.6171,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.601919949054718,
"rewards/margins": 0.227634459733963,
"rewards/rejected": 0.374285489320755,
"step": 1080
},
{
"epoch": 0.57,
"learning_rate": 4.639368359610982e-07,
"logits/chosen": -1.4987363815307617,
"logits/rejected": -1.4325814247131348,
"logps/chosen": -2522.322509765625,
"logps/rejected": -2121.84912109375,
"loss": 0.6571,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.5968645215034485,
"rewards/margins": 0.3043002486228943,
"rewards/rejected": 0.2925642132759094,
"step": 1090
},
{
"epoch": 0.58,
"learning_rate": 4.5482931570425803e-07,
"logits/chosen": -1.5703797340393066,
"logits/rejected": -1.5181505680084229,
"logps/chosen": -2581.994140625,
"logps/rejected": -2270.20166015625,
"loss": 0.6539,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6347614526748657,
"rewards/margins": 0.321241557598114,
"rewards/rejected": 0.31352001428604126,
"step": 1100
},
{
"epoch": 0.58,
"eval_logits/chosen": -1.5226702690124512,
"eval_logits/rejected": -1.4696787595748901,
"eval_logps/chosen": -2588.0068359375,
"eval_logps/rejected": -2181.99365234375,
"eval_loss": 0.6389243006706238,
"eval_rewards/accuracies": 0.6499999761581421,
"eval_rewards/chosen": 0.8476871848106384,
"eval_rewards/margins": 0.362454891204834,
"eval_rewards/rejected": 0.4852323532104492,
"eval_runtime": 301.2203,
"eval_samples_per_second": 6.64,
"eval_steps_per_second": 0.415,
"step": 1100
},
{
"epoch": 0.58,
"learning_rate": 4.4573688208903686e-07,
"logits/chosen": -1.4915900230407715,
"logits/rejected": -1.3990033864974976,
"logps/chosen": -2177.49169921875,
"logps/rejected": -1711.8460693359375,
"loss": 0.6447,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.6701298356056213,
"rewards/margins": 0.3176502585411072,
"rewards/rejected": 0.3524795174598694,
"step": 1110
},
{
"epoch": 0.59,
"learning_rate": 4.366625719144016e-07,
"logits/chosen": -1.5326006412506104,
"logits/rejected": -1.4640724658966064,
"logps/chosen": -2241.04052734375,
"logps/rejected": -1938.517822265625,
"loss": 0.6094,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.9225455522537231,
"rewards/margins": 0.3196006417274475,
"rewards/rejected": 0.6029448509216309,
"step": 1120
},
{
"epoch": 0.59,
"learning_rate": 4.276094159262368e-07,
"logits/chosen": -1.459031343460083,
"logits/rejected": -1.4118678569793701,
"logps/chosen": -2329.41943359375,
"logps/rejected": -2065.614501953125,
"loss": 0.6114,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.0603306293487549,
"rewards/margins": 0.38362884521484375,
"rewards/rejected": 0.6767016649246216,
"step": 1130
},
{
"epoch": 0.6,
"learning_rate": 4.1858043780510135e-07,
"logits/chosen": -1.4943807125091553,
"logits/rejected": -1.4440956115722656,
"logps/chosen": -2648.4462890625,
"logps/rejected": -2317.19970703125,
"loss": 0.6521,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.9549520611763,
"rewards/margins": 0.1597224771976471,
"rewards/rejected": 0.7952295541763306,
"step": 1140
},
{
"epoch": 0.6,
"learning_rate": 4.0957865315634204e-07,
"logits/chosen": -1.4685379266738892,
"logits/rejected": -1.4013986587524414,
"logps/chosen": -2750.71142578125,
"logps/rejected": -2100.20068359375,
"loss": 0.6027,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.7830246686935425,
"rewards/margins": 0.5725045204162598,
"rewards/rejected": 0.2105201780796051,
"step": 1150
},
{
"epoch": 0.61,
"learning_rate": 4.006070685029075e-07,
"logits/chosen": -1.484535813331604,
"logits/rejected": -1.4587595462799072,
"logps/chosen": -2228.81787109375,
"logps/rejected": -2157.81298828125,
"loss": 0.6803,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.4148440957069397,
"rewards/margins": 0.10307104885578156,
"rewards/rejected": 0.3117729723453522,
"step": 1160
},
{
"epoch": 0.61,
"learning_rate": 3.916686802811927e-07,
"logits/chosen": -1.3863401412963867,
"logits/rejected": -1.4270175695419312,
"logps/chosen": -2092.947998046875,
"logps/rejected": -2140.6953125,
"loss": 0.624,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.6797593832015991,
"rewards/margins": 0.16529296338558197,
"rewards/rejected": 0.514466404914856,
"step": 1170
},
{
"epoch": 0.62,
"learning_rate": 3.8276647384025467e-07,
"logits/chosen": -1.4469492435455322,
"logits/rejected": -1.3607311248779297,
"logps/chosen": -2557.885009765625,
"logps/rejected": -2165.09033203125,
"loss": 0.6423,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.6221494078636169,
"rewards/margins": 0.2990773320198059,
"rewards/rejected": 0.3230721354484558,
"step": 1180
},
{
"epoch": 0.62,
"learning_rate": 3.7390342244472883e-07,
"logits/chosen": -1.5888515710830688,
"logits/rejected": -1.5609667301177979,
"logps/chosen": -2778.28515625,
"logps/rejected": -2496.6396484375,
"loss": 0.6533,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.9935188293457031,
"rewards/margins": 0.3621361255645752,
"rewards/rejected": 0.6313827037811279,
"step": 1190
},
{
"epoch": 0.63,
"learning_rate": 3.6508248628178446e-07,
"logits/chosen": -1.6396839618682861,
"logits/rejected": -1.5974278450012207,
"logps/chosen": -2493.72216796875,
"logps/rejected": -2359.435791015625,
"loss": 0.7267,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.9704666137695312,
"rewards/margins": 0.3994936943054199,
"rewards/rejected": 0.5709729790687561,
"step": 1200
},
{
"epoch": 0.63,
"eval_logits/chosen": -1.6800066232681274,
"eval_logits/rejected": -1.6292266845703125,
"eval_logps/chosen": -2618.873779296875,
"eval_logps/rejected": -2207.94384765625,
"eval_loss": 0.6421077847480774,
"eval_rewards/accuracies": 0.6620000004768372,
"eval_rewards/chosen": 0.5390151143074036,
"eval_rewards/margins": 0.3132854104042053,
"eval_rewards/rejected": 0.22572976350784302,
"eval_runtime": 304.4335,
"eval_samples_per_second": 6.57,
"eval_steps_per_second": 0.411,
"step": 1200
},
{
"epoch": 0.63,
"learning_rate": 3.563066114724441e-07,
"logits/chosen": -1.6271164417266846,
"logits/rejected": -1.5858738422393799,
"logps/chosen": -2807.364990234375,
"logps/rejected": -2029.6510009765625,
"loss": 0.6347,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6177263855934143,
"rewards/margins": 0.27368754148483276,
"rewards/rejected": 0.3440387547016144,
"step": 1210
},
{
"epoch": 0.64,
"learning_rate": 3.475787290876055e-07,
"logits/chosen": -1.5973155498504639,
"logits/rejected": -1.558475375175476,
"logps/chosen": -2490.0703125,
"logps/rejected": -2087.466064453125,
"loss": 0.6385,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8565654754638672,
"rewards/margins": 0.4143308699131012,
"rewards/rejected": 0.4422345757484436,
"step": 1220
},
{
"epoch": 0.64,
"learning_rate": 3.389017541690854e-07,
"logits/chosen": -1.5630786418914795,
"logits/rejected": -1.548064947128296,
"logps/chosen": -2276.59619140625,
"logps/rejected": -1839.0726318359375,
"loss": 0.6357,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.7428416609764099,
"rewards/margins": 0.3907639980316162,
"rewards/rejected": 0.35207757353782654,
"step": 1230
},
{
"epoch": 0.65,
"learning_rate": 3.30278584756021e-07,
"logits/chosen": -1.548689365386963,
"logits/rejected": -1.4891592264175415,
"logps/chosen": -2640.1591796875,
"logps/rejected": -2317.181396484375,
"loss": 0.6184,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.910789966583252,
"rewards/margins": 0.37699228525161743,
"rewards/rejected": 0.5337976217269897,
"step": 1240
},
{
"epoch": 0.65,
"learning_rate": 3.2171210091694735e-07,
"logits/chosen": -1.608028769493103,
"logits/rejected": -1.5826674699783325,
"logps/chosen": -2531.904296875,
"logps/rejected": -2342.30419921875,
"loss": 0.6087,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.7222377061843872,
"rewards/margins": 0.4060022830963135,
"rewards/rejected": 0.3162355422973633,
"step": 1250
},
{
"epoch": 0.66,
"learning_rate": 3.132051637878789e-07,
"logits/chosen": -1.5921976566314697,
"logits/rejected": -1.4880411624908447,
"logps/chosen": -2295.463134765625,
"logps/rejected": -1800.047119140625,
"loss": 0.6709,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.8092087507247925,
"rewards/margins": 0.39788728952407837,
"rewards/rejected": 0.4113215506076813,
"step": 1260
},
{
"epoch": 0.66,
"learning_rate": 3.0476061461671155e-07,
"logits/chosen": -1.5929429531097412,
"logits/rejected": -1.560585856437683,
"logps/chosen": -2178.914306640625,
"logps/rejected": -2029.7672119140625,
"loss": 0.6315,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.8661308288574219,
"rewards/margins": 0.3666331171989441,
"rewards/rejected": 0.4994977116584778,
"step": 1270
},
{
"epoch": 0.67,
"learning_rate": 2.9638127381427127e-07,
"logits/chosen": -1.4586659669876099,
"logits/rejected": -1.4546220302581787,
"logps/chosen": -2244.927978515625,
"logps/rejected": -2030.598876953125,
"loss": 0.5909,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.7213354110717773,
"rewards/margins": 0.37115171551704407,
"rewards/rejected": 0.3501836955547333,
"step": 1280
},
{
"epoch": 0.68,
"learning_rate": 2.8806994001231766e-07,
"logits/chosen": -1.462428092956543,
"logits/rejected": -1.4601207971572876,
"logps/chosen": -2553.372314453125,
"logps/rejected": -2366.053955078125,
"loss": 0.6324,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.9080332517623901,
"rewards/margins": 0.3693556487560272,
"rewards/rejected": 0.5386777520179749,
"step": 1290
},
{
"epoch": 0.68,
"learning_rate": 2.7982938912882544e-07,
"logits/chosen": -1.5518906116485596,
"logits/rejected": -1.47800874710083,
"logps/chosen": -2843.82421875,
"logps/rejected": -2309.199951171875,
"loss": 0.5746,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 1.102667212486267,
"rewards/margins": 0.6155067682266235,
"rewards/rejected": 0.48716044425964355,
"step": 1300
},
{
"epoch": 0.68,
"eval_logits/chosen": -1.5460779666900635,
"eval_logits/rejected": -1.4993510246276855,
"eval_logps/chosen": -2582.20947265625,
"eval_logps/rejected": -2181.592041015625,
"eval_loss": 0.6300790905952454,
"eval_rewards/accuracies": 0.6660000085830688,
"eval_rewards/chosen": 0.9056587815284729,
"eval_rewards/margins": 0.41641080379486084,
"eval_rewards/rejected": 0.48924797773361206,
"eval_runtime": 299.2617,
"eval_samples_per_second": 6.683,
"eval_steps_per_second": 0.418,
"step": 1300
},
{
"epoch": 0.69,
"learning_rate": 2.716623734408488e-07,
"logits/chosen": -1.5478688478469849,
"logits/rejected": -1.509421944618225,
"logps/chosen": -2733.4658203125,
"logps/rejected": -2210.788330078125,
"loss": 0.676,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.8321071863174438,
"rewards/margins": 0.17042401432991028,
"rewards/rejected": 0.661683201789856,
"step": 1310
},
{
"epoch": 0.69,
"learning_rate": 2.635716206652843e-07,
"logits/chosen": -1.51913321018219,
"logits/rejected": -1.5177617073059082,
"logps/chosen": -2348.56005859375,
"logps/rejected": -2216.1884765625,
"loss": 0.5911,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6093713045120239,
"rewards/margins": 0.3202818036079407,
"rewards/rejected": 0.28908950090408325,
"step": 1320
},
{
"epoch": 0.7,
"learning_rate": 2.5555983304783515e-07,
"logits/chosen": -1.4471040964126587,
"logits/rejected": -1.4324887990951538,
"logps/chosen": -2042.9017333984375,
"logps/rejected": -1859.039306640625,
"loss": 0.6168,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.49268943071365356,
"rewards/margins": 0.3076168894767761,
"rewards/rejected": 0.18507252633571625,
"step": 1330
},
{
"epoch": 0.7,
"learning_rate": 2.4762968646048356e-07,
"logits/chosen": -1.4452800750732422,
"logits/rejected": -1.3810513019561768,
"logps/chosen": -2950.53271484375,
"logps/rejected": -2301.14892578125,
"loss": 0.6184,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.9146178364753723,
"rewards/margins": 0.5878747701644897,
"rewards/rejected": 0.326742947101593,
"step": 1340
},
{
"epoch": 0.71,
"learning_rate": 2.397838295077703e-07,
"logits/chosen": -1.4514172077178955,
"logits/rejected": -1.430443525314331,
"logps/chosen": -2407.11181640625,
"logps/rejected": -2338.7666015625,
"loss": 0.6172,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.6004685163497925,
"rewards/margins": 0.15282198786735535,
"rewards/rejected": 0.44764652848243713,
"step": 1350
},
{
"epoch": 0.71,
"learning_rate": 2.3202488264218357e-07,
"logits/chosen": -1.4685500860214233,
"logits/rejected": -1.3829035758972168,
"logps/chosen": -2675.003173828125,
"logps/rejected": -2091.812744140625,
"loss": 0.61,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.828966498374939,
"rewards/margins": 0.3263750672340393,
"rewards/rejected": 0.5025915503501892,
"step": 1360
},
{
"epoch": 0.72,
"learning_rate": 2.243554372889479e-07,
"logits/chosen": -1.4399888515472412,
"logits/rejected": -1.3919384479522705,
"logps/chosen": -2576.9365234375,
"logps/rejected": -2010.0601806640625,
"loss": 0.597,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.9715896844863892,
"rewards/margins": 0.460097074508667,
"rewards/rejected": 0.5114925503730774,
"step": 1370
},
{
"epoch": 0.72,
"learning_rate": 2.1677805498050998e-07,
"logits/chosen": -1.3894431591033936,
"logits/rejected": -1.3669414520263672,
"logps/chosen": -1986.740966796875,
"logps/rejected": -1580.8253173828125,
"loss": 0.6499,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.5728658437728882,
"rewards/margins": 0.245010107755661,
"rewards/rejected": 0.32785576581954956,
"step": 1380
},
{
"epoch": 0.73,
"learning_rate": 2.0929526650100716e-07,
"logits/chosen": -1.4540735483169556,
"logits/rejected": -1.3499418497085571,
"logps/chosen": -2753.11669921875,
"logps/rejected": -2095.53466796875,
"loss": 0.6456,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.8334699869155884,
"rewards/margins": 0.7317672967910767,
"rewards/rejected": 0.10170261561870575,
"step": 1390
},
{
"epoch": 0.73,
"learning_rate": 2.0190957104100692e-07,
"logits/chosen": -1.4822982549667358,
"logits/rejected": -1.4137917757034302,
"logps/chosen": -2363.976806640625,
"logps/rejected": -1997.6536865234375,
"loss": 0.6053,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.7185107469558716,
"rewards/margins": 0.401099294424057,
"rewards/rejected": 0.3174114525318146,
"step": 1400
},
{
"epoch": 0.73,
"eval_logits/chosen": -1.4891161918640137,
"eval_logits/rejected": -1.4439697265625,
"eval_logps/chosen": -2585.19140625,
"eval_logps/rejected": -2184.890869140625,
"eval_loss": 0.6342132091522217,
"eval_rewards/accuracies": 0.6660000085830688,
"eval_rewards/chosen": 0.8758403062820435,
"eval_rewards/margins": 0.4195804297924042,
"eval_rewards/rejected": 0.4562598764896393,
"eval_runtime": 299.1063,
"eval_samples_per_second": 6.687,
"eval_steps_per_second": 0.418,
"step": 1400
},
{
"epoch": 0.74,
"learning_rate": 1.9462343536279612e-07,
"logits/chosen": -1.475975751876831,
"logits/rejected": -1.4379873275756836,
"logps/chosen": -2481.176025390625,
"logps/rejected": -2232.84912109375,
"loss": 0.6145,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.9434836506843567,
"rewards/margins": 0.4186176657676697,
"rewards/rejected": 0.524865984916687,
"step": 1410
},
{
"epoch": 0.74,
"learning_rate": 1.874392929765044e-07,
"logits/chosen": -1.4733283519744873,
"logits/rejected": -1.3902546167373657,
"logps/chosen": -2782.106689453125,
"logps/rejected": -2127.639404296875,
"loss": 0.5946,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 1.1233876943588257,
"rewards/margins": 0.5207871198654175,
"rewards/rejected": 0.6026005148887634,
"step": 1420
},
{
"epoch": 0.75,
"learning_rate": 1.8035954332732889e-07,
"logits/chosen": -1.4501025676727295,
"logits/rejected": -1.4023559093475342,
"logps/chosen": -2202.23974609375,
"logps/rejected": -1934.811279296875,
"loss": 0.6426,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.644204318523407,
"rewards/margins": 0.34255489706993103,
"rewards/rejected": 0.30164945125579834,
"step": 1430
},
{
"epoch": 0.75,
"learning_rate": 1.733865509941419e-07,
"logits/chosen": -1.4848979711532593,
"logits/rejected": -1.445502519607544,
"logps/chosen": -2633.660888671875,
"logps/rejected": -2392.826416015625,
"loss": 0.6303,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8459588885307312,
"rewards/margins": 0.4044179916381836,
"rewards/rejected": 0.4415409564971924,
"step": 1440
},
{
"epoch": 0.76,
"learning_rate": 1.6652264489973861e-07,
"logits/chosen": -1.4826475381851196,
"logits/rejected": -1.426309585571289,
"logps/chosen": -2556.17626953125,
"logps/rejected": -1992.7232666015625,
"loss": 0.6061,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6768069267272949,
"rewards/margins": 0.32333052158355713,
"rewards/rejected": 0.3534763753414154,
"step": 1450
},
{
"epoch": 0.76,
"learning_rate": 1.5977011753299724e-07,
"logits/chosen": -1.5091631412506104,
"logits/rejected": -1.4753676652908325,
"logps/chosen": -2201.044921875,
"logps/rejected": -1877.4302978515625,
"loss": 0.612,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.7366055250167847,
"rewards/margins": 0.2949199378490448,
"rewards/rejected": 0.44168558716773987,
"step": 1460
},
{
"epoch": 0.77,
"learning_rate": 1.5313122418320496e-07,
"logits/chosen": -1.5059702396392822,
"logits/rejected": -1.4471460580825806,
"logps/chosen": -2972.50439453125,
"logps/rejected": -2307.0458984375,
"loss": 0.6042,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 1.0683201551437378,
"rewards/margins": 0.5777542591094971,
"rewards/rejected": 0.49056586623191833,
"step": 1470
},
{
"epoch": 0.77,
"learning_rate": 1.4660818218681125e-07,
"logits/chosen": -1.4828715324401855,
"logits/rejected": -1.4702181816101074,
"logps/chosen": -2593.748046875,
"logps/rejected": -2591.448974609375,
"loss": 0.588,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.9806830286979675,
"rewards/margins": 0.4084799885749817,
"rewards/rejected": 0.5722029805183411,
"step": 1480
},
{
"epoch": 0.78,
"learning_rate": 1.4020317018685362e-07,
"logits/chosen": -1.456514596939087,
"logits/rejected": -1.390700101852417,
"logps/chosen": -2405.19482421875,
"logps/rejected": -1981.04296875,
"loss": 0.6567,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.614848256111145,
"rewards/margins": 0.3052050471305847,
"rewards/rejected": 0.3096432089805603,
"step": 1490
},
{
"epoch": 0.79,
"learning_rate": 1.3391832740531055e-07,
"logits/chosen": -1.4236390590667725,
"logits/rejected": -1.3956820964813232,
"logps/chosen": -2446.695068359375,
"logps/rejected": -2376.41259765625,
"loss": 0.6232,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.7929419875144958,
"rewards/margins": 0.35024353861808777,
"rewards/rejected": 0.44269853830337524,
"step": 1500
},
{
"epoch": 0.79,
"eval_logits/chosen": -1.4759258031845093,
"eval_logits/rejected": -1.4282684326171875,
"eval_logps/chosen": -2592.221923828125,
"eval_logps/rejected": -2190.57958984375,
"eval_loss": 0.6323803663253784,
"eval_rewards/accuracies": 0.6579999923706055,
"eval_rewards/chosen": 0.8055330514907837,
"eval_rewards/margins": 0.40616247057914734,
"eval_rewards/rejected": 0.39937061071395874,
"eval_runtime": 299.6311,
"eval_samples_per_second": 6.675,
"eval_steps_per_second": 0.417,
"step": 1500
},
{
"epoch": 0.79,
"learning_rate": 1.2775575292861707e-07,
"logits/chosen": -1.4745705127716064,
"logits/rejected": -1.4221175909042358,
"logps/chosen": -2639.8076171875,
"logps/rejected": -2123.642578125,
"loss": 0.6056,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.9188385009765625,
"rewards/margins": 0.5551499128341675,
"rewards/rejected": 0.3636886477470398,
"step": 1510
},
{
"epoch": 0.8,
"learning_rate": 1.21717505006588e-07,
"logits/chosen": -1.4603058099746704,
"logits/rejected": -1.4439467191696167,
"logps/chosen": -2664.22119140625,
"logps/rejected": -2496.781005859375,
"loss": 0.6213,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.9416143298149109,
"rewards/margins": 0.3402588963508606,
"rewards/rejected": 0.6013555526733398,
"step": 1520
},
{
"epoch": 0.8,
"learning_rate": 1.1580560036497877e-07,
"logits/chosen": -1.473534345626831,
"logits/rejected": -1.4060730934143066,
"logps/chosen": -2819.74462890625,
"logps/rejected": -2299.840576171875,
"loss": 0.6071,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.8828972578048706,
"rewards/margins": 0.5179694294929504,
"rewards/rejected": 0.3649279475212097,
"step": 1530
},
{
"epoch": 0.81,
"learning_rate": 1.1002201353191521e-07,
"logits/chosen": -1.4415251016616821,
"logits/rejected": -1.461745023727417,
"logps/chosen": -2390.272705078125,
"logps/rejected": -2447.08642578125,
"loss": 0.6433,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.6030459403991699,
"rewards/margins": 0.1989385038614273,
"rewards/rejected": 0.4041074216365814,
"step": 1540
},
{
"epoch": 0.81,
"learning_rate": 1.0436867617841766e-07,
"logits/chosen": -1.4779837131500244,
"logits/rejected": -1.443192958831787,
"logps/chosen": -2101.65771484375,
"logps/rejected": -1614.459228515625,
"loss": 0.5839,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.38758862018585205,
"rewards/margins": 0.41923385858535767,
"rewards/rejected": -0.03164520859718323,
"step": 1550
},
{
"epoch": 0.82,
"learning_rate": 9.884747647323854e-08,
"logits/chosen": -1.4118781089782715,
"logits/rejected": -1.398271083831787,
"logps/chosen": -2657.19287109375,
"logps/rejected": -2414.64990234375,
"loss": 0.6554,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6775075793266296,
"rewards/margins": 0.22664561867713928,
"rewards/rejected": 0.4508620798587799,
"step": 1560
},
{
"epoch": 0.82,
"learning_rate": 9.346025845222871e-08,
"logits/chosen": -1.4589564800262451,
"logits/rejected": -1.4241827726364136,
"logps/chosen": -2566.69384765625,
"logps/rejected": -2381.8310546875,
"loss": 0.6699,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.7393044829368591,
"rewards/margins": 0.26446717977523804,
"rewards/rejected": 0.4748373031616211,
"step": 1570
},
{
"epoch": 0.83,
"learning_rate": 8.82088214024454e-08,
"logits/chosen": -1.4593846797943115,
"logits/rejected": -1.4349015951156616,
"logps/chosen": -2314.169189453125,
"logps/rejected": -2187.58544921875,
"loss": 0.6497,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.4934845566749573,
"rewards/margins": 0.21952751278877258,
"rewards/rejected": 0.2739570140838623,
"step": 1580
},
{
"epoch": 0.83,
"learning_rate": 8.309491926120393e-08,
"logits/chosen": -1.4479442834854126,
"logits/rejected": -1.388183832168579,
"logps/chosen": -2701.14111328125,
"logps/rejected": -2293.677001953125,
"loss": 0.6347,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.6762970089912415,
"rewards/margins": 0.43245062232017517,
"rewards/rejected": 0.2438463717699051,
"step": 1590
},
{
"epoch": 0.84,
"learning_rate": 7.812026003027771e-08,
"logits/chosen": -1.2826584577560425,
"logits/rejected": -1.2632884979248047,
"logps/chosen": -2654.244873046875,
"logps/rejected": -2260.9638671875,
"loss": 0.6326,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.40709176659584045,
"rewards/margins": 0.25365540385246277,
"rewards/rejected": 0.15343639254570007,
"step": 1600
},
{
"epoch": 0.84,
"eval_logits/chosen": -1.4959200620651245,
"eval_logits/rejected": -1.450128436088562,
"eval_logps/chosen": -2627.5283203125,
"eval_logps/rejected": -2220.19970703125,
"eval_loss": 0.6391750574111938,
"eval_rewards/accuracies": 0.656000018119812,
"eval_rewards/chosen": 0.4524710476398468,
"eval_rewards/margins": 0.3492998778820038,
"eval_rewards/rejected": 0.10317116975784302,
"eval_runtime": 302.5644,
"eval_samples_per_second": 6.61,
"eval_steps_per_second": 0.413,
"step": 1600
},
{
"epoch": 0.84,
"learning_rate": 7.328650520543906e-08,
"logits/chosen": -1.4119188785552979,
"logits/rejected": -1.2946244478225708,
"logps/chosen": -2411.543701171875,
"logps/rejected": -1841.427978515625,
"loss": 0.6211,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.3078997731208801,
"rewards/margins": 0.17288625240325928,
"rewards/rejected": 0.13501352071762085,
"step": 1610
},
{
"epoch": 0.85,
"learning_rate": 6.859526922153352e-08,
"logits/chosen": -1.4251132011413574,
"logits/rejected": -1.3843073844909668,
"logps/chosen": -2429.940185546875,
"logps/rejected": -1990.4915771484375,
"loss": 0.6556,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.5606139898300171,
"rewards/margins": 0.2745349407196045,
"rewards/rejected": 0.286079078912735,
"step": 1620
},
{
"epoch": 0.85,
"learning_rate": 6.40481189132711e-08,
"logits/chosen": -1.4726622104644775,
"logits/rejected": -1.4261372089385986,
"logps/chosen": -2766.93115234375,
"logps/rejected": -2061.09912109375,
"loss": 0.6425,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.5808910131454468,
"rewards/margins": 0.43574967980384827,
"rewards/rejected": 0.1451413631439209,
"step": 1630
},
{
"epoch": 0.86,
"learning_rate": 5.964657299191711e-08,
"logits/chosen": -1.4473376274108887,
"logits/rejected": -1.4126627445220947,
"logps/chosen": -2487.42919921875,
"logps/rejected": -2065.8955078125,
"loss": 0.6381,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.8268505930900574,
"rewards/margins": 0.4533798098564148,
"rewards/rejected": 0.37347084283828735,
"step": 1640
},
{
"epoch": 0.86,
"learning_rate": 5.53921015380539e-08,
"logits/chosen": -1.428260087966919,
"logits/rejected": -1.4423437118530273,
"logps/chosen": -2295.45556640625,
"logps/rejected": -2376.85595703125,
"loss": 0.6173,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.6111637353897095,
"rewards/margins": 0.18756714463233948,
"rewards/rejected": 0.4235965311527252,
"step": 1650
},
{
"epoch": 0.87,
"learning_rate": 5.1286125510586805e-08,
"logits/chosen": -1.462693452835083,
"logits/rejected": -1.4421815872192383,
"logps/chosen": -2543.067626953125,
"logps/rejected": -2478.81494140625,
"loss": 0.613,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.8606179356575012,
"rewards/margins": 0.37834784388542175,
"rewards/rejected": 0.48227009177207947,
"step": 1660
},
{
"epoch": 0.87,
"learning_rate": 4.733001627215466e-08,
"logits/chosen": -1.4652189016342163,
"logits/rejected": -1.4526941776275635,
"logps/chosen": -2576.45556640625,
"logps/rejected": -2486.090576171875,
"loss": 0.6675,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.7547820210456848,
"rewards/margins": 0.23100514709949493,
"rewards/rejected": 0.5237768292427063,
"step": 1670
},
{
"epoch": 0.88,
"learning_rate": 4.352509513110658e-08,
"logits/chosen": -1.4286987781524658,
"logits/rejected": -1.4079492092132568,
"logps/chosen": -2363.428955078125,
"logps/rejected": -2208.08740234375,
"loss": 0.6258,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.574379026889801,
"rewards/margins": 0.24226748943328857,
"rewards/rejected": 0.33211153745651245,
"step": 1680
},
{
"epoch": 0.88,
"learning_rate": 3.9872632900194936e-08,
"logits/chosen": -1.4842069149017334,
"logits/rejected": -1.415021300315857,
"logps/chosen": -2913.2490234375,
"logps/rejected": -2346.609619140625,
"loss": 0.6436,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.6599145531654358,
"rewards/margins": 0.26596465706825256,
"rewards/rejected": 0.3939499258995056,
"step": 1690
},
{
"epoch": 0.89,
"learning_rate": 3.6373849472134954e-08,
"logits/chosen": -1.4031012058258057,
"logits/rejected": -1.3779500722885132,
"logps/chosen": -2266.2158203125,
"logps/rejected": -1981.5833740234375,
"loss": 0.6469,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.5494025945663452,
"rewards/margins": 0.21553239226341248,
"rewards/rejected": 0.33387020230293274,
"step": 1700
},
{
"epoch": 0.89,
"eval_logits/chosen": -1.4758340120315552,
"eval_logits/rejected": -1.4289432764053345,
"eval_logps/chosen": -2598.2412109375,
"eval_logps/rejected": -2195.535888671875,
"eval_loss": 0.6306354403495789,
"eval_rewards/accuracies": 0.6660000085830688,
"eval_rewards/chosen": 0.7453421354293823,
"eval_rewards/margins": 0.3955351710319519,
"eval_rewards/rejected": 0.3498069643974304,
"eval_runtime": 295.7456,
"eval_samples_per_second": 6.763,
"eval_steps_per_second": 0.423,
"step": 1700
},
{
"epoch": 0.9,
"learning_rate": 3.302991341216976e-08,
"logits/chosen": -1.4159257411956787,
"logits/rejected": -1.392617106437683,
"logps/chosen": -2077.9482421875,
"logps/rejected": -1972.2515869140625,
"loss": 0.6409,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5421566367149353,
"rewards/margins": 0.2578433156013489,
"rewards/rejected": 0.28431329131126404,
"step": 1710
},
{
"epoch": 0.9,
"learning_rate": 2.9841941567779474e-08,
"logits/chosen": -1.4799764156341553,
"logits/rejected": -1.4051799774169922,
"logps/chosen": -2897.63232421875,
"logps/rejected": -2480.90625,
"loss": 0.6257,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8838424682617188,
"rewards/margins": 0.3801085352897644,
"rewards/rejected": 0.5037339925765991,
"step": 1720
},
{
"epoch": 0.91,
"learning_rate": 2.681099869566328e-08,
"logits/chosen": -1.4630422592163086,
"logits/rejected": -1.4653818607330322,
"logps/chosen": -2166.15966796875,
"logps/rejected": -2133.84326171875,
"loss": 0.6171,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.5266101956367493,
"rewards/margins": 0.21218034625053406,
"rewards/rejected": 0.3144298195838928,
"step": 1730
},
{
"epoch": 0.91,
"learning_rate": 2.3938097106119216e-08,
"logits/chosen": -1.4574975967407227,
"logits/rejected": -1.4154255390167236,
"logps/chosen": -2208.398681640625,
"logps/rejected": -1935.158203125,
"loss": 0.6305,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.6478286981582642,
"rewards/margins": 0.3098670542240143,
"rewards/rejected": 0.33796167373657227,
"step": 1740
},
{
"epoch": 0.92,
"learning_rate": 2.12241963249406e-08,
"logits/chosen": -1.4689569473266602,
"logits/rejected": -1.4307196140289307,
"logps/chosen": -2519.071044921875,
"logps/rejected": -2212.586181640625,
"loss": 0.6578,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6326580047607422,
"rewards/margins": 0.3157083988189697,
"rewards/rejected": 0.31694963574409485,
"step": 1750
},
{
"epoch": 0.92,
"learning_rate": 1.8670202772942568e-08,
"logits/chosen": -1.4382356405258179,
"logits/rejected": -1.3769454956054688,
"logps/chosen": -2694.0830078125,
"logps/rejected": -2166.41845703125,
"loss": 0.6341,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.7784560322761536,
"rewards/margins": 0.3015449643135071,
"rewards/rejected": 0.4769110679626465,
"step": 1760
},
{
"epoch": 0.93,
"learning_rate": 1.6276969463224545e-08,
"logits/chosen": -1.4650015830993652,
"logits/rejected": -1.463744878768921,
"logps/chosen": -2586.126220703125,
"logps/rejected": -2591.75439453125,
"loss": 0.6103,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.6145761609077454,
"rewards/margins": 0.4303979277610779,
"rewards/rejected": 0.18417824804782867,
"step": 1770
},
{
"epoch": 0.93,
"learning_rate": 1.4045295716271e-08,
"logits/chosen": -1.4920063018798828,
"logits/rejected": -1.450634241104126,
"logps/chosen": -2605.60986328125,
"logps/rejected": -2116.304931640625,
"loss": 0.608,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.645778477191925,
"rewards/margins": 0.3450910151004791,
"rewards/rejected": 0.30068737268447876,
"step": 1780
},
{
"epoch": 0.94,
"learning_rate": 1.1975926892984766e-08,
"logits/chosen": -1.4100964069366455,
"logits/rejected": -1.3769333362579346,
"logps/chosen": -2435.0087890625,
"logps/rejected": -2033.880126953125,
"loss": 0.6496,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.6221305727958679,
"rewards/margins": 0.3294012248516083,
"rewards/rejected": 0.29272931814193726,
"step": 1790
},
{
"epoch": 0.94,
"learning_rate": 1.0069554145742787e-08,
"logits/chosen": -1.395265817642212,
"logits/rejected": -1.3731589317321777,
"logps/chosen": -2578.064697265625,
"logps/rejected": -2280.887451171875,
"loss": 0.669,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.6557528972625732,
"rewards/margins": 0.4573606848716736,
"rewards/rejected": 0.1983920931816101,
"step": 1800
},
{
"epoch": 0.94,
"eval_logits/chosen": -1.4769095182418823,
"eval_logits/rejected": -1.4307643175125122,
"eval_logps/chosen": -2607.336669921875,
"eval_logps/rejected": -2203.039306640625,
"eval_loss": 0.6322839260101318,
"eval_rewards/accuracies": 0.6600000262260437,
"eval_rewards/chosen": 0.6543857455253601,
"eval_rewards/margins": 0.3796128034591675,
"eval_rewards/rejected": 0.2747729420661926,
"eval_runtime": 293.77,
"eval_samples_per_second": 6.808,
"eval_steps_per_second": 0.426,
"step": 1800
},
{
"epoch": 0.95,
"learning_rate": 8.326814187556485e-09,
"logits/chosen": -1.4078927040100098,
"logits/rejected": -1.380299687385559,
"logps/chosen": -2524.50439453125,
"logps/rejected": -2226.43994140625,
"loss": 0.6208,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.5907411575317383,
"rewards/margins": 0.25163906812667847,
"rewards/rejected": 0.3391020894050598,
"step": 1810
},
{
"epoch": 0.95,
"learning_rate": 6.7482890794151594e-09,
"logits/chosen": -1.4838191270828247,
"logits/rejected": -1.4362868070602417,
"logps/chosen": -2814.218017578125,
"logps/rejected": -2245.9033203125,
"loss": 0.632,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.8923807144165039,
"rewards/margins": 0.4581494927406311,
"rewards/rejected": 0.4342312812805176,
"step": 1820
},
{
"epoch": 0.96,
"learning_rate": 5.334506035882036e-09,
"logits/chosen": -1.370774507522583,
"logits/rejected": -1.3359023332595825,
"logps/chosen": -2687.776123046875,
"logps/rejected": -2035.099609375,
"loss": 0.6014,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.6979535818099976,
"rewards/margins": 0.42525219917297363,
"rewards/rejected": 0.27270132303237915,
"step": 1830
},
{
"epoch": 0.96,
"learning_rate": 4.0859372490090194e-09,
"logits/chosen": -1.4562771320343018,
"logits/rejected": -1.4093388319015503,
"logps/chosen": -2788.104248046875,
"logps/rejected": -2335.853759765625,
"loss": 0.6116,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.7869713306427002,
"rewards/margins": 0.4374913275241852,
"rewards/rejected": 0.3494799733161926,
"step": 1840
},
{
"epoch": 0.97,
"learning_rate": 3.0029997306283416e-09,
"logits/chosen": -1.4756406545639038,
"logits/rejected": -1.3986704349517822,
"logps/chosen": -2574.64111328125,
"logps/rejected": -1893.6328125,
"loss": 0.6546,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.6360118389129639,
"rewards/margins": 0.3881533145904541,
"rewards/rejected": 0.24785849452018738,
"step": 1850
},
{
"epoch": 0.97,
"learning_rate": 2.0860551730742526e-09,
"logits/chosen": -1.4544508457183838,
"logits/rejected": -1.419983983039856,
"logps/chosen": -2375.126220703125,
"logps/rejected": -2017.3466796875,
"loss": 0.5584,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.8510934710502625,
"rewards/margins": 0.6245936155319214,
"rewards/rejected": 0.22649994492530823,
"step": 1860
},
{
"epoch": 0.98,
"learning_rate": 1.3354098283802628e-09,
"logits/chosen": -1.4696677923202515,
"logits/rejected": -1.4230769872665405,
"logps/chosen": -2438.054931640625,
"logps/rejected": -2103.46044921875,
"loss": 0.621,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.7339269518852234,
"rewards/margins": 0.35110199451446533,
"rewards/rejected": 0.38282495737075806,
"step": 1870
},
{
"epoch": 0.98,
"learning_rate": 7.513144059937415e-10,
"logits/chosen": -1.4952335357666016,
"logits/rejected": -1.442657232284546,
"logps/chosen": -2848.296630859375,
"logps/rejected": -2374.80126953125,
"loss": 0.6061,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.7486821413040161,
"rewards/margins": 0.31393861770629883,
"rewards/rejected": 0.43474358320236206,
"step": 1880
},
{
"epoch": 0.99,
"learning_rate": 3.3396398904106393e-10,
"logits/chosen": -1.4425480365753174,
"logits/rejected": -1.4436792135238647,
"logps/chosen": -2551.7880859375,
"logps/rejected": -2169.797607421875,
"loss": 0.6124,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.5796520709991455,
"rewards/margins": 0.4224782884120941,
"rewards/rejected": 0.15717382729053497,
"step": 1890
},
{
"epoch": 0.99,
"learning_rate": 8.349796917112018e-11,
"logits/chosen": -1.4112383127212524,
"logits/rejected": -1.3823628425598145,
"logps/chosen": -2330.736083984375,
"logps/rejected": -2090.098876953125,
"loss": 0.6531,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.5138527750968933,
"rewards/margins": 0.193558931350708,
"rewards/rejected": 0.3202938437461853,
"step": 1900
},
{
"epoch": 0.99,
"eval_logits/chosen": -1.4753704071044922,
"eval_logits/rejected": -1.4289445877075195,
"eval_logps/chosen": -2603.777587890625,
"eval_logps/rejected": -2200.1181640625,
"eval_loss": 0.6316895484924316,
"eval_rewards/accuracies": 0.6639999747276306,
"eval_rewards/chosen": 0.6899767518043518,
"eval_rewards/margins": 0.38598912954330444,
"eval_rewards/rejected": 0.30398762226104736,
"eval_runtime": 302.6434,
"eval_samples_per_second": 6.608,
"eval_steps_per_second": 0.413,
"step": 1900
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": -1.4598416090011597,
"logits/rejected": -1.4293019771575928,
"logps/chosen": -2462.09912109375,
"logps/rejected": -2050.02490234375,
"loss": 0.6322,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.5636069178581238,
"rewards/margins": 0.25135958194732666,
"rewards/rejected": 0.3122473955154419,
"step": 1910
},
{
"epoch": 1.0,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.6480738864519209,
"train_runtime": 26013.0665,
"train_samples_per_second": 2.35,
"train_steps_per_second": 0.073
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000000,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}