Jimmy19991222's picture
Upload folder using huggingface_hub
c501c5e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 9.442932838948966,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0071109533309937,
"logits/rejected": -0.9781900644302368,
"logps/chosen": -0.2738580107688904,
"logps/rejected": -0.27158379554748535,
"loss": 1.0523,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5477160215377808,
"rewards/margins": -0.004548341501504183,
"rewards/rejected": -0.5431675910949707,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 6.34423728622988,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -1.0404982566833496,
"logits/rejected": -0.9738548398017883,
"logps/chosen": -0.2942856252193451,
"logps/rejected": -0.2995370030403137,
"loss": 1.0442,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.5885712504386902,
"rewards/margins": 0.010502670891582966,
"rewards/rejected": -0.5990740060806274,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 6.854457761517512,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.9717105031013489,
"logits/rejected": -0.9914683103561401,
"logps/chosen": -0.2636018991470337,
"logps/rejected": -0.3009588122367859,
"loss": 1.0229,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5272037982940674,
"rewards/margins": 0.07471387088298798,
"rewards/rejected": -0.6019176244735718,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 16.17238672181369,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.9552351236343384,
"logits/rejected": -0.9299653768539429,
"logps/chosen": -0.27658405900001526,
"logps/rejected": -0.2946491837501526,
"loss": 1.0348,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5531681180000305,
"rewards/margins": 0.03613026812672615,
"rewards/rejected": -0.5892983675003052,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 7.914459513231275,
"learning_rate": 1.0638297872340424e-06,
"logits/chosen": -1.0123283863067627,
"logits/rejected": -0.9839458465576172,
"logps/chosen": -0.2764621078968048,
"logps/rejected": -0.29262328147888184,
"loss": 1.0216,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.5529242157936096,
"rewards/margins": 0.03232245892286301,
"rewards/rejected": -0.5852465629577637,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 13.510536818444182,
"learning_rate": 1.276595744680851e-06,
"logits/chosen": -0.9960908889770508,
"logits/rejected": -0.9520798921585083,
"logps/chosen": -0.3060453534126282,
"logps/rejected": -0.3202216625213623,
"loss": 1.0213,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.6120907068252563,
"rewards/margins": 0.028352651745080948,
"rewards/rejected": -0.6404433250427246,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 10.603480288342643,
"learning_rate": 1.4893617021276594e-06,
"logits/chosen": -1.0775905847549438,
"logits/rejected": -1.0043548345565796,
"logps/chosen": -0.33030545711517334,
"logps/rejected": -0.3744826912879944,
"loss": 1.0195,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.6606109142303467,
"rewards/margins": 0.08835448324680328,
"rewards/rejected": -0.7489653825759888,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 14.893194407448227,
"learning_rate": 1.702127659574468e-06,
"logits/chosen": -1.0553807020187378,
"logits/rejected": -1.0140490531921387,
"logps/chosen": -0.3645663559436798,
"logps/rejected": -0.461661159992218,
"loss": 1.0284,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.7291327118873596,
"rewards/margins": 0.19418945908546448,
"rewards/rejected": -0.923322319984436,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 7.362675910290458,
"learning_rate": 1.9148936170212767e-06,
"logits/chosen": -1.1070150136947632,
"logits/rejected": -1.0679465532302856,
"logps/chosen": -0.4404965341091156,
"logps/rejected": -0.5644907355308533,
"loss": 1.0179,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.8809930682182312,
"rewards/margins": 0.24798834323883057,
"rewards/rejected": -1.1289814710617065,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 16.23414874505975,
"learning_rate": 1.9997482349425066e-06,
"logits/chosen": -1.0770556926727295,
"logits/rejected": -1.0299774408340454,
"logps/chosen": -0.3946690261363983,
"logps/rejected": -0.47187358140945435,
"loss": 1.0123,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.7893380522727966,
"rewards/margins": 0.15440911054611206,
"rewards/rejected": -0.9437471628189087,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 11.88283791262975,
"learning_rate": 1.998210129767735e-06,
"logits/chosen": -1.0645383596420288,
"logits/rejected": -1.035369873046875,
"logps/chosen": -0.3778243362903595,
"logps/rejected": -0.48207464814186096,
"loss": 0.9951,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.755648672580719,
"rewards/margins": 0.2085006982088089,
"rewards/rejected": -0.9641492962837219,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 10.444389026599103,
"learning_rate": 1.995275937465126e-06,
"logits/chosen": -1.082425594329834,
"logits/rejected": -1.0538678169250488,
"logps/chosen": -0.4237767159938812,
"logps/rejected": -0.4713103175163269,
"loss": 0.9836,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.8475534319877625,
"rewards/margins": 0.0950673446059227,
"rewards/rejected": -0.9426206350326538,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 12.992830889875604,
"learning_rate": 1.9909497617679347e-06,
"logits/chosen": -0.9931782484054565,
"logits/rejected": -0.9680334329605103,
"logps/chosen": -0.5701107382774353,
"logps/rejected": -0.7114989757537842,
"loss": 0.9774,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.1402214765548706,
"rewards/margins": 0.2827766239643097,
"rewards/rejected": -1.4229979515075684,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 8.908123494624329,
"learning_rate": 1.985237653224059e-06,
"logits/chosen": -0.9891507029533386,
"logits/rejected": -0.9734717607498169,
"logps/chosen": -0.5873534679412842,
"logps/rejected": -0.7440844774246216,
"loss": 0.9571,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1747069358825684,
"rewards/margins": 0.3134620785713196,
"rewards/rejected": -1.4881689548492432,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 15.06224561163384,
"learning_rate": 1.9781476007338054e-06,
"logits/chosen": -0.9478601217269897,
"logits/rejected": -0.8844977617263794,
"logps/chosen": -0.6380752921104431,
"logps/rejected": -0.7878230810165405,
"loss": 0.9386,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.2761505842208862,
"rewards/margins": 0.29949551820755005,
"rewards/rejected": -1.575646162033081,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 10.129109213694903,
"learning_rate": 1.9696895203766866e-06,
"logits/chosen": -0.9139761924743652,
"logits/rejected": -0.9103153944015503,
"logps/chosen": -0.7025324702262878,
"logps/rejected": -0.9276626706123352,
"loss": 0.8866,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.4050649404525757,
"rewards/margins": 0.45026034116744995,
"rewards/rejected": -1.8553253412246704,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 16.035849628874075,
"learning_rate": 1.9598752415428888e-06,
"logits/chosen": -0.9445829391479492,
"logits/rejected": -0.9311642646789551,
"logps/chosen": -0.8271282315254211,
"logps/rejected": -1.0663609504699707,
"loss": 0.8879,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6542564630508423,
"rewards/margins": 0.4784657061100006,
"rewards/rejected": -2.1327219009399414,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 15.552471664159093,
"learning_rate": 1.9487184903887996e-06,
"logits/chosen": -0.9677060842514038,
"logits/rejected": -0.9533635377883911,
"logps/chosen": -1.1237901449203491,
"logps/rejected": -1.4190008640289307,
"loss": 0.9043,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.2475802898406982,
"rewards/margins": 0.5904213786125183,
"rewards/rejected": -2.8380017280578613,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 21.38276928877544,
"learning_rate": 1.936234870639737e-06,
"logits/chosen": -1.0183446407318115,
"logits/rejected": -0.9617747068405151,
"logps/chosen": -1.5094763040542603,
"logps/rejected": -1.7956956624984741,
"loss": 0.8115,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.0189526081085205,
"rewards/margins": 0.5724390745162964,
"rewards/rejected": -3.5913913249969482,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 31.546005742023485,
"learning_rate": 1.922441841766729e-06,
"logits/chosen": -0.8167861104011536,
"logits/rejected": -0.8134365081787109,
"logps/chosen": -1.9628349542617798,
"logps/rejected": -2.347581148147583,
"loss": 0.841,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.9256699085235596,
"rewards/margins": 0.7694929838180542,
"rewards/rejected": -4.695162296295166,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 31.175237667862007,
"learning_rate": 1.907358694567865e-06,
"logits/chosen": -0.7257764935493469,
"logits/rejected": -0.682075560092926,
"logps/chosen": -2.4148917198181152,
"logps/rejected": -2.919673204421997,
"loss": 0.8144,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.8297834396362305,
"rewards/margins": 1.0095628499984741,
"rewards/rejected": -5.839346408843994,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 42.60812515694024,
"learning_rate": 1.8910065241883678e-06,
"logits/chosen": -0.5907033681869507,
"logits/rejected": -0.5452768206596375,
"logps/chosen": -2.7082858085632324,
"logps/rejected": -3.285773515701294,
"loss": 0.7803,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.416571617126465,
"rewards/margins": 1.1549749374389648,
"rewards/rejected": -6.571547031402588,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 33.770352812549774,
"learning_rate": 1.8734082006171296e-06,
"logits/chosen": -0.6769031286239624,
"logits/rejected": -0.6223554611206055,
"logps/chosen": -2.841639995574951,
"logps/rejected": -3.499586820602417,
"loss": 0.7724,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -5.683279991149902,
"rewards/margins": 1.3158934116363525,
"rewards/rejected": -6.999173641204834,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 25.195574765320742,
"learning_rate": 1.8545883367009615e-06,
"logits/chosen": -0.7494109272956848,
"logits/rejected": -0.6586568355560303,
"logps/chosen": -2.6896004676818848,
"logps/rejected": -3.3795294761657715,
"loss": 0.7034,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -5.3792009353637695,
"rewards/margins": 1.3798582553863525,
"rewards/rejected": -6.759058952331543,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 20.698310297934206,
"learning_rate": 1.8345732537213026e-06,
"logits/chosen": -0.8739752769470215,
"logits/rejected": -0.8345277905464172,
"logps/chosen": -2.600498676300049,
"logps/rejected": -3.1906166076660156,
"loss": 0.6515,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.200997352600098,
"rewards/margins": 1.1802361011505127,
"rewards/rejected": -6.381233215332031,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 31.900476449074073,
"learning_rate": 1.8133909445815276e-06,
"logits/chosen": -0.876822829246521,
"logits/rejected": -0.8683232069015503,
"logps/chosen": -2.75192928314209,
"logps/rejected": -3.620870590209961,
"loss": 0.6498,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -5.50385856628418,
"rewards/margins": 1.7378835678100586,
"rewards/rejected": -7.241741180419922,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 30.23141141236411,
"learning_rate": 1.7910710346563413e-06,
"logits/chosen": -0.7084225416183472,
"logits/rejected": -0.650471568107605,
"logps/chosen": -3.4160752296447754,
"logps/rejected": -4.176965713500977,
"loss": 0.6394,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -6.832150459289551,
"rewards/margins": 1.5217812061309814,
"rewards/rejected": -8.353931427001953,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 29.441980968776832,
"learning_rate": 1.767644740358011e-06,
"logits/chosen": -0.76490318775177,
"logits/rejected": -0.7356737852096558,
"logps/chosen": -3.500870943069458,
"logps/rejected": -4.334284782409668,
"loss": 0.5747,
"rewards/accuracies": 0.8125,
"rewards/chosen": -7.001741886138916,
"rewards/margins": 1.6668283939361572,
"rewards/rejected": -8.668569564819336,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 28.87020107784321,
"learning_rate": 1.743144825477394e-06,
"logits/chosen": -0.6797415614128113,
"logits/rejected": -0.650688648223877,
"logps/chosen": -3.6205127239227295,
"logps/rejected": -4.511746406555176,
"loss": 0.6507,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -7.241025447845459,
"rewards/margins": 1.7824666500091553,
"rewards/rejected": -9.023492813110352,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 29.827503183327266,
"learning_rate": 1.7176055553608117e-06,
"logits/chosen": -0.7169264554977417,
"logits/rejected": -0.6832514405250549,
"logps/chosen": -3.934389114379883,
"logps/rejected": -4.9375319480896,
"loss": 0.6128,
"rewards/accuracies": 0.8125,
"rewards/chosen": -7.868778228759766,
"rewards/margins": 2.0062854290008545,
"rewards/rejected": -9.8750638961792,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 27.09179333048581,
"learning_rate": 1.6910626489868648e-06,
"logits/chosen": -0.8100920915603638,
"logits/rejected": -0.7742663621902466,
"logps/chosen": -3.824146270751953,
"logps/rejected": -5.090175628662109,
"loss": 0.6399,
"rewards/accuracies": 0.84375,
"rewards/chosen": -7.648292541503906,
"rewards/margins": 2.5320582389831543,
"rewards/rejected": -10.180351257324219,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 36.65170099175081,
"learning_rate": 1.6635532290102113e-06,
"logits/chosen": -0.8540701866149902,
"logits/rejected": -0.8212080001831055,
"logps/chosen": -4.092007637023926,
"logps/rejected": -5.184715270996094,
"loss": 0.5601,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -8.184015274047852,
"rewards/margins": 2.185415744781494,
"rewards/rejected": -10.369430541992188,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 44.09007725935235,
"learning_rate": 1.6351157698421788e-06,
"logits/chosen": -0.9053822755813599,
"logits/rejected": -0.8696815371513367,
"logps/chosen": -4.188479423522949,
"logps/rejected": -5.3639140129089355,
"loss": 0.5898,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -8.376958847045898,
"rewards/margins": 2.3508691787719727,
"rewards/rejected": -10.727828025817871,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 34.800340553634506,
"learning_rate": 1.6057900438408199e-06,
"logits/chosen": -0.8616800308227539,
"logits/rejected": -0.8292746543884277,
"logps/chosen": -4.644923686981201,
"logps/rejected": -6.038055896759033,
"loss": 0.5397,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -9.289847373962402,
"rewards/margins": 2.7862656116485596,
"rewards/rejected": -12.076111793518066,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 34.593547384833734,
"learning_rate": 1.5756170656856736e-06,
"logits/chosen": -0.9542654752731323,
"logits/rejected": -0.889543354511261,
"logps/chosen": -4.545766830444336,
"logps/rejected": -5.687682628631592,
"loss": 0.5562,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -9.091533660888672,
"rewards/margins": 2.28383207321167,
"rewards/rejected": -11.375365257263184,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 22.61281693291947,
"learning_rate": 1.544639035015027e-06,
"logits/chosen": -0.9639078378677368,
"logits/rejected": -0.9341806173324585,
"logps/chosen": -4.075970649719238,
"logps/rejected": -5.5132246017456055,
"loss": 0.513,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.151941299438477,
"rewards/margins": 2.8745083808898926,
"rewards/rejected": -11.026449203491211,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 21.446599953079577,
"learning_rate": 1.5128992774059062e-06,
"logits/chosen": -1.0559054613113403,
"logits/rejected": -0.9924653172492981,
"logps/chosen": -3.7231125831604004,
"logps/rejected": -5.130820274353027,
"loss": 0.4996,
"rewards/accuracies": 0.875,
"rewards/chosen": -7.446225166320801,
"rewards/margins": 2.815417766571045,
"rewards/rejected": -10.261640548706055,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 24.863835996393608,
"learning_rate": 1.4804421837793377e-06,
"logits/chosen": -0.9934264421463013,
"logits/rejected": -0.9997881054878235,
"logps/chosen": -4.336796760559082,
"logps/rejected": -5.937041282653809,
"loss": 0.4682,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -8.673593521118164,
"rewards/margins": 3.200488328933716,
"rewards/rejected": -11.874082565307617,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 34.56272131407248,
"learning_rate": 1.4473131483156324e-06,
"logits/chosen": -0.8811644315719604,
"logits/rejected": -0.8515303730964661,
"logps/chosen": -5.209665298461914,
"logps/rejected": -6.913350582122803,
"loss": 0.515,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -10.419330596923828,
"rewards/margins": 3.4073710441589355,
"rewards/rejected": -13.826701164245605,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 26.404593181307447,
"learning_rate": 1.4135585049665206e-06,
"logits/chosen": -0.8241022825241089,
"logits/rejected": -0.7840823531150818,
"logps/chosen": -5.047942161560059,
"logps/rejected": -6.955193996429443,
"loss": 0.4519,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -10.095884323120117,
"rewards/margins": 3.8145041465759277,
"rewards/rejected": -13.910387992858887,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 35.5838299296831,
"learning_rate": 1.3792254626529285e-06,
"logits/chosen": -0.8618327975273132,
"logits/rejected": -0.7756074666976929,
"logps/chosen": -5.758598327636719,
"logps/rejected": -7.596462249755859,
"loss": 0.5778,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -11.517196655273438,
"rewards/margins": 3.6757278442382812,
"rewards/rejected": -15.192924499511719,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 42.32040382898782,
"learning_rate": 1.3443620392390349e-06,
"logits/chosen": -0.9941180944442749,
"logits/rejected": -0.9657033085823059,
"logps/chosen": -4.432991981506348,
"logps/rejected": -6.000949859619141,
"loss": 0.495,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -8.865983963012695,
"rewards/margins": 3.135915756225586,
"rewards/rejected": -12.001899719238281,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 28.859222169675768,
"learning_rate": 1.3090169943749473e-06,
"logits/chosen": -0.948104739189148,
"logits/rejected": -0.9129034280776978,
"logps/chosen": -3.579448699951172,
"logps/rejected": -5.187192440032959,
"loss": 0.4532,
"rewards/accuracies": 0.875,
"rewards/chosen": -7.158897399902344,
"rewards/margins": 3.2154877185821533,
"rewards/rejected": -10.374384880065918,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 33.6510053739595,
"learning_rate": 1.27323976130192e-06,
"logits/chosen": -0.9587677121162415,
"logits/rejected": -0.9107363820075989,
"logps/chosen": -4.461714744567871,
"logps/rejected": -6.2298054695129395,
"loss": 0.3885,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -8.923429489135742,
"rewards/margins": 3.536180019378662,
"rewards/rejected": -12.459610939025879,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 39.16622543078335,
"learning_rate": 1.2370803777154975e-06,
"logits/chosen": -0.7982478141784668,
"logits/rejected": -0.7258783578872681,
"logps/chosen": -7.227081298828125,
"logps/rejected": -9.01085090637207,
"loss": 0.5453,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -14.45416259765625,
"rewards/margins": 3.5675411224365234,
"rewards/rejected": -18.02170181274414,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 28.294600400326075,
"learning_rate": 1.2005894157832728e-06,
"logits/chosen": -0.9068690538406372,
"logits/rejected": -0.8007113337516785,
"logps/chosen": -5.985177516937256,
"logps/rejected": -8.007855415344238,
"loss": 0.4459,
"rewards/accuracies": 0.90625,
"rewards/chosen": -11.970355033874512,
"rewards/margins": 4.045356750488281,
"rewards/rejected": -16.015710830688477,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 26.428195821183824,
"learning_rate": 1.1638179114151377e-06,
"logits/chosen": -1.0134648084640503,
"logits/rejected": -0.9478827714920044,
"logps/chosen": -4.030945301055908,
"logps/rejected": -5.84409761428833,
"loss": 0.4607,
"rewards/accuracies": 0.84375,
"rewards/chosen": -8.061890602111816,
"rewards/margins": 3.6263041496276855,
"rewards/rejected": -11.68819522857666,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 28.506424636352925,
"learning_rate": 1.1268172928849485e-06,
"logits/chosen": -1.0107872486114502,
"logits/rejected": -0.9833100438117981,
"logps/chosen": -3.623994827270508,
"logps/rejected": -5.339346885681152,
"loss": 0.4664,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -7.247989654541016,
"rewards/margins": 3.4307048320770264,
"rewards/rejected": -10.678693771362305,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 37.9874271990268,
"learning_rate": 1.0896393089034335e-06,
"logits/chosen": -1.0698987245559692,
"logits/rejected": -0.9614090919494629,
"logps/chosen": -4.2720537185668945,
"logps/rejected": -6.518821716308594,
"loss": 0.3759,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -8.544107437133789,
"rewards/margins": 4.493536472320557,
"rewards/rejected": -13.037643432617188,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 37.6233219867946,
"learning_rate": 1.052335956242944e-06,
"logits/chosen": -0.9640167355537415,
"logits/rejected": -0.9025171399116516,
"logps/chosen": -5.073387622833252,
"logps/rejected": -7.112657070159912,
"loss": 0.3989,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -10.146775245666504,
"rewards/margins": 4.078539848327637,
"rewards/rejected": -14.225314140319824,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 35.09471619941238,
"learning_rate": 1.0149594070152636e-06,
"logits/chosen": -0.9901530146598816,
"logits/rejected": -0.9247368574142456,
"logps/chosen": -6.148016452789307,
"logps/rejected": -8.221637725830078,
"loss": 0.4697,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -12.296032905578613,
"rewards/margins": 4.147244930267334,
"rewards/rejected": -16.443275451660156,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 39.356165818725984,
"learning_rate": 9.77561935704195e-07,
"logits/chosen": -0.9357139468193054,
"logits/rejected": -0.858476459980011,
"logps/chosen": -6.003566741943359,
"logps/rejected": -8.099205017089844,
"loss": 0.4241,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -12.007133483886719,
"rewards/margins": 4.191277027130127,
"rewards/rejected": -16.198410034179688,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 20.857149706425567,
"learning_rate": 9.401958460549657e-07,
"logits/chosen": -0.8877873420715332,
"logits/rejected": -0.8332953453063965,
"logps/chosen": -5.713176250457764,
"logps/rejected": -7.9226484298706055,
"loss": 0.4085,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -11.426352500915527,
"rewards/margins": 4.418946266174316,
"rewards/rejected": -15.845296859741211,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 37.11096746877866,
"learning_rate": 9.029133979227118e-07,
"logits/chosen": -0.9584988355636597,
"logits/rejected": -0.9051562547683716,
"logps/chosen": -4.586709976196289,
"logps/rejected": -6.5038323402404785,
"loss": 0.4022,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -9.173419952392578,
"rewards/margins": 3.8342444896698,
"rewards/rejected": -13.007664680480957,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 25.04133162285963,
"learning_rate": 8.657667341823448e-07,
"logits/chosen": -0.9564048051834106,
"logits/rejected": -0.8701663017272949,
"logps/chosen": -4.893515586853027,
"logps/rejected": -6.940362453460693,
"loss": 0.4312,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -9.787031173706055,
"rewards/margins": 4.093693256378174,
"rewards/rejected": -13.880724906921387,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 31.64139590058085,
"learning_rate": 8.288078078020249e-07,
"logits/chosen": -1.0176098346710205,
"logits/rejected": -0.9464299082756042,
"logps/chosen": -5.894881248474121,
"logps/rejected": -8.109701156616211,
"loss": 0.4212,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -11.789762496948242,
"rewards/margins": 4.429640769958496,
"rewards/rejected": -16.219402313232422,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 34.98704174006504,
"learning_rate": 7.920883091822408e-07,
"logits/chosen": -1.0222933292388916,
"logits/rejected": -0.9283574223518372,
"logps/chosen": -5.977299213409424,
"logps/rejected": -8.55643081665039,
"loss": 0.3473,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -11.954598426818848,
"rewards/margins": 5.158264636993408,
"rewards/rejected": -17.11286163330078,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 43.7429550932754,
"learning_rate": 7.556595938621058e-07,
"logits/chosen": -1.0368258953094482,
"logits/rejected": -0.9450758099555969,
"logps/chosen": -6.416205406188965,
"logps/rejected": -8.702176094055176,
"loss": 0.4135,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -12.83241081237793,
"rewards/margins": 4.571939945220947,
"rewards/rejected": -17.40435218811035,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 38.571708947108014,
"learning_rate": 7.195726106939973e-07,
"logits/chosen": -1.0127325057983398,
"logits/rejected": -0.9613968729972839,
"logps/chosen": -6.0891900062561035,
"logps/rejected": -8.455511093139648,
"loss": 0.3415,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -12.178380012512207,
"rewards/margins": 4.73264217376709,
"rewards/rejected": -16.911022186279297,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 37.02547097442152,
"learning_rate": 6.838778305869759e-07,
"logits/chosen": -0.9378641247749329,
"logits/rejected": -0.8806314468383789,
"logps/chosen": -6.423588752746582,
"logps/rejected": -8.611102104187012,
"loss": 0.4404,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -12.847177505493164,
"rewards/margins": 4.375027656555176,
"rewards/rejected": -17.222204208374023,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 35.55070245031894,
"learning_rate": 6.486251759186572e-07,
"logits/chosen": -1.0858322381973267,
"logits/rejected": -0.9954659342765808,
"logps/chosen": -5.805714130401611,
"logps/rejected": -7.78420877456665,
"loss": 0.4396,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -11.611428260803223,
"rewards/margins": 3.956988573074341,
"rewards/rejected": -15.5684175491333,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 51.53731628000405,
"learning_rate": 6.138639507142538e-07,
"logits/chosen": -1.175060749053955,
"logits/rejected": -1.1142823696136475,
"logps/chosen": -5.7005181312561035,
"logps/rejected": -7.967810153961182,
"loss": 0.4227,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -11.401036262512207,
"rewards/margins": 4.534584999084473,
"rewards/rejected": -15.935620307922363,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 29.428644028564324,
"learning_rate": 5.796427716904346e-07,
"logits/chosen": -1.1236612796783447,
"logits/rejected": -1.0238118171691895,
"logps/chosen": -6.259681701660156,
"logps/rejected": -8.45996379852295,
"loss": 0.3742,
"rewards/accuracies": 0.875,
"rewards/chosen": -12.519363403320312,
"rewards/margins": 4.400565147399902,
"rewards/rejected": -16.9199275970459,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 38.71321431370745,
"learning_rate": 5.460095002604532e-07,
"logits/chosen": -1.11953866481781,
"logits/rejected": -1.0796916484832764,
"logps/chosen": -6.55707311630249,
"logps/rejected": -9.187610626220703,
"loss": 0.3626,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -13.11414623260498,
"rewards/margins": 5.2610764503479,
"rewards/rejected": -18.375221252441406,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 38.929986299465604,
"learning_rate": 5.130111755956327e-07,
"logits/chosen": -1.1838449239730835,
"logits/rejected": -1.0870417356491089,
"logps/chosen": -6.676375389099121,
"logps/rejected": -9.317723274230957,
"loss": 0.4211,
"rewards/accuracies": 0.875,
"rewards/chosen": -13.352750778198242,
"rewards/margins": 5.282693862915039,
"rewards/rejected": -18.635446548461914,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 26.360971338492213,
"learning_rate": 4.806939488368308e-07,
"logits/chosen": -1.0527994632720947,
"logits/rejected": -0.9714158177375793,
"logps/chosen": -6.790243625640869,
"logps/rejected": -8.82271671295166,
"loss": 0.3754,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -13.580487251281738,
"rewards/margins": 4.06494665145874,
"rewards/rejected": -17.64543342590332,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 37.25228754273986,
"learning_rate": 4.4910301854789755e-07,
"logits/chosen": -1.092002511024475,
"logits/rejected": -1.0370265245437622,
"logps/chosen": -6.746194362640381,
"logps/rejected": -8.957503318786621,
"loss": 0.379,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -13.492388725280762,
"rewards/margins": 4.4226179122924805,
"rewards/rejected": -17.915006637573242,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 41.78732477890408,
"learning_rate": 4.1828256750139443e-07,
"logits/chosen": -1.15060555934906,
"logits/rejected": -1.0927339792251587,
"logps/chosen": -6.618721008300781,
"logps/rejected": -8.740182876586914,
"loss": 0.4272,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -13.237442016601562,
"rewards/margins": 4.242924213409424,
"rewards/rejected": -17.480365753173828,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 31.334898714386284,
"learning_rate": 3.882757008849935e-07,
"logits/chosen": -1.1759268045425415,
"logits/rejected": -1.125778317451477,
"logps/chosen": -7.186532020568848,
"logps/rejected": -9.452940940856934,
"loss": 0.3551,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -14.373064041137695,
"rewards/margins": 4.532819747924805,
"rewards/rejected": -18.905881881713867,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 29.63352796318247,
"learning_rate": 3.5912438601497584e-07,
"logits/chosen": -1.186089038848877,
"logits/rejected": -1.1533267498016357,
"logps/chosen": -6.283223628997803,
"logps/rejected": -8.40349006652832,
"loss": 0.3724,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -12.566447257995605,
"rewards/margins": 4.2405314445495605,
"rewards/rejected": -16.80698013305664,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 37.40129439705042,
"learning_rate": 3.308693936411421e-07,
"logits/chosen": -1.0497562885284424,
"logits/rejected": -1.0346195697784424,
"logps/chosen": -6.789434909820557,
"logps/rejected": -9.07376766204834,
"loss": 0.3605,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -13.578869819641113,
"rewards/margins": 4.568666458129883,
"rewards/rejected": -18.14753532409668,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 42.85252213793353,
"learning_rate": 3.035502409252333e-07,
"logits/chosen": -1.11203134059906,
"logits/rejected": -1.0642902851104736,
"logps/chosen": -6.502237796783447,
"logps/rejected": -9.193612098693848,
"loss": 0.4275,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -13.004475593566895,
"rewards/margins": 5.382746696472168,
"rewards/rejected": -18.387224197387695,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 45.248127741114246,
"learning_rate": 2.7720513617260855e-07,
"logits/chosen": -1.1741015911102295,
"logits/rejected": -1.0450173616409302,
"logps/chosen": -6.776492118835449,
"logps/rejected": -9.342794418334961,
"loss": 0.3758,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -13.552984237670898,
"rewards/margins": 5.132604598999023,
"rewards/rejected": -18.685588836669922,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 70.88406643518205,
"learning_rate": 2.5187092539447294e-07,
"logits/chosen": -1.1018563508987427,
"logits/rejected": -1.0579187870025635,
"logps/chosen": -6.298445701599121,
"logps/rejected": -8.556467056274414,
"loss": 0.3808,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -12.596891403198242,
"rewards/margins": 4.516043663024902,
"rewards/rejected": -17.112934112548828,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 41.80456248679069,
"learning_rate": 2.2758304077540058e-07,
"logits/chosen": -1.1480379104614258,
"logits/rejected": -1.1150692701339722,
"logps/chosen": -6.318451881408691,
"logps/rejected": -8.656303405761719,
"loss": 0.3586,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -12.636903762817383,
"rewards/margins": 4.675703525543213,
"rewards/rejected": -17.312606811523438,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 31.017236490830967,
"learning_rate": 2.043754511182191e-07,
"logits/chosen": -1.1511554718017578,
"logits/rejected": -1.0976629257202148,
"logps/chosen": -6.138351917266846,
"logps/rejected": -8.664915084838867,
"loss": 0.4,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -12.276703834533691,
"rewards/margins": 5.053128242492676,
"rewards/rejected": -17.329830169677734,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 26.351372088988093,
"learning_rate": 1.8228061433556864e-07,
"logits/chosen": -1.1164242029190063,
"logits/rejected": -1.0599582195281982,
"logps/chosen": -6.1393351554870605,
"logps/rejected": -8.908954620361328,
"loss": 0.3271,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -12.278670310974121,
"rewards/margins": 5.539238929748535,
"rewards/rejected": -17.817909240722656,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 40.004488570738765,
"learning_rate": 1.6132943205457606e-07,
"logits/chosen": -1.1820439100265503,
"logits/rejected": -1.1261646747589111,
"logps/chosen": -6.401742458343506,
"logps/rejected": -8.99330997467041,
"loss": 0.4273,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -12.803484916687012,
"rewards/margins": 5.183135032653809,
"rewards/rejected": -17.98661994934082,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 41.3303995282676,
"learning_rate": 1.415512063981339e-07,
"logits/chosen": -1.1933691501617432,
"logits/rejected": -1.143477201461792,
"logps/chosen": -6.095961093902588,
"logps/rejected": -8.315205574035645,
"loss": 0.3615,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -12.191922187805176,
"rewards/margins": 4.438488960266113,
"rewards/rejected": -16.63041114807129,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 30.146673376540157,
"learning_rate": 1.2297359900323344e-07,
"logits/chosen": -1.185856819152832,
"logits/rejected": -1.149908423423767,
"logps/chosen": -6.064610958099365,
"logps/rejected": -8.274811744689941,
"loss": 0.3805,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -12.12922191619873,
"rewards/margins": 4.420398712158203,
"rewards/rejected": -16.549623489379883,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -1.3878380060195923,
"eval_logits/rejected": -1.3844929933547974,
"eval_logps/chosen": -5.970302104949951,
"eval_logps/rejected": -8.178492546081543,
"eval_loss": 0.34991469979286194,
"eval_rewards/accuracies": 0.9004064798355103,
"eval_rewards/chosen": -11.940604209899902,
"eval_rewards/margins": 4.416379928588867,
"eval_rewards/rejected": -16.356985092163086,
"eval_runtime": 98.864,
"eval_samples_per_second": 19.835,
"eval_steps_per_second": 1.244,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 39.23606930955491,
"learning_rate": 1.0562259233366333e-07,
"logits/chosen": -1.1601266860961914,
"logits/rejected": -1.1533467769622803,
"logps/chosen": -6.3432416915893555,
"logps/rejected": -8.685356140136719,
"loss": 0.3527,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -12.686483383178711,
"rewards/margins": 4.684228420257568,
"rewards/rejected": -17.370712280273438,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 44.84060293631811,
"learning_rate": 8.952245334118413e-08,
"logits/chosen": -1.1762316226959229,
"logits/rejected": -1.1400468349456787,
"logps/chosen": -5.951014041900635,
"logps/rejected": -8.487456321716309,
"loss": 0.372,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -11.90202808380127,
"rewards/margins": 5.072883605957031,
"rewards/rejected": -16.974912643432617,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 31.58697079899106,
"learning_rate": 7.46956995260033e-08,
"logits/chosen": -1.1965105533599854,
"logits/rejected": -1.0948525667190552,
"logps/chosen": -5.939952373504639,
"logps/rejected": -8.576761245727539,
"loss": 0.3642,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -11.879904747009277,
"rewards/margins": 5.273618698120117,
"rewards/rejected": -17.153522491455078,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 56.62718923940337,
"learning_rate": 6.11630674440139e-08,
"logits/chosen": -1.2364650964736938,
"logits/rejected": -1.1493674516677856,
"logps/chosen": -5.8380866050720215,
"logps/rejected": -8.528668403625488,
"loss": 0.3543,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -11.676173210144043,
"rewards/margins": 5.381163597106934,
"rewards/rejected": -17.057336807250977,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 26.88857335924454,
"learning_rate": 4.8943483704846465e-08,
"logits/chosen": -1.2132270336151123,
"logits/rejected": -1.1822996139526367,
"logps/chosen": -6.329747200012207,
"logps/rejected": -8.68973159790039,
"loss": 0.378,
"rewards/accuracies": 0.90625,
"rewards/chosen": -12.659494400024414,
"rewards/margins": 4.719969749450684,
"rewards/rejected": -17.37946319580078,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 29.204672971590583,
"learning_rate": 3.805403850129407e-08,
"logits/chosen": -1.1887871026992798,
"logits/rejected": -1.1395562887191772,
"logps/chosen": -6.298637866973877,
"logps/rejected": -8.703396797180176,
"loss": 0.3701,
"rewards/accuracies": 0.90625,
"rewards/chosen": -12.597275733947754,
"rewards/margins": 4.809514999389648,
"rewards/rejected": -17.40679359436035,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 41.83119701192464,
"learning_rate": 2.8509961707132492e-08,
"logits/chosen": -1.1526520252227783,
"logits/rejected": -1.087210774421692,
"logps/chosen": -5.99376106262207,
"logps/rejected": -8.27347183227539,
"loss": 0.3539,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -11.98752212524414,
"rewards/margins": 4.559422492980957,
"rewards/rejected": -16.54694366455078,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 27.28448585229794,
"learning_rate": 2.032460157676452e-08,
"logits/chosen": -1.1298894882202148,
"logits/rejected": -1.049036979675293,
"logps/chosen": -6.4232072830200195,
"logps/rejected": -8.850305557250977,
"loss": 0.3414,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -12.846414566040039,
"rewards/margins": 4.854195594787598,
"rewards/rejected": -17.700611114501953,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 31.150711268639814,
"learning_rate": 1.3509406076478659e-08,
"logits/chosen": -1.1100740432739258,
"logits/rejected": -1.0567227602005005,
"logps/chosen": -6.3755292892456055,
"logps/rejected": -9.159284591674805,
"loss": 0.344,
"rewards/accuracies": 0.9375,
"rewards/chosen": -12.751058578491211,
"rewards/margins": 5.567511081695557,
"rewards/rejected": -18.31856918334961,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 33.19068830748795,
"learning_rate": 8.07390687343379e-09,
"logits/chosen": -1.250570297241211,
"logits/rejected": -1.1990430355072021,
"logps/chosen": -6.264920711517334,
"logps/rejected": -8.49793815612793,
"loss": 0.3294,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -12.529841423034668,
"rewards/margins": 4.466032981872559,
"rewards/rejected": -16.99587631225586,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 42.76771467797157,
"learning_rate": 4.025706004760931e-09,
"logits/chosen": -1.1908820867538452,
"logits/rejected": -1.1271415948867798,
"logps/chosen": -6.330782890319824,
"logps/rejected": -9.00413703918457,
"loss": 0.3373,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": -12.661565780639648,
"rewards/margins": 5.346711158752441,
"rewards/rejected": -18.00827407836914,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 28.947424332254975,
"learning_rate": 1.3704652454261667e-09,
"logits/chosen": -1.1968469619750977,
"logits/rejected": -1.1597331762313843,
"logps/chosen": -6.481853485107422,
"logps/rejected": -9.090927124023438,
"loss": 0.3515,
"rewards/accuracies": 0.90625,
"rewards/chosen": -12.963706970214844,
"rewards/margins": 5.218146800994873,
"rewards/rejected": -18.181854248046875,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 39.23731303488194,
"learning_rate": 1.1189818972656696e-10,
"logits/chosen": -1.163874864578247,
"logits/rejected": -1.1393449306488037,
"logps/chosen": -6.374614715576172,
"logps/rejected": -9.016167640686035,
"loss": 0.3467,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -12.749229431152344,
"rewards/margins": 5.283105373382568,
"rewards/rejected": -18.03233528137207,
"step": 465
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 0.5656856803873622,
"train_runtime": 11731.0657,
"train_samples_per_second": 5.104,
"train_steps_per_second": 0.04
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}