Jimmy19991222's picture
Upload folder using huggingface_hub
ca905cf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01068804275217101,
"grad_norm": 57.237549195872155,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.0180665254592896,
"logits/rejected": -0.9884552955627441,
"logps/chosen": -0.27425095438957214,
"logps/rejected": -0.2716319262981415,
"loss": 3.1091,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -2.7425098419189453,
"rewards/margins": -0.02619057334959507,
"rewards/rejected": -2.7163190841674805,
"step": 5
},
{
"epoch": 0.02137608550434202,
"grad_norm": 36.2177280707271,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.047877311706543,
"logits/rejected": -0.9804394841194153,
"logps/chosen": -0.2944500744342804,
"logps/rejected": -0.29980722069740295,
"loss": 3.1522,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.944500684738159,
"rewards/margins": 0.05357087776064873,
"rewards/rejected": -2.9980719089508057,
"step": 10
},
{
"epoch": 0.03206412825651302,
"grad_norm": 51.02954591523818,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.9653420448303223,
"logits/rejected": -0.9844053983688354,
"logps/chosen": -0.26417964696884155,
"logps/rejected": -0.30082693696022034,
"loss": 3.2048,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.641796588897705,
"rewards/margins": 0.3664725720882416,
"rewards/rejected": -3.0082690715789795,
"step": 15
},
{
"epoch": 0.04275217100868404,
"grad_norm": 56.89476138009963,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.9597972631454468,
"logits/rejected": -0.9341325759887695,
"logps/chosen": -0.27756327390670776,
"logps/rejected": -0.2916925251483917,
"loss": 3.1321,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.7756330966949463,
"rewards/margins": 0.14129219949245453,
"rewards/rejected": -2.9169249534606934,
"step": 20
},
{
"epoch": 0.053440213760855046,
"grad_norm": 56.48955746474513,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.001181960105896,
"logits/rejected": -0.9730860590934753,
"logps/chosen": -0.2715573310852051,
"logps/rejected": -0.27819815278053284,
"loss": 3.3596,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.7155730724334717,
"rewards/margins": 0.06640852242708206,
"rewards/rejected": -2.7819817066192627,
"step": 25
},
{
"epoch": 0.06412825651302605,
"grad_norm": 47.66267593497189,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -1.0001966953277588,
"logits/rejected": -0.9549218416213989,
"logps/chosen": -0.2734990119934082,
"logps/rejected": -0.2796509861946106,
"loss": 2.9655,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -2.734990358352661,
"rewards/margins": 0.06151958554983139,
"rewards/rejected": -2.7965099811553955,
"step": 30
},
{
"epoch": 0.07481629926519706,
"grad_norm": 57.03302592987705,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.0495048761367798,
"logits/rejected": -0.9743221998214722,
"logps/chosen": -0.2940281331539154,
"logps/rejected": -0.31984126567840576,
"loss": 3.0572,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.940281391143799,
"rewards/margins": 0.25813135504722595,
"rewards/rejected": -3.1984126567840576,
"step": 35
},
{
"epoch": 0.08550434201736808,
"grad_norm": 64.29646368113443,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -1.0000861883163452,
"logits/rejected": -0.9559175372123718,
"logps/chosen": -0.28027427196502686,
"logps/rejected": -0.3249492049217224,
"loss": 3.0201,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.8027429580688477,
"rewards/margins": 0.4467490315437317,
"rewards/rejected": -3.2494919300079346,
"step": 40
},
{
"epoch": 0.09619238476953908,
"grad_norm": 34.0521027952876,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -1.049403429031372,
"logits/rejected": -1.0066633224487305,
"logps/chosen": -0.3022717535495758,
"logps/rejected": -0.355845183134079,
"loss": 3.1061,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -3.0227174758911133,
"rewards/margins": 0.5357345342636108,
"rewards/rejected": -3.5584518909454346,
"step": 45
},
{
"epoch": 0.10688042752171009,
"grad_norm": 50.184137131794785,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -1.0293877124786377,
"logits/rejected": -0.9806405901908875,
"logps/chosen": -0.3117847442626953,
"logps/rejected": -0.3513973653316498,
"loss": 3.1525,
"rewards/accuracies": 0.46875,
"rewards/chosen": -3.1178476810455322,
"rewards/margins": 0.39612606167793274,
"rewards/rejected": -3.5139732360839844,
"step": 50
},
{
"epoch": 0.11756847027388109,
"grad_norm": 139.4899548956689,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -1.0580527782440186,
"logits/rejected": -1.0236852169036865,
"logps/chosen": -0.29338452219963074,
"logps/rejected": -0.36238163709640503,
"loss": 2.8456,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.933845043182373,
"rewards/margins": 0.6899713277816772,
"rewards/rejected": -3.6238162517547607,
"step": 55
},
{
"epoch": 0.1282565130260521,
"grad_norm": 58.45122397836986,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -1.0895339250564575,
"logits/rejected": -1.0574713945388794,
"logps/chosen": -0.33461707830429077,
"logps/rejected": -0.35189467668533325,
"loss": 2.9738,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -3.3461709022521973,
"rewards/margins": 0.17277587950229645,
"rewards/rejected": -3.518946886062622,
"step": 60
},
{
"epoch": 0.13894455577822312,
"grad_norm": 100.62827839328082,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -1.011530876159668,
"logits/rejected": -0.9821838140487671,
"logps/chosen": -0.4006083011627197,
"logps/rejected": -0.464979887008667,
"loss": 2.9379,
"rewards/accuracies": 0.59375,
"rewards/chosen": -4.0060834884643555,
"rewards/margins": 0.6437152624130249,
"rewards/rejected": -4.649798393249512,
"step": 65
},
{
"epoch": 0.14963259853039412,
"grad_norm": 39.36526232625554,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -1.0184242725372314,
"logits/rejected": -0.9939621686935425,
"logps/chosen": -0.3619542419910431,
"logps/rejected": -0.4431312084197998,
"loss": 2.9573,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -3.619542360305786,
"rewards/margins": 0.8117697834968567,
"rewards/rejected": -4.43131160736084,
"step": 70
},
{
"epoch": 0.16032064128256512,
"grad_norm": 83.11076732917083,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.9596433639526367,
"logits/rejected": -0.8910166621208191,
"logps/chosen": -0.3588549494743347,
"logps/rejected": -0.4166484773159027,
"loss": 2.9742,
"rewards/accuracies": 0.53125,
"rewards/chosen": -3.588549852371216,
"rewards/margins": 0.5779348015785217,
"rewards/rejected": -4.1664838790893555,
"step": 75
},
{
"epoch": 0.17100868403473615,
"grad_norm": 52.281331982276065,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.9426174163818359,
"logits/rejected": -0.9289323687553406,
"logps/chosen": -0.35129761695861816,
"logps/rejected": -0.4580927789211273,
"loss": 2.9737,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.5129764080047607,
"rewards/margins": 1.067950963973999,
"rewards/rejected": -4.580927848815918,
"step": 80
},
{
"epoch": 0.18169672678690715,
"grad_norm": 61.53493979772547,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9526857137680054,
"logits/rejected": -0.9304324388504028,
"logps/chosen": -0.34235039353370667,
"logps/rejected": -0.40353184938430786,
"loss": 2.7213,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -3.4235033988952637,
"rewards/margins": 0.6118148565292358,
"rewards/rejected": -4.035318851470947,
"step": 85
},
{
"epoch": 0.19238476953907815,
"grad_norm": 75.22407650978651,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -0.9911141395568848,
"logits/rejected": -0.9571215510368347,
"logps/chosen": -0.4391642212867737,
"logps/rejected": -0.5185960531234741,
"loss": 3.0403,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -4.391642093658447,
"rewards/margins": 0.7943190336227417,
"rewards/rejected": -5.1859612464904785,
"step": 90
},
{
"epoch": 0.20307281229124916,
"grad_norm": 46.673632090780266,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.079331636428833,
"logits/rejected": -0.996097207069397,
"logps/chosen": -0.4490174353122711,
"logps/rejected": -0.49736976623535156,
"loss": 2.8747,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -4.490174293518066,
"rewards/margins": 0.4835231900215149,
"rewards/rejected": -4.973697662353516,
"step": 95
},
{
"epoch": 0.21376085504342018,
"grad_norm": 78.85306309497338,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.9557577967643738,
"logits/rejected": -0.9308866262435913,
"logps/chosen": -0.4265132546424866,
"logps/rejected": -0.4960516393184662,
"loss": 2.9809,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -4.265132427215576,
"rewards/margins": 0.6953836679458618,
"rewards/rejected": -4.960515975952148,
"step": 100
},
{
"epoch": 0.22444889779559118,
"grad_norm": 80.40817210917017,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.9734071493148804,
"logits/rejected": -0.9203007817268372,
"logps/chosen": -0.4045742154121399,
"logps/rejected": -0.5108767747879028,
"loss": 2.9566,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -4.045742034912109,
"rewards/margins": 1.063025712966919,
"rewards/rejected": -5.108767509460449,
"step": 105
},
{
"epoch": 0.23513694054776219,
"grad_norm": 55.451042957143265,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.9206374883651733,
"logits/rejected": -0.8604587316513062,
"logps/chosen": -0.45949387550354004,
"logps/rejected": -0.6004349589347839,
"loss": 2.8412,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -4.594939231872559,
"rewards/margins": 1.4094107151031494,
"rewards/rejected": -6.004349708557129,
"step": 110
},
{
"epoch": 0.2458249832999332,
"grad_norm": 51.58223883398887,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -0.9696682691574097,
"logits/rejected": -0.9112384915351868,
"logps/chosen": -0.4893345832824707,
"logps/rejected": -0.5542086362838745,
"loss": 2.7495,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -4.893345355987549,
"rewards/margins": 0.6487414240837097,
"rewards/rejected": -5.542087078094482,
"step": 115
},
{
"epoch": 0.2565130260521042,
"grad_norm": 75.59919212642018,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.9438816905021667,
"logits/rejected": -0.8547528386116028,
"logps/chosen": -0.5028254985809326,
"logps/rejected": -0.7035338878631592,
"loss": 2.5628,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -5.028255462646484,
"rewards/margins": 2.0070836544036865,
"rewards/rejected": -7.03533935546875,
"step": 120
},
{
"epoch": 0.26720106880427524,
"grad_norm": 66.32992513821185,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.016081690788269,
"logits/rejected": -0.9737744331359863,
"logps/chosen": -0.5569332838058472,
"logps/rejected": -0.6537975072860718,
"loss": 2.4448,
"rewards/accuracies": 0.65625,
"rewards/chosen": -5.569332599639893,
"rewards/margins": 0.9686424136161804,
"rewards/rejected": -6.537975311279297,
"step": 125
},
{
"epoch": 0.27788911155644624,
"grad_norm": 139.11732623143496,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.032061219215393,
"logits/rejected": -1.0252352952957153,
"logps/chosen": -0.5443070530891418,
"logps/rejected": -0.8193408250808716,
"loss": 2.4333,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -5.443070411682129,
"rewards/margins": 2.750338077545166,
"rewards/rejected": -8.193408012390137,
"step": 130
},
{
"epoch": 0.28857715430861725,
"grad_norm": 107.42202232758989,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -1.0170912742614746,
"logits/rejected": -0.9671396017074585,
"logps/chosen": -0.6021947264671326,
"logps/rejected": -0.7191929221153259,
"loss": 2.517,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -6.021947383880615,
"rewards/margins": 1.169981837272644,
"rewards/rejected": -7.191929817199707,
"step": 135
},
{
"epoch": 0.29926519706078825,
"grad_norm": 77.05576180382866,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -1.0649584531784058,
"logits/rejected": -1.0430896282196045,
"logps/chosen": -0.6696725487709045,
"logps/rejected": -0.8106359243392944,
"loss": 2.3997,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -6.696726322174072,
"rewards/margins": 1.4096347093582153,
"rewards/rejected": -8.106359481811523,
"step": 140
},
{
"epoch": 0.30995323981295925,
"grad_norm": 68.63585118244188,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -0.9915879964828491,
"logits/rejected": -0.9645885229110718,
"logps/chosen": -0.6888564825057983,
"logps/rejected": -0.9088963270187378,
"loss": 2.0828,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -6.8885650634765625,
"rewards/margins": 2.2003989219665527,
"rewards/rejected": -9.088963508605957,
"step": 145
},
{
"epoch": 0.32064128256513025,
"grad_norm": 77.90508875376052,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -1.0322893857955933,
"logits/rejected": -1.0123205184936523,
"logps/chosen": -0.7648183107376099,
"logps/rejected": -0.9603475332260132,
"loss": 2.2673,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -7.6481828689575195,
"rewards/margins": 1.9552921056747437,
"rewards/rejected": -9.603475570678711,
"step": 150
},
{
"epoch": 0.33132932531730125,
"grad_norm": 75.78147375517075,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.0493463277816772,
"logits/rejected": -1.0279868841171265,
"logps/chosen": -0.8422037363052368,
"logps/rejected": -1.0871771574020386,
"loss": 2.2922,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -8.422037124633789,
"rewards/margins": 2.4497344493865967,
"rewards/rejected": -10.871770858764648,
"step": 155
},
{
"epoch": 0.3420173680694723,
"grad_norm": 86.49849369728787,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.0481699705123901,
"logits/rejected": -1.030601143836975,
"logps/chosen": -0.9466081857681274,
"logps/rejected": -1.3202154636383057,
"loss": 2.3755,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -9.466081619262695,
"rewards/margins": 3.736072540283203,
"rewards/rejected": -13.202154159545898,
"step": 160
},
{
"epoch": 0.3527054108216433,
"grad_norm": 66.82340849667754,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.0408477783203125,
"logits/rejected": -1.0174505710601807,
"logps/chosen": -0.9799006581306458,
"logps/rejected": -1.3342236280441284,
"loss": 2.1125,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -9.799007415771484,
"rewards/margins": 3.5432305335998535,
"rewards/rejected": -13.34223747253418,
"step": 165
},
{
"epoch": 0.3633934535738143,
"grad_norm": 97.43908089438905,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.0224933624267578,
"logits/rejected": -1.001030683517456,
"logps/chosen": -0.9700697064399719,
"logps/rejected": -1.358564853668213,
"loss": 1.9793,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -9.70069694519043,
"rewards/margins": 3.884951114654541,
"rewards/rejected": -13.585647583007812,
"step": 170
},
{
"epoch": 0.3740814963259853,
"grad_norm": 98.62704227490674,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -1.04830002784729,
"logits/rejected": -1.0014127492904663,
"logps/chosen": -1.0846463441848755,
"logps/rejected": -1.3184218406677246,
"loss": 1.8174,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -10.846463203430176,
"rewards/margins": 2.3377552032470703,
"rewards/rejected": -13.18421745300293,
"step": 175
},
{
"epoch": 0.3847695390781563,
"grad_norm": 74.26153272998572,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -0.9784607887268066,
"logits/rejected": -0.9590786099433899,
"logps/chosen": -1.03909432888031,
"logps/rejected": -1.3960068225860596,
"loss": 1.8592,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -10.39094352722168,
"rewards/margins": 3.5691237449645996,
"rewards/rejected": -13.960065841674805,
"step": 180
},
{
"epoch": 0.3954575818303273,
"grad_norm": 88.60468047988923,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.0223743915557861,
"logits/rejected": -0.9691470861434937,
"logps/chosen": -1.0873353481292725,
"logps/rejected": -1.4810540676116943,
"loss": 1.8506,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -10.873353958129883,
"rewards/margins": 3.937185764312744,
"rewards/rejected": -14.810541152954102,
"step": 185
},
{
"epoch": 0.4061456245824983,
"grad_norm": 79.84179637831463,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -0.9927349090576172,
"logits/rejected": -1.0011526346206665,
"logps/chosen": -1.2325414419174194,
"logps/rejected": -1.739311933517456,
"loss": 1.6742,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -12.325414657592773,
"rewards/margins": 5.067704200744629,
"rewards/rejected": -17.393117904663086,
"step": 190
},
{
"epoch": 0.4168336673346693,
"grad_norm": 75.55606036176057,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -0.9720694422721863,
"logits/rejected": -0.9535917043685913,
"logps/chosen": -1.2131645679473877,
"logps/rejected": -1.5727177858352661,
"loss": 1.7343,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -12.131647109985352,
"rewards/margins": 3.5955300331115723,
"rewards/rejected": -15.727177619934082,
"step": 195
},
{
"epoch": 0.42752171008684037,
"grad_norm": 95.29175169321584,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -0.9580856561660767,
"logits/rejected": -0.9478925466537476,
"logps/chosen": -1.2784286737442017,
"logps/rejected": -1.7080621719360352,
"loss": 1.7675,
"rewards/accuracies": 0.78125,
"rewards/chosen": -12.784285545349121,
"rewards/margins": 4.296335220336914,
"rewards/rejected": -17.08062171936035,
"step": 200
},
{
"epoch": 0.43820975283901137,
"grad_norm": 97.56349101288879,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.003482460975647,
"logits/rejected": -0.9547850489616394,
"logps/chosen": -1.410736322402954,
"logps/rejected": -1.8478959798812866,
"loss": 1.8853,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -14.107362747192383,
"rewards/margins": 4.371596336364746,
"rewards/rejected": -18.478958129882812,
"step": 205
},
{
"epoch": 0.44889779559118237,
"grad_norm": 124.17700452204937,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.0298535823822021,
"logits/rejected": -1.020567774772644,
"logps/chosen": -1.4878171682357788,
"logps/rejected": -1.9283632040023804,
"loss": 1.7977,
"rewards/accuracies": 0.84375,
"rewards/chosen": -14.878171920776367,
"rewards/margins": 4.405461311340332,
"rewards/rejected": -19.283634185791016,
"step": 210
},
{
"epoch": 0.45958583834335337,
"grad_norm": 122.41736903454225,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -0.9621469378471375,
"logits/rejected": -0.9473578333854675,
"logps/chosen": -1.558885097503662,
"logps/rejected": -2.0420405864715576,
"loss": 1.712,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -15.588850021362305,
"rewards/margins": 4.8315558433532715,
"rewards/rejected": -20.420406341552734,
"step": 215
},
{
"epoch": 0.47027388109552437,
"grad_norm": 118.87466737296252,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.0177868604660034,
"logits/rejected": -1.0038330554962158,
"logps/chosen": -1.6249806880950928,
"logps/rejected": -2.1466097831726074,
"loss": 1.6798,
"rewards/accuracies": 0.75,
"rewards/chosen": -16.249807357788086,
"rewards/margins": 5.216291904449463,
"rewards/rejected": -21.46609878540039,
"step": 220
},
{
"epoch": 0.48096192384769537,
"grad_norm": 71.45686372104745,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -1.0141699314117432,
"logits/rejected": -0.9860795736312866,
"logps/chosen": -1.6077144145965576,
"logps/rejected": -2.097548723220825,
"loss": 1.5264,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -16.077144622802734,
"rewards/margins": 4.898342132568359,
"rewards/rejected": -20.975486755371094,
"step": 225
},
{
"epoch": 0.4916499665998664,
"grad_norm": 96.03329426013343,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.1012922525405884,
"logits/rejected": -1.0541749000549316,
"logps/chosen": -1.536604881286621,
"logps/rejected": -1.9562132358551025,
"loss": 1.5597,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -15.366048812866211,
"rewards/margins": 4.196080207824707,
"rewards/rejected": -19.562129974365234,
"step": 230
},
{
"epoch": 0.5023380093520374,
"grad_norm": 92.4386577422302,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.1283349990844727,
"logits/rejected": -1.1022907495498657,
"logps/chosen": -1.527305245399475,
"logps/rejected": -2.0704562664031982,
"loss": 1.4964,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -15.273053169250488,
"rewards/margins": 5.431510925292969,
"rewards/rejected": -20.70456314086914,
"step": 235
},
{
"epoch": 0.5130260521042084,
"grad_norm": 99.65700789182705,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.098283290863037,
"logits/rejected": -1.1012353897094727,
"logps/chosen": -1.430646538734436,
"logps/rejected": -1.9294793605804443,
"loss": 1.5595,
"rewards/accuracies": 0.8125,
"rewards/chosen": -14.306467056274414,
"rewards/margins": 4.98832893371582,
"rewards/rejected": -19.294795989990234,
"step": 240
},
{
"epoch": 0.5237140948563794,
"grad_norm": 129.78574414242638,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -1.1896294355392456,
"logits/rejected": -1.1353044509887695,
"logps/chosen": -1.4528030157089233,
"logps/rejected": -2.033853054046631,
"loss": 1.475,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -14.52802848815918,
"rewards/margins": 5.810500144958496,
"rewards/rejected": -20.33852767944336,
"step": 245
},
{
"epoch": 0.5344021376085505,
"grad_norm": 128.20283155514042,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -1.1316919326782227,
"logits/rejected": -1.1171941757202148,
"logps/chosen": -1.538417100906372,
"logps/rejected": -2.1081037521362305,
"loss": 1.4196,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -15.384170532226562,
"rewards/margins": 5.696866512298584,
"rewards/rejected": -21.081039428710938,
"step": 250
},
{
"epoch": 0.5450901803607214,
"grad_norm": 297.4344168039998,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.1720324754714966,
"logits/rejected": -1.1475986242294312,
"logps/chosen": -1.6590303182601929,
"logps/rejected": -2.150458335876465,
"loss": 1.6319,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -16.590303421020508,
"rewards/margins": 4.914281845092773,
"rewards/rejected": -21.50458335876465,
"step": 255
},
{
"epoch": 0.5557782231128925,
"grad_norm": 110.4855476557986,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -1.145662546157837,
"logits/rejected": -1.1171993017196655,
"logps/chosen": -1.5531560182571411,
"logps/rejected": -2.0303704738616943,
"loss": 1.4324,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.531560897827148,
"rewards/margins": 4.7721452713012695,
"rewards/rejected": -20.303707122802734,
"step": 260
},
{
"epoch": 0.5664662658650634,
"grad_norm": 83.14465015789618,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.1167972087860107,
"logits/rejected": -1.1012585163116455,
"logps/chosen": -1.6575731039047241,
"logps/rejected": -2.175945997238159,
"loss": 1.5468,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -16.575729370117188,
"rewards/margins": 5.183730125427246,
"rewards/rejected": -21.75946044921875,
"step": 265
},
{
"epoch": 0.5771543086172345,
"grad_norm": 116.06958067335016,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -1.1125719547271729,
"logits/rejected": -1.0850471258163452,
"logps/chosen": -1.4920045137405396,
"logps/rejected": -2.0412135124206543,
"loss": 1.411,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -14.920045852661133,
"rewards/margins": 5.492091655731201,
"rewards/rejected": -20.41213607788086,
"step": 270
},
{
"epoch": 0.5878423513694054,
"grad_norm": 93.41636467602738,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -1.0854105949401855,
"logits/rejected": -1.0501768589019775,
"logps/chosen": -1.501579999923706,
"logps/rejected": -1.966059684753418,
"loss": 1.6514,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -15.015800476074219,
"rewards/margins": 4.644796848297119,
"rewards/rejected": -19.66059684753418,
"step": 275
},
{
"epoch": 0.5985303941215765,
"grad_norm": 76.77398013161601,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -1.1719205379486084,
"logits/rejected": -1.148206114768982,
"logps/chosen": -1.5624816417694092,
"logps/rejected": -2.106921672821045,
"loss": 1.5084,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.6248140335083,
"rewards/margins": 5.44440221786499,
"rewards/rejected": -21.069217681884766,
"step": 280
},
{
"epoch": 0.6092184368737475,
"grad_norm": 111.58387969589569,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.1303155422210693,
"logits/rejected": -1.0974434614181519,
"logps/chosen": -1.6112607717514038,
"logps/rejected": -2.212517261505127,
"loss": 1.3065,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -16.112607955932617,
"rewards/margins": 6.012566089630127,
"rewards/rejected": -22.125173568725586,
"step": 285
},
{
"epoch": 0.6199064796259185,
"grad_norm": 101.64126956685786,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.1603832244873047,
"logits/rejected": -1.117941975593567,
"logps/chosen": -1.6240203380584717,
"logps/rejected": -2.1277661323547363,
"loss": 1.5293,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -16.240203857421875,
"rewards/margins": 5.037457466125488,
"rewards/rejected": -21.277660369873047,
"step": 290
},
{
"epoch": 0.6305945223780896,
"grad_norm": 107.06654753863799,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.0859363079071045,
"logits/rejected": -1.0710818767547607,
"logps/chosen": -1.6715633869171143,
"logps/rejected": -2.20039701461792,
"loss": 1.4051,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.715633392333984,
"rewards/margins": 5.288336753845215,
"rewards/rejected": -22.003969192504883,
"step": 295
},
{
"epoch": 0.6412825651302605,
"grad_norm": 85.09256280069626,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -1.015700340270996,
"logits/rejected": -0.9886563420295715,
"logps/chosen": -1.7295385599136353,
"logps/rejected": -2.197303056716919,
"loss": 1.7183,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -17.295387268066406,
"rewards/margins": 4.677645206451416,
"rewards/rejected": -21.973031997680664,
"step": 300
},
{
"epoch": 0.6519706078824316,
"grad_norm": 98.45045929238997,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.1253305673599243,
"logits/rejected": -1.078313946723938,
"logps/chosen": -1.716840386390686,
"logps/rejected": -2.1928813457489014,
"loss": 1.4901,
"rewards/accuracies": 0.78125,
"rewards/chosen": -17.16840171813965,
"rewards/margins": 4.760410785675049,
"rewards/rejected": -21.928813934326172,
"step": 305
},
{
"epoch": 0.6626586506346025,
"grad_norm": 141.52088188995467,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -1.1675808429718018,
"logits/rejected": -1.1469465494155884,
"logps/chosen": -1.7660911083221436,
"logps/rejected": -2.279519557952881,
"loss": 1.6871,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.660913467407227,
"rewards/margins": 5.134285926818848,
"rewards/rejected": -22.795196533203125,
"step": 310
},
{
"epoch": 0.6733466933867736,
"grad_norm": 101.86538699204806,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.153141736984253,
"logits/rejected": -1.097063660621643,
"logps/chosen": -1.705733299255371,
"logps/rejected": -2.247840166091919,
"loss": 1.5097,
"rewards/accuracies": 0.84375,
"rewards/chosen": -17.05733299255371,
"rewards/margins": 5.421066761016846,
"rewards/rejected": -22.47840118408203,
"step": 315
},
{
"epoch": 0.6840347361389446,
"grad_norm": 122.36169835495791,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.136850357055664,
"logits/rejected": -1.1315498352050781,
"logps/chosen": -1.7248958349227905,
"logps/rejected": -2.382091760635376,
"loss": 1.446,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -17.248958587646484,
"rewards/margins": 6.571959018707275,
"rewards/rejected": -23.8209171295166,
"step": 320
},
{
"epoch": 0.6947227788911156,
"grad_norm": 100.7599593802445,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -1.158361792564392,
"logits/rejected": -1.1068694591522217,
"logps/chosen": -1.8045142889022827,
"logps/rejected": -2.512817859649658,
"loss": 1.4403,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -18.04514503479004,
"rewards/margins": 7.083035469055176,
"rewards/rejected": -25.128177642822266,
"step": 325
},
{
"epoch": 0.7054108216432866,
"grad_norm": 82.7307588668697,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.0738600492477417,
"logits/rejected": -1.030057430267334,
"logps/chosen": -1.734301209449768,
"logps/rejected": -2.244229793548584,
"loss": 1.4411,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.3430118560791,
"rewards/margins": 5.099286079406738,
"rewards/rejected": -22.442298889160156,
"step": 330
},
{
"epoch": 0.7160988643954576,
"grad_norm": 111.19834108528815,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.1105704307556152,
"logits/rejected": -1.092313289642334,
"logps/chosen": -1.7023674249649048,
"logps/rejected": -2.2848927974700928,
"loss": 1.3002,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.023672103881836,
"rewards/margins": 5.82525634765625,
"rewards/rejected": -22.84893226623535,
"step": 335
},
{
"epoch": 0.7267869071476286,
"grad_norm": 124.14973601872444,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.1307401657104492,
"logits/rejected": -1.0960733890533447,
"logps/chosen": -1.639500617980957,
"logps/rejected": -2.2025198936462402,
"loss": 1.4763,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.395008087158203,
"rewards/margins": 5.630189895629883,
"rewards/rejected": -22.025196075439453,
"step": 340
},
{
"epoch": 0.7374749498997996,
"grad_norm": 89.15665757381706,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.1599509716033936,
"logits/rejected": -1.135851502418518,
"logps/chosen": -1.72158682346344,
"logps/rejected": -2.364271402359009,
"loss": 1.5031,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.215869903564453,
"rewards/margins": 6.426844596862793,
"rewards/rejected": -23.642711639404297,
"step": 345
},
{
"epoch": 0.7481629926519706,
"grad_norm": 110.91770853185307,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.1471744775772095,
"logits/rejected": -1.1494718790054321,
"logps/chosen": -1.5995361804962158,
"logps/rejected": -2.1568686962127686,
"loss": 1.4483,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -15.995361328125,
"rewards/margins": 5.57332706451416,
"rewards/rejected": -21.568689346313477,
"step": 350
},
{
"epoch": 0.7588510354041417,
"grad_norm": 92.35558706588404,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.0737619400024414,
"logits/rejected": -1.0871598720550537,
"logps/chosen": -1.583603858947754,
"logps/rejected": -2.1480062007904053,
"loss": 1.2137,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -15.836038589477539,
"rewards/margins": 5.6440229415893555,
"rewards/rejected": -21.480064392089844,
"step": 355
},
{
"epoch": 0.7695390781563126,
"grad_norm": 97.35140511945166,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -1.1281821727752686,
"logits/rejected": -1.1263208389282227,
"logps/chosen": -1.5962927341461182,
"logps/rejected": -2.2565903663635254,
"loss": 1.4883,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -15.962926864624023,
"rewards/margins": 6.602975368499756,
"rewards/rejected": -22.565902709960938,
"step": 360
},
{
"epoch": 0.7802271209084837,
"grad_norm": 97.26488316442018,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.1745531558990479,
"logits/rejected": -1.1077674627304077,
"logps/chosen": -1.6627562046051025,
"logps/rejected": -2.321105480194092,
"loss": 1.4521,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.627561569213867,
"rewards/margins": 6.583495140075684,
"rewards/rejected": -23.211057662963867,
"step": 365
},
{
"epoch": 0.7909151636606546,
"grad_norm": 112.70224926269489,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.0878835916519165,
"logits/rejected": -1.075674295425415,
"logps/chosen": -1.6165920495986938,
"logps/rejected": -2.117642402648926,
"loss": 1.4406,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -16.165922164916992,
"rewards/margins": 5.010504245758057,
"rewards/rejected": -21.17642593383789,
"step": 370
},
{
"epoch": 0.8016032064128257,
"grad_norm": 116.18618106964092,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -1.1332778930664062,
"logits/rejected": -1.1369507312774658,
"logps/chosen": -1.7694313526153564,
"logps/rejected": -2.3713538646698,
"loss": 1.4837,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -17.694313049316406,
"rewards/margins": 6.019228935241699,
"rewards/rejected": -23.71354103088379,
"step": 375
},
{
"epoch": 0.8122912491649966,
"grad_norm": 133.07010667525282,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.1531364917755127,
"logits/rejected": -1.1320288181304932,
"logps/chosen": -1.6155163049697876,
"logps/rejected": -2.160113573074341,
"loss": 1.5728,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.155162811279297,
"rewards/margins": 5.4459710121154785,
"rewards/rejected": -21.60113525390625,
"step": 380
},
{
"epoch": 0.8229792919171677,
"grad_norm": 92.80555931457836,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -1.14119553565979,
"logits/rejected": -1.1190695762634277,
"logps/chosen": -1.6841446161270142,
"logps/rejected": -2.4013619422912598,
"loss": 1.2863,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -16.841445922851562,
"rewards/margins": 7.172172546386719,
"rewards/rejected": -24.013620376586914,
"step": 385
},
{
"epoch": 0.8336673346693386,
"grad_norm": 106.02842713910356,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.164880633354187,
"logits/rejected": -1.1471444368362427,
"logps/chosen": -1.7512538433074951,
"logps/rejected": -2.3633463382720947,
"loss": 1.4838,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -17.512537002563477,
"rewards/margins": 6.120924949645996,
"rewards/rejected": -23.633460998535156,
"step": 390
},
{
"epoch": 0.8443553774215097,
"grad_norm": 91.21482574420814,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -1.158891201019287,
"logits/rejected": -1.1362249851226807,
"logps/chosen": -1.6644847393035889,
"logps/rejected": -2.2324166297912598,
"loss": 1.3638,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -16.644847869873047,
"rewards/margins": 5.679316997528076,
"rewards/rejected": -22.32416534423828,
"step": 395
},
{
"epoch": 0.8550434201736807,
"grad_norm": 67.57151749023619,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -1.1628615856170654,
"logits/rejected": -1.1462781429290771,
"logps/chosen": -1.6841766834259033,
"logps/rejected": -2.238058567047119,
"loss": 1.2555,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.841764450073242,
"rewards/margins": 5.538818359375,
"rewards/rejected": -22.380582809448242,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -1.3520146608352661,
"eval_logits/rejected": -1.3613466024398804,
"eval_logps/chosen": -1.698158621788025,
"eval_logps/rejected": -2.272404670715332,
"eval_loss": 1.3605413436889648,
"eval_rewards/accuracies": 0.8455284833908081,
"eval_rewards/chosen": -16.981586456298828,
"eval_rewards/margins": 5.742460250854492,
"eval_rewards/rejected": -22.724044799804688,
"eval_runtime": 96.7859,
"eval_samples_per_second": 20.261,
"eval_steps_per_second": 1.271,
"step": 400
},
{
"epoch": 0.8657314629258517,
"grad_norm": 113.07658741149837,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.1250704526901245,
"logits/rejected": -1.142858624458313,
"logps/chosen": -1.7644565105438232,
"logps/rejected": -2.3112316131591797,
"loss": 1.3959,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -17.64456558227539,
"rewards/margins": 5.467750549316406,
"rewards/rejected": -23.112316131591797,
"step": 405
},
{
"epoch": 0.8764195056780227,
"grad_norm": 135.58367842014283,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -1.148863434791565,
"logits/rejected": -1.1359000205993652,
"logps/chosen": -1.7182337045669556,
"logps/rejected": -2.268091917037964,
"loss": 1.5344,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.182336807250977,
"rewards/margins": 5.4985833168029785,
"rewards/rejected": -22.680919647216797,
"step": 410
},
{
"epoch": 0.8871075484301937,
"grad_norm": 114.17576942626454,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.1382702589035034,
"logits/rejected": -1.0858891010284424,
"logps/chosen": -1.606128454208374,
"logps/rejected": -2.25667142868042,
"loss": 1.6467,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -16.061288833618164,
"rewards/margins": 6.505424499511719,
"rewards/rejected": -22.56671142578125,
"step": 415
},
{
"epoch": 0.8977955911823647,
"grad_norm": 101.01008878090249,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -1.1730471849441528,
"logits/rejected": -1.1232213973999023,
"logps/chosen": -1.5842628479003906,
"logps/rejected": -2.213921308517456,
"loss": 1.3771,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.842630386352539,
"rewards/margins": 6.296584129333496,
"rewards/rejected": -22.13921356201172,
"step": 420
},
{
"epoch": 0.9084836339345357,
"grad_norm": 111.86270544120462,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -1.1721917390823364,
"logits/rejected": -1.1620614528656006,
"logps/chosen": -1.77499258518219,
"logps/rejected": -2.305689573287964,
"loss": 1.493,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -17.74992561340332,
"rewards/margins": 5.306972980499268,
"rewards/rejected": -23.05689811706543,
"step": 425
},
{
"epoch": 0.9191716766867067,
"grad_norm": 86.63448539967071,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -1.1510651111602783,
"logits/rejected": -1.1352717876434326,
"logps/chosen": -1.7862894535064697,
"logps/rejected": -2.381641149520874,
"loss": 1.411,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -17.862895965576172,
"rewards/margins": 5.953517436981201,
"rewards/rejected": -23.816410064697266,
"step": 430
},
{
"epoch": 0.9298597194388778,
"grad_norm": 102.66949123740247,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.1117022037506104,
"logits/rejected": -1.0704118013381958,
"logps/chosen": -1.6134551763534546,
"logps/rejected": -2.211256742477417,
"loss": 1.3969,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -16.134552001953125,
"rewards/margins": 5.9780168533325195,
"rewards/rejected": -22.112567901611328,
"step": 435
},
{
"epoch": 0.9405477621910487,
"grad_norm": 112.7340895477117,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -1.1366580724716187,
"logits/rejected": -1.0769071578979492,
"logps/chosen": -1.7496669292449951,
"logps/rejected": -2.3226873874664307,
"loss": 1.3153,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -17.49666976928711,
"rewards/margins": 5.7302045822143555,
"rewards/rejected": -23.226871490478516,
"step": 440
},
{
"epoch": 0.9512358049432198,
"grad_norm": 92.65086353146947,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.0847865343093872,
"logits/rejected": -1.0684945583343506,
"logps/chosen": -1.726458191871643,
"logps/rejected": -2.406322956085205,
"loss": 1.2646,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -17.26458168029785,
"rewards/margins": 6.798647403717041,
"rewards/rejected": -24.063228607177734,
"step": 445
},
{
"epoch": 0.9619238476953907,
"grad_norm": 81.82402808973579,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -1.1987271308898926,
"logits/rejected": -1.1772375106811523,
"logps/chosen": -1.6759332418441772,
"logps/rejected": -2.2297987937927246,
"loss": 1.4218,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -16.759334564208984,
"rewards/margins": 5.5386552810668945,
"rewards/rejected": -22.297988891601562,
"step": 450
},
{
"epoch": 0.9726118904475618,
"grad_norm": 114.73913428926,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.1624139547348022,
"logits/rejected": -1.1274266242980957,
"logps/chosen": -1.695810317993164,
"logps/rejected": -2.3724701404571533,
"loss": 1.1616,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -16.958105087280273,
"rewards/margins": 6.766595363616943,
"rewards/rejected": -23.724700927734375,
"step": 455
},
{
"epoch": 0.9832999331997327,
"grad_norm": 105.12442076336163,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -1.1637624502182007,
"logits/rejected": -1.1525938510894775,
"logps/chosen": -1.7872707843780518,
"logps/rejected": -2.419015407562256,
"loss": 1.334,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -17.87270736694336,
"rewards/margins": 6.317442893981934,
"rewards/rejected": -24.19015121459961,
"step": 460
},
{
"epoch": 0.9939879759519038,
"grad_norm": 103.69467200755768,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.1328219175338745,
"logits/rejected": -1.139512300491333,
"logps/chosen": -1.773741364479065,
"logps/rejected": -2.4201297760009766,
"loss": 1.4736,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -17.737415313720703,
"rewards/margins": 6.4638848304748535,
"rewards/rejected": -24.201297760009766,
"step": 465
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}