ds_coder6.7b_adamw_iter6 / trainer_state.json
yiran-wang3's picture
End of training
594ce7c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 32,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": 26.208711624145508,
"debug/policy_chosen_logps": -419.32049560546875,
"debug/policy_rejected_logits": 27.114166259765625,
"debug/policy_rejected_logps": -409.2409362792969,
"debug/reference_chosen_logps": -419.32049560546875,
"debug/reference_rejected_logps": -409.2409362792969,
"epoch": 0.03125,
"grad_norm": 5.94402702532091,
"learning_rate": 1e-06,
"logits/chosen": 26.208711624145508,
"logits/rejected": 27.114166259765625,
"logps/chosen": -419.32049560546875,
"logps/rejected": -409.2409362792969,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": 27.364856719970703,
"debug/policy_chosen_logps": -384.8145446777344,
"debug/policy_rejected_logits": 30.614818572998047,
"debug/policy_rejected_logps": -389.1242980957031,
"debug/reference_chosen_logps": -385.05755615234375,
"debug/reference_rejected_logps": -389.2853698730469,
"epoch": 0.0625,
"grad_norm": 5.403682848041972,
"learning_rate": 1e-06,
"logits/chosen": 27.364856719970703,
"logits/rejected": 30.614818572998047,
"logps/chosen": -384.8145446777344,
"logps/rejected": -389.1242980957031,
"loss": 0.5005,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.002430190797895193,
"rewards/margins": 0.0008192063542082906,
"rewards/rejected": 0.001610984792932868,
"step": 2
},
{
"debug/policy_chosen_logits": 27.064964294433594,
"debug/policy_chosen_logps": -393.77032470703125,
"debug/policy_rejected_logits": 24.79046058654785,
"debug/policy_rejected_logps": -380.7741394042969,
"debug/reference_chosen_logps": -393.5738525390625,
"debug/reference_rejected_logps": -381.0361022949219,
"epoch": 0.09375,
"grad_norm": 5.494546355154901,
"learning_rate": 1e-06,
"logits/chosen": 27.064964294433594,
"logits/rejected": 24.79046058654785,
"logps/chosen": -393.77032470703125,
"logps/rejected": -380.7741394042969,
"loss": 0.4975,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0019646836444735527,
"rewards/margins": -0.004584388807415962,
"rewards/rejected": 0.0026197051629424095,
"step": 3
},
{
"debug/policy_chosen_logits": 28.537710189819336,
"debug/policy_chosen_logps": -393.27496337890625,
"debug/policy_rejected_logits": 26.111160278320312,
"debug/policy_rejected_logps": -371.4323425292969,
"debug/reference_chosen_logps": -394.78082275390625,
"debug/reference_rejected_logps": -371.3717041015625,
"epoch": 0.125,
"grad_norm": 5.6289119594347605,
"learning_rate": 1e-06,
"logits/chosen": 28.537710189819336,
"logits/rejected": 26.111160278320312,
"logps/chosen": -393.27496337890625,
"logps/rejected": -371.4323425292969,
"loss": 0.496,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.015058821067214012,
"rewards/margins": 0.015665167942643166,
"rewards/rejected": -0.0006063459441065788,
"step": 4
},
{
"debug/policy_chosen_logits": 27.205202102661133,
"debug/policy_chosen_logps": -372.9059753417969,
"debug/policy_rejected_logits": 26.36362075805664,
"debug/policy_rejected_logps": -375.8589172363281,
"debug/reference_chosen_logps": -373.8547668457031,
"debug/reference_rejected_logps": -376.77655029296875,
"epoch": 0.15625,
"grad_norm": 4.945940952646883,
"learning_rate": 1e-06,
"logits/chosen": 27.205202102661133,
"logits/rejected": 26.36362075805664,
"logps/chosen": -372.9059753417969,
"logps/rejected": -375.8589172363281,
"loss": 0.501,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.009487838484346867,
"rewards/margins": 0.0003112029517069459,
"rewards/rejected": 0.00917663611471653,
"step": 5
},
{
"debug/policy_chosen_logits": 26.221603393554688,
"debug/policy_chosen_logps": -389.5379638671875,
"debug/policy_rejected_logits": 28.28006362915039,
"debug/policy_rejected_logps": -388.9604187011719,
"debug/reference_chosen_logps": -390.29022216796875,
"debug/reference_rejected_logps": -389.8282775878906,
"epoch": 0.1875,
"grad_norm": 5.296809968342226,
"learning_rate": 1e-06,
"logits/chosen": 26.221603393554688,
"logits/rejected": 28.28006362915039,
"logps/chosen": -389.5379638671875,
"logps/rejected": -388.9604187011719,
"loss": 0.5017,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00752284936606884,
"rewards/margins": -0.0011558537371456623,
"rewards/rejected": 0.008678702637553215,
"step": 6
},
{
"debug/policy_chosen_logits": 26.66923713684082,
"debug/policy_chosen_logps": -394.6316223144531,
"debug/policy_rejected_logits": 25.254545211791992,
"debug/policy_rejected_logps": -364.38299560546875,
"debug/reference_chosen_logps": -395.24407958984375,
"debug/reference_rejected_logps": -364.6360168457031,
"epoch": 0.21875,
"grad_norm": 5.136828721896042,
"learning_rate": 1e-06,
"logits/chosen": 26.66923713684082,
"logits/rejected": 25.254545211791992,
"logps/chosen": -394.6316223144531,
"logps/rejected": -364.38299560546875,
"loss": 0.4987,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.006124534644186497,
"rewards/margins": 0.0035945128183811903,
"rewards/rejected": 0.0025300215929746628,
"step": 7
},
{
"debug/policy_chosen_logits": 23.028860092163086,
"debug/policy_chosen_logps": -385.67840576171875,
"debug/policy_rejected_logits": 25.49799919128418,
"debug/policy_rejected_logps": -404.09539794921875,
"debug/reference_chosen_logps": -386.231201171875,
"debug/reference_rejected_logps": -403.6939392089844,
"epoch": 0.25,
"grad_norm": 5.138563586224073,
"learning_rate": 1e-06,
"logits/chosen": 23.028860092163086,
"logits/rejected": 25.49799919128418,
"logps/chosen": -385.67840576171875,
"logps/rejected": -404.09539794921875,
"loss": 0.4964,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005527915433049202,
"rewards/margins": 0.009542694315314293,
"rewards/rejected": -0.004014777950942516,
"step": 8
},
{
"debug/policy_chosen_logits": 27.734926223754883,
"debug/policy_chosen_logps": -377.7676696777344,
"debug/policy_rejected_logits": 26.249759674072266,
"debug/policy_rejected_logps": -390.58367919921875,
"debug/reference_chosen_logps": -377.9134216308594,
"debug/reference_rejected_logps": -389.7505798339844,
"epoch": 0.28125,
"grad_norm": 5.126237673188204,
"learning_rate": 1e-06,
"logits/chosen": 27.734926223754883,
"logits/rejected": 26.249759674072266,
"logps/chosen": -377.7676696777344,
"logps/rejected": -390.58367919921875,
"loss": 0.4953,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.001457519712857902,
"rewards/margins": 0.009788626804947853,
"rewards/rejected": -0.008331108838319778,
"step": 9
},
{
"debug/policy_chosen_logits": 27.97374725341797,
"debug/policy_chosen_logps": -372.5227966308594,
"debug/policy_rejected_logits": 26.68909454345703,
"debug/policy_rejected_logps": -371.6484375,
"debug/reference_chosen_logps": -371.6475830078125,
"debug/reference_rejected_logps": -371.02789306640625,
"epoch": 0.3125,
"grad_norm": 5.826967853852101,
"learning_rate": 1e-06,
"logits/chosen": 27.97374725341797,
"logits/rejected": 26.68909454345703,
"logps/chosen": -372.5227966308594,
"logps/rejected": -371.6484375,
"loss": 0.5013,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.008752173744142056,
"rewards/margins": -0.002546653850004077,
"rewards/rejected": -0.0062055205926299095,
"step": 10
},
{
"debug/policy_chosen_logits": 25.17486572265625,
"debug/policy_chosen_logps": -387.5612487792969,
"debug/policy_rejected_logits": 25.225900650024414,
"debug/policy_rejected_logps": -394.29156494140625,
"debug/reference_chosen_logps": -389.58349609375,
"debug/reference_rejected_logps": -395.47021484375,
"epoch": 0.34375,
"grad_norm": 5.487120274982643,
"learning_rate": 1e-06,
"logits/chosen": 25.17486572265625,
"logits/rejected": 25.225900650024414,
"logps/chosen": -387.5612487792969,
"logps/rejected": -394.29156494140625,
"loss": 0.4966,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.020222166553139687,
"rewards/margins": 0.008435897529125214,
"rewards/rejected": 0.011786269955337048,
"step": 11
},
{
"debug/policy_chosen_logits": 25.62350082397461,
"debug/policy_chosen_logps": -391.43536376953125,
"debug/policy_rejected_logits": 26.19980239868164,
"debug/policy_rejected_logps": -385.09454345703125,
"debug/reference_chosen_logps": -391.71038818359375,
"debug/reference_rejected_logps": -385.83221435546875,
"epoch": 0.375,
"grad_norm": 5.260475370442769,
"learning_rate": 1e-06,
"logits/chosen": 25.62350082397461,
"logits/rejected": 26.19980239868164,
"logps/chosen": -391.43536376953125,
"logps/rejected": -385.09454345703125,
"loss": 0.4959,
"rewards/accuracies": 0.25,
"rewards/chosen": 0.0027502821758389473,
"rewards/margins": -0.004626387730240822,
"rewards/rejected": 0.007376670837402344,
"step": 12
},
{
"debug/policy_chosen_logits": 26.92571449279785,
"debug/policy_chosen_logps": -368.7294006347656,
"debug/policy_rejected_logits": 27.75008201599121,
"debug/policy_rejected_logps": -406.4366149902344,
"debug/reference_chosen_logps": -368.6622619628906,
"debug/reference_rejected_logps": -406.2249450683594,
"epoch": 0.40625,
"grad_norm": 4.7848487788456415,
"learning_rate": 1e-06,
"logits/chosen": 26.92571449279785,
"logits/rejected": 27.75008201599121,
"logps/chosen": -368.7294006347656,
"logps/rejected": -406.4366149902344,
"loss": 0.4929,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0006715008057653904,
"rewards/margins": 0.0014451986644417048,
"rewards/rejected": -0.002116698771715164,
"step": 13
},
{
"debug/policy_chosen_logits": 30.04401969909668,
"debug/policy_chosen_logps": -375.1585998535156,
"debug/policy_rejected_logits": 26.233510971069336,
"debug/policy_rejected_logps": -391.10772705078125,
"debug/reference_chosen_logps": -374.92877197265625,
"debug/reference_rejected_logps": -390.80914306640625,
"epoch": 0.4375,
"grad_norm": 4.836397952988449,
"learning_rate": 1e-06,
"logits/chosen": 30.04401969909668,
"logits/rejected": 26.233510971069336,
"logps/chosen": -375.1585998535156,
"logps/rejected": -391.10772705078125,
"loss": 0.4978,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0022980873472988605,
"rewards/margins": 0.0006880564615130424,
"rewards/rejected": -0.0029861442744731903,
"step": 14
},
{
"debug/policy_chosen_logits": 26.359272003173828,
"debug/policy_chosen_logps": -380.72393798828125,
"debug/policy_rejected_logits": 25.454103469848633,
"debug/policy_rejected_logps": -372.8114013671875,
"debug/reference_chosen_logps": -381.0364990234375,
"debug/reference_rejected_logps": -374.25555419921875,
"epoch": 0.46875,
"grad_norm": 5.802453188137246,
"learning_rate": 1e-06,
"logits/chosen": 26.359272003173828,
"logits/rejected": 25.454103469848633,
"logps/chosen": -380.72393798828125,
"logps/rejected": -372.8114013671875,
"loss": 0.5021,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0031254198402166367,
"rewards/margins": -0.011316032148897648,
"rewards/rejected": 0.01444145105779171,
"step": 15
},
{
"debug/policy_chosen_logits": 27.719757080078125,
"debug/policy_chosen_logps": -387.31719970703125,
"debug/policy_rejected_logits": 28.422988891601562,
"debug/policy_rejected_logps": -394.9139404296875,
"debug/reference_chosen_logps": -387.28955078125,
"debug/reference_rejected_logps": -392.607666015625,
"epoch": 0.5,
"grad_norm": 5.284125486448873,
"learning_rate": 1e-06,
"logits/chosen": 27.719757080078125,
"logits/rejected": 28.422988891601562,
"logps/chosen": -387.31719970703125,
"logps/rejected": -394.9139404296875,
"loss": 0.4931,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00027671828866004944,
"rewards/margins": 0.022786367684602737,
"rewards/rejected": -0.023063087835907936,
"step": 16
},
{
"debug/policy_chosen_logits": 26.237287521362305,
"debug/policy_chosen_logps": -396.80377197265625,
"debug/policy_rejected_logits": 26.251012802124023,
"debug/policy_rejected_logps": -415.9134521484375,
"debug/reference_chosen_logps": -396.8568115234375,
"debug/reference_rejected_logps": -414.50372314453125,
"epoch": 0.53125,
"grad_norm": 5.422306895885996,
"learning_rate": 1e-06,
"logits/chosen": 26.237287521362305,
"logits/rejected": 26.251012802124023,
"logps/chosen": -396.80377197265625,
"logps/rejected": -415.9134521484375,
"loss": 0.4936,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0005303573561832309,
"rewards/margins": 0.014627190306782722,
"rewards/rejected": -0.014096831902861595,
"step": 17
},
{
"debug/policy_chosen_logits": 27.24745750427246,
"debug/policy_chosen_logps": -400.6668701171875,
"debug/policy_rejected_logits": 28.665220260620117,
"debug/policy_rejected_logps": -404.18115234375,
"debug/reference_chosen_logps": -400.91192626953125,
"debug/reference_rejected_logps": -404.140625,
"epoch": 0.5625,
"grad_norm": 6.154342037709508,
"learning_rate": 1e-06,
"logits/chosen": 27.24745750427246,
"logits/rejected": 28.665220260620117,
"logps/chosen": -400.6668701171875,
"logps/rejected": -404.18115234375,
"loss": 0.504,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0024504470638930798,
"rewards/margins": 0.002855682745575905,
"rewards/rejected": -0.0004052352160215378,
"step": 18
},
{
"debug/policy_chosen_logits": 26.074438095092773,
"debug/policy_chosen_logps": -368.0001220703125,
"debug/policy_rejected_logits": 28.75902557373047,
"debug/policy_rejected_logps": -402.38555908203125,
"debug/reference_chosen_logps": -369.5516357421875,
"debug/reference_rejected_logps": -402.15142822265625,
"epoch": 0.59375,
"grad_norm": 4.942495765333941,
"learning_rate": 1e-06,
"logits/chosen": 26.074438095092773,
"logits/rejected": 28.75902557373047,
"logps/chosen": -368.0001220703125,
"logps/rejected": -402.38555908203125,
"loss": 0.4971,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.015514831990003586,
"rewards/margins": 0.017856139689683914,
"rewards/rejected": -0.002341308631002903,
"step": 19
},
{
"debug/policy_chosen_logits": 22.95450782775879,
"debug/policy_chosen_logps": -387.22039794921875,
"debug/policy_rejected_logits": 23.25604820251465,
"debug/policy_rejected_logps": -407.40460205078125,
"debug/reference_chosen_logps": -387.4991760253906,
"debug/reference_rejected_logps": -406.79705810546875,
"epoch": 0.625,
"grad_norm": 5.841637379604238,
"learning_rate": 1e-06,
"logits/chosen": 22.95450782775879,
"logits/rejected": 23.25604820251465,
"logps/chosen": -387.22039794921875,
"logps/rejected": -407.40460205078125,
"loss": 0.4981,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0027874757070094347,
"rewards/margins": 0.008862762711942196,
"rewards/rejected": -0.00607528630644083,
"step": 20
},
{
"debug/policy_chosen_logits": 25.369287490844727,
"debug/policy_chosen_logps": -384.18011474609375,
"debug/policy_rejected_logits": 27.09587287902832,
"debug/policy_rejected_logps": -374.6014709472656,
"debug/reference_chosen_logps": -384.5386962890625,
"debug/reference_rejected_logps": -374.75848388671875,
"epoch": 0.65625,
"grad_norm": 5.35593100802686,
"learning_rate": 1e-06,
"logits/chosen": 25.369287490844727,
"logits/rejected": 27.09587287902832,
"logps/chosen": -384.18011474609375,
"logps/rejected": -374.6014709472656,
"loss": 0.5014,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.003585891332477331,
"rewards/margins": 0.0020158004481345415,
"rewards/rejected": 0.0015700910007581115,
"step": 21
},
{
"debug/policy_chosen_logits": 29.95963478088379,
"debug/policy_chosen_logps": -412.9902648925781,
"debug/policy_rejected_logits": 29.491188049316406,
"debug/policy_rejected_logps": -388.817138671875,
"debug/reference_chosen_logps": -413.6742248535156,
"debug/reference_rejected_logps": -388.7025146484375,
"epoch": 0.6875,
"grad_norm": 5.750706618647552,
"learning_rate": 1e-06,
"logits/chosen": 29.95963478088379,
"logits/rejected": 29.491188049316406,
"logps/chosen": -412.9902648925781,
"logps/rejected": -388.817138671875,
"loss": 0.5016,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0068396758288145065,
"rewards/margins": 0.007985686883330345,
"rewards/rejected": -0.0011460117530077696,
"step": 22
},
{
"debug/policy_chosen_logits": 27.35663414001465,
"debug/policy_chosen_logps": -381.54901123046875,
"debug/policy_rejected_logits": 27.309879302978516,
"debug/policy_rejected_logps": -408.41070556640625,
"debug/reference_chosen_logps": -381.82025146484375,
"debug/reference_rejected_logps": -406.42999267578125,
"epoch": 0.71875,
"grad_norm": 5.238622917805314,
"learning_rate": 1e-06,
"logits/chosen": 27.35663414001465,
"logits/rejected": 27.309879302978516,
"logps/chosen": -381.54901123046875,
"logps/rejected": -408.41070556640625,
"loss": 0.4947,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.002712326357141137,
"rewards/margins": 0.022518998011946678,
"rewards/rejected": -0.019806671887636185,
"step": 23
},
{
"debug/policy_chosen_logits": 24.55692481994629,
"debug/policy_chosen_logps": -373.54644775390625,
"debug/policy_rejected_logits": 22.0833740234375,
"debug/policy_rejected_logps": -383.18634033203125,
"debug/reference_chosen_logps": -373.8763732910156,
"debug/reference_rejected_logps": -384.98699951171875,
"epoch": 0.75,
"grad_norm": 5.752704029646181,
"learning_rate": 1e-06,
"logits/chosen": 24.55692481994629,
"logits/rejected": 22.0833740234375,
"logps/chosen": -373.54644775390625,
"logps/rejected": -383.18634033203125,
"loss": 0.5047,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.00329933175817132,
"rewards/margins": -0.014707411639392376,
"rewards/rejected": 0.018006745725870132,
"step": 24
},
{
"debug/policy_chosen_logits": 24.850391387939453,
"debug/policy_chosen_logps": -386.2474365234375,
"debug/policy_rejected_logits": 29.567493438720703,
"debug/policy_rejected_logps": -422.58990478515625,
"debug/reference_chosen_logps": -386.5673828125,
"debug/reference_rejected_logps": -422.57989501953125,
"epoch": 0.78125,
"grad_norm": 5.457291481304808,
"learning_rate": 1e-06,
"logits/chosen": 24.850391387939453,
"logits/rejected": 29.567493438720703,
"logps/chosen": -386.2474365234375,
"logps/rejected": -422.58990478515625,
"loss": 0.4937,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.003199271857738495,
"rewards/margins": 0.0032989501487463713,
"rewards/rejected": -9.96782910078764e-05,
"step": 25
},
{
"debug/policy_chosen_logits": 24.353761672973633,
"debug/policy_chosen_logps": -365.70977783203125,
"debug/policy_rejected_logits": 27.883697509765625,
"debug/policy_rejected_logps": -387.9311828613281,
"debug/reference_chosen_logps": -365.4722900390625,
"debug/reference_rejected_logps": -386.3861083984375,
"epoch": 0.8125,
"grad_norm": 6.268906937225549,
"learning_rate": 1e-06,
"logits/chosen": 24.353761672973633,
"logits/rejected": 27.883697509765625,
"logps/chosen": -365.70977783203125,
"logps/rejected": -387.9311828613281,
"loss": 0.492,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0023751831613481045,
"rewards/margins": 0.013075370341539383,
"rewards/rejected": -0.0154505530372262,
"step": 26
},
{
"debug/policy_chosen_logits": 29.132570266723633,
"debug/policy_chosen_logps": -383.54681396484375,
"debug/policy_rejected_logits": 30.820039749145508,
"debug/policy_rejected_logps": -391.74847412109375,
"debug/reference_chosen_logps": -384.86737060546875,
"debug/reference_rejected_logps": -390.72613525390625,
"epoch": 0.84375,
"grad_norm": 5.319109456599871,
"learning_rate": 1e-06,
"logits/chosen": 29.132570266723633,
"logits/rejected": 30.820039749145508,
"logps/chosen": -383.54681396484375,
"logps/rejected": -391.74847412109375,
"loss": 0.4953,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.013205718249082565,
"rewards/margins": 0.023428915068507195,
"rewards/rejected": -0.010223197750747204,
"step": 27
},
{
"debug/policy_chosen_logits": 21.42544174194336,
"debug/policy_chosen_logps": -359.3302917480469,
"debug/policy_rejected_logits": 23.681625366210938,
"debug/policy_rejected_logps": -420.1634826660156,
"debug/reference_chosen_logps": -360.5409240722656,
"debug/reference_rejected_logps": -418.49676513671875,
"epoch": 0.875,
"grad_norm": 5.766637040961972,
"learning_rate": 1e-06,
"logits/chosen": 21.42544174194336,
"logits/rejected": 23.681625366210938,
"logps/chosen": -359.3302917480469,
"logps/rejected": -420.1634826660156,
"loss": 0.4952,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012106247246265411,
"rewards/margins": 0.028773421421647072,
"rewards/rejected": -0.01666717603802681,
"step": 28
},
{
"debug/policy_chosen_logits": 23.576292037963867,
"debug/policy_chosen_logps": -389.7811279296875,
"debug/policy_rejected_logits": 25.178329467773438,
"debug/policy_rejected_logps": -380.7835998535156,
"debug/reference_chosen_logps": -388.56658935546875,
"debug/reference_rejected_logps": -380.28936767578125,
"epoch": 0.90625,
"grad_norm": 5.149123243810572,
"learning_rate": 1e-06,
"logits/chosen": 23.576292037963867,
"logits/rejected": 25.178329467773438,
"logps/chosen": -389.7811279296875,
"logps/rejected": -380.7835998535156,
"loss": 0.5005,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.01214546151459217,
"rewards/margins": -0.007203062996268272,
"rewards/rejected": -0.004942398518323898,
"step": 29
},
{
"debug/policy_chosen_logits": 25.0103702545166,
"debug/policy_chosen_logps": -374.01544189453125,
"debug/policy_rejected_logits": 24.813074111938477,
"debug/policy_rejected_logps": -402.4690246582031,
"debug/reference_chosen_logps": -373.6084899902344,
"debug/reference_rejected_logps": -400.3970947265625,
"epoch": 0.9375,
"grad_norm": 5.054574585698812,
"learning_rate": 1e-06,
"logits/chosen": 25.0103702545166,
"logits/rejected": 24.813074111938477,
"logps/chosen": -374.01544189453125,
"logps/rejected": -402.4690246582031,
"loss": 0.4868,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.00406951829791069,
"rewards/margins": 0.016650084406137466,
"rewards/rejected": -0.020719602704048157,
"step": 30
},
{
"debug/policy_chosen_logits": 24.653635025024414,
"debug/policy_chosen_logps": -389.5201721191406,
"debug/policy_rejected_logits": 26.097084045410156,
"debug/policy_rejected_logps": -385.3985290527344,
"debug/reference_chosen_logps": -388.95758056640625,
"debug/reference_rejected_logps": -384.1873779296875,
"epoch": 0.96875,
"grad_norm": 5.42566873511884,
"learning_rate": 1e-06,
"logits/chosen": 24.653635025024414,
"logits/rejected": 26.097084045410156,
"logps/chosen": -389.5201721191406,
"logps/rejected": -385.3985290527344,
"loss": 0.4971,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.005625876598060131,
"rewards/margins": 0.006485518999397755,
"rewards/rejected": -0.012111397460103035,
"step": 31
},
{
"debug/policy_chosen_logits": 26.803712844848633,
"debug/policy_chosen_logps": -409.23480224609375,
"debug/policy_rejected_logits": 27.91266632080078,
"debug/policy_rejected_logps": -441.89007568359375,
"debug/reference_chosen_logps": -409.35791015625,
"debug/reference_rejected_logps": -439.5430908203125,
"epoch": 1.0,
"grad_norm": 5.122199540354154,
"learning_rate": 1e-06,
"logits/chosen": 26.803712844848633,
"logits/rejected": 27.91266632080078,
"logps/chosen": -409.23480224609375,
"logps/rejected": -441.89007568359375,
"loss": 0.4517,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0012308494187891483,
"rewards/margins": 0.024701077491044998,
"rewards/rejected": -0.023470228537917137,
"step": 32
},
{
"epoch": 1.0,
"step": 32,
"total_flos": 0.0,
"train_loss": 0.49607059359550476,
"train_runtime": 386.5625,
"train_samples_per_second": 5.161,
"train_steps_per_second": 0.083
}
],
"logging_steps": 1,
"max_steps": 32,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}