Safetensors
llama
alignment-handbook
trl
dpo
Generated from Trainer
yiran-wang3's picture
Model save
cf6a0e7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 158,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": 1.5589828491210938,
"debug/policy_chosen_logps": -258.5330810546875,
"debug/policy_rejected_logits": 1.9977812767028809,
"debug/policy_rejected_logps": -304.0617980957031,
"debug/reference_chosen_logps": -258.5330810546875,
"debug/reference_rejected_logps": -304.0617980957031,
"epoch": 0.006329113924050633,
"grad_norm": 5.915865288930895,
"learning_rate": 1e-06,
"logits/chosen": 1.5589828491210938,
"logits/rejected": 1.9977812767028809,
"logps/chosen": -258.5330810546875,
"logps/rejected": -304.0617980957031,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": 0.8514629602432251,
"debug/policy_chosen_logps": -222.75827026367188,
"debug/policy_rejected_logits": 1.458482027053833,
"debug/policy_rejected_logps": -292.2978210449219,
"debug/reference_chosen_logps": -222.56484985351562,
"debug/reference_rejected_logps": -288.334716796875,
"epoch": 0.03164556962025317,
"grad_norm": 6.379094662882782,
"learning_rate": 1e-06,
"logits/chosen": 0.8514629602432251,
"logits/rejected": 1.458482027053833,
"logps/chosen": -222.75827026367188,
"logps/rejected": -292.2978210449219,
"loss": 0.4816,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0019342182204127312,
"rewards/margins": 0.03769642859697342,
"rewards/rejected": -0.039630644023418427,
"step": 5
},
{
"debug/policy_chosen_logits": 1.1418471336364746,
"debug/policy_chosen_logps": -261.1085510253906,
"debug/policy_rejected_logits": 1.316489338874817,
"debug/policy_rejected_logps": -285.4795837402344,
"debug/reference_chosen_logps": -260.5736999511719,
"debug/reference_rejected_logps": -280.2572937011719,
"epoch": 0.06329113924050633,
"grad_norm": 12.885197123935471,
"learning_rate": 1e-06,
"logits/chosen": 1.1418471336364746,
"logits/rejected": 1.316489338874817,
"logps/chosen": -261.1085510253906,
"logps/rejected": -285.4795837402344,
"loss": 0.4629,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.005348391830921173,
"rewards/margins": 0.04687455669045448,
"rewards/rejected": -0.05222295597195625,
"step": 10
},
{
"debug/policy_chosen_logits": 1.4202030897140503,
"debug/policy_chosen_logps": -305.30096435546875,
"debug/policy_rejected_logits": 1.608795404434204,
"debug/policy_rejected_logps": -339.3628845214844,
"debug/reference_chosen_logps": -305.89739990234375,
"debug/reference_rejected_logps": -336.0830078125,
"epoch": 0.0949367088607595,
"grad_norm": 6.031873391940916,
"learning_rate": 1e-06,
"logits/chosen": 1.4202030897140503,
"logits/rejected": 1.608795404434204,
"logps/chosen": -305.30096435546875,
"logps/rejected": -339.3628845214844,
"loss": 0.462,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.005964324809610844,
"rewards/margins": 0.038763098418712616,
"rewards/rejected": -0.032798778265714645,
"step": 15
},
{
"debug/policy_chosen_logits": 1.2072508335113525,
"debug/policy_chosen_logps": -259.9560546875,
"debug/policy_rejected_logits": 1.4596980810165405,
"debug/policy_rejected_logps": -266.99896240234375,
"debug/reference_chosen_logps": -262.2249450683594,
"debug/reference_rejected_logps": -262.94488525390625,
"epoch": 0.12658227848101267,
"grad_norm": 5.929430664241562,
"learning_rate": 1e-06,
"logits/chosen": 1.2072508335113525,
"logits/rejected": 1.4596980810165405,
"logps/chosen": -259.9560546875,
"logps/rejected": -266.99896240234375,
"loss": 0.4568,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.02268880605697632,
"rewards/margins": 0.06322960555553436,
"rewards/rejected": -0.04054080322384834,
"step": 20
},
{
"debug/policy_chosen_logits": 0.9236510992050171,
"debug/policy_chosen_logps": -263.56951904296875,
"debug/policy_rejected_logits": 1.2153400182724,
"debug/policy_rejected_logps": -276.596923828125,
"debug/reference_chosen_logps": -264.62982177734375,
"debug/reference_rejected_logps": -272.1346130371094,
"epoch": 0.15822784810126583,
"grad_norm": 6.795022163630081,
"learning_rate": 1e-06,
"logits/chosen": 0.9236510992050171,
"logits/rejected": 1.2153400182724,
"logps/chosen": -263.56951904296875,
"logps/rejected": -276.596923828125,
"loss": 0.4609,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.010603101924061775,
"rewards/margins": 0.05522637441754341,
"rewards/rejected": -0.04462327063083649,
"step": 25
},
{
"debug/policy_chosen_logits": 0.8845943212509155,
"debug/policy_chosen_logps": -232.0923309326172,
"debug/policy_rejected_logits": 1.284155011177063,
"debug/policy_rejected_logps": -287.80389404296875,
"debug/reference_chosen_logps": -233.78652954101562,
"debug/reference_rejected_logps": -284.5167236328125,
"epoch": 0.189873417721519,
"grad_norm": 6.4445556777608255,
"learning_rate": 1e-06,
"logits/chosen": 0.8845943212509155,
"logits/rejected": 1.284155011177063,
"logps/chosen": -232.0923309326172,
"logps/rejected": -287.80389404296875,
"loss": 0.4609,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.016941774636507034,
"rewards/margins": 0.04981378838419914,
"rewards/rejected": -0.03287201002240181,
"step": 30
},
{
"debug/policy_chosen_logits": 1.1807546615600586,
"debug/policy_chosen_logps": -263.7032165527344,
"debug/policy_rejected_logits": 1.3615357875823975,
"debug/policy_rejected_logps": -295.0924377441406,
"debug/reference_chosen_logps": -264.52520751953125,
"debug/reference_rejected_logps": -289.96612548828125,
"epoch": 0.22151898734177214,
"grad_norm": 6.39988158389298,
"learning_rate": 1e-06,
"logits/chosen": 1.1807546615600586,
"logits/rejected": 1.3615357875823975,
"logps/chosen": -263.7032165527344,
"logps/rejected": -295.0924377441406,
"loss": 0.4495,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.008219520561397076,
"rewards/margins": 0.05948234722018242,
"rewards/rejected": -0.05126282572746277,
"step": 35
},
{
"debug/policy_chosen_logits": 0.918303370475769,
"debug/policy_chosen_logps": -224.531982421875,
"debug/policy_rejected_logits": 1.2155705690383911,
"debug/policy_rejected_logps": -266.7242431640625,
"debug/reference_chosen_logps": -227.6628875732422,
"debug/reference_rejected_logps": -259.6141052246094,
"epoch": 0.25316455696202533,
"grad_norm": 8.66786179216246,
"learning_rate": 1e-06,
"logits/chosen": 0.918303370475769,
"logits/rejected": 1.2155705690383911,
"logps/chosen": -224.531982421875,
"logps/rejected": -266.7242431640625,
"loss": 0.4495,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.031309086829423904,
"rewards/margins": 0.10241049528121948,
"rewards/rejected": -0.07110141217708588,
"step": 40
},
{
"debug/policy_chosen_logits": 0.8259471654891968,
"debug/policy_chosen_logps": -230.60250854492188,
"debug/policy_rejected_logits": 1.2626183032989502,
"debug/policy_rejected_logps": -303.4950866699219,
"debug/reference_chosen_logps": -230.0920867919922,
"debug/reference_rejected_logps": -302.10784912109375,
"epoch": 0.2848101265822785,
"grad_norm": 6.143825464676947,
"learning_rate": 1e-06,
"logits/chosen": 0.8259471654891968,
"logits/rejected": 1.2626183032989502,
"logps/chosen": -230.60250854492188,
"logps/rejected": -303.4950866699219,
"loss": 0.4802,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0051041776314377785,
"rewards/margins": 0.008768384344875813,
"rewards/rejected": -0.013872561976313591,
"step": 45
},
{
"debug/policy_chosen_logits": 0.9409104585647583,
"debug/policy_chosen_logps": -241.2617950439453,
"debug/policy_rejected_logits": 1.2857184410095215,
"debug/policy_rejected_logps": -291.4665222167969,
"debug/reference_chosen_logps": -244.69577026367188,
"debug/reference_rejected_logps": -284.1947021484375,
"epoch": 0.31645569620253167,
"grad_norm": 8.46649937885156,
"learning_rate": 1e-06,
"logits/chosen": 0.9409104585647583,
"logits/rejected": 1.2857184410095215,
"logps/chosen": -241.2617950439453,
"logps/rejected": -291.4665222167969,
"loss": 0.4411,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.034339673817157745,
"rewards/margins": 0.10705772787332535,
"rewards/rejected": -0.0727180689573288,
"step": 50
},
{
"debug/policy_chosen_logits": 0.8741863369941711,
"debug/policy_chosen_logps": -250.87057495117188,
"debug/policy_rejected_logits": 1.258837103843689,
"debug/policy_rejected_logps": -289.27069091796875,
"debug/reference_chosen_logps": -255.7415771484375,
"debug/reference_rejected_logps": -283.4430847167969,
"epoch": 0.34810126582278483,
"grad_norm": 9.716442001601763,
"learning_rate": 1e-06,
"logits/chosen": 0.8741863369941711,
"logits/rejected": 1.258837103843689,
"logps/chosen": -250.87057495117188,
"logps/rejected": -289.27069091796875,
"loss": 0.4436,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.048710085451602936,
"rewards/margins": 0.10698604583740234,
"rewards/rejected": -0.05827596038579941,
"step": 55
},
{
"debug/policy_chosen_logits": 0.6640017628669739,
"debug/policy_chosen_logps": -269.62237548828125,
"debug/policy_rejected_logits": 0.8445190191268921,
"debug/policy_rejected_logps": -291.27325439453125,
"debug/reference_chosen_logps": -269.4212951660156,
"debug/reference_rejected_logps": -285.77349853515625,
"epoch": 0.379746835443038,
"grad_norm": 7.925495242886814,
"learning_rate": 1e-06,
"logits/chosen": 0.6640017628669739,
"logits/rejected": 0.8445190191268921,
"logps/chosen": -269.62237548828125,
"logps/rejected": -291.27325439453125,
"loss": 0.438,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.002010857220739126,
"rewards/margins": 0.052986472845077515,
"rewards/rejected": -0.05499732494354248,
"step": 60
},
{
"debug/policy_chosen_logits": 1.0082881450653076,
"debug/policy_chosen_logps": -241.1085662841797,
"debug/policy_rejected_logits": 1.5921090841293335,
"debug/policy_rejected_logps": -303.08465576171875,
"debug/reference_chosen_logps": -245.0981903076172,
"debug/reference_rejected_logps": -300.36328125,
"epoch": 0.41139240506329117,
"grad_norm": 7.096776814684128,
"learning_rate": 1e-06,
"logits/chosen": 1.0082881450653076,
"logits/rejected": 1.5921090841293335,
"logps/chosen": -241.1085662841797,
"logps/rejected": -303.08465576171875,
"loss": 0.4602,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.03989603370428085,
"rewards/margins": 0.06710983067750931,
"rewards/rejected": -0.027213791385293007,
"step": 65
},
{
"debug/policy_chosen_logits": 0.7952272295951843,
"debug/policy_chosen_logps": -252.08798217773438,
"debug/policy_rejected_logits": 1.0696840286254883,
"debug/policy_rejected_logps": -287.27301025390625,
"debug/reference_chosen_logps": -253.79379272460938,
"debug/reference_rejected_logps": -279.5188903808594,
"epoch": 0.4430379746835443,
"grad_norm": 7.584678181203943,
"learning_rate": 1e-06,
"logits/chosen": 0.7952272295951843,
"logits/rejected": 1.0696840286254883,
"logps/chosen": -252.08798217773438,
"logps/rejected": -287.27301025390625,
"loss": 0.4335,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.017058206722140312,
"rewards/margins": 0.09459935128688812,
"rewards/rejected": -0.07754113525152206,
"step": 70
},
{
"debug/policy_chosen_logits": 0.9075101613998413,
"debug/policy_chosen_logps": -218.43185424804688,
"debug/policy_rejected_logits": 1.0321990251541138,
"debug/policy_rejected_logps": -245.87973022460938,
"debug/reference_chosen_logps": -221.93466186523438,
"debug/reference_rejected_logps": -243.0590057373047,
"epoch": 0.47468354430379744,
"grad_norm": 6.725884442562555,
"learning_rate": 1e-06,
"logits/chosen": 0.9075101613998413,
"logits/rejected": 1.0321990251541138,
"logps/chosen": -218.43185424804688,
"logps/rejected": -245.87973022460938,
"loss": 0.4441,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03502799943089485,
"rewards/margins": 0.06323517113924026,
"rewards/rejected": -0.028207167983055115,
"step": 75
},
{
"debug/policy_chosen_logits": 0.6510931253433228,
"debug/policy_chosen_logps": -218.7671356201172,
"debug/policy_rejected_logits": 0.8215225338935852,
"debug/policy_rejected_logps": -276.33111572265625,
"debug/reference_chosen_logps": -222.28018188476562,
"debug/reference_rejected_logps": -267.1961364746094,
"epoch": 0.5063291139240507,
"grad_norm": 7.155350358859657,
"learning_rate": 1e-06,
"logits/chosen": 0.6510931253433228,
"logits/rejected": 0.8215225338935852,
"logps/chosen": -218.7671356201172,
"logps/rejected": -276.33111572265625,
"loss": 0.4348,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.03513062372803688,
"rewards/margins": 0.1264806091785431,
"rewards/rejected": -0.09134997427463531,
"step": 80
},
{
"debug/policy_chosen_logits": 0.9534305334091187,
"debug/policy_chosen_logps": -250.000244140625,
"debug/policy_rejected_logits": 1.0431879758834839,
"debug/policy_rejected_logps": -275.9551086425781,
"debug/reference_chosen_logps": -250.7502899169922,
"debug/reference_rejected_logps": -268.43548583984375,
"epoch": 0.5379746835443038,
"grad_norm": 26.837408837144096,
"learning_rate": 1e-06,
"logits/chosen": 0.9534305334091187,
"logits/rejected": 1.0431879758834839,
"logps/chosen": -250.000244140625,
"logps/rejected": -275.9551086425781,
"loss": 0.4926,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.007500249892473221,
"rewards/margins": 0.08269646763801575,
"rewards/rejected": -0.07519622147083282,
"step": 85
},
{
"debug/policy_chosen_logits": 1.1253650188446045,
"debug/policy_chosen_logps": -240.8356475830078,
"debug/policy_rejected_logits": 1.2428481578826904,
"debug/policy_rejected_logps": -265.67266845703125,
"debug/reference_chosen_logps": -245.643798828125,
"debug/reference_rejected_logps": -261.6888122558594,
"epoch": 0.569620253164557,
"grad_norm": 8.938690009286978,
"learning_rate": 1e-06,
"logits/chosen": 1.1253650188446045,
"logits/rejected": 1.2428481578826904,
"logps/chosen": -240.8356475830078,
"logps/rejected": -265.67266845703125,
"loss": 0.4314,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.04808169603347778,
"rewards/margins": 0.08791980892419815,
"rewards/rejected": -0.03983811289072037,
"step": 90
},
{
"debug/policy_chosen_logits": 0.9913564920425415,
"debug/policy_chosen_logps": -247.68453979492188,
"debug/policy_rejected_logits": 1.167474389076233,
"debug/policy_rejected_logps": -284.51300048828125,
"debug/reference_chosen_logps": -250.7725067138672,
"debug/reference_rejected_logps": -276.8506774902344,
"epoch": 0.6012658227848101,
"grad_norm": 7.214786092625251,
"learning_rate": 1e-06,
"logits/chosen": 0.9913564920425415,
"logits/rejected": 1.167474389076233,
"logps/chosen": -247.68453979492188,
"logps/rejected": -284.51300048828125,
"loss": 0.4481,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.030879342928528786,
"rewards/margins": 0.10750222206115723,
"rewards/rejected": -0.07662288844585419,
"step": 95
},
{
"debug/policy_chosen_logits": 1.548004388809204,
"debug/policy_chosen_logps": -286.9696350097656,
"debug/policy_rejected_logits": 1.2569023370742798,
"debug/policy_rejected_logps": -255.9474639892578,
"debug/reference_chosen_logps": -288.26263427734375,
"debug/reference_rejected_logps": -252.56982421875,
"epoch": 0.6329113924050633,
"grad_norm": 7.098617456221662,
"learning_rate": 1e-06,
"logits/chosen": 1.548004388809204,
"logits/rejected": 1.2569023370742798,
"logps/chosen": -286.9696350097656,
"logps/rejected": -255.9474639892578,
"loss": 0.4429,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012930279597640038,
"rewards/margins": 0.04670674726366997,
"rewards/rejected": -0.03377646952867508,
"step": 100
},
{
"epoch": 0.6329113924050633,
"eval_debug/policy_chosen_logits": 1.2252188920974731,
"eval_debug/policy_chosen_logps": -250.68939208984375,
"eval_debug/policy_rejected_logits": 1.4343616962432861,
"eval_debug/policy_rejected_logps": -287.45086669921875,
"eval_debug/reference_chosen_logps": -255.34970092773438,
"eval_debug/reference_rejected_logps": -283.57049560546875,
"eval_logits/chosen": 1.2252188920974731,
"eval_logits/rejected": 1.4343616962432861,
"eval_logps/chosen": -250.68939208984375,
"eval_logps/rejected": -287.45086669921875,
"eval_loss": 0.43653252720832825,
"eval_rewards/accuracies": 0.5769230723381042,
"eval_rewards/chosen": 0.04660310223698616,
"eval_rewards/margins": 0.08540700376033783,
"eval_rewards/rejected": -0.03880389407277107,
"eval_runtime": 19.8549,
"eval_samples_per_second": 20.146,
"eval_steps_per_second": 0.655,
"step": 100
},
{
"debug/policy_chosen_logits": 1.011919617652893,
"debug/policy_chosen_logps": -279.73260498046875,
"debug/policy_rejected_logits": 1.211625337600708,
"debug/policy_rejected_logps": -298.412109375,
"debug/reference_chosen_logps": -281.5310974121094,
"debug/reference_rejected_logps": -292.20550537109375,
"epoch": 0.6645569620253164,
"grad_norm": 6.340425768293679,
"learning_rate": 1e-06,
"logits/chosen": 1.011919617652893,
"logits/rejected": 1.211625337600708,
"logps/chosen": -279.73260498046875,
"logps/rejected": -298.412109375,
"loss": 0.4362,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.017984820529818535,
"rewards/margins": 0.08005066215991974,
"rewards/rejected": -0.06206584721803665,
"step": 105
},
{
"debug/policy_chosen_logits": 1.0565037727355957,
"debug/policy_chosen_logps": -251.0978546142578,
"debug/policy_rejected_logits": 1.3947855234146118,
"debug/policy_rejected_logps": -316.4710998535156,
"debug/reference_chosen_logps": -253.4007110595703,
"debug/reference_rejected_logps": -309.9458923339844,
"epoch": 0.6962025316455697,
"grad_norm": 20.34165260676491,
"learning_rate": 1e-06,
"logits/chosen": 1.0565037727355957,
"logits/rejected": 1.3947855234146118,
"logps/chosen": -251.0978546142578,
"logps/rejected": -316.4710998535156,
"loss": 0.4383,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.023028511554002762,
"rewards/margins": 0.08828048408031464,
"rewards/rejected": -0.06525196880102158,
"step": 110
},
{
"debug/policy_chosen_logits": 0.8845629692077637,
"debug/policy_chosen_logps": -241.9716339111328,
"debug/policy_rejected_logits": 1.229775071144104,
"debug/policy_rejected_logps": -321.60186767578125,
"debug/reference_chosen_logps": -246.28433227539062,
"debug/reference_rejected_logps": -314.5198974609375,
"epoch": 0.7278481012658228,
"grad_norm": 7.789166803514712,
"learning_rate": 1e-06,
"logits/chosen": 0.8845629692077637,
"logits/rejected": 1.229775071144104,
"logps/chosen": -241.9716339111328,
"logps/rejected": -321.60186767578125,
"loss": 0.4426,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.043126728385686874,
"rewards/margins": 0.11394629627466202,
"rewards/rejected": -0.07081956416368484,
"step": 115
},
{
"debug/policy_chosen_logits": 0.6471331119537354,
"debug/policy_chosen_logps": -232.4429168701172,
"debug/policy_rejected_logits": 0.9131924510002136,
"debug/policy_rejected_logps": -279.41290283203125,
"debug/reference_chosen_logps": -237.39102172851562,
"debug/reference_rejected_logps": -273.61090087890625,
"epoch": 0.759493670886076,
"grad_norm": 7.468046301754059,
"learning_rate": 1e-06,
"logits/chosen": 0.6471331119537354,
"logits/rejected": 0.9131924510002136,
"logps/chosen": -232.4429168701172,
"logps/rejected": -279.41290283203125,
"loss": 0.4131,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04948071017861366,
"rewards/margins": 0.1075005754828453,
"rewards/rejected": -0.05801987648010254,
"step": 120
},
{
"debug/policy_chosen_logits": 0.9338349103927612,
"debug/policy_chosen_logps": -260.35235595703125,
"debug/policy_rejected_logits": 1.0534359216690063,
"debug/policy_rejected_logps": -297.56683349609375,
"debug/reference_chosen_logps": -264.9391174316406,
"debug/reference_rejected_logps": -289.8217468261719,
"epoch": 0.7911392405063291,
"grad_norm": 8.935461685140815,
"learning_rate": 1e-06,
"logits/chosen": 0.9338349103927612,
"logits/rejected": 1.0534359216690063,
"logps/chosen": -260.35235595703125,
"logps/rejected": -297.56683349609375,
"loss": 0.4303,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.045867711305618286,
"rewards/margins": 0.12331867218017578,
"rewards/rejected": -0.0774509608745575,
"step": 125
},
{
"debug/policy_chosen_logits": 0.8780291676521301,
"debug/policy_chosen_logps": -284.29205322265625,
"debug/policy_rejected_logits": 0.8824840784072876,
"debug/policy_rejected_logps": -287.76690673828125,
"debug/reference_chosen_logps": -286.41943359375,
"debug/reference_rejected_logps": -283.56903076171875,
"epoch": 0.8227848101265823,
"grad_norm": 6.948216331668783,
"learning_rate": 1e-06,
"logits/chosen": 0.8780291676521301,
"logits/rejected": 0.8824840784072876,
"logps/chosen": -284.29205322265625,
"logps/rejected": -287.76690673828125,
"loss": 0.4375,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02127380482852459,
"rewards/margins": 0.06325232237577438,
"rewards/rejected": -0.041978511959314346,
"step": 130
},
{
"debug/policy_chosen_logits": 1.165907859802246,
"debug/policy_chosen_logps": -255.9198455810547,
"debug/policy_rejected_logits": 1.4020473957061768,
"debug/policy_rejected_logps": -301.6413879394531,
"debug/reference_chosen_logps": -260.84521484375,
"debug/reference_rejected_logps": -295.99700927734375,
"epoch": 0.8544303797468354,
"grad_norm": 6.0797186914906485,
"learning_rate": 1e-06,
"logits/chosen": 1.165907859802246,
"logits/rejected": 1.4020473957061768,
"logps/chosen": -255.9198455810547,
"logps/rejected": -301.6413879394531,
"loss": 0.4418,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.049253594130277634,
"rewards/margins": 0.10569741576910019,
"rewards/rejected": -0.056443821638822556,
"step": 135
},
{
"debug/policy_chosen_logits": 0.9684173464775085,
"debug/policy_chosen_logps": -240.7368927001953,
"debug/policy_rejected_logits": 1.522164225578308,
"debug/policy_rejected_logps": -300.8490295410156,
"debug/reference_chosen_logps": -244.41757202148438,
"debug/reference_rejected_logps": -289.0794372558594,
"epoch": 0.8860759493670886,
"grad_norm": 6.850074566718433,
"learning_rate": 1e-06,
"logits/chosen": 0.9684173464775085,
"logits/rejected": 1.522164225578308,
"logps/chosen": -240.7368927001953,
"logps/rejected": -300.8490295410156,
"loss": 0.43,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.03680698946118355,
"rewards/margins": 0.1545029729604721,
"rewards/rejected": -0.11769597232341766,
"step": 140
},
{
"debug/policy_chosen_logits": 1.095474123954773,
"debug/policy_chosen_logps": -281.7500305175781,
"debug/policy_rejected_logits": 1.0368950366973877,
"debug/policy_rejected_logps": -281.8016052246094,
"debug/reference_chosen_logps": -285.4373474121094,
"debug/reference_rejected_logps": -278.67181396484375,
"epoch": 0.9177215189873418,
"grad_norm": 6.330596887372699,
"learning_rate": 1e-06,
"logits/chosen": 1.095474123954773,
"logits/rejected": 1.0368950366973877,
"logps/chosen": -281.7500305175781,
"logps/rejected": -281.8016052246094,
"loss": 0.4243,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.036873430013656616,
"rewards/margins": 0.06817178428173065,
"rewards/rejected": -0.03129836544394493,
"step": 145
},
{
"debug/policy_chosen_logits": 0.9509929418563843,
"debug/policy_chosen_logps": -247.018310546875,
"debug/policy_rejected_logits": 1.1111629009246826,
"debug/policy_rejected_logps": -272.07684326171875,
"debug/reference_chosen_logps": -250.40658569335938,
"debug/reference_rejected_logps": -265.6427001953125,
"epoch": 0.9493670886075949,
"grad_norm": 8.073046871358697,
"learning_rate": 1e-06,
"logits/chosen": 0.9509929418563843,
"logits/rejected": 1.1111629009246826,
"logps/chosen": -247.018310546875,
"logps/rejected": -272.07684326171875,
"loss": 0.4234,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03388286381959915,
"rewards/margins": 0.09822405129671097,
"rewards/rejected": -0.06434118002653122,
"step": 150
},
{
"debug/policy_chosen_logits": 0.6622827053070068,
"debug/policy_chosen_logps": -237.2403106689453,
"debug/policy_rejected_logits": 0.8520939946174622,
"debug/policy_rejected_logps": -286.5059509277344,
"debug/reference_chosen_logps": -241.94467163085938,
"debug/reference_rejected_logps": -278.73272705078125,
"epoch": 0.9810126582278481,
"grad_norm": 7.904037537559287,
"learning_rate": 1e-06,
"logits/chosen": 0.6622827053070068,
"logits/rejected": 0.8520939946174622,
"logps/chosen": -237.2403106689453,
"logps/rejected": -286.5059509277344,
"loss": 0.423,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.04704369604587555,
"rewards/margins": 0.12477605044841766,
"rewards/rejected": -0.07773236930370331,
"step": 155
},
{
"epoch": 1.0,
"step": 158,
"total_flos": 0.0,
"train_loss": 0.44511839181562013,
"train_runtime": 1281.3009,
"train_samples_per_second": 7.867,
"train_steps_per_second": 0.123
}
],
"logging_steps": 5,
"max_steps": 158,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}