{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 158, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 1.5589828491210938, "debug/policy_chosen_logps": -258.5330810546875, "debug/policy_rejected_logits": 1.9977812767028809, "debug/policy_rejected_logps": -304.0617980957031, "debug/reference_chosen_logps": -258.5330810546875, "debug/reference_rejected_logps": -304.0617980957031, "epoch": 0.006329113924050633, "grad_norm": 5.915865288930895, "learning_rate": 1e-06, "logits/chosen": 1.5589828491210938, "logits/rejected": 1.9977812767028809, "logps/chosen": -258.5330810546875, "logps/rejected": -304.0617980957031, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 0.8514629602432251, "debug/policy_chosen_logps": -222.75827026367188, "debug/policy_rejected_logits": 1.458482027053833, "debug/policy_rejected_logps": -292.2978210449219, "debug/reference_chosen_logps": -222.56484985351562, "debug/reference_rejected_logps": -288.334716796875, "epoch": 0.03164556962025317, "grad_norm": 6.379094662882782, "learning_rate": 1e-06, "logits/chosen": 0.8514629602432251, "logits/rejected": 1.458482027053833, "logps/chosen": -222.75827026367188, "logps/rejected": -292.2978210449219, "loss": 0.4816, "rewards/accuracies": 0.75, "rewards/chosen": -0.0019342182204127312, "rewards/margins": 0.03769642859697342, "rewards/rejected": -0.039630644023418427, "step": 5 }, { "debug/policy_chosen_logits": 1.1418471336364746, "debug/policy_chosen_logps": -261.1085510253906, "debug/policy_rejected_logits": 1.316489338874817, "debug/policy_rejected_logps": -285.4795837402344, "debug/reference_chosen_logps": -260.5736999511719, "debug/reference_rejected_logps": -280.2572937011719, "epoch": 0.06329113924050633, "grad_norm": 12.885197123935471, "learning_rate": 1e-06, "logits/chosen": 1.1418471336364746, "logits/rejected": 1.316489338874817, "logps/chosen": -261.1085510253906, "logps/rejected": -285.4795837402344, "loss": 0.4629, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.005348391830921173, "rewards/margins": 0.04687455669045448, "rewards/rejected": -0.05222295597195625, "step": 10 }, { "debug/policy_chosen_logits": 1.4202030897140503, "debug/policy_chosen_logps": -305.30096435546875, "debug/policy_rejected_logits": 1.608795404434204, "debug/policy_rejected_logps": -339.3628845214844, "debug/reference_chosen_logps": -305.89739990234375, "debug/reference_rejected_logps": -336.0830078125, "epoch": 0.0949367088607595, "grad_norm": 6.031873391940916, "learning_rate": 1e-06, "logits/chosen": 1.4202030897140503, "logits/rejected": 1.608795404434204, "logps/chosen": -305.30096435546875, "logps/rejected": -339.3628845214844, "loss": 0.462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.005964324809610844, "rewards/margins": 0.038763098418712616, "rewards/rejected": -0.032798778265714645, "step": 15 }, { "debug/policy_chosen_logits": 1.2072508335113525, "debug/policy_chosen_logps": -259.9560546875, "debug/policy_rejected_logits": 1.4596980810165405, "debug/policy_rejected_logps": -266.99896240234375, "debug/reference_chosen_logps": -262.2249450683594, "debug/reference_rejected_logps": -262.94488525390625, "epoch": 0.12658227848101267, "grad_norm": 5.929430664241562, "learning_rate": 1e-06, "logits/chosen": 1.2072508335113525, "logits/rejected": 1.4596980810165405, "logps/chosen": -259.9560546875, "logps/rejected": -266.99896240234375, "loss": 0.4568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02268880605697632, "rewards/margins": 0.06322960555553436, "rewards/rejected": -0.04054080322384834, "step": 20 }, { "debug/policy_chosen_logits": 0.9236510992050171, "debug/policy_chosen_logps": -263.56951904296875, "debug/policy_rejected_logits": 1.2153400182724, "debug/policy_rejected_logps": -276.596923828125, "debug/reference_chosen_logps": -264.62982177734375, "debug/reference_rejected_logps": -272.1346130371094, "epoch": 0.15822784810126583, "grad_norm": 6.795022163630081, "learning_rate": 1e-06, "logits/chosen": 0.9236510992050171, "logits/rejected": 1.2153400182724, "logps/chosen": -263.56951904296875, "logps/rejected": -276.596923828125, "loss": 0.4609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010603101924061775, "rewards/margins": 0.05522637441754341, "rewards/rejected": -0.04462327063083649, "step": 25 }, { "debug/policy_chosen_logits": 0.8845943212509155, "debug/policy_chosen_logps": -232.0923309326172, "debug/policy_rejected_logits": 1.284155011177063, "debug/policy_rejected_logps": -287.80389404296875, "debug/reference_chosen_logps": -233.78652954101562, "debug/reference_rejected_logps": -284.5167236328125, "epoch": 0.189873417721519, "grad_norm": 6.4445556777608255, "learning_rate": 1e-06, "logits/chosen": 0.8845943212509155, "logits/rejected": 1.284155011177063, "logps/chosen": -232.0923309326172, "logps/rejected": -287.80389404296875, "loss": 0.4609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.016941774636507034, "rewards/margins": 0.04981378838419914, "rewards/rejected": -0.03287201002240181, "step": 30 }, { "debug/policy_chosen_logits": 1.1807546615600586, "debug/policy_chosen_logps": -263.7032165527344, "debug/policy_rejected_logits": 1.3615357875823975, "debug/policy_rejected_logps": -295.0924377441406, "debug/reference_chosen_logps": -264.52520751953125, "debug/reference_rejected_logps": -289.96612548828125, "epoch": 0.22151898734177214, "grad_norm": 6.39988158389298, "learning_rate": 1e-06, "logits/chosen": 1.1807546615600586, "logits/rejected": 1.3615357875823975, "logps/chosen": -263.7032165527344, "logps/rejected": -295.0924377441406, "loss": 0.4495, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008219520561397076, "rewards/margins": 0.05948234722018242, "rewards/rejected": -0.05126282572746277, "step": 35 }, { "debug/policy_chosen_logits": 0.918303370475769, "debug/policy_chosen_logps": -224.531982421875, "debug/policy_rejected_logits": 1.2155705690383911, "debug/policy_rejected_logps": -266.7242431640625, "debug/reference_chosen_logps": -227.6628875732422, "debug/reference_rejected_logps": -259.6141052246094, "epoch": 0.25316455696202533, "grad_norm": 8.66786179216246, "learning_rate": 1e-06, "logits/chosen": 0.918303370475769, "logits/rejected": 1.2155705690383911, "logps/chosen": -224.531982421875, "logps/rejected": -266.7242431640625, "loss": 0.4495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.031309086829423904, "rewards/margins": 0.10241049528121948, "rewards/rejected": -0.07110141217708588, "step": 40 }, { "debug/policy_chosen_logits": 0.8259471654891968, "debug/policy_chosen_logps": -230.60250854492188, "debug/policy_rejected_logits": 1.2626183032989502, "debug/policy_rejected_logps": -303.4950866699219, "debug/reference_chosen_logps": -230.0920867919922, "debug/reference_rejected_logps": -302.10784912109375, "epoch": 0.2848101265822785, "grad_norm": 6.143825464676947, "learning_rate": 1e-06, "logits/chosen": 0.8259471654891968, "logits/rejected": 1.2626183032989502, "logps/chosen": -230.60250854492188, "logps/rejected": -303.4950866699219, "loss": 0.4802, "rewards/accuracies": 0.625, "rewards/chosen": -0.0051041776314377785, "rewards/margins": 0.008768384344875813, "rewards/rejected": -0.013872561976313591, "step": 45 }, { "debug/policy_chosen_logits": 0.9409104585647583, "debug/policy_chosen_logps": -241.2617950439453, "debug/policy_rejected_logits": 1.2857184410095215, "debug/policy_rejected_logps": -291.4665222167969, "debug/reference_chosen_logps": -244.69577026367188, "debug/reference_rejected_logps": -284.1947021484375, "epoch": 0.31645569620253167, "grad_norm": 8.46649937885156, "learning_rate": 1e-06, "logits/chosen": 0.9409104585647583, "logits/rejected": 1.2857184410095215, "logps/chosen": -241.2617950439453, "logps/rejected": -291.4665222167969, "loss": 0.4411, "rewards/accuracies": 0.75, "rewards/chosen": 0.034339673817157745, "rewards/margins": 0.10705772787332535, "rewards/rejected": -0.0727180689573288, "step": 50 }, { "debug/policy_chosen_logits": 0.8741863369941711, "debug/policy_chosen_logps": -250.87057495117188, "debug/policy_rejected_logits": 1.258837103843689, "debug/policy_rejected_logps": -289.27069091796875, "debug/reference_chosen_logps": -255.7415771484375, "debug/reference_rejected_logps": -283.4430847167969, "epoch": 0.34810126582278483, "grad_norm": 9.716442001601763, "learning_rate": 1e-06, "logits/chosen": 0.8741863369941711, "logits/rejected": 1.258837103843689, "logps/chosen": -250.87057495117188, "logps/rejected": -289.27069091796875, "loss": 0.4436, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.048710085451602936, "rewards/margins": 0.10698604583740234, "rewards/rejected": -0.05827596038579941, "step": 55 }, { "debug/policy_chosen_logits": 0.6640017628669739, "debug/policy_chosen_logps": -269.62237548828125, "debug/policy_rejected_logits": 0.8445190191268921, "debug/policy_rejected_logps": -291.27325439453125, "debug/reference_chosen_logps": -269.4212951660156, "debug/reference_rejected_logps": -285.77349853515625, "epoch": 0.379746835443038, "grad_norm": 7.925495242886814, "learning_rate": 1e-06, "logits/chosen": 0.6640017628669739, "logits/rejected": 0.8445190191268921, "logps/chosen": -269.62237548828125, "logps/rejected": -291.27325439453125, "loss": 0.438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002010857220739126, "rewards/margins": 0.052986472845077515, "rewards/rejected": -0.05499732494354248, "step": 60 }, { "debug/policy_chosen_logits": 1.0082881450653076, "debug/policy_chosen_logps": -241.1085662841797, "debug/policy_rejected_logits": 1.5921090841293335, "debug/policy_rejected_logps": -303.08465576171875, "debug/reference_chosen_logps": -245.0981903076172, "debug/reference_rejected_logps": -300.36328125, "epoch": 0.41139240506329117, "grad_norm": 7.096776814684128, "learning_rate": 1e-06, "logits/chosen": 1.0082881450653076, "logits/rejected": 1.5921090841293335, "logps/chosen": -241.1085662841797, "logps/rejected": -303.08465576171875, "loss": 0.4602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03989603370428085, "rewards/margins": 0.06710983067750931, "rewards/rejected": -0.027213791385293007, "step": 65 }, { "debug/policy_chosen_logits": 0.7952272295951843, "debug/policy_chosen_logps": -252.08798217773438, "debug/policy_rejected_logits": 1.0696840286254883, "debug/policy_rejected_logps": -287.27301025390625, "debug/reference_chosen_logps": -253.79379272460938, "debug/reference_rejected_logps": -279.5188903808594, "epoch": 0.4430379746835443, "grad_norm": 7.584678181203943, "learning_rate": 1e-06, "logits/chosen": 0.7952272295951843, "logits/rejected": 1.0696840286254883, "logps/chosen": -252.08798217773438, "logps/rejected": -287.27301025390625, "loss": 0.4335, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017058206722140312, "rewards/margins": 0.09459935128688812, "rewards/rejected": -0.07754113525152206, "step": 70 }, { "debug/policy_chosen_logits": 0.9075101613998413, "debug/policy_chosen_logps": -218.43185424804688, "debug/policy_rejected_logits": 1.0321990251541138, "debug/policy_rejected_logps": -245.87973022460938, "debug/reference_chosen_logps": -221.93466186523438, "debug/reference_rejected_logps": -243.0590057373047, "epoch": 0.47468354430379744, "grad_norm": 6.725884442562555, "learning_rate": 1e-06, "logits/chosen": 0.9075101613998413, "logits/rejected": 1.0321990251541138, "logps/chosen": -218.43185424804688, "logps/rejected": -245.87973022460938, "loss": 0.4441, "rewards/accuracies": 0.75, "rewards/chosen": 0.03502799943089485, "rewards/margins": 0.06323517113924026, "rewards/rejected": -0.028207167983055115, "step": 75 }, { "debug/policy_chosen_logits": 0.6510931253433228, "debug/policy_chosen_logps": -218.7671356201172, "debug/policy_rejected_logits": 0.8215225338935852, "debug/policy_rejected_logps": -276.33111572265625, "debug/reference_chosen_logps": -222.28018188476562, "debug/reference_rejected_logps": -267.1961364746094, "epoch": 0.5063291139240507, "grad_norm": 7.155350358859657, "learning_rate": 1e-06, "logits/chosen": 0.6510931253433228, "logits/rejected": 0.8215225338935852, "logps/chosen": -218.7671356201172, "logps/rejected": -276.33111572265625, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03513062372803688, "rewards/margins": 0.1264806091785431, "rewards/rejected": -0.09134997427463531, "step": 80 }, { "debug/policy_chosen_logits": 0.9534305334091187, "debug/policy_chosen_logps": -250.000244140625, "debug/policy_rejected_logits": 1.0431879758834839, "debug/policy_rejected_logps": -275.9551086425781, "debug/reference_chosen_logps": -250.7502899169922, "debug/reference_rejected_logps": -268.43548583984375, "epoch": 0.5379746835443038, "grad_norm": 26.837408837144096, "learning_rate": 1e-06, "logits/chosen": 0.9534305334091187, "logits/rejected": 1.0431879758834839, "logps/chosen": -250.000244140625, "logps/rejected": -275.9551086425781, "loss": 0.4926, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007500249892473221, "rewards/margins": 0.08269646763801575, "rewards/rejected": -0.07519622147083282, "step": 85 }, { "debug/policy_chosen_logits": 1.1253650188446045, "debug/policy_chosen_logps": -240.8356475830078, "debug/policy_rejected_logits": 1.2428481578826904, "debug/policy_rejected_logps": -265.67266845703125, "debug/reference_chosen_logps": -245.643798828125, "debug/reference_rejected_logps": -261.6888122558594, "epoch": 0.569620253164557, "grad_norm": 8.938690009286978, "learning_rate": 1e-06, "logits/chosen": 1.1253650188446045, "logits/rejected": 1.2428481578826904, "logps/chosen": -240.8356475830078, "logps/rejected": -265.67266845703125, "loss": 0.4314, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04808169603347778, "rewards/margins": 0.08791980892419815, "rewards/rejected": -0.03983811289072037, "step": 90 }, { "debug/policy_chosen_logits": 0.9913564920425415, "debug/policy_chosen_logps": -247.68453979492188, "debug/policy_rejected_logits": 1.167474389076233, "debug/policy_rejected_logps": -284.51300048828125, "debug/reference_chosen_logps": -250.7725067138672, "debug/reference_rejected_logps": -276.8506774902344, "epoch": 0.6012658227848101, "grad_norm": 7.214786092625251, "learning_rate": 1e-06, "logits/chosen": 0.9913564920425415, "logits/rejected": 1.167474389076233, "logps/chosen": -247.68453979492188, "logps/rejected": -284.51300048828125, "loss": 0.4481, "rewards/accuracies": 0.625, "rewards/chosen": 0.030879342928528786, "rewards/margins": 0.10750222206115723, "rewards/rejected": -0.07662288844585419, "step": 95 }, { "debug/policy_chosen_logits": 1.548004388809204, "debug/policy_chosen_logps": -286.9696350097656, "debug/policy_rejected_logits": 1.2569023370742798, "debug/policy_rejected_logps": -255.9474639892578, "debug/reference_chosen_logps": -288.26263427734375, "debug/reference_rejected_logps": -252.56982421875, "epoch": 0.6329113924050633, "grad_norm": 7.098617456221662, "learning_rate": 1e-06, "logits/chosen": 1.548004388809204, "logits/rejected": 1.2569023370742798, "logps/chosen": -286.9696350097656, "logps/rejected": -255.9474639892578, "loss": 0.4429, "rewards/accuracies": 0.625, "rewards/chosen": 0.012930279597640038, "rewards/margins": 0.04670674726366997, "rewards/rejected": -0.03377646952867508, "step": 100 }, { "epoch": 0.6329113924050633, "eval_debug/policy_chosen_logits": 1.2252188920974731, "eval_debug/policy_chosen_logps": -250.68939208984375, "eval_debug/policy_rejected_logits": 1.4343616962432861, "eval_debug/policy_rejected_logps": -287.45086669921875, "eval_debug/reference_chosen_logps": -255.34970092773438, "eval_debug/reference_rejected_logps": -283.57049560546875, "eval_logits/chosen": 1.2252188920974731, "eval_logits/rejected": 1.4343616962432861, "eval_logps/chosen": -250.68939208984375, "eval_logps/rejected": -287.45086669921875, "eval_loss": 0.43653252720832825, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/chosen": 0.04660310223698616, "eval_rewards/margins": 0.08540700376033783, "eval_rewards/rejected": -0.03880389407277107, "eval_runtime": 19.8549, "eval_samples_per_second": 20.146, "eval_steps_per_second": 0.655, "step": 100 }, { "debug/policy_chosen_logits": 1.011919617652893, "debug/policy_chosen_logps": -279.73260498046875, "debug/policy_rejected_logits": 1.211625337600708, "debug/policy_rejected_logps": -298.412109375, "debug/reference_chosen_logps": -281.5310974121094, "debug/reference_rejected_logps": -292.20550537109375, "epoch": 0.6645569620253164, "grad_norm": 6.340425768293679, "learning_rate": 1e-06, "logits/chosen": 1.011919617652893, "logits/rejected": 1.211625337600708, "logps/chosen": -279.73260498046875, "logps/rejected": -298.412109375, "loss": 0.4362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017984820529818535, "rewards/margins": 0.08005066215991974, "rewards/rejected": -0.06206584721803665, "step": 105 }, { "debug/policy_chosen_logits": 1.0565037727355957, "debug/policy_chosen_logps": -251.0978546142578, "debug/policy_rejected_logits": 1.3947855234146118, "debug/policy_rejected_logps": -316.4710998535156, "debug/reference_chosen_logps": -253.4007110595703, "debug/reference_rejected_logps": -309.9458923339844, "epoch": 0.6962025316455697, "grad_norm": 20.34165260676491, "learning_rate": 1e-06, "logits/chosen": 1.0565037727355957, "logits/rejected": 1.3947855234146118, "logps/chosen": -251.0978546142578, "logps/rejected": -316.4710998535156, "loss": 0.4383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023028511554002762, "rewards/margins": 0.08828048408031464, "rewards/rejected": -0.06525196880102158, "step": 110 }, { "debug/policy_chosen_logits": 0.8845629692077637, "debug/policy_chosen_logps": -241.9716339111328, "debug/policy_rejected_logits": 1.229775071144104, "debug/policy_rejected_logps": -321.60186767578125, "debug/reference_chosen_logps": -246.28433227539062, "debug/reference_rejected_logps": -314.5198974609375, "epoch": 0.7278481012658228, "grad_norm": 7.789166803514712, "learning_rate": 1e-06, "logits/chosen": 0.8845629692077637, "logits/rejected": 1.229775071144104, "logps/chosen": -241.9716339111328, "logps/rejected": -321.60186767578125, "loss": 0.4426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.043126728385686874, "rewards/margins": 0.11394629627466202, "rewards/rejected": -0.07081956416368484, "step": 115 }, { "debug/policy_chosen_logits": 0.6471331119537354, "debug/policy_chosen_logps": -232.4429168701172, "debug/policy_rejected_logits": 0.9131924510002136, "debug/policy_rejected_logps": -279.41290283203125, "debug/reference_chosen_logps": -237.39102172851562, "debug/reference_rejected_logps": -273.61090087890625, "epoch": 0.759493670886076, "grad_norm": 7.468046301754059, "learning_rate": 1e-06, "logits/chosen": 0.6471331119537354, "logits/rejected": 0.9131924510002136, "logps/chosen": -232.4429168701172, "logps/rejected": -279.41290283203125, "loss": 0.4131, "rewards/accuracies": 0.625, "rewards/chosen": 0.04948071017861366, "rewards/margins": 0.1075005754828453, "rewards/rejected": -0.05801987648010254, "step": 120 }, { "debug/policy_chosen_logits": 0.9338349103927612, "debug/policy_chosen_logps": -260.35235595703125, "debug/policy_rejected_logits": 1.0534359216690063, "debug/policy_rejected_logps": -297.56683349609375, "debug/reference_chosen_logps": -264.9391174316406, "debug/reference_rejected_logps": -289.8217468261719, "epoch": 0.7911392405063291, "grad_norm": 8.935461685140815, "learning_rate": 1e-06, "logits/chosen": 0.9338349103927612, "logits/rejected": 1.0534359216690063, "logps/chosen": -260.35235595703125, "logps/rejected": -297.56683349609375, "loss": 0.4303, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.045867711305618286, "rewards/margins": 0.12331867218017578, "rewards/rejected": -0.0774509608745575, "step": 125 }, { "debug/policy_chosen_logits": 0.8780291676521301, "debug/policy_chosen_logps": -284.29205322265625, "debug/policy_rejected_logits": 0.8824840784072876, "debug/policy_rejected_logps": -287.76690673828125, "debug/reference_chosen_logps": -286.41943359375, "debug/reference_rejected_logps": -283.56903076171875, "epoch": 0.8227848101265823, "grad_norm": 6.948216331668783, "learning_rate": 1e-06, "logits/chosen": 0.8780291676521301, "logits/rejected": 0.8824840784072876, "logps/chosen": -284.29205322265625, "logps/rejected": -287.76690673828125, "loss": 0.4375, "rewards/accuracies": 0.625, "rewards/chosen": 0.02127380482852459, "rewards/margins": 0.06325232237577438, "rewards/rejected": -0.041978511959314346, "step": 130 }, { "debug/policy_chosen_logits": 1.165907859802246, "debug/policy_chosen_logps": -255.9198455810547, "debug/policy_rejected_logits": 1.4020473957061768, "debug/policy_rejected_logps": -301.6413879394531, "debug/reference_chosen_logps": -260.84521484375, "debug/reference_rejected_logps": -295.99700927734375, "epoch": 0.8544303797468354, "grad_norm": 6.0797186914906485, "learning_rate": 1e-06, "logits/chosen": 1.165907859802246, "logits/rejected": 1.4020473957061768, "logps/chosen": -255.9198455810547, "logps/rejected": -301.6413879394531, "loss": 0.4418, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.049253594130277634, "rewards/margins": 0.10569741576910019, "rewards/rejected": -0.056443821638822556, "step": 135 }, { "debug/policy_chosen_logits": 0.9684173464775085, "debug/policy_chosen_logps": -240.7368927001953, "debug/policy_rejected_logits": 1.522164225578308, "debug/policy_rejected_logps": -300.8490295410156, "debug/reference_chosen_logps": -244.41757202148438, "debug/reference_rejected_logps": -289.0794372558594, "epoch": 0.8860759493670886, "grad_norm": 6.850074566718433, "learning_rate": 1e-06, "logits/chosen": 0.9684173464775085, "logits/rejected": 1.522164225578308, "logps/chosen": -240.7368927001953, "logps/rejected": -300.8490295410156, "loss": 0.43, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03680698946118355, "rewards/margins": 0.1545029729604721, "rewards/rejected": -0.11769597232341766, "step": 140 }, { "debug/policy_chosen_logits": 1.095474123954773, "debug/policy_chosen_logps": -281.7500305175781, "debug/policy_rejected_logits": 1.0368950366973877, "debug/policy_rejected_logps": -281.8016052246094, "debug/reference_chosen_logps": -285.4373474121094, "debug/reference_rejected_logps": -278.67181396484375, "epoch": 0.9177215189873418, "grad_norm": 6.330596887372699, "learning_rate": 1e-06, "logits/chosen": 1.095474123954773, "logits/rejected": 1.0368950366973877, "logps/chosen": -281.7500305175781, "logps/rejected": -281.8016052246094, "loss": 0.4243, "rewards/accuracies": 0.625, "rewards/chosen": 0.036873430013656616, "rewards/margins": 0.06817178428173065, "rewards/rejected": -0.03129836544394493, "step": 145 }, { "debug/policy_chosen_logits": 0.9509929418563843, "debug/policy_chosen_logps": -247.018310546875, "debug/policy_rejected_logits": 1.1111629009246826, "debug/policy_rejected_logps": -272.07684326171875, "debug/reference_chosen_logps": -250.40658569335938, "debug/reference_rejected_logps": -265.6427001953125, "epoch": 0.9493670886075949, "grad_norm": 8.073046871358697, "learning_rate": 1e-06, "logits/chosen": 0.9509929418563843, "logits/rejected": 1.1111629009246826, "logps/chosen": -247.018310546875, "logps/rejected": -272.07684326171875, "loss": 0.4234, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03388286381959915, "rewards/margins": 0.09822405129671097, "rewards/rejected": -0.06434118002653122, "step": 150 }, { "debug/policy_chosen_logits": 0.6622827053070068, "debug/policy_chosen_logps": -237.2403106689453, "debug/policy_rejected_logits": 0.8520939946174622, "debug/policy_rejected_logps": -286.5059509277344, "debug/reference_chosen_logps": -241.94467163085938, "debug/reference_rejected_logps": -278.73272705078125, "epoch": 0.9810126582278481, "grad_norm": 7.904037537559287, "learning_rate": 1e-06, "logits/chosen": 0.6622827053070068, "logits/rejected": 0.8520939946174622, "logps/chosen": -237.2403106689453, "logps/rejected": -286.5059509277344, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04704369604587555, "rewards/margins": 0.12477605044841766, "rewards/rejected": -0.07773236930370331, "step": 155 }, { "epoch": 1.0, "step": 158, "total_flos": 0.0, "train_loss": 0.44511839181562013, "train_runtime": 1281.3009, "train_samples_per_second": 7.867, "train_steps_per_second": 0.123 } ], "logging_steps": 5, "max_steps": 158, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }