{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 32, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 26.208711624145508, "debug/policy_chosen_logps": -419.32049560546875, "debug/policy_rejected_logits": 27.114166259765625, "debug/policy_rejected_logps": -409.2409362792969, "debug/reference_chosen_logps": -419.32049560546875, "debug/reference_rejected_logps": -409.2409362792969, "epoch": 0.03125, "grad_norm": 5.94402702532091, "learning_rate": 1e-06, "logits/chosen": 26.208711624145508, "logits/rejected": 27.114166259765625, "logps/chosen": -419.32049560546875, "logps/rejected": -409.2409362792969, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 27.364856719970703, "debug/policy_chosen_logps": -384.8145446777344, "debug/policy_rejected_logits": 30.614818572998047, "debug/policy_rejected_logps": -389.1242980957031, "debug/reference_chosen_logps": -385.05755615234375, "debug/reference_rejected_logps": -389.2853698730469, "epoch": 0.0625, "grad_norm": 5.403682848041972, "learning_rate": 1e-06, "logits/chosen": 27.364856719970703, "logits/rejected": 30.614818572998047, "logps/chosen": -384.8145446777344, "logps/rejected": -389.1242980957031, "loss": 0.5005, "rewards/accuracies": 0.5, "rewards/chosen": 0.002430190797895193, "rewards/margins": 0.0008192063542082906, "rewards/rejected": 0.001610984792932868, "step": 2 }, { "debug/policy_chosen_logits": 27.064964294433594, "debug/policy_chosen_logps": -393.77032470703125, "debug/policy_rejected_logits": 24.79046058654785, "debug/policy_rejected_logps": -380.7741394042969, "debug/reference_chosen_logps": -393.5738525390625, "debug/reference_rejected_logps": -381.0361022949219, "epoch": 0.09375, "grad_norm": 5.494546355154901, "learning_rate": 1e-06, "logits/chosen": 27.064964294433594, "logits/rejected": 24.79046058654785, "logps/chosen": -393.77032470703125, "logps/rejected": -380.7741394042969, "loss": 0.4975, "rewards/accuracies": 0.5, "rewards/chosen": -0.0019646836444735527, "rewards/margins": -0.004584388807415962, "rewards/rejected": 0.0026197051629424095, "step": 3 }, { "debug/policy_chosen_logits": 28.537710189819336, "debug/policy_chosen_logps": -393.27496337890625, "debug/policy_rejected_logits": 26.111160278320312, "debug/policy_rejected_logps": -371.4323425292969, "debug/reference_chosen_logps": -394.78082275390625, "debug/reference_rejected_logps": -371.3717041015625, "epoch": 0.125, "grad_norm": 5.6289119594347605, "learning_rate": 1e-06, "logits/chosen": 28.537710189819336, "logits/rejected": 26.111160278320312, "logps/chosen": -393.27496337890625, "logps/rejected": -371.4323425292969, "loss": 0.496, "rewards/accuracies": 0.875, "rewards/chosen": 0.015058821067214012, "rewards/margins": 0.015665167942643166, "rewards/rejected": -0.0006063459441065788, "step": 4 }, { "debug/policy_chosen_logits": 27.205202102661133, "debug/policy_chosen_logps": -372.9059753417969, "debug/policy_rejected_logits": 26.36362075805664, "debug/policy_rejected_logps": -375.8589172363281, "debug/reference_chosen_logps": -373.8547668457031, "debug/reference_rejected_logps": -376.77655029296875, "epoch": 0.15625, "grad_norm": 4.945940952646883, "learning_rate": 1e-06, "logits/chosen": 27.205202102661133, "logits/rejected": 26.36362075805664, "logps/chosen": -372.9059753417969, "logps/rejected": -375.8589172363281, "loss": 0.501, "rewards/accuracies": 0.625, "rewards/chosen": 0.009487838484346867, "rewards/margins": 0.0003112029517069459, "rewards/rejected": 0.00917663611471653, "step": 5 }, { "debug/policy_chosen_logits": 26.221603393554688, "debug/policy_chosen_logps": -389.5379638671875, "debug/policy_rejected_logits": 28.28006362915039, "debug/policy_rejected_logps": -388.9604187011719, "debug/reference_chosen_logps": -390.29022216796875, "debug/reference_rejected_logps": -389.8282775878906, "epoch": 0.1875, "grad_norm": 5.296809968342226, "learning_rate": 1e-06, "logits/chosen": 26.221603393554688, "logits/rejected": 28.28006362915039, "logps/chosen": -389.5379638671875, "logps/rejected": -388.9604187011719, "loss": 0.5017, "rewards/accuracies": 0.5, "rewards/chosen": 0.00752284936606884, "rewards/margins": -0.0011558537371456623, "rewards/rejected": 0.008678702637553215, "step": 6 }, { "debug/policy_chosen_logits": 26.66923713684082, "debug/policy_chosen_logps": -394.6316223144531, "debug/policy_rejected_logits": 25.254545211791992, "debug/policy_rejected_logps": -364.38299560546875, "debug/reference_chosen_logps": -395.24407958984375, "debug/reference_rejected_logps": -364.6360168457031, "epoch": 0.21875, "grad_norm": 5.136828721896042, "learning_rate": 1e-06, "logits/chosen": 26.66923713684082, "logits/rejected": 25.254545211791992, "logps/chosen": -394.6316223144531, "logps/rejected": -364.38299560546875, "loss": 0.4987, "rewards/accuracies": 0.5, "rewards/chosen": 0.006124534644186497, "rewards/margins": 0.0035945128183811903, "rewards/rejected": 0.0025300215929746628, "step": 7 }, { "debug/policy_chosen_logits": 23.028860092163086, "debug/policy_chosen_logps": -385.67840576171875, "debug/policy_rejected_logits": 25.49799919128418, "debug/policy_rejected_logps": -404.09539794921875, "debug/reference_chosen_logps": -386.231201171875, "debug/reference_rejected_logps": -403.6939392089844, "epoch": 0.25, "grad_norm": 5.138563586224073, "learning_rate": 1e-06, "logits/chosen": 23.028860092163086, "logits/rejected": 25.49799919128418, "logps/chosen": -385.67840576171875, "logps/rejected": -404.09539794921875, "loss": 0.4964, "rewards/accuracies": 0.625, "rewards/chosen": 0.005527915433049202, "rewards/margins": 0.009542694315314293, "rewards/rejected": -0.004014777950942516, "step": 8 }, { "debug/policy_chosen_logits": 27.734926223754883, "debug/policy_chosen_logps": -377.7676696777344, "debug/policy_rejected_logits": 26.249759674072266, "debug/policy_rejected_logps": -390.58367919921875, "debug/reference_chosen_logps": -377.9134216308594, "debug/reference_rejected_logps": -389.7505798339844, "epoch": 0.28125, "grad_norm": 5.126237673188204, "learning_rate": 1e-06, "logits/chosen": 27.734926223754883, "logits/rejected": 26.249759674072266, "logps/chosen": -377.7676696777344, "logps/rejected": -390.58367919921875, "loss": 0.4953, "rewards/accuracies": 0.75, "rewards/chosen": 0.001457519712857902, "rewards/margins": 0.009788626804947853, "rewards/rejected": -0.008331108838319778, "step": 9 }, { "debug/policy_chosen_logits": 27.97374725341797, "debug/policy_chosen_logps": -372.5227966308594, "debug/policy_rejected_logits": 26.68909454345703, "debug/policy_rejected_logps": -371.6484375, "debug/reference_chosen_logps": -371.6475830078125, "debug/reference_rejected_logps": -371.02789306640625, "epoch": 0.3125, "grad_norm": 5.826967853852101, "learning_rate": 1e-06, "logits/chosen": 27.97374725341797, "logits/rejected": 26.68909454345703, "logps/chosen": -372.5227966308594, "logps/rejected": -371.6484375, "loss": 0.5013, "rewards/accuracies": 0.5, "rewards/chosen": -0.008752173744142056, "rewards/margins": -0.002546653850004077, "rewards/rejected": -0.0062055205926299095, "step": 10 }, { "debug/policy_chosen_logits": 25.17486572265625, "debug/policy_chosen_logps": -387.5612487792969, "debug/policy_rejected_logits": 25.225900650024414, "debug/policy_rejected_logps": -394.29156494140625, "debug/reference_chosen_logps": -389.58349609375, "debug/reference_rejected_logps": -395.47021484375, "epoch": 0.34375, "grad_norm": 5.487120274982643, "learning_rate": 1e-06, "logits/chosen": 25.17486572265625, "logits/rejected": 25.225900650024414, "logps/chosen": -387.5612487792969, "logps/rejected": -394.29156494140625, "loss": 0.4966, "rewards/accuracies": 0.5, "rewards/chosen": 0.020222166553139687, "rewards/margins": 0.008435897529125214, "rewards/rejected": 0.011786269955337048, "step": 11 }, { "debug/policy_chosen_logits": 25.62350082397461, "debug/policy_chosen_logps": -391.43536376953125, "debug/policy_rejected_logits": 26.19980239868164, "debug/policy_rejected_logps": -385.09454345703125, "debug/reference_chosen_logps": -391.71038818359375, "debug/reference_rejected_logps": -385.83221435546875, "epoch": 0.375, "grad_norm": 5.260475370442769, "learning_rate": 1e-06, "logits/chosen": 25.62350082397461, "logits/rejected": 26.19980239868164, "logps/chosen": -391.43536376953125, "logps/rejected": -385.09454345703125, "loss": 0.4959, "rewards/accuracies": 0.25, "rewards/chosen": 0.0027502821758389473, "rewards/margins": -0.004626387730240822, "rewards/rejected": 0.007376670837402344, "step": 12 }, { "debug/policy_chosen_logits": 26.92571449279785, "debug/policy_chosen_logps": -368.7294006347656, "debug/policy_rejected_logits": 27.75008201599121, "debug/policy_rejected_logps": -406.4366149902344, "debug/reference_chosen_logps": -368.6622619628906, "debug/reference_rejected_logps": -406.2249450683594, "epoch": 0.40625, "grad_norm": 4.7848487788456415, "learning_rate": 1e-06, "logits/chosen": 26.92571449279785, "logits/rejected": 27.75008201599121, "logps/chosen": -368.7294006347656, "logps/rejected": -406.4366149902344, "loss": 0.4929, "rewards/accuracies": 0.625, "rewards/chosen": -0.0006715008057653904, "rewards/margins": 0.0014451986644417048, "rewards/rejected": -0.002116698771715164, "step": 13 }, { "debug/policy_chosen_logits": 30.04401969909668, "debug/policy_chosen_logps": -375.1585998535156, "debug/policy_rejected_logits": 26.233510971069336, "debug/policy_rejected_logps": -391.10772705078125, "debug/reference_chosen_logps": -374.92877197265625, "debug/reference_rejected_logps": -390.80914306640625, "epoch": 0.4375, "grad_norm": 4.836397952988449, "learning_rate": 1e-06, "logits/chosen": 30.04401969909668, "logits/rejected": 26.233510971069336, "logps/chosen": -375.1585998535156, "logps/rejected": -391.10772705078125, "loss": 0.4978, "rewards/accuracies": 0.5, "rewards/chosen": -0.0022980873472988605, "rewards/margins": 0.0006880564615130424, "rewards/rejected": -0.0029861442744731903, "step": 14 }, { "debug/policy_chosen_logits": 26.359272003173828, "debug/policy_chosen_logps": -380.72393798828125, "debug/policy_rejected_logits": 25.454103469848633, "debug/policy_rejected_logps": -372.8114013671875, "debug/reference_chosen_logps": -381.0364990234375, "debug/reference_rejected_logps": -374.25555419921875, "epoch": 0.46875, "grad_norm": 5.802453188137246, "learning_rate": 1e-06, "logits/chosen": 26.359272003173828, "logits/rejected": 25.454103469848633, "logps/chosen": -380.72393798828125, "logps/rejected": -372.8114013671875, "loss": 0.5021, "rewards/accuracies": 0.5, "rewards/chosen": 0.0031254198402166367, "rewards/margins": -0.011316032148897648, "rewards/rejected": 0.01444145105779171, "step": 15 }, { "debug/policy_chosen_logits": 27.719757080078125, "debug/policy_chosen_logps": -387.31719970703125, "debug/policy_rejected_logits": 28.422988891601562, "debug/policy_rejected_logps": -394.9139404296875, "debug/reference_chosen_logps": -387.28955078125, "debug/reference_rejected_logps": -392.607666015625, "epoch": 0.5, "grad_norm": 5.284125486448873, "learning_rate": 1e-06, "logits/chosen": 27.719757080078125, "logits/rejected": 28.422988891601562, "logps/chosen": -387.31719970703125, "logps/rejected": -394.9139404296875, "loss": 0.4931, "rewards/accuracies": 0.75, "rewards/chosen": -0.00027671828866004944, "rewards/margins": 0.022786367684602737, "rewards/rejected": -0.023063087835907936, "step": 16 }, { "debug/policy_chosen_logits": 26.237287521362305, "debug/policy_chosen_logps": -396.80377197265625, "debug/policy_rejected_logits": 26.251012802124023, "debug/policy_rejected_logps": -415.9134521484375, "debug/reference_chosen_logps": -396.8568115234375, "debug/reference_rejected_logps": -414.50372314453125, "epoch": 0.53125, "grad_norm": 5.422306895885996, "learning_rate": 1e-06, "logits/chosen": 26.237287521362305, "logits/rejected": 26.251012802124023, "logps/chosen": -396.80377197265625, "logps/rejected": -415.9134521484375, "loss": 0.4936, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005303573561832309, "rewards/margins": 0.014627190306782722, "rewards/rejected": -0.014096831902861595, "step": 17 }, { "debug/policy_chosen_logits": 27.24745750427246, "debug/policy_chosen_logps": -400.6668701171875, "debug/policy_rejected_logits": 28.665220260620117, "debug/policy_rejected_logps": -404.18115234375, "debug/reference_chosen_logps": -400.91192626953125, "debug/reference_rejected_logps": -404.140625, "epoch": 0.5625, "grad_norm": 6.154342037709508, "learning_rate": 1e-06, "logits/chosen": 27.24745750427246, "logits/rejected": 28.665220260620117, "logps/chosen": -400.6668701171875, "logps/rejected": -404.18115234375, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": 0.0024504470638930798, "rewards/margins": 0.002855682745575905, "rewards/rejected": -0.0004052352160215378, "step": 18 }, { "debug/policy_chosen_logits": 26.074438095092773, "debug/policy_chosen_logps": -368.0001220703125, "debug/policy_rejected_logits": 28.75902557373047, "debug/policy_rejected_logps": -402.38555908203125, "debug/reference_chosen_logps": -369.5516357421875, "debug/reference_rejected_logps": -402.15142822265625, "epoch": 0.59375, "grad_norm": 4.942495765333941, "learning_rate": 1e-06, "logits/chosen": 26.074438095092773, "logits/rejected": 28.75902557373047, "logps/chosen": -368.0001220703125, "logps/rejected": -402.38555908203125, "loss": 0.4971, "rewards/accuracies": 0.625, "rewards/chosen": 0.015514831990003586, "rewards/margins": 0.017856139689683914, "rewards/rejected": -0.002341308631002903, "step": 19 }, { "debug/policy_chosen_logits": 22.95450782775879, "debug/policy_chosen_logps": -387.22039794921875, "debug/policy_rejected_logits": 23.25604820251465, "debug/policy_rejected_logps": -407.40460205078125, "debug/reference_chosen_logps": -387.4991760253906, "debug/reference_rejected_logps": -406.79705810546875, "epoch": 0.625, "grad_norm": 5.841637379604238, "learning_rate": 1e-06, "logits/chosen": 22.95450782775879, "logits/rejected": 23.25604820251465, "logps/chosen": -387.22039794921875, "logps/rejected": -407.40460205078125, "loss": 0.4981, "rewards/accuracies": 0.75, "rewards/chosen": 0.0027874757070094347, "rewards/margins": 0.008862762711942196, "rewards/rejected": -0.00607528630644083, "step": 20 }, { "debug/policy_chosen_logits": 25.369287490844727, "debug/policy_chosen_logps": -384.18011474609375, "debug/policy_rejected_logits": 27.09587287902832, "debug/policy_rejected_logps": -374.6014709472656, "debug/reference_chosen_logps": -384.5386962890625, "debug/reference_rejected_logps": -374.75848388671875, "epoch": 0.65625, "grad_norm": 5.35593100802686, "learning_rate": 1e-06, "logits/chosen": 25.369287490844727, "logits/rejected": 27.09587287902832, "logps/chosen": -384.18011474609375, "logps/rejected": -374.6014709472656, "loss": 0.5014, "rewards/accuracies": 0.5, "rewards/chosen": 0.003585891332477331, "rewards/margins": 0.0020158004481345415, "rewards/rejected": 0.0015700910007581115, "step": 21 }, { "debug/policy_chosen_logits": 29.95963478088379, "debug/policy_chosen_logps": -412.9902648925781, "debug/policy_rejected_logits": 29.491188049316406, "debug/policy_rejected_logps": -388.817138671875, "debug/reference_chosen_logps": -413.6742248535156, "debug/reference_rejected_logps": -388.7025146484375, "epoch": 0.6875, "grad_norm": 5.750706618647552, "learning_rate": 1e-06, "logits/chosen": 29.95963478088379, "logits/rejected": 29.491188049316406, "logps/chosen": -412.9902648925781, "logps/rejected": -388.817138671875, "loss": 0.5016, "rewards/accuracies": 0.5, "rewards/chosen": 0.0068396758288145065, "rewards/margins": 0.007985686883330345, "rewards/rejected": -0.0011460117530077696, "step": 22 }, { "debug/policy_chosen_logits": 27.35663414001465, "debug/policy_chosen_logps": -381.54901123046875, "debug/policy_rejected_logits": 27.309879302978516, "debug/policy_rejected_logps": -408.41070556640625, "debug/reference_chosen_logps": -381.82025146484375, "debug/reference_rejected_logps": -406.42999267578125, "epoch": 0.71875, "grad_norm": 5.238622917805314, "learning_rate": 1e-06, "logits/chosen": 27.35663414001465, "logits/rejected": 27.309879302978516, "logps/chosen": -381.54901123046875, "logps/rejected": -408.41070556640625, "loss": 0.4947, "rewards/accuracies": 0.5, "rewards/chosen": 0.002712326357141137, "rewards/margins": 0.022518998011946678, "rewards/rejected": -0.019806671887636185, "step": 23 }, { "debug/policy_chosen_logits": 24.55692481994629, "debug/policy_chosen_logps": -373.54644775390625, "debug/policy_rejected_logits": 22.0833740234375, "debug/policy_rejected_logps": -383.18634033203125, "debug/reference_chosen_logps": -373.8763732910156, "debug/reference_rejected_logps": -384.98699951171875, "epoch": 0.75, "grad_norm": 5.752704029646181, "learning_rate": 1e-06, "logits/chosen": 24.55692481994629, "logits/rejected": 22.0833740234375, "logps/chosen": -373.54644775390625, "logps/rejected": -383.18634033203125, "loss": 0.5047, "rewards/accuracies": 0.375, "rewards/chosen": 0.00329933175817132, "rewards/margins": -0.014707411639392376, "rewards/rejected": 0.018006745725870132, "step": 24 }, { "debug/policy_chosen_logits": 24.850391387939453, "debug/policy_chosen_logps": -386.2474365234375, "debug/policy_rejected_logits": 29.567493438720703, "debug/policy_rejected_logps": -422.58990478515625, "debug/reference_chosen_logps": -386.5673828125, "debug/reference_rejected_logps": -422.57989501953125, "epoch": 0.78125, "grad_norm": 5.457291481304808, "learning_rate": 1e-06, "logits/chosen": 24.850391387939453, "logits/rejected": 29.567493438720703, "logps/chosen": -386.2474365234375, "logps/rejected": -422.58990478515625, "loss": 0.4937, "rewards/accuracies": 0.625, "rewards/chosen": 0.003199271857738495, "rewards/margins": 0.0032989501487463713, "rewards/rejected": -9.96782910078764e-05, "step": 25 }, { "debug/policy_chosen_logits": 24.353761672973633, "debug/policy_chosen_logps": -365.70977783203125, "debug/policy_rejected_logits": 27.883697509765625, "debug/policy_rejected_logps": -387.9311828613281, "debug/reference_chosen_logps": -365.4722900390625, "debug/reference_rejected_logps": -386.3861083984375, "epoch": 0.8125, "grad_norm": 6.268906937225549, "learning_rate": 1e-06, "logits/chosen": 24.353761672973633, "logits/rejected": 27.883697509765625, "logps/chosen": -365.70977783203125, "logps/rejected": -387.9311828613281, "loss": 0.492, "rewards/accuracies": 0.625, "rewards/chosen": -0.0023751831613481045, "rewards/margins": 0.013075370341539383, "rewards/rejected": -0.0154505530372262, "step": 26 }, { "debug/policy_chosen_logits": 29.132570266723633, "debug/policy_chosen_logps": -383.54681396484375, "debug/policy_rejected_logits": 30.820039749145508, "debug/policy_rejected_logps": -391.74847412109375, "debug/reference_chosen_logps": -384.86737060546875, "debug/reference_rejected_logps": -390.72613525390625, "epoch": 0.84375, "grad_norm": 5.319109456599871, "learning_rate": 1e-06, "logits/chosen": 29.132570266723633, "logits/rejected": 30.820039749145508, "logps/chosen": -383.54681396484375, "logps/rejected": -391.74847412109375, "loss": 0.4953, "rewards/accuracies": 0.75, "rewards/chosen": 0.013205718249082565, "rewards/margins": 0.023428915068507195, "rewards/rejected": -0.010223197750747204, "step": 27 }, { "debug/policy_chosen_logits": 21.42544174194336, "debug/policy_chosen_logps": -359.3302917480469, "debug/policy_rejected_logits": 23.681625366210938, "debug/policy_rejected_logps": -420.1634826660156, "debug/reference_chosen_logps": -360.5409240722656, "debug/reference_rejected_logps": -418.49676513671875, "epoch": 0.875, "grad_norm": 5.766637040961972, "learning_rate": 1e-06, "logits/chosen": 21.42544174194336, "logits/rejected": 23.681625366210938, "logps/chosen": -359.3302917480469, "logps/rejected": -420.1634826660156, "loss": 0.4952, "rewards/accuracies": 0.625, "rewards/chosen": 0.012106247246265411, "rewards/margins": 0.028773421421647072, "rewards/rejected": -0.01666717603802681, "step": 28 }, { "debug/policy_chosen_logits": 23.576292037963867, "debug/policy_chosen_logps": -389.7811279296875, "debug/policy_rejected_logits": 25.178329467773438, "debug/policy_rejected_logps": -380.7835998535156, "debug/reference_chosen_logps": -388.56658935546875, "debug/reference_rejected_logps": -380.28936767578125, "epoch": 0.90625, "grad_norm": 5.149123243810572, "learning_rate": 1e-06, "logits/chosen": 23.576292037963867, "logits/rejected": 25.178329467773438, "logps/chosen": -389.7811279296875, "logps/rejected": -380.7835998535156, "loss": 0.5005, "rewards/accuracies": 0.375, "rewards/chosen": -0.01214546151459217, "rewards/margins": -0.007203062996268272, "rewards/rejected": -0.004942398518323898, "step": 29 }, { "debug/policy_chosen_logits": 25.0103702545166, "debug/policy_chosen_logps": -374.01544189453125, "debug/policy_rejected_logits": 24.813074111938477, "debug/policy_rejected_logps": -402.4690246582031, "debug/reference_chosen_logps": -373.6084899902344, "debug/reference_rejected_logps": -400.3970947265625, "epoch": 0.9375, "grad_norm": 5.054574585698812, "learning_rate": 1e-06, "logits/chosen": 25.0103702545166, "logits/rejected": 24.813074111938477, "logps/chosen": -374.01544189453125, "logps/rejected": -402.4690246582031, "loss": 0.4868, "rewards/accuracies": 0.75, "rewards/chosen": -0.00406951829791069, "rewards/margins": 0.016650084406137466, "rewards/rejected": -0.020719602704048157, "step": 30 }, { "debug/policy_chosen_logits": 24.653635025024414, "debug/policy_chosen_logps": -389.5201721191406, "debug/policy_rejected_logits": 26.097084045410156, "debug/policy_rejected_logps": -385.3985290527344, "debug/reference_chosen_logps": -388.95758056640625, "debug/reference_rejected_logps": -384.1873779296875, "epoch": 0.96875, "grad_norm": 5.42566873511884, "learning_rate": 1e-06, "logits/chosen": 24.653635025024414, "logits/rejected": 26.097084045410156, "logps/chosen": -389.5201721191406, "logps/rejected": -385.3985290527344, "loss": 0.4971, "rewards/accuracies": 0.5, "rewards/chosen": -0.005625876598060131, "rewards/margins": 0.006485518999397755, "rewards/rejected": -0.012111397460103035, "step": 31 }, { "debug/policy_chosen_logits": 26.803712844848633, "debug/policy_chosen_logps": -409.23480224609375, "debug/policy_rejected_logits": 27.91266632080078, "debug/policy_rejected_logps": -441.89007568359375, "debug/reference_chosen_logps": -409.35791015625, "debug/reference_rejected_logps": -439.5430908203125, "epoch": 1.0, "grad_norm": 5.122199540354154, "learning_rate": 1e-06, "logits/chosen": 26.803712844848633, "logits/rejected": 27.91266632080078, "logps/chosen": -409.23480224609375, "logps/rejected": -441.89007568359375, "loss": 0.4517, "rewards/accuracies": 0.625, "rewards/chosen": 0.0012308494187891483, "rewards/margins": 0.024701077491044998, "rewards/rejected": -0.023470228537917137, "step": 32 }, { "epoch": 1.0, "step": 32, "total_flos": 0.0, "train_loss": 0.49607059359550476, "train_runtime": 386.5625, "train_samples_per_second": 5.161, "train_steps_per_second": 0.083 } ], "logging_steps": 1, "max_steps": 32, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }