{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 47, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 31.700279235839844, "debug/policy_chosen_logps": -434.26495361328125, "debug/policy_rejected_logits": 33.99253845214844, "debug/policy_rejected_logps": -441.9063720703125, "debug/reference_chosen_logps": -434.26495361328125, "debug/reference_rejected_logps": -441.9063720703125, "epoch": 0.02127659574468085, "grad_norm": 5.407328059506411, "learning_rate": 1e-06, "logits/chosen": 31.700279235839844, "logits/rejected": 33.99253845214844, "logps/chosen": -434.26495361328125, "logps/rejected": -441.9063720703125, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 27.489120483398438, "debug/policy_chosen_logps": -410.28472900390625, "debug/policy_rejected_logits": 31.382970809936523, "debug/policy_rejected_logps": -435.17218017578125, "debug/reference_chosen_logps": -410.96771240234375, "debug/reference_rejected_logps": -436.0491638183594, "epoch": 0.0425531914893617, "grad_norm": 4.951759612240564, "learning_rate": 1e-06, "logits/chosen": 27.489120483398438, "logits/rejected": 31.382970809936523, "logps/chosen": -410.28472900390625, "logps/rejected": -435.17218017578125, "loss": 0.5009, "rewards/accuracies": 0.625, "rewards/chosen": 0.006829871796071529, "rewards/margins": -0.001939887646585703, "rewards/rejected": 0.00876975990831852, "step": 2 }, { "debug/policy_chosen_logits": 31.120014190673828, "debug/policy_chosen_logps": -401.9127197265625, "debug/policy_rejected_logits": 33.329689025878906, "debug/policy_rejected_logps": -424.90576171875, "debug/reference_chosen_logps": -402.24658203125, "debug/reference_rejected_logps": -424.7574462890625, "epoch": 0.06382978723404255, "grad_norm": 5.411851250945231, "learning_rate": 1e-06, "logits/chosen": 31.120014190673828, "logits/rejected": 33.329689025878906, "logps/chosen": -401.9127197265625, "logps/rejected": -424.90576171875, "loss": 0.5007, "rewards/accuracies": 0.5, "rewards/chosen": 0.003338394220918417, "rewards/margins": 0.004821510519832373, "rewards/rejected": -0.001483116764575243, "step": 3 }, { "debug/policy_chosen_logits": 30.066335678100586, "debug/policy_chosen_logps": -403.6931457519531, "debug/policy_rejected_logits": 33.15522384643555, "debug/policy_rejected_logps": -436.77801513671875, "debug/reference_chosen_logps": -403.52996826171875, "debug/reference_rejected_logps": -436.29296875, "epoch": 0.0851063829787234, "grad_norm": 6.440647306952527, "learning_rate": 1e-06, "logits/chosen": 30.066335678100586, "logits/rejected": 33.15522384643555, "logps/chosen": -403.6931457519531, "logps/rejected": -436.77801513671875, "loss": 0.4993, "rewards/accuracies": 0.75, "rewards/chosen": -0.0016318517737090588, "rewards/margins": 0.003218421945348382, "rewards/rejected": -0.00485027302056551, "step": 4 }, { "debug/policy_chosen_logits": 28.028703689575195, "debug/policy_chosen_logps": -391.64715576171875, "debug/policy_rejected_logits": 30.672496795654297, "debug/policy_rejected_logps": -420.91143798828125, "debug/reference_chosen_logps": -392.4825134277344, "debug/reference_rejected_logps": -422.0054931640625, "epoch": 0.10638297872340426, "grad_norm": 5.433518269383661, "learning_rate": 1e-06, "logits/chosen": 28.028703689575195, "logits/rejected": 30.672496795654297, "logps/chosen": -391.64715576171875, "logps/rejected": -420.91143798828125, "loss": 0.4987, "rewards/accuracies": 0.25, "rewards/chosen": 0.008353347890079021, "rewards/margins": -0.0025872797705233097, "rewards/rejected": 0.010940628126263618, "step": 5 }, { "debug/policy_chosen_logits": 26.0015811920166, "debug/policy_chosen_logps": -403.2935791015625, "debug/policy_rejected_logits": 25.2414608001709, "debug/policy_rejected_logps": -407.3106994628906, "debug/reference_chosen_logps": -402.87139892578125, "debug/reference_rejected_logps": -406.1798095703125, "epoch": 0.1276595744680851, "grad_norm": 5.048275333461177, "learning_rate": 1e-06, "logits/chosen": 26.0015811920166, "logits/rejected": 25.2414608001709, "logps/chosen": -403.2935791015625, "logps/rejected": -407.3106994628906, "loss": 0.496, "rewards/accuracies": 0.5, "rewards/chosen": -0.0042218780145049095, "rewards/margins": 0.007086906582117081, "rewards/rejected": -0.01130878459662199, "step": 6 }, { "debug/policy_chosen_logits": 27.019393920898438, "debug/policy_chosen_logps": -424.65643310546875, "debug/policy_rejected_logits": 29.134994506835938, "debug/policy_rejected_logps": -420.3319396972656, "debug/reference_chosen_logps": -423.5784912109375, "debug/reference_rejected_logps": -420.0720520019531, "epoch": 0.14893617021276595, "grad_norm": 6.092801689841109, "learning_rate": 1e-06, "logits/chosen": 27.019393920898438, "logits/rejected": 29.134994506835938, "logps/chosen": -424.65643310546875, "logps/rejected": -420.3319396972656, "loss": 0.499, "rewards/accuracies": 0.125, "rewards/chosen": -0.010779608972370625, "rewards/margins": -0.008180923759937286, "rewards/rejected": -0.0025986863765865564, "step": 7 }, { "debug/policy_chosen_logits": 25.41461944580078, "debug/policy_chosen_logps": -420.9305419921875, "debug/policy_rejected_logits": 25.522966384887695, "debug/policy_rejected_logps": -427.20623779296875, "debug/reference_chosen_logps": -420.0583190917969, "debug/reference_rejected_logps": -426.08453369140625, "epoch": 0.1702127659574468, "grad_norm": 5.248588154856571, "learning_rate": 1e-06, "logits/chosen": 25.41461944580078, "logits/rejected": 25.522966384887695, "logps/chosen": -420.9305419921875, "logps/rejected": -427.20623779296875, "loss": 0.4975, "rewards/accuracies": 0.5, "rewards/chosen": -0.00872222799807787, "rewards/margins": 0.0024948506616055965, "rewards/rejected": -0.011217079125344753, "step": 8 }, { "debug/policy_chosen_logits": 27.28373146057129, "debug/policy_chosen_logps": -413.857177734375, "debug/policy_rejected_logits": 29.01516342163086, "debug/policy_rejected_logps": -429.70623779296875, "debug/reference_chosen_logps": -413.364501953125, "debug/reference_rejected_logps": -429.416259765625, "epoch": 0.19148936170212766, "grad_norm": 5.836663309727503, "learning_rate": 1e-06, "logits/chosen": 27.28373146057129, "logits/rejected": 29.01516342163086, "logps/chosen": -413.857177734375, "logps/rejected": -429.70623779296875, "loss": 0.4954, "rewards/accuracies": 0.375, "rewards/chosen": -0.004926986526697874, "rewards/margins": -0.002027016133069992, "rewards/rejected": -0.002899970393627882, "step": 9 }, { "debug/policy_chosen_logits": 29.55730628967285, "debug/policy_chosen_logps": -418.648193359375, "debug/policy_rejected_logits": 30.004676818847656, "debug/policy_rejected_logps": -430.01788330078125, "debug/reference_chosen_logps": -418.4599609375, "debug/reference_rejected_logps": -429.8154296875, "epoch": 0.2127659574468085, "grad_norm": 4.978725766808406, "learning_rate": 1e-06, "logits/chosen": 29.55730628967285, "logits/rejected": 30.004676818847656, "logps/chosen": -418.648193359375, "logps/rejected": -430.01788330078125, "loss": 0.4991, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018823242280632257, "rewards/margins": 0.00014217384159564972, "rewards/rejected": -0.002024497603997588, "step": 10 }, { "debug/policy_chosen_logits": 33.0296516418457, "debug/policy_chosen_logps": -434.47308349609375, "debug/policy_rejected_logits": 31.160263061523438, "debug/policy_rejected_logps": -406.6353759765625, "debug/reference_chosen_logps": -433.95892333984375, "debug/reference_rejected_logps": -405.2964782714844, "epoch": 0.23404255319148937, "grad_norm": 5.822177618834045, "learning_rate": 1e-06, "logits/chosen": 33.0296516418457, "logits/rejected": 31.160263061523438, "logps/chosen": -434.47308349609375, "logps/rejected": -406.6353759765625, "loss": 0.493, "rewards/accuracies": 0.5, "rewards/chosen": -0.005142059177160263, "rewards/margins": 0.008247108198702335, "rewards/rejected": -0.013389168307185173, "step": 11 }, { "debug/policy_chosen_logits": 30.720827102661133, "debug/policy_chosen_logps": -455.06597900390625, "debug/policy_rejected_logits": 32.45933151245117, "debug/policy_rejected_logps": -462.2677307128906, "debug/reference_chosen_logps": -454.6126403808594, "debug/reference_rejected_logps": -459.7181396484375, "epoch": 0.2553191489361702, "grad_norm": 5.239137130887116, "learning_rate": 1e-06, "logits/chosen": 30.720827102661133, "logits/rejected": 32.45933151245117, "logps/chosen": -455.06597900390625, "logps/rejected": -462.2677307128906, "loss": 0.4991, "rewards/accuracies": 0.75, "rewards/chosen": -0.00453338585793972, "rewards/margins": 0.020962638780474663, "rewards/rejected": -0.025496024638414383, "step": 12 }, { "debug/policy_chosen_logits": 30.186174392700195, "debug/policy_chosen_logps": -412.29742431640625, "debug/policy_rejected_logits": 28.243711471557617, "debug/policy_rejected_logps": -426.9504089355469, "debug/reference_chosen_logps": -411.92120361328125, "debug/reference_rejected_logps": -425.7698974609375, "epoch": 0.2765957446808511, "grad_norm": 5.6983956081800855, "learning_rate": 1e-06, "logits/chosen": 30.186174392700195, "logits/rejected": 28.243711471557617, "logps/chosen": -412.29742431640625, "logps/rejected": -426.9504089355469, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": -0.003762359730899334, "rewards/margins": 0.008042870089411736, "rewards/rejected": -0.011805228888988495, "step": 13 }, { "debug/policy_chosen_logits": 29.8179931640625, "debug/policy_chosen_logps": -402.04205322265625, "debug/policy_rejected_logits": 27.887521743774414, "debug/policy_rejected_logps": -406.5090637207031, "debug/reference_chosen_logps": -402.81463623046875, "debug/reference_rejected_logps": -406.35760498046875, "epoch": 0.2978723404255319, "grad_norm": 5.185829515819964, "learning_rate": 1e-06, "logits/chosen": 29.8179931640625, "logits/rejected": 27.887521743774414, "logps/chosen": -402.04205322265625, "logps/rejected": -406.5090637207031, "loss": 0.4892, "rewards/accuracies": 0.625, "rewards/chosen": 0.007725906558334827, "rewards/margins": 0.009240342304110527, "rewards/rejected": -0.0015144352801144123, "step": 14 }, { "debug/policy_chosen_logits": 30.058448791503906, "debug/policy_chosen_logps": -412.29827880859375, "debug/policy_rejected_logits": 29.466854095458984, "debug/policy_rejected_logps": -412.73504638671875, "debug/reference_chosen_logps": -411.7734680175781, "debug/reference_rejected_logps": -413.09912109375, "epoch": 0.3191489361702128, "grad_norm": 5.147098230721813, "learning_rate": 1e-06, "logits/chosen": 30.058448791503906, "logits/rejected": 29.466854095458984, "logps/chosen": -412.29827880859375, "logps/rejected": -412.73504638671875, "loss": 0.4893, "rewards/accuracies": 0.25, "rewards/chosen": -0.005248222034424543, "rewards/margins": -0.008889121934771538, "rewards/rejected": 0.0036408999003469944, "step": 15 }, { "debug/policy_chosen_logits": 26.801280975341797, "debug/policy_chosen_logps": -453.10833740234375, "debug/policy_rejected_logits": 28.296146392822266, "debug/policy_rejected_logps": -433.0950927734375, "debug/reference_chosen_logps": -453.86102294921875, "debug/reference_rejected_logps": -432.42510986328125, "epoch": 0.3404255319148936, "grad_norm": 5.3650227932794765, "learning_rate": 1e-06, "logits/chosen": 26.801280975341797, "logits/rejected": 28.296146392822266, "logps/chosen": -453.10833740234375, "logps/rejected": -433.0950927734375, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": 0.007526512257754803, "rewards/margins": 0.014225959777832031, "rewards/rejected": -0.0066994475200772285, "step": 16 }, { "debug/policy_chosen_logits": 25.992467880249023, "debug/policy_chosen_logps": -436.1377258300781, "debug/policy_rejected_logits": 27.410860061645508, "debug/policy_rejected_logps": -426.43035888671875, "debug/reference_chosen_logps": -434.6832275390625, "debug/reference_rejected_logps": -424.5072021484375, "epoch": 0.3617021276595745, "grad_norm": 5.048216336945854, "learning_rate": 1e-06, "logits/chosen": 25.992467880249023, "logits/rejected": 27.410860061645508, "logps/chosen": -436.1377258300781, "logps/rejected": -426.43035888671875, "loss": 0.494, "rewards/accuracies": 0.625, "rewards/chosen": -0.014545059762895107, "rewards/margins": 0.004686659201979637, "rewards/rejected": -0.01923171989619732, "step": 17 }, { "debug/policy_chosen_logits": 27.924072265625, "debug/policy_chosen_logps": -456.6978759765625, "debug/policy_rejected_logits": 27.263843536376953, "debug/policy_rejected_logps": -411.67791748046875, "debug/reference_chosen_logps": -455.6437683105469, "debug/reference_rejected_logps": -408.3628234863281, "epoch": 0.3829787234042553, "grad_norm": 4.959644897985259, "learning_rate": 1e-06, "logits/chosen": 27.924072265625, "logits/rejected": 27.263843536376953, "logps/chosen": -456.6978759765625, "logps/rejected": -411.67791748046875, "loss": 0.4872, "rewards/accuracies": 0.75, "rewards/chosen": -0.01054123044013977, "rewards/margins": 0.022609787061810493, "rewards/rejected": -0.033151015639305115, "step": 18 }, { "debug/policy_chosen_logits": 30.296974182128906, "debug/policy_chosen_logps": -407.5791320800781, "debug/policy_rejected_logits": 29.760583877563477, "debug/policy_rejected_logps": -417.291748046875, "debug/reference_chosen_logps": -410.10662841796875, "debug/reference_rejected_logps": -418.6151123046875, "epoch": 0.40425531914893614, "grad_norm": 4.809418559441442, "learning_rate": 1e-06, "logits/chosen": 30.296974182128906, "logits/rejected": 29.760583877563477, "logps/chosen": -407.5791320800781, "logps/rejected": -417.291748046875, "loss": 0.4937, "rewards/accuracies": 0.875, "rewards/chosen": 0.02527473494410515, "rewards/margins": 0.012040939182043076, "rewards/rejected": 0.013233794830739498, "step": 19 }, { "debug/policy_chosen_logits": 30.575592041015625, "debug/policy_chosen_logps": -413.82574462890625, "debug/policy_rejected_logits": 32.98490905761719, "debug/policy_rejected_logps": -443.43548583984375, "debug/reference_chosen_logps": -414.27642822265625, "debug/reference_rejected_logps": -441.5928649902344, "epoch": 0.425531914893617, "grad_norm": 5.055368747694493, "learning_rate": 1e-06, "logits/chosen": 30.575592041015625, "logits/rejected": 32.98490905761719, "logps/chosen": -413.82574462890625, "logps/rejected": -443.43548583984375, "loss": 0.4771, "rewards/accuracies": 0.875, "rewards/chosen": 0.0045069498009979725, "rewards/margins": 0.02293361723423004, "rewards/rejected": -0.018426666036248207, "step": 20 }, { "debug/policy_chosen_logits": 28.962617874145508, "debug/policy_chosen_logps": -416.78582763671875, "debug/policy_rejected_logits": 31.380332946777344, "debug/policy_rejected_logps": -443.6494445800781, "debug/reference_chosen_logps": -419.51043701171875, "debug/reference_rejected_logps": -442.1171875, "epoch": 0.44680851063829785, "grad_norm": 5.2399641694392685, "learning_rate": 1e-06, "logits/chosen": 28.962617874145508, "logits/rejected": 31.380332946777344, "logps/chosen": -416.78582763671875, "logps/rejected": -443.6494445800781, "loss": 0.4861, "rewards/accuracies": 0.75, "rewards/chosen": 0.02724616974592209, "rewards/margins": 0.04256858676671982, "rewards/rejected": -0.015322417952120304, "step": 21 }, { "debug/policy_chosen_logits": 33.89327621459961, "debug/policy_chosen_logps": -439.533203125, "debug/policy_rejected_logits": 32.8599853515625, "debug/policy_rejected_logps": -468.5189208984375, "debug/reference_chosen_logps": -437.9319763183594, "debug/reference_rejected_logps": -460.019287109375, "epoch": 0.46808510638297873, "grad_norm": 5.600180944400873, "learning_rate": 1e-06, "logits/chosen": 33.89327621459961, "logits/rejected": 32.8599853515625, "logps/chosen": -439.533203125, "logps/rejected": -468.5189208984375, "loss": 0.4779, "rewards/accuracies": 0.75, "rewards/chosen": -0.016012268140912056, "rewards/margins": 0.06898414343595505, "rewards/rejected": -0.08499641716480255, "step": 22 }, { "debug/policy_chosen_logits": 30.022546768188477, "debug/policy_chosen_logps": -448.22540283203125, "debug/policy_rejected_logits": 30.50183868408203, "debug/policy_rejected_logps": -417.8683776855469, "debug/reference_chosen_logps": -448.73858642578125, "debug/reference_rejected_logps": -417.9635925292969, "epoch": 0.48936170212765956, "grad_norm": 5.053771531276715, "learning_rate": 1e-06, "logits/chosen": 30.022546768188477, "logits/rejected": 30.50183868408203, "logps/chosen": -448.22540283203125, "logps/rejected": -417.8683776855469, "loss": 0.4906, "rewards/accuracies": 0.5, "rewards/chosen": 0.0051317219622433186, "rewards/margins": 0.004179535433650017, "rewards/rejected": 0.0009521869942545891, "step": 23 }, { "debug/policy_chosen_logits": 31.068572998046875, "debug/policy_chosen_logps": -408.3885192871094, "debug/policy_rejected_logits": 30.79738426208496, "debug/policy_rejected_logps": -432.73651123046875, "debug/reference_chosen_logps": -406.73419189453125, "debug/reference_rejected_logps": -432.0497131347656, "epoch": 0.5106382978723404, "grad_norm": 5.120783229464688, "learning_rate": 1e-06, "logits/chosen": 31.068572998046875, "logits/rejected": 30.79738426208496, "logps/chosen": -408.3885192871094, "logps/rejected": -432.73651123046875, "loss": 0.478, "rewards/accuracies": 0.375, "rewards/chosen": -0.016543272882699966, "rewards/margins": -0.009675255045294762, "rewards/rejected": -0.006868018768727779, "step": 24 }, { "debug/policy_chosen_logits": 28.878725051879883, "debug/policy_chosen_logps": -434.04144287109375, "debug/policy_rejected_logits": 30.279621124267578, "debug/policy_rejected_logps": -457.3016357421875, "debug/reference_chosen_logps": -433.21746826171875, "debug/reference_rejected_logps": -453.06280517578125, "epoch": 0.5319148936170213, "grad_norm": 5.297697739276052, "learning_rate": 1e-06, "logits/chosen": 28.878725051879883, "logits/rejected": 30.279621124267578, "logps/chosen": -434.04144287109375, "logps/rejected": -457.3016357421875, "loss": 0.485, "rewards/accuracies": 0.875, "rewards/chosen": -0.00823978427797556, "rewards/margins": 0.03414863348007202, "rewards/rejected": -0.042388420552015305, "step": 25 }, { "debug/policy_chosen_logits": 30.609947204589844, "debug/policy_chosen_logps": -402.73504638671875, "debug/policy_rejected_logits": 29.12665367126465, "debug/policy_rejected_logps": -411.1260986328125, "debug/reference_chosen_logps": -404.96392822265625, "debug/reference_rejected_logps": -411.49969482421875, "epoch": 0.5531914893617021, "grad_norm": 5.066192133102437, "learning_rate": 1e-06, "logits/chosen": 30.609947204589844, "logits/rejected": 29.12665367126465, "logps/chosen": -402.73504638671875, "logps/rejected": -411.1260986328125, "loss": 0.4781, "rewards/accuracies": 0.75, "rewards/chosen": 0.022288817912340164, "rewards/margins": 0.01855243556201458, "rewards/rejected": 0.003736380487680435, "step": 26 }, { "debug/policy_chosen_logits": 26.225852966308594, "debug/policy_chosen_logps": -434.0633544921875, "debug/policy_rejected_logits": 27.547882080078125, "debug/policy_rejected_logps": -460.3682861328125, "debug/reference_chosen_logps": -434.0380859375, "debug/reference_rejected_logps": -457.2252197265625, "epoch": 0.574468085106383, "grad_norm": 5.199359660864606, "learning_rate": 1e-06, "logits/chosen": 26.225852966308594, "logits/rejected": 27.547882080078125, "logps/chosen": -434.0633544921875, "logps/rejected": -460.3682861328125, "loss": 0.4775, "rewards/accuracies": 0.625, "rewards/chosen": -0.0002528773620724678, "rewards/margins": 0.031177710741758347, "rewards/rejected": -0.03143058717250824, "step": 27 }, { "debug/policy_chosen_logits": 34.810482025146484, "debug/policy_chosen_logps": -432.9176025390625, "debug/policy_rejected_logits": 32.29673385620117, "debug/policy_rejected_logps": -435.6657409667969, "debug/reference_chosen_logps": -433.37603759765625, "debug/reference_rejected_logps": -432.38958740234375, "epoch": 0.5957446808510638, "grad_norm": 5.456307945174751, "learning_rate": 1e-06, "logits/chosen": 34.810482025146484, "logits/rejected": 32.29673385620117, "logps/chosen": -432.9176025390625, "logps/rejected": -435.6657409667969, "loss": 0.4635, "rewards/accuracies": 0.625, "rewards/chosen": 0.004584426060318947, "rewards/margins": 0.03734596073627472, "rewards/rejected": -0.03276153653860092, "step": 28 }, { "debug/policy_chosen_logits": 28.973360061645508, "debug/policy_chosen_logps": -432.0859375, "debug/policy_rejected_logits": 27.616941452026367, "debug/policy_rejected_logps": -419.5810546875, "debug/reference_chosen_logps": -432.6524658203125, "debug/reference_rejected_logps": -413.3448181152344, "epoch": 0.6170212765957447, "grad_norm": 4.889651674840413, "learning_rate": 1e-06, "logits/chosen": 28.973360061645508, "logits/rejected": 27.616941452026367, "logps/chosen": -432.0859375, "logps/rejected": -419.5810546875, "loss": 0.4789, "rewards/accuracies": 0.625, "rewards/chosen": 0.005664861761033535, "rewards/margins": 0.06802742183208466, "rewards/rejected": -0.062362559139728546, "step": 29 }, { "debug/policy_chosen_logits": 28.820457458496094, "debug/policy_chosen_logps": -419.7233581542969, "debug/policy_rejected_logits": 30.256000518798828, "debug/policy_rejected_logps": -422.4107971191406, "debug/reference_chosen_logps": -423.3680725097656, "debug/reference_rejected_logps": -421.3091125488281, "epoch": 0.6382978723404256, "grad_norm": 5.137030977785722, "learning_rate": 1e-06, "logits/chosen": 28.820457458496094, "logits/rejected": 30.256000518798828, "logps/chosen": -419.7233581542969, "logps/rejected": -422.4107971191406, "loss": 0.4823, "rewards/accuracies": 0.625, "rewards/chosen": 0.03644702956080437, "rewards/margins": 0.047463756054639816, "rewards/rejected": -0.011016730219125748, "step": 30 }, { "debug/policy_chosen_logits": 29.283926010131836, "debug/policy_chosen_logps": -392.496826171875, "debug/policy_rejected_logits": 31.77328109741211, "debug/policy_rejected_logps": -434.51806640625, "debug/reference_chosen_logps": -395.81146240234375, "debug/reference_rejected_logps": -434.8221435546875, "epoch": 0.6595744680851063, "grad_norm": 4.951189622094444, "learning_rate": 1e-06, "logits/chosen": 29.283926010131836, "logits/rejected": 31.77328109741211, "logps/chosen": -392.496826171875, "logps/rejected": -434.51806640625, "loss": 0.4638, "rewards/accuracies": 0.5, "rewards/chosen": 0.033146705478429794, "rewards/margins": 0.03010578267276287, "rewards/rejected": 0.003040926530957222, "step": 31 }, { "debug/policy_chosen_logits": 29.353422164916992, "debug/policy_chosen_logps": -414.32415771484375, "debug/policy_rejected_logits": 30.822248458862305, "debug/policy_rejected_logps": -430.6376037597656, "debug/reference_chosen_logps": -415.54888916015625, "debug/reference_rejected_logps": -431.1400146484375, "epoch": 0.6808510638297872, "grad_norm": 4.999575032095535, "learning_rate": 1e-06, "logits/chosen": 29.353422164916992, "logits/rejected": 30.822248458862305, "logps/chosen": -414.32415771484375, "logps/rejected": -430.6376037597656, "loss": 0.4834, "rewards/accuracies": 0.625, "rewards/chosen": 0.012247240170836449, "rewards/margins": 0.007223015185445547, "rewards/rejected": 0.00502422172576189, "step": 32 }, { "debug/policy_chosen_logits": 27.81666374206543, "debug/policy_chosen_logps": -437.5671081542969, "debug/policy_rejected_logits": 29.937236785888672, "debug/policy_rejected_logps": -429.474853515625, "debug/reference_chosen_logps": -440.85504150390625, "debug/reference_rejected_logps": -431.23309326171875, "epoch": 0.7021276595744681, "grad_norm": 5.116678809536897, "learning_rate": 1e-06, "logits/chosen": 27.81666374206543, "logits/rejected": 29.937236785888672, "logps/chosen": -437.5671081542969, "logps/rejected": -429.474853515625, "loss": 0.4783, "rewards/accuracies": 0.5, "rewards/chosen": 0.03287952393293381, "rewards/margins": 0.01529712788760662, "rewards/rejected": 0.017582397907972336, "step": 33 }, { "debug/policy_chosen_logits": 32.981014251708984, "debug/policy_chosen_logps": -450.77899169921875, "debug/policy_rejected_logits": 29.245454788208008, "debug/policy_rejected_logps": -421.23468017578125, "debug/reference_chosen_logps": -454.22735595703125, "debug/reference_rejected_logps": -425.0584716796875, "epoch": 0.723404255319149, "grad_norm": 5.485713658124459, "learning_rate": 1e-06, "logits/chosen": 32.981014251708984, "logits/rejected": 29.245454788208008, "logps/chosen": -450.77899169921875, "logps/rejected": -421.23468017578125, "loss": 0.4736, "rewards/accuracies": 0.625, "rewards/chosen": 0.03448398783802986, "rewards/margins": -0.003754120320081711, "rewards/rejected": 0.038238104432821274, "step": 34 }, { "debug/policy_chosen_logits": 30.9548397064209, "debug/policy_chosen_logps": -427.41632080078125, "debug/policy_rejected_logits": 29.430871963500977, "debug/policy_rejected_logps": -436.83050537109375, "debug/reference_chosen_logps": -430.6944580078125, "debug/reference_rejected_logps": -433.17041015625, "epoch": 0.7446808510638298, "grad_norm": 5.73163901812078, "learning_rate": 1e-06, "logits/chosen": 30.9548397064209, "logits/rejected": 29.430871963500977, "logps/chosen": -427.41632080078125, "logps/rejected": -436.83050537109375, "loss": 0.4459, "rewards/accuracies": 0.875, "rewards/chosen": 0.03278125822544098, "rewards/margins": 0.06938225030899048, "rewards/rejected": -0.0366009883582592, "step": 35 }, { "debug/policy_chosen_logits": 28.33571434020996, "debug/policy_chosen_logps": -400.690673828125, "debug/policy_rejected_logits": 24.813756942749023, "debug/policy_rejected_logps": -445.54791259765625, "debug/reference_chosen_logps": -404.2062072753906, "debug/reference_rejected_logps": -433.4603576660156, "epoch": 0.7659574468085106, "grad_norm": 5.207209058021427, "learning_rate": 1e-06, "logits/chosen": 28.33571434020996, "logits/rejected": 24.813756942749023, "logps/chosen": -400.690673828125, "logps/rejected": -445.54791259765625, "loss": 0.4546, "rewards/accuracies": 0.875, "rewards/chosen": 0.03515518084168434, "rewards/margins": 0.15603074431419373, "rewards/rejected": -0.12087554484605789, "step": 36 }, { "debug/policy_chosen_logits": 29.32408332824707, "debug/policy_chosen_logps": -427.9587707519531, "debug/policy_rejected_logits": 27.91067123413086, "debug/policy_rejected_logps": -405.4839172363281, "debug/reference_chosen_logps": -427.9952392578125, "debug/reference_rejected_logps": -407.0904846191406, "epoch": 0.7872340425531915, "grad_norm": 5.293688440117633, "learning_rate": 1e-06, "logits/chosen": 29.32408332824707, "logits/rejected": 27.91067123413086, "logps/chosen": -427.9587707519531, "logps/rejected": -405.4839172363281, "loss": 0.5106, "rewards/accuracies": 0.625, "rewards/chosen": 0.00036445818841457367, "rewards/margins": -0.01570144295692444, "rewards/rejected": 0.01606590300798416, "step": 37 }, { "debug/policy_chosen_logits": 28.01889419555664, "debug/policy_chosen_logps": -404.5608825683594, "debug/policy_rejected_logits": 28.02815055847168, "debug/policy_rejected_logps": -413.2740173339844, "debug/reference_chosen_logps": -405.4373779296875, "debug/reference_rejected_logps": -406.2366027832031, "epoch": 0.8085106382978723, "grad_norm": 4.841695423817661, "learning_rate": 1e-06, "logits/chosen": 28.01889419555664, "logits/rejected": 28.02815055847168, "logps/chosen": -404.5608825683594, "logps/rejected": -413.2740173339844, "loss": 0.4719, "rewards/accuracies": 0.875, "rewards/chosen": 0.00876510702073574, "rewards/margins": 0.07913925498723984, "rewards/rejected": -0.07037414610385895, "step": 38 }, { "debug/policy_chosen_logits": 26.369901657104492, "debug/policy_chosen_logps": -418.9932556152344, "debug/policy_rejected_logits": 24.119754791259766, "debug/policy_rejected_logps": -405.0431213378906, "debug/reference_chosen_logps": -421.83203125, "debug/reference_rejected_logps": -404.04913330078125, "epoch": 0.8297872340425532, "grad_norm": 5.464322991896784, "learning_rate": 1e-06, "logits/chosen": 26.369901657104492, "logits/rejected": 24.119754791259766, "logps/chosen": -418.9932556152344, "logps/rejected": -405.0431213378906, "loss": 0.4666, "rewards/accuracies": 0.625, "rewards/chosen": 0.028387565165758133, "rewards/margins": 0.03832760080695152, "rewards/rejected": -0.009940031915903091, "step": 39 }, { "debug/policy_chosen_logits": 31.222116470336914, "debug/policy_chosen_logps": -408.92315673828125, "debug/policy_rejected_logits": 30.565526962280273, "debug/policy_rejected_logps": -438.00384521484375, "debug/reference_chosen_logps": -410.3876037597656, "debug/reference_rejected_logps": -428.6034851074219, "epoch": 0.851063829787234, "grad_norm": 5.087286245449022, "learning_rate": 1e-06, "logits/chosen": 31.222116470336914, "logits/rejected": 30.565526962280273, "logps/chosen": -408.92315673828125, "logps/rejected": -438.00384521484375, "loss": 0.4691, "rewards/accuracies": 0.75, "rewards/chosen": 0.014644507318735123, "rewards/margins": 0.10864795744419098, "rewards/rejected": -0.09400344640016556, "step": 40 }, { "debug/policy_chosen_logits": 29.859107971191406, "debug/policy_chosen_logps": -403.02581787109375, "debug/policy_rejected_logits": 23.891035079956055, "debug/policy_rejected_logps": -395.781005859375, "debug/reference_chosen_logps": -409.97967529296875, "debug/reference_rejected_logps": -398.636474609375, "epoch": 0.8723404255319149, "grad_norm": 4.953135237737809, "learning_rate": 1e-06, "logits/chosen": 29.859107971191406, "logits/rejected": 23.891035079956055, "logps/chosen": -403.02581787109375, "logps/rejected": -395.781005859375, "loss": 0.458, "rewards/accuracies": 0.75, "rewards/chosen": 0.06953833997249603, "rewards/margins": 0.04098331928253174, "rewards/rejected": 0.02855503186583519, "step": 41 }, { "debug/policy_chosen_logits": 31.02197265625, "debug/policy_chosen_logps": -424.50299072265625, "debug/policy_rejected_logits": 29.632427215576172, "debug/policy_rejected_logps": -422.94561767578125, "debug/reference_chosen_logps": -429.08648681640625, "debug/reference_rejected_logps": -424.3019104003906, "epoch": 0.8936170212765957, "grad_norm": 5.514556663745466, "learning_rate": 1e-06, "logits/chosen": 31.02197265625, "logits/rejected": 29.632427215576172, "logps/chosen": -424.50299072265625, "logps/rejected": -422.94561767578125, "loss": 0.4735, "rewards/accuracies": 0.75, "rewards/chosen": 0.04583461582660675, "rewards/margins": 0.032271310687065125, "rewards/rejected": 0.01356330793350935, "step": 42 }, { "debug/policy_chosen_logits": 32.29981231689453, "debug/policy_chosen_logps": -437.9972229003906, "debug/policy_rejected_logits": 30.15468978881836, "debug/policy_rejected_logps": -440.30535888671875, "debug/reference_chosen_logps": -436.08892822265625, "debug/reference_rejected_logps": -434.7149353027344, "epoch": 0.9148936170212766, "grad_norm": 4.808288177536615, "learning_rate": 1e-06, "logits/chosen": 32.29981231689453, "logits/rejected": 30.15468978881836, "logps/chosen": -437.9972229003906, "logps/rejected": -440.30535888671875, "loss": 0.4598, "rewards/accuracies": 0.375, "rewards/chosen": -0.01908310130238533, "rewards/margins": 0.03682101517915726, "rewards/rejected": -0.055904120206832886, "step": 43 }, { "debug/policy_chosen_logits": 26.440486907958984, "debug/policy_chosen_logps": -391.61572265625, "debug/policy_rejected_logits": 29.678592681884766, "debug/policy_rejected_logps": -430.9735412597656, "debug/reference_chosen_logps": -396.64862060546875, "debug/reference_rejected_logps": -434.947998046875, "epoch": 0.9361702127659575, "grad_norm": 5.150941817603919, "learning_rate": 1e-06, "logits/chosen": 26.440486907958984, "logits/rejected": 29.678592681884766, "logps/chosen": -391.61572265625, "logps/rejected": -430.9735412597656, "loss": 0.4568, "rewards/accuracies": 0.75, "rewards/chosen": 0.050329361110925674, "rewards/margins": 0.010584792122244835, "rewards/rejected": 0.03974456712603569, "step": 44 }, { "debug/policy_chosen_logits": 29.451526641845703, "debug/policy_chosen_logps": -425.8896484375, "debug/policy_rejected_logits": 32.46401596069336, "debug/policy_rejected_logps": -428.1052551269531, "debug/reference_chosen_logps": -424.65936279296875, "debug/reference_rejected_logps": -427.16961669921875, "epoch": 0.9574468085106383, "grad_norm": 5.012447008649623, "learning_rate": 1e-06, "logits/chosen": 29.451526641845703, "logits/rejected": 32.46401596069336, "logps/chosen": -425.8896484375, "logps/rejected": -428.1052551269531, "loss": 0.4777, "rewards/accuracies": 0.5, "rewards/chosen": -0.012303046882152557, "rewards/margins": -0.0029468159191310406, "rewards/rejected": -0.009356231428682804, "step": 45 }, { "debug/policy_chosen_logits": 31.168346405029297, "debug/policy_chosen_logps": -426.1267395019531, "debug/policy_rejected_logits": 29.51166534423828, "debug/policy_rejected_logps": -444.91766357421875, "debug/reference_chosen_logps": -429.6617736816406, "debug/reference_rejected_logps": -439.4256591796875, "epoch": 0.9787234042553191, "grad_norm": 5.030625016447312, "learning_rate": 1e-06, "logits/chosen": 31.168346405029297, "logits/rejected": 29.51166534423828, "logps/chosen": -426.1267395019531, "logps/rejected": -444.91766357421875, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 0.03535018861293793, "rewards/margins": 0.09027023613452911, "rewards/rejected": -0.05492004379630089, "step": 46 }, { "debug/policy_chosen_logits": 28.43193244934082, "debug/policy_chosen_logps": -419.4750671386719, "debug/policy_rejected_logits": 27.273754119873047, "debug/policy_rejected_logps": -438.4751892089844, "debug/reference_chosen_logps": -421.2059326171875, "debug/reference_rejected_logps": -437.2878723144531, "epoch": 1.0, "grad_norm": 5.486914446149956, "learning_rate": 1e-06, "logits/chosen": 28.43193244934082, "logits/rejected": 27.273754119873047, "logps/chosen": -419.4750671386719, "logps/rejected": -438.4751892089844, "loss": 0.4596, "rewards/accuracies": 0.75, "rewards/chosen": 0.01730876788496971, "rewards/margins": 0.02918224036693573, "rewards/rejected": -0.011873474344611168, "step": 47 }, { "epoch": 1.0, "step": 47, "total_flos": 0.0, "train_loss": 0.48215872493196044, "train_runtime": 474.4995, "train_samples_per_second": 6.327, "train_steps_per_second": 0.099 } ], "logging_steps": 1, "max_steps": 47, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }