{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 91, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 22.537443161010742, "debug/policy_chosen_logps": -454.7864685058594, "debug/policy_rejected_logits": 24.08443260192871, "debug/policy_rejected_logps": -485.6905517578125, "debug/reference_chosen_logps": -454.7864685058594, "debug/reference_rejected_logps": -485.6905517578125, "epoch": 0.01098901098901099, "grad_norm": 6.833481499698497, "learning_rate": 1e-06, "logits/chosen": 22.537443161010742, "logits/rejected": 24.08443260192871, "logps/chosen": -454.7864685058594, "logps/rejected": -485.6905517578125, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 25.2491512298584, "debug/policy_chosen_logps": -426.63800048828125, "debug/policy_rejected_logits": 25.061065673828125, "debug/policy_rejected_logps": -446.8348083496094, "debug/reference_chosen_logps": -426.80908203125, "debug/reference_rejected_logps": -446.17877197265625, "epoch": 0.02197802197802198, "grad_norm": 8.518360012009484, "learning_rate": 1e-06, "logits/chosen": 25.2491512298584, "logits/rejected": 25.061065673828125, "logps/chosen": -426.63800048828125, "logps/rejected": -446.8348083496094, "loss": 0.5003, "rewards/accuracies": 0.625, "rewards/chosen": 0.0017108535394072533, "rewards/margins": 0.008270950056612492, "rewards/rejected": -0.006560096517205238, "step": 2 }, { "debug/policy_chosen_logits": 23.04501724243164, "debug/policy_chosen_logps": -418.5323791503906, "debug/policy_rejected_logits": 26.21137809753418, "debug/policy_rejected_logps": -445.2972412109375, "debug/reference_chosen_logps": -418.4830627441406, "debug/reference_rejected_logps": -445.174560546875, "epoch": 0.03296703296703297, "grad_norm": 8.234274066877381, "learning_rate": 1e-06, "logits/chosen": 23.04501724243164, "logits/rejected": 26.21137809753418, "logps/chosen": -418.5323791503906, "logps/rejected": -445.2972412109375, "loss": 0.497, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004932782612740993, "rewards/margins": 0.0007336426060646772, "rewards/rejected": -0.001226921333000064, "step": 3 }, { "debug/policy_chosen_logits": 22.864124298095703, "debug/policy_chosen_logps": -424.31500244140625, "debug/policy_rejected_logits": 24.804826736450195, "debug/policy_rejected_logps": -440.4050598144531, "debug/reference_chosen_logps": -424.2529296875, "debug/reference_rejected_logps": -440.0723876953125, "epoch": 0.04395604395604396, "grad_norm": 8.267600116778256, "learning_rate": 1e-06, "logits/chosen": 22.864124298095703, "logits/rejected": 24.804826736450195, "logps/chosen": -424.31500244140625, "logps/rejected": -440.4050598144531, "loss": 0.4948, "rewards/accuracies": 0.625, "rewards/chosen": -0.000620613107457757, "rewards/margins": 0.0027059551794081926, "rewards/rejected": -0.0033265682868659496, "step": 4 }, { "debug/policy_chosen_logits": 23.74032211303711, "debug/policy_chosen_logps": -425.2148132324219, "debug/policy_rejected_logits": 24.66983985900879, "debug/policy_rejected_logps": -428.9949951171875, "debug/reference_chosen_logps": -425.707763671875, "debug/reference_rejected_logps": -427.61578369140625, "epoch": 0.054945054945054944, "grad_norm": 7.1377920370536465, "learning_rate": 1e-06, "logits/chosen": 23.74032211303711, "logits/rejected": 24.66983985900879, "logps/chosen": -425.2148132324219, "logps/rejected": -428.9949951171875, "loss": 0.4915, "rewards/accuracies": 0.75, "rewards/chosen": 0.004929542541503906, "rewards/margins": 0.018722152337431908, "rewards/rejected": -0.013792609795928001, "step": 5 }, { "debug/policy_chosen_logits": 22.908384323120117, "debug/policy_chosen_logps": -436.1788635253906, "debug/policy_rejected_logits": 26.066926956176758, "debug/policy_rejected_logps": -458.21295166015625, "debug/reference_chosen_logps": -437.0372619628906, "debug/reference_rejected_logps": -455.97064208984375, "epoch": 0.06593406593406594, "grad_norm": 6.840813228978009, "learning_rate": 1e-06, "logits/chosen": 22.908384323120117, "logits/rejected": 26.066926956176758, "logps/chosen": -436.1788635253906, "logps/rejected": -458.21295166015625, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": 0.008583908900618553, "rewards/margins": 0.031006699427962303, "rewards/rejected": -0.02242279052734375, "step": 6 }, { "debug/policy_chosen_logits": 27.81578826904297, "debug/policy_chosen_logps": -456.6412353515625, "debug/policy_rejected_logits": 22.0266170501709, "debug/policy_rejected_logps": -461.65155029296875, "debug/reference_chosen_logps": -457.8715515136719, "debug/reference_rejected_logps": -459.9510498046875, "epoch": 0.07692307692307693, "grad_norm": 7.815334550310639, "learning_rate": 1e-06, "logits/chosen": 27.81578826904297, "logits/rejected": 22.0266170501709, "logps/chosen": -456.6412353515625, "logps/rejected": -461.65155029296875, "loss": 0.4758, "rewards/accuracies": 0.875, "rewards/chosen": 0.012303275987505913, "rewards/margins": 0.029308240860700607, "rewards/rejected": -0.017004966735839844, "step": 7 }, { "debug/policy_chosen_logits": 21.05910301208496, "debug/policy_chosen_logps": -420.5210876464844, "debug/policy_rejected_logits": 21.690637588500977, "debug/policy_rejected_logps": -461.24737548828125, "debug/reference_chosen_logps": -421.00885009765625, "debug/reference_rejected_logps": -459.6142578125, "epoch": 0.08791208791208792, "grad_norm": 6.940663555739121, "learning_rate": 1e-06, "logits/chosen": 21.05910301208496, "logits/rejected": 21.690637588500977, "logps/chosen": -420.5210876464844, "logps/rejected": -461.24737548828125, "loss": 0.4748, "rewards/accuracies": 0.5, "rewards/chosen": 0.004877815023064613, "rewards/margins": 0.02120864763855934, "rewards/rejected": -0.016330832615494728, "step": 8 }, { "debug/policy_chosen_logits": 25.456287384033203, "debug/policy_chosen_logps": -440.22540283203125, "debug/policy_rejected_logits": 26.438932418823242, "debug/policy_rejected_logps": -419.1558837890625, "debug/reference_chosen_logps": -439.4281921386719, "debug/reference_rejected_logps": -417.99652099609375, "epoch": 0.0989010989010989, "grad_norm": 5.7812586478558865, "learning_rate": 1e-06, "logits/chosen": 25.456287384033203, "logits/rejected": 26.438932418823242, "logps/chosen": -440.22540283203125, "logps/rejected": -419.1558837890625, "loss": 0.4769, "rewards/accuracies": 0.5, "rewards/chosen": -0.007972106337547302, "rewards/margins": 0.003621369134634733, "rewards/rejected": -0.011593475937843323, "step": 9 }, { "debug/policy_chosen_logits": 22.315250396728516, "debug/policy_chosen_logps": -425.8302001953125, "debug/policy_rejected_logits": 28.707416534423828, "debug/policy_rejected_logps": -453.5276184082031, "debug/reference_chosen_logps": -427.7785339355469, "debug/reference_rejected_logps": -450.7994079589844, "epoch": 0.10989010989010989, "grad_norm": 6.765369287722621, "learning_rate": 1e-06, "logits/chosen": 22.315250396728516, "logits/rejected": 28.707416534423828, "logps/chosen": -425.8302001953125, "logps/rejected": -453.5276184082031, "loss": 0.4488, "rewards/accuracies": 0.75, "rewards/chosen": 0.019483529031276703, "rewards/margins": 0.04676567018032074, "rewards/rejected": -0.027282143011689186, "step": 10 }, { "debug/policy_chosen_logits": 23.35513687133789, "debug/policy_chosen_logps": -409.0987548828125, "debug/policy_rejected_logits": 25.19911003112793, "debug/policy_rejected_logps": -457.9080810546875, "debug/reference_chosen_logps": -411.0341796875, "debug/reference_rejected_logps": -452.00286865234375, "epoch": 0.12087912087912088, "grad_norm": 6.194229784289876, "learning_rate": 1e-06, "logits/chosen": 23.35513687133789, "logits/rejected": 25.19911003112793, "logps/chosen": -409.0987548828125, "logps/rejected": -457.9080810546875, "loss": 0.4512, "rewards/accuracies": 1.0, "rewards/chosen": 0.01935398206114769, "rewards/margins": 0.07840606570243835, "rewards/rejected": -0.059052083641290665, "step": 11 }, { "debug/policy_chosen_logits": 24.74183464050293, "debug/policy_chosen_logps": -443.5227355957031, "debug/policy_rejected_logits": 23.557279586791992, "debug/policy_rejected_logps": -438.05023193359375, "debug/reference_chosen_logps": -444.56549072265625, "debug/reference_rejected_logps": -433.29443359375, "epoch": 0.13186813186813187, "grad_norm": 6.295588890883081, "learning_rate": 1e-06, "logits/chosen": 24.74183464050293, "logits/rejected": 23.557279586791992, "logps/chosen": -443.5227355957031, "logps/rejected": -438.05023193359375, "loss": 0.444, "rewards/accuracies": 0.875, "rewards/chosen": 0.010427666828036308, "rewards/margins": 0.057986069470644, "rewards/rejected": -0.04755840077996254, "step": 12 }, { "debug/policy_chosen_logits": 25.26060676574707, "debug/policy_chosen_logps": -401.9018249511719, "debug/policy_rejected_logits": 25.899250030517578, "debug/policy_rejected_logps": -447.271240234375, "debug/reference_chosen_logps": -407.1501159667969, "debug/reference_rejected_logps": -443.3197326660156, "epoch": 0.14285714285714285, "grad_norm": 6.651428295339092, "learning_rate": 1e-06, "logits/chosen": 25.26060676574707, "logits/rejected": 25.899250030517578, "logps/chosen": -401.9018249511719, "logps/rejected": -447.271240234375, "loss": 0.4438, "rewards/accuracies": 0.875, "rewards/chosen": 0.05248305946588516, "rewards/margins": 0.0919981375336647, "rewards/rejected": -0.03951507434248924, "step": 13 }, { "debug/policy_chosen_logits": 27.350528717041016, "debug/policy_chosen_logps": -431.6693115234375, "debug/policy_rejected_logits": 27.774742126464844, "debug/policy_rejected_logps": -437.4266052246094, "debug/reference_chosen_logps": -437.1637268066406, "debug/reference_rejected_logps": -433.93536376953125, "epoch": 0.15384615384615385, "grad_norm": 6.44432847940225, "learning_rate": 1e-06, "logits/chosen": 27.350528717041016, "logits/rejected": 27.774742126464844, "logps/chosen": -431.6693115234375, "logps/rejected": -437.4266052246094, "loss": 0.4203, "rewards/accuracies": 0.75, "rewards/chosen": 0.05494399741292, "rewards/margins": 0.08985621482133865, "rewards/rejected": -0.034912221133708954, "step": 14 }, { "debug/policy_chosen_logits": 26.464439392089844, "debug/policy_chosen_logps": -441.0960998535156, "debug/policy_rejected_logits": 22.606828689575195, "debug/policy_rejected_logps": -450.34637451171875, "debug/reference_chosen_logps": -443.2272033691406, "debug/reference_rejected_logps": -445.57098388671875, "epoch": 0.16483516483516483, "grad_norm": 6.034773405123565, "learning_rate": 1e-06, "logits/chosen": 26.464439392089844, "logits/rejected": 22.606828689575195, "logps/chosen": -441.0960998535156, "logps/rejected": -450.34637451171875, "loss": 0.4508, "rewards/accuracies": 0.5, "rewards/chosen": 0.021311109885573387, "rewards/margins": 0.06906504929065704, "rewards/rejected": -0.047753941267728806, "step": 15 }, { "debug/policy_chosen_logits": 23.063968658447266, "debug/policy_chosen_logps": -430.00677490234375, "debug/policy_rejected_logits": 30.106121063232422, "debug/policy_rejected_logps": -459.87164306640625, "debug/reference_chosen_logps": -435.9080810546875, "debug/reference_rejected_logps": -454.621337890625, "epoch": 0.17582417582417584, "grad_norm": 5.694131268125543, "learning_rate": 1e-06, "logits/chosen": 23.063968658447266, "logits/rejected": 30.106121063232422, "logps/chosen": -430.00677490234375, "logps/rejected": -459.87164306640625, "loss": 0.4059, "rewards/accuracies": 0.875, "rewards/chosen": 0.05901290848851204, "rewards/margins": 0.11151611059904099, "rewards/rejected": -0.05250319838523865, "step": 16 }, { "debug/policy_chosen_logits": 29.839580535888672, "debug/policy_chosen_logps": -436.97039794921875, "debug/policy_rejected_logits": 28.98643684387207, "debug/policy_rejected_logps": -476.59881591796875, "debug/reference_chosen_logps": -443.8123779296875, "debug/reference_rejected_logps": -466.94873046875, "epoch": 0.18681318681318682, "grad_norm": 6.738937580645961, "learning_rate": 1e-06, "logits/chosen": 29.839580535888672, "logits/rejected": 28.98643684387207, "logps/chosen": -436.97039794921875, "logps/rejected": -476.59881591796875, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 0.06841972470283508, "rewards/margins": 0.16492080688476562, "rewards/rejected": -0.09650108218193054, "step": 17 }, { "debug/policy_chosen_logits": 29.84876823425293, "debug/policy_chosen_logps": -441.1398620605469, "debug/policy_rejected_logits": 21.603878021240234, "debug/policy_rejected_logps": -443.14556884765625, "debug/reference_chosen_logps": -448.20770263671875, "debug/reference_rejected_logps": -441.1372375488281, "epoch": 0.1978021978021978, "grad_norm": 5.137388346436403, "learning_rate": 1e-06, "logits/chosen": 29.84876823425293, "logits/rejected": 21.603878021240234, "logps/chosen": -441.1398620605469, "logps/rejected": -443.14556884765625, "loss": 0.4207, "rewards/accuracies": 0.75, "rewards/chosen": 0.07067855447530746, "rewards/margins": 0.09076160192489624, "rewards/rejected": -0.020083043724298477, "step": 18 }, { "debug/policy_chosen_logits": 26.974803924560547, "debug/policy_chosen_logps": -453.483642578125, "debug/policy_rejected_logits": 26.663101196289062, "debug/policy_rejected_logps": -489.6366271972656, "debug/reference_chosen_logps": -460.9517822265625, "debug/reference_rejected_logps": -470.0023193359375, "epoch": 0.2087912087912088, "grad_norm": 7.17158785388267, "learning_rate": 1e-06, "logits/chosen": 26.974803924560547, "logits/rejected": 26.663101196289062, "logps/chosen": -453.483642578125, "logps/rejected": -489.6366271972656, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 0.07468143105506897, "rewards/margins": 0.271024614572525, "rewards/rejected": -0.19634319841861725, "step": 19 }, { "debug/policy_chosen_logits": 25.026630401611328, "debug/policy_chosen_logps": -433.65960693359375, "debug/policy_rejected_logits": 25.77982521057129, "debug/policy_rejected_logps": -471.0029296875, "debug/reference_chosen_logps": -436.43359375, "debug/reference_rejected_logps": -460.9761962890625, "epoch": 0.21978021978021978, "grad_norm": 4.898911355829737, "learning_rate": 1e-06, "logits/chosen": 25.026630401611328, "logits/rejected": 25.77982521057129, "logps/chosen": -433.65960693359375, "logps/rejected": -471.0029296875, "loss": 0.3395, "rewards/accuracies": 0.875, "rewards/chosen": 0.027739638462662697, "rewards/margins": 0.12800678610801697, "rewards/rejected": -0.10026714205741882, "step": 20 }, { "debug/policy_chosen_logits": 25.650062561035156, "debug/policy_chosen_logps": -435.83905029296875, "debug/policy_rejected_logits": 24.800756454467773, "debug/policy_rejected_logps": -481.5117492675781, "debug/reference_chosen_logps": -443.297607421875, "debug/reference_rejected_logps": -455.4820556640625, "epoch": 0.23076923076923078, "grad_norm": 5.689326640738984, "learning_rate": 1e-06, "logits/chosen": 25.650062561035156, "logits/rejected": 24.800756454467773, "logps/chosen": -435.83905029296875, "logps/rejected": -481.5117492675781, "loss": 0.3623, "rewards/accuracies": 0.875, "rewards/chosen": 0.07458514720201492, "rewards/margins": 0.3348817229270935, "rewards/rejected": -0.2602965533733368, "step": 21 }, { "debug/policy_chosen_logits": 22.519716262817383, "debug/policy_chosen_logps": -405.88226318359375, "debug/policy_rejected_logits": 27.31670570373535, "debug/policy_rejected_logps": -470.7935791015625, "debug/reference_chosen_logps": -416.4732360839844, "debug/reference_rejected_logps": -464.93170166015625, "epoch": 0.24175824175824176, "grad_norm": 4.9841849550976765, "learning_rate": 1e-06, "logits/chosen": 22.519716262817383, "logits/rejected": 27.31670570373535, "logps/chosen": -405.88226318359375, "logps/rejected": -470.7935791015625, "loss": 0.3153, "rewards/accuracies": 0.75, "rewards/chosen": 0.10590964555740356, "rewards/margins": 0.16452865302562714, "rewards/rejected": -0.058619000017642975, "step": 22 }, { "debug/policy_chosen_logits": 25.54558753967285, "debug/policy_chosen_logps": -427.68450927734375, "debug/policy_rejected_logits": 28.938701629638672, "debug/policy_rejected_logps": -456.0726623535156, "debug/reference_chosen_logps": -445.5054626464844, "debug/reference_rejected_logps": -441.0499267578125, "epoch": 0.25274725274725274, "grad_norm": 4.453741538704258, "learning_rate": 1e-06, "logits/chosen": 25.54558753967285, "logits/rejected": 28.938701629638672, "logps/chosen": -427.68450927734375, "logps/rejected": -456.0726623535156, "loss": 0.3437, "rewards/accuracies": 0.875, "rewards/chosen": 0.17820952832698822, "rewards/margins": 0.328436940908432, "rewards/rejected": -0.1502273976802826, "step": 23 }, { "debug/policy_chosen_logits": 24.3718318939209, "debug/policy_chosen_logps": -407.55877685546875, "debug/policy_rejected_logits": 25.362825393676758, "debug/policy_rejected_logps": -428.3117370605469, "debug/reference_chosen_logps": -422.75518798828125, "debug/reference_rejected_logps": -419.3153381347656, "epoch": 0.26373626373626374, "grad_norm": 5.345678599763925, "learning_rate": 1e-06, "logits/chosen": 24.3718318939209, "logits/rejected": 25.362825393676758, "logps/chosen": -407.55877685546875, "logps/rejected": -428.3117370605469, "loss": 0.3313, "rewards/accuracies": 0.875, "rewards/chosen": 0.1519637256860733, "rewards/margins": 0.24192774295806885, "rewards/rejected": -0.08996403217315674, "step": 24 }, { "debug/policy_chosen_logits": 22.36195945739746, "debug/policy_chosen_logps": -407.60107421875, "debug/policy_rejected_logits": 21.827068328857422, "debug/policy_rejected_logps": -436.4155578613281, "debug/reference_chosen_logps": -428.3585205078125, "debug/reference_rejected_logps": -423.5442199707031, "epoch": 0.27472527472527475, "grad_norm": 6.983328118149161, "learning_rate": 1e-06, "logits/chosen": 22.36195945739746, "logits/rejected": 21.827068328857422, "logps/chosen": -407.60107421875, "logps/rejected": -436.4155578613281, "loss": 0.3456, "rewards/accuracies": 0.75, "rewards/chosen": 0.20757438242435455, "rewards/margins": 0.3362876772880554, "rewards/rejected": -0.12871329486370087, "step": 25 }, { "debug/policy_chosen_logits": 24.60111427307129, "debug/policy_chosen_logps": -424.568115234375, "debug/policy_rejected_logits": 27.44658088684082, "debug/policy_rejected_logps": -500.7226867675781, "debug/reference_chosen_logps": -442.6761779785156, "debug/reference_rejected_logps": -472.3526916503906, "epoch": 0.2857142857142857, "grad_norm": 4.621624916905314, "learning_rate": 1e-06, "logits/chosen": 24.60111427307129, "logits/rejected": 27.44658088684082, "logps/chosen": -424.568115234375, "logps/rejected": -500.7226867675781, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 0.18108057975769043, "rewards/margins": 0.46478039026260376, "rewards/rejected": -0.28369981050491333, "step": 26 }, { "debug/policy_chosen_logits": 24.04281234741211, "debug/policy_chosen_logps": -420.4295349121094, "debug/policy_rejected_logits": 24.854686737060547, "debug/policy_rejected_logps": -453.5174560546875, "debug/reference_chosen_logps": -440.0169677734375, "debug/reference_rejected_logps": -441.5909118652344, "epoch": 0.2967032967032967, "grad_norm": 4.722287417720129, "learning_rate": 1e-06, "logits/chosen": 24.04281234741211, "logits/rejected": 24.854686737060547, "logps/chosen": -420.4295349121094, "logps/rejected": -453.5174560546875, "loss": 0.3288, "rewards/accuracies": 1.0, "rewards/chosen": 0.19587397575378418, "rewards/margins": 0.3151397109031677, "rewards/rejected": -0.11926574259996414, "step": 27 }, { "debug/policy_chosen_logits": 26.117721557617188, "debug/policy_chosen_logps": -437.7601318359375, "debug/policy_rejected_logits": 28.745031356811523, "debug/policy_rejected_logps": -477.95904541015625, "debug/reference_chosen_logps": -448.17889404296875, "debug/reference_rejected_logps": -449.2839050292969, "epoch": 0.3076923076923077, "grad_norm": 4.51629611753504, "learning_rate": 1e-06, "logits/chosen": 26.117721557617188, "logits/rejected": 28.745031356811523, "logps/chosen": -437.7601318359375, "logps/rejected": -477.95904541015625, "loss": 0.3783, "rewards/accuracies": 1.0, "rewards/chosen": 0.10418765991926193, "rewards/margins": 0.3909391462802887, "rewards/rejected": -0.28675150871276855, "step": 28 }, { "debug/policy_chosen_logits": 28.643272399902344, "debug/policy_chosen_logps": -445.78875732421875, "debug/policy_rejected_logits": 30.509065628051758, "debug/policy_rejected_logps": -490.1319274902344, "debug/reference_chosen_logps": -458.77532958984375, "debug/reference_rejected_logps": -467.02667236328125, "epoch": 0.31868131868131866, "grad_norm": 5.726900857769083, "learning_rate": 1e-06, "logits/chosen": 28.643272399902344, "logits/rejected": 30.509065628051758, "logps/chosen": -445.78875732421875, "logps/rejected": -490.1319274902344, "loss": 0.3291, "rewards/accuracies": 0.875, "rewards/chosen": 0.1298656463623047, "rewards/margins": 0.3609181046485901, "rewards/rejected": -0.2310524582862854, "step": 29 }, { "debug/policy_chosen_logits": 23.9057674407959, "debug/policy_chosen_logps": -429.0777587890625, "debug/policy_rejected_logits": 27.041988372802734, "debug/policy_rejected_logps": -489.99102783203125, "debug/reference_chosen_logps": -435.3116149902344, "debug/reference_rejected_logps": -468.8659973144531, "epoch": 0.32967032967032966, "grad_norm": 5.589091335127157, "learning_rate": 1e-06, "logits/chosen": 23.9057674407959, "logits/rejected": 27.041988372802734, "logps/chosen": -429.0777587890625, "logps/rejected": -489.99102783203125, "loss": 0.3149, "rewards/accuracies": 0.75, "rewards/chosen": 0.06233878806233406, "rewards/margins": 0.2735891342163086, "rewards/rejected": -0.21125034987926483, "step": 30 }, { "debug/policy_chosen_logits": 20.52339744567871, "debug/policy_chosen_logps": -417.83367919921875, "debug/policy_rejected_logits": 25.475711822509766, "debug/policy_rejected_logps": -477.3974914550781, "debug/reference_chosen_logps": -429.0983581542969, "debug/reference_rejected_logps": -454.0728759765625, "epoch": 0.34065934065934067, "grad_norm": 5.919614811999328, "learning_rate": 1e-06, "logits/chosen": 20.52339744567871, "logits/rejected": 25.475711822509766, "logps/chosen": -417.83367919921875, "logps/rejected": -477.3974914550781, "loss": 0.2959, "rewards/accuracies": 0.625, "rewards/chosen": 0.11264674365520477, "rewards/margins": 0.34589269757270813, "rewards/rejected": -0.23324593901634216, "step": 31 }, { "debug/policy_chosen_logits": 21.121295928955078, "debug/policy_chosen_logps": -410.53643798828125, "debug/policy_rejected_logits": 27.618637084960938, "debug/policy_rejected_logps": -471.409912109375, "debug/reference_chosen_logps": -427.6445007324219, "debug/reference_rejected_logps": -452.9000244140625, "epoch": 0.3516483516483517, "grad_norm": 6.235065911009125, "learning_rate": 1e-06, "logits/chosen": 21.121295928955078, "logits/rejected": 27.618637084960938, "logps/chosen": -410.53643798828125, "logps/rejected": -471.409912109375, "loss": 0.3322, "rewards/accuracies": 0.75, "rewards/chosen": 0.1710808128118515, "rewards/margins": 0.35617977380752563, "rewards/rejected": -0.18509894609451294, "step": 32 }, { "debug/policy_chosen_logits": 25.020137786865234, "debug/policy_chosen_logps": -421.2449951171875, "debug/policy_rejected_logits": 26.06194305419922, "debug/policy_rejected_logps": -455.33154296875, "debug/reference_chosen_logps": -445.73223876953125, "debug/reference_rejected_logps": -430.2513427734375, "epoch": 0.3626373626373626, "grad_norm": 4.220132361607984, "learning_rate": 1e-06, "logits/chosen": 25.020137786865234, "logits/rejected": 26.06194305419922, "logps/chosen": -421.2449951171875, "logps/rejected": -455.33154296875, "loss": 0.2825, "rewards/accuracies": 1.0, "rewards/chosen": 0.24487201869487762, "rewards/margins": 0.4956740140914917, "rewards/rejected": -0.25080201029777527, "step": 33 }, { "debug/policy_chosen_logits": 28.49104881286621, "debug/policy_chosen_logps": -432.42578125, "debug/policy_rejected_logits": 26.045143127441406, "debug/policy_rejected_logps": -500.81494140625, "debug/reference_chosen_logps": -455.1983642578125, "debug/reference_rejected_logps": -481.32403564453125, "epoch": 0.37362637362637363, "grad_norm": 4.102855456579325, "learning_rate": 1e-06, "logits/chosen": 28.49104881286621, "logits/rejected": 26.045143127441406, "logps/chosen": -432.42578125, "logps/rejected": -500.81494140625, "loss": 0.2778, "rewards/accuracies": 0.75, "rewards/chosen": 0.22772569954395294, "rewards/margins": 0.422635018825531, "rewards/rejected": -0.19490931928157806, "step": 34 }, { "debug/policy_chosen_logits": 23.481491088867188, "debug/policy_chosen_logps": -410.7614440917969, "debug/policy_rejected_logits": 26.54258155822754, "debug/policy_rejected_logps": -461.57659912109375, "debug/reference_chosen_logps": -431.06304931640625, "debug/reference_rejected_logps": -428.4720458984375, "epoch": 0.38461538461538464, "grad_norm": 3.930918282446488, "learning_rate": 1e-06, "logits/chosen": 23.481491088867188, "logits/rejected": 26.54258155822754, "logps/chosen": -410.7614440917969, "logps/rejected": -461.57659912109375, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": 0.20301616191864014, "rewards/margins": 0.5340613126754761, "rewards/rejected": -0.33104515075683594, "step": 35 }, { "debug/policy_chosen_logits": 24.361486434936523, "debug/policy_chosen_logps": -412.67987060546875, "debug/policy_rejected_logits": 27.827251434326172, "debug/policy_rejected_logps": -452.28857421875, "debug/reference_chosen_logps": -430.6239318847656, "debug/reference_rejected_logps": -440.8873596191406, "epoch": 0.3956043956043956, "grad_norm": 5.7094820058187645, "learning_rate": 1e-06, "logits/chosen": 24.361486434936523, "logits/rejected": 27.827251434326172, "logps/chosen": -412.67987060546875, "logps/rejected": -452.28857421875, "loss": 0.3088, "rewards/accuracies": 0.625, "rewards/chosen": 0.1794406771659851, "rewards/margins": 0.29345306754112244, "rewards/rejected": -0.11401237547397614, "step": 36 }, { "debug/policy_chosen_logits": 26.789180755615234, "debug/policy_chosen_logps": -419.154296875, "debug/policy_rejected_logits": 24.381994247436523, "debug/policy_rejected_logps": -443.3766174316406, "debug/reference_chosen_logps": -440.4947509765625, "debug/reference_rejected_logps": -457.65887451171875, "epoch": 0.4065934065934066, "grad_norm": 3.962286836465319, "learning_rate": 1e-06, "logits/chosen": 26.789180755615234, "logits/rejected": 24.381994247436523, "logps/chosen": -419.154296875, "logps/rejected": -443.3766174316406, "loss": 0.2823, "rewards/accuracies": 0.625, "rewards/chosen": 0.21340444684028625, "rewards/margins": 0.07058170437812805, "rewards/rejected": 0.1428227573633194, "step": 37 }, { "debug/policy_chosen_logits": 24.922195434570312, "debug/policy_chosen_logps": -399.93096923828125, "debug/policy_rejected_logits": 27.783912658691406, "debug/policy_rejected_logps": -472.2608947753906, "debug/reference_chosen_logps": -427.5108642578125, "debug/reference_rejected_logps": -462.4599914550781, "epoch": 0.4175824175824176, "grad_norm": 4.150324748851986, "learning_rate": 1e-06, "logits/chosen": 24.922195434570312, "logits/rejected": 27.783912658691406, "logps/chosen": -399.93096923828125, "logps/rejected": -472.2608947753906, "loss": 0.2398, "rewards/accuracies": 0.875, "rewards/chosen": 0.27579906582832336, "rewards/margins": 0.37380802631378174, "rewards/rejected": -0.09800895303487778, "step": 38 }, { "debug/policy_chosen_logits": 22.96780014038086, "debug/policy_chosen_logps": -420.74261474609375, "debug/policy_rejected_logits": 23.919336318969727, "debug/policy_rejected_logps": -439.653076171875, "debug/reference_chosen_logps": -433.999755859375, "debug/reference_rejected_logps": -421.92962646484375, "epoch": 0.42857142857142855, "grad_norm": 4.365666660792002, "learning_rate": 1e-06, "logits/chosen": 22.96780014038086, "logits/rejected": 23.919336318969727, "logps/chosen": -420.74261474609375, "logps/rejected": -439.653076171875, "loss": 0.2833, "rewards/accuracies": 1.0, "rewards/chosen": 0.13257166743278503, "rewards/margins": 0.3098059296607971, "rewards/rejected": -0.17723426222801208, "step": 39 }, { "debug/policy_chosen_logits": 27.75818634033203, "debug/policy_chosen_logps": -455.0806884765625, "debug/policy_rejected_logits": 27.270263671875, "debug/policy_rejected_logps": -468.50927734375, "debug/reference_chosen_logps": -462.73138427734375, "debug/reference_rejected_logps": -473.1136474609375, "epoch": 0.43956043956043955, "grad_norm": 4.570887002489462, "learning_rate": 1e-06, "logits/chosen": 27.75818634033203, "logits/rejected": 27.270263671875, "logps/chosen": -455.0806884765625, "logps/rejected": -468.50927734375, "loss": 0.3436, "rewards/accuracies": 0.375, "rewards/chosen": 0.07650664448738098, "rewards/margins": 0.030462883412837982, "rewards/rejected": 0.04604377597570419, "step": 40 }, { "debug/policy_chosen_logits": 25.347915649414062, "debug/policy_chosen_logps": -406.87591552734375, "debug/policy_rejected_logits": 28.548446655273438, "debug/policy_rejected_logps": -464.4383544921875, "debug/reference_chosen_logps": -433.459716796875, "debug/reference_rejected_logps": -448.8690185546875, "epoch": 0.45054945054945056, "grad_norm": 3.677831065051595, "learning_rate": 1e-06, "logits/chosen": 25.347915649414062, "logits/rejected": 28.548446655273438, "logps/chosen": -406.87591552734375, "logps/rejected": -464.4383544921875, "loss": 0.2758, "rewards/accuracies": 0.875, "rewards/chosen": 0.26583802700042725, "rewards/margins": 0.4215315878391266, "rewards/rejected": -0.15569356083869934, "step": 41 }, { "debug/policy_chosen_logits": 22.77235221862793, "debug/policy_chosen_logps": -402.58416748046875, "debug/policy_rejected_logits": 26.393054962158203, "debug/policy_rejected_logps": -471.61260986328125, "debug/reference_chosen_logps": -426.26690673828125, "debug/reference_rejected_logps": -453.03125, "epoch": 0.46153846153846156, "grad_norm": 3.808742477450644, "learning_rate": 1e-06, "logits/chosen": 22.77235221862793, "logits/rejected": 26.393054962158203, "logps/chosen": -402.58416748046875, "logps/rejected": -471.61260986328125, "loss": 0.2739, "rewards/accuracies": 0.75, "rewards/chosen": 0.23682719469070435, "rewards/margins": 0.4226408302783966, "rewards/rejected": -0.18581363558769226, "step": 42 }, { "debug/policy_chosen_logits": 22.165273666381836, "debug/policy_chosen_logps": -409.06304931640625, "debug/policy_rejected_logits": 26.427730560302734, "debug/policy_rejected_logps": -448.95489501953125, "debug/reference_chosen_logps": -434.5325927734375, "debug/reference_rejected_logps": -412.31060791015625, "epoch": 0.4725274725274725, "grad_norm": 7.512415554484962, "learning_rate": 1e-06, "logits/chosen": 22.165273666381836, "logits/rejected": 26.427730560302734, "logps/chosen": -409.06304931640625, "logps/rejected": -448.95489501953125, "loss": 0.3186, "rewards/accuracies": 1.0, "rewards/chosen": 0.25469571352005005, "rewards/margins": 0.6211386322975159, "rewards/rejected": -0.36644288897514343, "step": 43 }, { "debug/policy_chosen_logits": 25.82545280456543, "debug/policy_chosen_logps": -409.12420654296875, "debug/policy_rejected_logits": 23.266063690185547, "debug/policy_rejected_logps": -493.0911865234375, "debug/reference_chosen_logps": -437.0023193359375, "debug/reference_rejected_logps": -456.59796142578125, "epoch": 0.4835164835164835, "grad_norm": 5.005310078975177, "learning_rate": 1e-06, "logits/chosen": 25.82545280456543, "logits/rejected": 23.266063690185547, "logps/chosen": -409.12420654296875, "logps/rejected": -493.0911865234375, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": 0.278780996799469, "rewards/margins": 0.6437131762504578, "rewards/rejected": -0.36493217945098877, "step": 44 }, { "debug/policy_chosen_logits": 28.779375076293945, "debug/policy_chosen_logps": -419.06585693359375, "debug/policy_rejected_logits": 25.911027908325195, "debug/policy_rejected_logps": -493.4896240234375, "debug/reference_chosen_logps": -444.6504211425781, "debug/reference_rejected_logps": -467.607177734375, "epoch": 0.4945054945054945, "grad_norm": 5.276153180435082, "learning_rate": 1e-06, "logits/chosen": 28.779375076293945, "logits/rejected": 25.911027908325195, "logps/chosen": -419.06585693359375, "logps/rejected": -493.4896240234375, "loss": 0.3159, "rewards/accuracies": 0.875, "rewards/chosen": 0.2558458745479584, "rewards/margins": 0.5146701335906982, "rewards/rejected": -0.25882428884506226, "step": 45 }, { "debug/policy_chosen_logits": 27.05695152282715, "debug/policy_chosen_logps": -428.0582275390625, "debug/policy_rejected_logits": 27.517444610595703, "debug/policy_rejected_logps": -499.21636962890625, "debug/reference_chosen_logps": -452.95599365234375, "debug/reference_rejected_logps": -468.61138916015625, "epoch": 0.5054945054945055, "grad_norm": 4.904417802927379, "learning_rate": 1e-06, "logits/chosen": 27.05695152282715, "logits/rejected": 27.517444610595703, "logps/chosen": -428.0582275390625, "logps/rejected": -499.21636962890625, "loss": 0.242, "rewards/accuracies": 1.0, "rewards/chosen": 0.2489778846502304, "rewards/margins": 0.5550275444984436, "rewards/rejected": -0.306049644947052, "step": 46 }, { "debug/policy_chosen_logits": 24.79572105407715, "debug/policy_chosen_logps": -412.3216552734375, "debug/policy_rejected_logits": 26.50978660583496, "debug/policy_rejected_logps": -466.15557861328125, "debug/reference_chosen_logps": -430.951416015625, "debug/reference_rejected_logps": -448.00433349609375, "epoch": 0.5164835164835165, "grad_norm": 5.334062642354243, "learning_rate": 1e-06, "logits/chosen": 24.79572105407715, "logits/rejected": 26.50978660583496, "logps/chosen": -412.3216552734375, "logps/rejected": -466.15557861328125, "loss": 0.3266, "rewards/accuracies": 0.875, "rewards/chosen": 0.18629732728004456, "rewards/margins": 0.3678096830844879, "rewards/rejected": -0.18151238560676575, "step": 47 }, { "debug/policy_chosen_logits": 24.24400520324707, "debug/policy_chosen_logps": -392.45751953125, "debug/policy_rejected_logits": 27.10297393798828, "debug/policy_rejected_logps": -463.1430969238281, "debug/reference_chosen_logps": -417.45147705078125, "debug/reference_rejected_logps": -438.20220947265625, "epoch": 0.5274725274725275, "grad_norm": 7.066668036628892, "learning_rate": 1e-06, "logits/chosen": 24.24400520324707, "logits/rejected": 27.10297393798828, "logps/chosen": -392.45751953125, "logps/rejected": -463.1430969238281, "loss": 0.3255, "rewards/accuracies": 0.875, "rewards/chosen": 0.24993950128555298, "rewards/margins": 0.49934816360473633, "rewards/rejected": -0.24940869212150574, "step": 48 }, { "debug/policy_chosen_logits": 22.664649963378906, "debug/policy_chosen_logps": -394.15032958984375, "debug/policy_rejected_logits": 25.70164680480957, "debug/policy_rejected_logps": -452.9449768066406, "debug/reference_chosen_logps": -416.6987609863281, "debug/reference_rejected_logps": -439.2945251464844, "epoch": 0.5384615384615384, "grad_norm": 5.135410370061012, "learning_rate": 1e-06, "logits/chosen": 22.664649963378906, "logits/rejected": 25.70164680480957, "logps/chosen": -394.15032958984375, "logps/rejected": -452.9449768066406, "loss": 0.2879, "rewards/accuracies": 1.0, "rewards/chosen": 0.22548410296440125, "rewards/margins": 0.3619886040687561, "rewards/rejected": -0.13650450110435486, "step": 49 }, { "debug/policy_chosen_logits": 22.666519165039062, "debug/policy_chosen_logps": -396.6944885253906, "debug/policy_rejected_logits": 21.13740348815918, "debug/policy_rejected_logps": -453.7715148925781, "debug/reference_chosen_logps": -424.0322265625, "debug/reference_rejected_logps": -440.89862060546875, "epoch": 0.5494505494505495, "grad_norm": 3.9756803394109825, "learning_rate": 1e-06, "logits/chosen": 22.666519165039062, "logits/rejected": 21.13740348815918, "logps/chosen": -396.6944885253906, "logps/rejected": -453.7715148925781, "loss": 0.2293, "rewards/accuracies": 0.875, "rewards/chosen": 0.27337703108787537, "rewards/margins": 0.40210601687431335, "rewards/rejected": -0.1287289708852768, "step": 50 }, { "debug/policy_chosen_logits": 23.19964027404785, "debug/policy_chosen_logps": -397.15765380859375, "debug/policy_rejected_logits": 24.41975212097168, "debug/policy_rejected_logps": -475.76104736328125, "debug/reference_chosen_logps": -424.8116760253906, "debug/reference_rejected_logps": -434.97528076171875, "epoch": 0.5604395604395604, "grad_norm": 4.600442889147922, "learning_rate": 1e-06, "logits/chosen": 23.19964027404785, "logits/rejected": 24.41975212097168, "logps/chosen": -397.15765380859375, "logps/rejected": -475.76104736328125, "loss": 0.2753, "rewards/accuracies": 1.0, "rewards/chosen": 0.2765401601791382, "rewards/margins": 0.6843976974487305, "rewards/rejected": -0.4078575372695923, "step": 51 }, { "debug/policy_chosen_logits": 27.027416229248047, "debug/policy_chosen_logps": -433.012451171875, "debug/policy_rejected_logits": 24.729970932006836, "debug/policy_rejected_logps": -465.9927978515625, "debug/reference_chosen_logps": -459.7962951660156, "debug/reference_rejected_logps": -439.83599853515625, "epoch": 0.5714285714285714, "grad_norm": 3.8308230544471087, "learning_rate": 1e-06, "logits/chosen": 27.027416229248047, "logits/rejected": 24.729970932006836, "logps/chosen": -433.012451171875, "logps/rejected": -465.9927978515625, "loss": 0.2846, "rewards/accuracies": 1.0, "rewards/chosen": 0.26783838868141174, "rewards/margins": 0.529405951499939, "rewards/rejected": -0.26156753301620483, "step": 52 }, { "debug/policy_chosen_logits": 25.013721466064453, "debug/policy_chosen_logps": -409.18426513671875, "debug/policy_rejected_logits": 29.04503059387207, "debug/policy_rejected_logps": -455.4191589355469, "debug/reference_chosen_logps": -434.70855712890625, "debug/reference_rejected_logps": -437.6434020996094, "epoch": 0.5824175824175825, "grad_norm": 5.361663918449101, "learning_rate": 1e-06, "logits/chosen": 25.013721466064453, "logits/rejected": 29.04503059387207, "logps/chosen": -409.18426513671875, "logps/rejected": -455.4191589355469, "loss": 0.3145, "rewards/accuracies": 0.75, "rewards/chosen": 0.2552429437637329, "rewards/margins": 0.433000773191452, "rewards/rejected": -0.17775781452655792, "step": 53 }, { "debug/policy_chosen_logits": 26.417736053466797, "debug/policy_chosen_logps": -419.6263427734375, "debug/policy_rejected_logits": 28.78829002380371, "debug/policy_rejected_logps": -475.8312683105469, "debug/reference_chosen_logps": -440.78143310546875, "debug/reference_rejected_logps": -462.00067138671875, "epoch": 0.5934065934065934, "grad_norm": 4.416192021518405, "learning_rate": 1e-06, "logits/chosen": 26.417736053466797, "logits/rejected": 28.78829002380371, "logps/chosen": -419.6263427734375, "logps/rejected": -475.8312683105469, "loss": 0.3234, "rewards/accuracies": 0.75, "rewards/chosen": 0.2115507870912552, "rewards/margins": 0.34985676407814026, "rewards/rejected": -0.13830597698688507, "step": 54 }, { "debug/policy_chosen_logits": 28.22123908996582, "debug/policy_chosen_logps": -429.47418212890625, "debug/policy_rejected_logits": 25.79034996032715, "debug/policy_rejected_logps": -471.1182861328125, "debug/reference_chosen_logps": -445.8591003417969, "debug/reference_rejected_logps": -455.14202880859375, "epoch": 0.6043956043956044, "grad_norm": 6.671276859718192, "learning_rate": 1e-06, "logits/chosen": 28.22123908996582, "logits/rejected": 25.79034996032715, "logps/chosen": -429.47418212890625, "logps/rejected": -471.1182861328125, "loss": 0.2846, "rewards/accuracies": 0.875, "rewards/chosen": 0.16384944319725037, "rewards/margins": 0.32361170649528503, "rewards/rejected": -0.15976226329803467, "step": 55 }, { "debug/policy_chosen_logits": 26.506807327270508, "debug/policy_chosen_logps": -420.41082763671875, "debug/policy_rejected_logits": 27.579261779785156, "debug/policy_rejected_logps": -452.91162109375, "debug/reference_chosen_logps": -448.4418029785156, "debug/reference_rejected_logps": -432.2170104980469, "epoch": 0.6153846153846154, "grad_norm": 5.601125759934195, "learning_rate": 1e-06, "logits/chosen": 26.506807327270508, "logits/rejected": 27.579261779785156, "logps/chosen": -420.41082763671875, "logps/rejected": -452.91162109375, "loss": 0.2563, "rewards/accuracies": 0.875, "rewards/chosen": 0.2803099453449249, "rewards/margins": 0.48725610971450806, "rewards/rejected": -0.20694613456726074, "step": 56 }, { "debug/policy_chosen_logits": 22.477113723754883, "debug/policy_chosen_logps": -410.53009033203125, "debug/policy_rejected_logits": 24.827415466308594, "debug/policy_rejected_logps": -463.63214111328125, "debug/reference_chosen_logps": -430.95086669921875, "debug/reference_rejected_logps": -428.8712158203125, "epoch": 0.6263736263736264, "grad_norm": 4.278326605774298, "learning_rate": 1e-06, "logits/chosen": 22.477113723754883, "logits/rejected": 24.827415466308594, "logps/chosen": -410.53009033203125, "logps/rejected": -463.63214111328125, "loss": 0.2727, "rewards/accuracies": 0.875, "rewards/chosen": 0.20420792698860168, "rewards/margins": 0.5518174171447754, "rewards/rejected": -0.3476094603538513, "step": 57 }, { "debug/policy_chosen_logits": 24.6320858001709, "debug/policy_chosen_logps": -408.2926025390625, "debug/policy_rejected_logits": 27.864049911499023, "debug/policy_rejected_logps": -461.57208251953125, "debug/reference_chosen_logps": -426.1760559082031, "debug/reference_rejected_logps": -447.0111083984375, "epoch": 0.6373626373626373, "grad_norm": 4.643648875549545, "learning_rate": 1e-06, "logits/chosen": 24.6320858001709, "logits/rejected": 27.864049911499023, "logps/chosen": -408.2926025390625, "logps/rejected": -461.57208251953125, "loss": 0.2516, "rewards/accuracies": 0.625, "rewards/chosen": 0.17883440852165222, "rewards/margins": 0.3244439363479614, "rewards/rejected": -0.1456095427274704, "step": 58 }, { "debug/policy_chosen_logits": 20.885883331298828, "debug/policy_chosen_logps": -412.9990539550781, "debug/policy_rejected_logits": 21.050661087036133, "debug/policy_rejected_logps": -522.6400756835938, "debug/reference_chosen_logps": -440.10931396484375, "debug/reference_rejected_logps": -487.7521667480469, "epoch": 0.6483516483516484, "grad_norm": 4.05657059697319, "learning_rate": 1e-06, "logits/chosen": 20.885883331298828, "logits/rejected": 21.050661087036133, "logps/chosen": -412.9990539550781, "logps/rejected": -522.6400756835938, "loss": 0.2678, "rewards/accuracies": 0.875, "rewards/chosen": 0.2711024880409241, "rewards/margins": 0.6199814081192017, "rewards/rejected": -0.3488789200782776, "step": 59 }, { "debug/policy_chosen_logits": 24.035409927368164, "debug/policy_chosen_logps": -399.28765869140625, "debug/policy_rejected_logits": 24.84585952758789, "debug/policy_rejected_logps": -453.340576171875, "debug/reference_chosen_logps": -420.63153076171875, "debug/reference_rejected_logps": -434.2669677734375, "epoch": 0.6593406593406593, "grad_norm": 3.784225958996139, "learning_rate": 1e-06, "logits/chosen": 24.035409927368164, "logits/rejected": 24.84585952758789, "logps/chosen": -399.28765869140625, "logps/rejected": -453.340576171875, "loss": 0.2552, "rewards/accuracies": 0.625, "rewards/chosen": 0.2134389877319336, "rewards/margins": 0.4041747748851776, "rewards/rejected": -0.19073577225208282, "step": 60 }, { "debug/policy_chosen_logits": 21.771154403686523, "debug/policy_chosen_logps": -403.99932861328125, "debug/policy_rejected_logits": 25.54006576538086, "debug/policy_rejected_logps": -431.0410461425781, "debug/reference_chosen_logps": -427.13372802734375, "debug/reference_rejected_logps": -426.09619140625, "epoch": 0.6703296703296703, "grad_norm": 4.4444525496723815, "learning_rate": 1e-06, "logits/chosen": 21.771154403686523, "logits/rejected": 25.54006576538086, "logps/chosen": -403.99932861328125, "logps/rejected": -431.0410461425781, "loss": 0.2411, "rewards/accuracies": 0.875, "rewards/chosen": 0.2313441038131714, "rewards/margins": 0.2807927131652832, "rewards/rejected": -0.04944861680269241, "step": 61 }, { "debug/policy_chosen_logits": 21.56890869140625, "debug/policy_chosen_logps": -418.62939453125, "debug/policy_rejected_logits": 26.892208099365234, "debug/policy_rejected_logps": -432.6386413574219, "debug/reference_chosen_logps": -446.4366149902344, "debug/reference_rejected_logps": -437.1725769042969, "epoch": 0.6813186813186813, "grad_norm": 6.207177159902563, "learning_rate": 1e-06, "logits/chosen": 21.56890869140625, "logits/rejected": 26.892208099365234, "logps/chosen": -418.62939453125, "logps/rejected": -432.6386413574219, "loss": 0.2765, "rewards/accuracies": 0.5, "rewards/chosen": 0.27807170152664185, "rewards/margins": 0.23273253440856934, "rewards/rejected": 0.04533915966749191, "step": 62 }, { "debug/policy_chosen_logits": 28.32245445251465, "debug/policy_chosen_logps": -440.50726318359375, "debug/policy_rejected_logits": 25.365680694580078, "debug/policy_rejected_logps": -464.0654296875, "debug/reference_chosen_logps": -475.3460998535156, "debug/reference_rejected_logps": -457.081787109375, "epoch": 0.6923076923076923, "grad_norm": 5.48574215281406, "learning_rate": 1e-06, "logits/chosen": 28.32245445251465, "logits/rejected": 25.365680694580078, "logps/chosen": -440.50726318359375, "logps/rejected": -464.0654296875, "loss": 0.2435, "rewards/accuracies": 1.0, "rewards/chosen": 0.34838855266571045, "rewards/margins": 0.4182246923446655, "rewards/rejected": -0.06983615458011627, "step": 63 }, { "debug/policy_chosen_logits": 25.090259552001953, "debug/policy_chosen_logps": -396.6990051269531, "debug/policy_rejected_logits": 27.255340576171875, "debug/policy_rejected_logps": -454.5365295410156, "debug/reference_chosen_logps": -436.3953552246094, "debug/reference_rejected_logps": -430.6047058105469, "epoch": 0.7032967032967034, "grad_norm": 8.801285897449953, "learning_rate": 1e-06, "logits/chosen": 25.090259552001953, "logits/rejected": 27.255340576171875, "logps/chosen": -396.6990051269531, "logps/rejected": -454.5365295410156, "loss": 0.3064, "rewards/accuracies": 0.875, "rewards/chosen": 0.3969634771347046, "rewards/margins": 0.6362816095352173, "rewards/rejected": -0.23931819200515747, "step": 64 }, { "debug/policy_chosen_logits": 28.422582626342773, "debug/policy_chosen_logps": -421.810302734375, "debug/policy_rejected_logits": 22.969980239868164, "debug/policy_rejected_logps": -464.9712219238281, "debug/reference_chosen_logps": -445.0280456542969, "debug/reference_rejected_logps": -451.85345458984375, "epoch": 0.7142857142857143, "grad_norm": 3.3417383455535807, "learning_rate": 1e-06, "logits/chosen": 28.422582626342773, "logits/rejected": 22.969980239868164, "logps/chosen": -421.810302734375, "logps/rejected": -464.9712219238281, "loss": 0.216, "rewards/accuracies": 0.875, "rewards/chosen": 0.2321772277355194, "rewards/margins": 0.36335471272468567, "rewards/rejected": -0.13117747008800507, "step": 65 }, { "debug/policy_chosen_logits": 24.602392196655273, "debug/policy_chosen_logps": -409.503662109375, "debug/policy_rejected_logits": 26.957948684692383, "debug/policy_rejected_logps": -518.37353515625, "debug/reference_chosen_logps": -443.19598388671875, "debug/reference_rejected_logps": -475.41815185546875, "epoch": 0.7252747252747253, "grad_norm": 7.382953809999285, "learning_rate": 1e-06, "logits/chosen": 24.602392196655273, "logits/rejected": 26.957948684692383, "logps/chosen": -409.503662109375, "logps/rejected": -518.37353515625, "loss": 0.1895, "rewards/accuracies": 0.875, "rewards/chosen": 0.3369232416152954, "rewards/margins": 0.7664777040481567, "rewards/rejected": -0.42955446243286133, "step": 66 }, { "debug/policy_chosen_logits": 22.386083602905273, "debug/policy_chosen_logps": -382.5001220703125, "debug/policy_rejected_logits": 16.90252113342285, "debug/policy_rejected_logps": -475.36181640625, "debug/reference_chosen_logps": -412.29132080078125, "debug/reference_rejected_logps": -449.7041015625, "epoch": 0.7362637362637363, "grad_norm": 5.308279817037437, "learning_rate": 1e-06, "logits/chosen": 22.386083602905273, "logits/rejected": 16.90252113342285, "logps/chosen": -382.5001220703125, "logps/rejected": -475.36181640625, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 0.2979119122028351, "rewards/margins": 0.5544889569282532, "rewards/rejected": -0.2565770447254181, "step": 67 }, { "debug/policy_chosen_logits": 17.921504974365234, "debug/policy_chosen_logps": -390.6407470703125, "debug/policy_rejected_logits": 24.367341995239258, "debug/policy_rejected_logps": -484.7353515625, "debug/reference_chosen_logps": -416.8155822753906, "debug/reference_rejected_logps": -466.807861328125, "epoch": 0.7472527472527473, "grad_norm": 5.3343691933569595, "learning_rate": 1e-06, "logits/chosen": 17.921504974365234, "logits/rejected": 24.367341995239258, "logps/chosen": -390.6407470703125, "logps/rejected": -484.7353515625, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": 0.2617482841014862, "rewards/margins": 0.441023588180542, "rewards/rejected": -0.17927534878253937, "step": 68 }, { "debug/policy_chosen_logits": 22.636571884155273, "debug/policy_chosen_logps": -391.5332336425781, "debug/policy_rejected_logits": 27.57520294189453, "debug/policy_rejected_logps": -491.03448486328125, "debug/reference_chosen_logps": -418.2889709472656, "debug/reference_rejected_logps": -467.211181640625, "epoch": 0.7582417582417582, "grad_norm": 4.491987560088097, "learning_rate": 1e-06, "logits/chosen": 22.636571884155273, "logits/rejected": 27.57520294189453, "logps/chosen": -391.5332336425781, "logps/rejected": -491.03448486328125, "loss": 0.2792, "rewards/accuracies": 0.875, "rewards/chosen": 0.26755744218826294, "rewards/margins": 0.5057904124259949, "rewards/rejected": -0.23823297023773193, "step": 69 }, { "debug/policy_chosen_logits": 27.65935516357422, "debug/policy_chosen_logps": -403.2398986816406, "debug/policy_rejected_logits": 29.533405303955078, "debug/policy_rejected_logps": -492.2855224609375, "debug/reference_chosen_logps": -433.0833435058594, "debug/reference_rejected_logps": -444.8511962890625, "epoch": 0.7692307692307693, "grad_norm": 3.4461660659523656, "learning_rate": 1e-06, "logits/chosen": 27.65935516357422, "logits/rejected": 29.533405303955078, "logps/chosen": -403.2398986816406, "logps/rejected": -492.2855224609375, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 0.29843446612358093, "rewards/margins": 0.7727776169776917, "rewards/rejected": -0.47434312105178833, "step": 70 }, { "debug/policy_chosen_logits": 24.503950119018555, "debug/policy_chosen_logps": -446.45513916015625, "debug/policy_rejected_logits": 22.433277130126953, "debug/policy_rejected_logps": -470.9190979003906, "debug/reference_chosen_logps": -472.0450134277344, "debug/reference_rejected_logps": -448.1822814941406, "epoch": 0.7802197802197802, "grad_norm": 5.717761852601763, "learning_rate": 1e-06, "logits/chosen": 24.503950119018555, "logits/rejected": 22.433277130126953, "logps/chosen": -446.45513916015625, "logps/rejected": -470.9190979003906, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 0.2558988630771637, "rewards/margins": 0.4832671284675598, "rewards/rejected": -0.22736826539039612, "step": 71 }, { "debug/policy_chosen_logits": 19.840513229370117, "debug/policy_chosen_logps": -389.92559814453125, "debug/policy_rejected_logits": 24.959850311279297, "debug/policy_rejected_logps": -467.8755187988281, "debug/reference_chosen_logps": -414.42547607421875, "debug/reference_rejected_logps": -441.78948974609375, "epoch": 0.7912087912087912, "grad_norm": 5.324274056234465, "learning_rate": 1e-06, "logits/chosen": 19.840513229370117, "logits/rejected": 24.959850311279297, "logps/chosen": -389.92559814453125, "logps/rejected": -467.8755187988281, "loss": 0.225, "rewards/accuracies": 0.875, "rewards/chosen": 0.24499861896038055, "rewards/margins": 0.5058590173721313, "rewards/rejected": -0.260860413312912, "step": 72 }, { "debug/policy_chosen_logits": 24.089357376098633, "debug/policy_chosen_logps": -417.7742614746094, "debug/policy_rejected_logits": 26.09589958190918, "debug/policy_rejected_logps": -444.3390808105469, "debug/reference_chosen_logps": -446.7651672363281, "debug/reference_rejected_logps": -417.492919921875, "epoch": 0.8021978021978022, "grad_norm": 3.7830573167149204, "learning_rate": 1e-06, "logits/chosen": 24.089357376098633, "logits/rejected": 26.09589958190918, "logps/chosen": -417.7742614746094, "logps/rejected": -444.3390808105469, "loss": 0.2977, "rewards/accuracies": 0.875, "rewards/chosen": 0.2899090647697449, "rewards/margins": 0.5583702921867371, "rewards/rejected": -0.2684612572193146, "step": 73 }, { "debug/policy_chosen_logits": 21.799409866333008, "debug/policy_chosen_logps": -405.78070068359375, "debug/policy_rejected_logits": 26.489110946655273, "debug/policy_rejected_logps": -476.9659423828125, "debug/reference_chosen_logps": -435.2866516113281, "debug/reference_rejected_logps": -449.07806396484375, "epoch": 0.8131868131868132, "grad_norm": 3.635552390173908, "learning_rate": 1e-06, "logits/chosen": 21.799409866333008, "logits/rejected": 26.489110946655273, "logps/chosen": -405.78070068359375, "logps/rejected": -476.9659423828125, "loss": 0.2695, "rewards/accuracies": 0.875, "rewards/chosen": 0.2950596511363983, "rewards/margins": 0.573938250541687, "rewards/rejected": -0.2788785696029663, "step": 74 }, { "debug/policy_chosen_logits": 21.563961029052734, "debug/policy_chosen_logps": -382.45452880859375, "debug/policy_rejected_logits": 27.974708557128906, "debug/policy_rejected_logps": -446.31573486328125, "debug/reference_chosen_logps": -403.13330078125, "debug/reference_rejected_logps": -445.6566467285156, "epoch": 0.8241758241758241, "grad_norm": 6.518551807981199, "learning_rate": 1e-06, "logits/chosen": 21.563961029052734, "logits/rejected": 27.974708557128906, "logps/chosen": -382.45452880859375, "logps/rejected": -446.31573486328125, "loss": 0.339, "rewards/accuracies": 0.75, "rewards/chosen": 0.2067876011133194, "rewards/margins": 0.2133782058954239, "rewards/rejected": -0.006590619683265686, "step": 75 }, { "debug/policy_chosen_logits": 26.57479476928711, "debug/policy_chosen_logps": -421.49554443359375, "debug/policy_rejected_logits": 25.88642692565918, "debug/policy_rejected_logps": -463.5325622558594, "debug/reference_chosen_logps": -445.17974853515625, "debug/reference_rejected_logps": -448.434814453125, "epoch": 0.8351648351648352, "grad_norm": 4.642048379046934, "learning_rate": 1e-06, "logits/chosen": 26.57479476928711, "logits/rejected": 25.88642692565918, "logps/chosen": -421.49554443359375, "logps/rejected": -463.5325622558594, "loss": 0.3014, "rewards/accuracies": 0.75, "rewards/chosen": 0.23684199154376984, "rewards/margins": 0.3878192901611328, "rewards/rejected": -0.15097728371620178, "step": 76 }, { "debug/policy_chosen_logits": 27.36321258544922, "debug/policy_chosen_logps": -417.56976318359375, "debug/policy_rejected_logits": 27.49226951599121, "debug/policy_rejected_logps": -499.5318603515625, "debug/reference_chosen_logps": -444.1575927734375, "debug/reference_rejected_logps": -475.24273681640625, "epoch": 0.8461538461538461, "grad_norm": 4.237463467519612, "learning_rate": 1e-06, "logits/chosen": 27.36321258544922, "logits/rejected": 27.49226951599121, "logps/chosen": -417.56976318359375, "logps/rejected": -499.5318603515625, "loss": 0.2199, "rewards/accuracies": 0.875, "rewards/chosen": 0.2658780515193939, "rewards/margins": 0.5087695121765137, "rewards/rejected": -0.24289149045944214, "step": 77 }, { "debug/policy_chosen_logits": 24.683202743530273, "debug/policy_chosen_logps": -409.4302673339844, "debug/policy_rejected_logits": 23.952590942382812, "debug/policy_rejected_logps": -463.2736511230469, "debug/reference_chosen_logps": -432.84686279296875, "debug/reference_rejected_logps": -437.69964599609375, "epoch": 0.8571428571428571, "grad_norm": 4.166956738488574, "learning_rate": 1e-06, "logits/chosen": 24.683202743530273, "logits/rejected": 23.952590942382812, "logps/chosen": -409.4302673339844, "logps/rejected": -463.2736511230469, "loss": 0.2483, "rewards/accuracies": 0.75, "rewards/chosen": 0.2341659516096115, "rewards/margins": 0.48990583419799805, "rewards/rejected": -0.25573989748954773, "step": 78 }, { "debug/policy_chosen_logits": 26.540374755859375, "debug/policy_chosen_logps": -422.86810302734375, "debug/policy_rejected_logits": 30.67388916015625, "debug/policy_rejected_logps": -492.8260192871094, "debug/reference_chosen_logps": -438.4207763671875, "debug/reference_rejected_logps": -458.2353515625, "epoch": 0.8681318681318682, "grad_norm": 5.889314624750013, "learning_rate": 1e-06, "logits/chosen": 26.540374755859375, "logits/rejected": 30.67388916015625, "logps/chosen": -422.86810302734375, "logps/rejected": -492.8260192871094, "loss": 0.264, "rewards/accuracies": 0.875, "rewards/chosen": 0.15552687644958496, "rewards/margins": 0.5014333724975586, "rewards/rejected": -0.34590649604797363, "step": 79 }, { "debug/policy_chosen_logits": 24.57747459411621, "debug/policy_chosen_logps": -390.2710266113281, "debug/policy_rejected_logits": 27.390230178833008, "debug/policy_rejected_logps": -489.9441223144531, "debug/reference_chosen_logps": -416.788818359375, "debug/reference_rejected_logps": -440.2818298339844, "epoch": 0.8791208791208791, "grad_norm": 4.745834840744495, "learning_rate": 1e-06, "logits/chosen": 24.57747459411621, "logits/rejected": 27.390230178833008, "logps/chosen": -390.2710266113281, "logps/rejected": -489.9441223144531, "loss": 0.2894, "rewards/accuracies": 0.875, "rewards/chosen": 0.2651780843734741, "rewards/margins": 0.76180100440979, "rewards/rejected": -0.49662283062934875, "step": 80 }, { "debug/policy_chosen_logits": 25.34842300415039, "debug/policy_chosen_logps": -390.0006103515625, "debug/policy_rejected_logits": 27.055194854736328, "debug/policy_rejected_logps": -460.668701171875, "debug/reference_chosen_logps": -408.5838623046875, "debug/reference_rejected_logps": -444.5193786621094, "epoch": 0.8901098901098901, "grad_norm": 4.590683530349925, "learning_rate": 1e-06, "logits/chosen": 25.34842300415039, "logits/rejected": 27.055194854736328, "logps/chosen": -390.0006103515625, "logps/rejected": -460.668701171875, "loss": 0.2902, "rewards/accuracies": 1.0, "rewards/chosen": 0.1858326643705368, "rewards/margins": 0.3473258316516876, "rewards/rejected": -0.16149315237998962, "step": 81 }, { "debug/policy_chosen_logits": 27.74835968017578, "debug/policy_chosen_logps": -413.76629638671875, "debug/policy_rejected_logits": 27.351459503173828, "debug/policy_rejected_logps": -438.90216064453125, "debug/reference_chosen_logps": -443.37615966796875, "debug/reference_rejected_logps": -417.8377380371094, "epoch": 0.9010989010989011, "grad_norm": 4.49887483206356, "learning_rate": 1e-06, "logits/chosen": 27.74835968017578, "logits/rejected": 27.351459503173828, "logps/chosen": -413.76629638671875, "logps/rejected": -438.90216064453125, "loss": 0.252, "rewards/accuracies": 0.875, "rewards/chosen": 0.29609841108322144, "rewards/margins": 0.5067427754402161, "rewards/rejected": -0.21064436435699463, "step": 82 }, { "debug/policy_chosen_logits": 26.60724639892578, "debug/policy_chosen_logps": -380.8109130859375, "debug/policy_rejected_logits": 30.683988571166992, "debug/policy_rejected_logps": -487.4014892578125, "debug/reference_chosen_logps": -419.3782958984375, "debug/reference_rejected_logps": -462.61505126953125, "epoch": 0.9120879120879121, "grad_norm": 4.452776489719901, "learning_rate": 1e-06, "logits/chosen": 26.60724639892578, "logits/rejected": 30.683988571166992, "logps/chosen": -380.8109130859375, "logps/rejected": -487.4014892578125, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 0.38567405939102173, "rewards/margins": 0.6335387825965881, "rewards/rejected": -0.2478647232055664, "step": 83 }, { "debug/policy_chosen_logits": 23.03528594970703, "debug/policy_chosen_logps": -416.22314453125, "debug/policy_rejected_logits": 23.581472396850586, "debug/policy_rejected_logps": -434.1588439941406, "debug/reference_chosen_logps": -440.158935546875, "debug/reference_rejected_logps": -421.5128173828125, "epoch": 0.9230769230769231, "grad_norm": 4.971404649364657, "learning_rate": 1e-06, "logits/chosen": 23.03528594970703, "logits/rejected": 23.581472396850586, "logps/chosen": -416.22314453125, "logps/rejected": -434.1588439941406, "loss": 0.2359, "rewards/accuracies": 0.875, "rewards/chosen": 0.23935794830322266, "rewards/margins": 0.36581796407699585, "rewards/rejected": -0.1264600157737732, "step": 84 }, { "debug/policy_chosen_logits": 26.0145320892334, "debug/policy_chosen_logps": -407.3331298828125, "debug/policy_rejected_logits": 22.86039924621582, "debug/policy_rejected_logps": -455.66888427734375, "debug/reference_chosen_logps": -429.9805908203125, "debug/reference_rejected_logps": -431.9481201171875, "epoch": 0.9340659340659341, "grad_norm": 4.560309498305691, "learning_rate": 1e-06, "logits/chosen": 26.0145320892334, "logits/rejected": 22.86039924621582, "logps/chosen": -407.3331298828125, "logps/rejected": -455.66888427734375, "loss": 0.2926, "rewards/accuracies": 0.875, "rewards/chosen": 0.22647437453269958, "rewards/margins": 0.46368181705474854, "rewards/rejected": -0.23720744252204895, "step": 85 }, { "debug/policy_chosen_logits": 21.526865005493164, "debug/policy_chosen_logps": -391.841552734375, "debug/policy_rejected_logits": 25.782846450805664, "debug/policy_rejected_logps": -485.3880310058594, "debug/reference_chosen_logps": -410.5163269042969, "debug/reference_rejected_logps": -450.18896484375, "epoch": 0.945054945054945, "grad_norm": 3.115000092835439, "learning_rate": 1e-06, "logits/chosen": 21.526865005493164, "logits/rejected": 25.782846450805664, "logps/chosen": -391.841552734375, "logps/rejected": -485.3880310058594, "loss": 0.227, "rewards/accuracies": 0.875, "rewards/chosen": 0.1867476999759674, "rewards/margins": 0.5387383699417114, "rewards/rejected": -0.35199064016342163, "step": 86 }, { "debug/policy_chosen_logits": 24.497520446777344, "debug/policy_chosen_logps": -430.42303466796875, "debug/policy_rejected_logits": 26.349315643310547, "debug/policy_rejected_logps": -465.03375244140625, "debug/reference_chosen_logps": -443.20037841796875, "debug/reference_rejected_logps": -455.8772888183594, "epoch": 0.9560439560439561, "grad_norm": 4.513341277064094, "learning_rate": 1e-06, "logits/chosen": 24.497520446777344, "logits/rejected": 26.349315643310547, "logps/chosen": -430.42303466796875, "logps/rejected": -465.03375244140625, "loss": 0.2854, "rewards/accuracies": 0.75, "rewards/chosen": 0.12777358293533325, "rewards/margins": 0.2193382978439331, "rewards/rejected": -0.09156470745801926, "step": 87 }, { "debug/policy_chosen_logits": 18.35858917236328, "debug/policy_chosen_logps": -389.84100341796875, "debug/policy_rejected_logits": 21.731107711791992, "debug/policy_rejected_logps": -423.7160339355469, "debug/reference_chosen_logps": -406.00408935546875, "debug/reference_rejected_logps": -423.71820068359375, "epoch": 0.967032967032967, "grad_norm": 4.8398310648982354, "learning_rate": 1e-06, "logits/chosen": 18.35858917236328, "logits/rejected": 21.731107711791992, "logps/chosen": -389.84100341796875, "logps/rejected": -423.7160339355469, "loss": 0.2595, "rewards/accuracies": 0.75, "rewards/chosen": 0.16163063049316406, "rewards/margins": 0.16160908341407776, "rewards/rejected": 2.155173569917679e-05, "step": 88 }, { "debug/policy_chosen_logits": 23.2659969329834, "debug/policy_chosen_logps": -423.4161376953125, "debug/policy_rejected_logits": 28.146160125732422, "debug/policy_rejected_logps": -473.67059326171875, "debug/reference_chosen_logps": -441.9681091308594, "debug/reference_rejected_logps": -431.1749267578125, "epoch": 0.978021978021978, "grad_norm": 6.467539115961516, "learning_rate": 1e-06, "logits/chosen": 23.2659969329834, "logits/rejected": 28.146160125732422, "logps/chosen": -423.4161376953125, "logps/rejected": -473.67059326171875, "loss": 0.2244, "rewards/accuracies": 1.0, "rewards/chosen": 0.1855195164680481, "rewards/margins": 0.610476553440094, "rewards/rejected": -0.4249570369720459, "step": 89 }, { "debug/policy_chosen_logits": 27.019420623779297, "debug/policy_chosen_logps": -413.4447021484375, "debug/policy_rejected_logits": 31.003211975097656, "debug/policy_rejected_logps": -500.8101501464844, "debug/reference_chosen_logps": -434.320068359375, "debug/reference_rejected_logps": -467.6002502441406, "epoch": 0.989010989010989, "grad_norm": 4.1029333690791745, "learning_rate": 1e-06, "logits/chosen": 27.019420623779297, "logits/rejected": 31.003211975097656, "logps/chosen": -413.4447021484375, "logps/rejected": -500.8101501464844, "loss": 0.2992, "rewards/accuracies": 0.875, "rewards/chosen": 0.20875364542007446, "rewards/margins": 0.5408526062965393, "rewards/rejected": -0.33209896087646484, "step": 90 }, { "debug/policy_chosen_logits": 29.092975616455078, "debug/policy_chosen_logps": -404.4205017089844, "debug/policy_rejected_logits": 23.611309051513672, "debug/policy_rejected_logps": -429.15985107421875, "debug/reference_chosen_logps": -424.5293884277344, "debug/reference_rejected_logps": -418.74169921875, "epoch": 1.0, "grad_norm": 5.419471225413546, "learning_rate": 1e-06, "logits/chosen": 29.092975616455078, "logits/rejected": 23.611309051513672, "logps/chosen": -404.4205017089844, "logps/rejected": -429.15985107421875, "loss": 0.2937, "rewards/accuracies": 0.75, "rewards/chosen": 0.20108896493911743, "rewards/margins": 0.30527064204216003, "rewards/rejected": -0.10418166220188141, "step": 91 }, { "epoch": 1.0, "step": 91, "total_flos": 0.0, "train_loss": 0.3198396195094664, "train_runtime": 711.7161, "train_samples_per_second": 8.179, "train_steps_per_second": 0.128 } ], "logging_steps": 1, "max_steps": 91, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }