{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 16.754077911376953, "debug/policy_chosen_logps": -410.5740966796875, "debug/policy_rejected_logits": 27.903383255004883, "debug/policy_rejected_logps": -454.6379699707031, "debug/reference_chosen_logps": -410.5740966796875, "debug/reference_rejected_logps": -454.6379699707031, "epoch": 0.0125, "grad_norm": 8.96581893169365, "learning_rate": 1e-06, "logits/chosen": 16.754077911376953, "logits/rejected": 27.903383255004883, "logps/chosen": -410.5740966796875, "logps/rejected": -454.6379699707031, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 21.365768432617188, "debug/policy_chosen_logps": -414.0776062011719, "debug/policy_rejected_logits": 24.07695770263672, "debug/policy_rejected_logps": -463.8160095214844, "debug/reference_chosen_logps": -414.4972839355469, "debug/reference_rejected_logps": -464.1080322265625, "epoch": 0.025, "grad_norm": 8.150809103935691, "learning_rate": 1e-06, "logits/chosen": 21.365768432617188, "logits/rejected": 24.07695770263672, "logps/chosen": -414.0776062011719, "logps/rejected": -463.8160095214844, "loss": 0.4994, "rewards/accuracies": 0.5, "rewards/chosen": 0.004196891561150551, "rewards/margins": 0.0012764740968123078, "rewards/rejected": 0.002920417580753565, "step": 2 }, { "debug/policy_chosen_logits": 25.1464786529541, "debug/policy_chosen_logps": -451.97662353515625, "debug/policy_rejected_logits": 20.219999313354492, "debug/policy_rejected_logps": -430.0290222167969, "debug/reference_chosen_logps": -451.04254150390625, "debug/reference_rejected_logps": -429.7825927734375, "epoch": 0.0375, "grad_norm": 7.628476816083828, "learning_rate": 1e-06, "logits/chosen": 25.1464786529541, "logits/rejected": 20.219999313354492, "logps/chosen": -451.97662353515625, "logps/rejected": -430.0290222167969, "loss": 0.4977, "rewards/accuracies": 0.25, "rewards/chosen": -0.009341010823845863, "rewards/margins": -0.006876640021800995, "rewards/rejected": -0.002464370569214225, "step": 3 }, { "debug/policy_chosen_logits": 19.24092674255371, "debug/policy_chosen_logps": -438.38348388671875, "debug/policy_rejected_logits": 26.255001068115234, "debug/policy_rejected_logps": -463.3494567871094, "debug/reference_chosen_logps": -437.87542724609375, "debug/reference_rejected_logps": -461.87445068359375, "epoch": 0.05, "grad_norm": 8.325323905761818, "learning_rate": 1e-06, "logits/chosen": 19.24092674255371, "logits/rejected": 26.255001068115234, "logps/chosen": -438.38348388671875, "logps/rejected": -463.3494567871094, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": -0.0050805662758648396, "rewards/margins": 0.009669303894042969, "rewards/rejected": -0.014749869704246521, "step": 4 }, { "debug/policy_chosen_logits": 17.008541107177734, "debug/policy_chosen_logps": -407.684814453125, "debug/policy_rejected_logits": 23.88060188293457, "debug/policy_rejected_logps": -430.1419982910156, "debug/reference_chosen_logps": -407.943115234375, "debug/reference_rejected_logps": -429.351806640625, "epoch": 0.0625, "grad_norm": 8.68321847373836, "learning_rate": 1e-06, "logits/chosen": 17.008541107177734, "logits/rejected": 23.88060188293457, "logps/chosen": -407.684814453125, "logps/rejected": -430.1419982910156, "loss": 0.4865, "rewards/accuracies": 0.5, "rewards/chosen": 0.002583122346550226, "rewards/margins": 0.010485000908374786, "rewards/rejected": -0.007901879027485847, "step": 5 }, { "debug/policy_chosen_logits": 23.972543716430664, "debug/policy_chosen_logps": -452.3487548828125, "debug/policy_rejected_logits": 23.104774475097656, "debug/policy_rejected_logps": -456.2618408203125, "debug/reference_chosen_logps": -451.52825927734375, "debug/reference_rejected_logps": -454.55670166015625, "epoch": 0.075, "grad_norm": 6.132910409397078, "learning_rate": 1e-06, "logits/chosen": 23.972543716430664, "logits/rejected": 23.104774475097656, "logps/chosen": -452.3487548828125, "logps/rejected": -456.2618408203125, "loss": 0.4884, "rewards/accuracies": 0.625, "rewards/chosen": -0.00820488017052412, "rewards/margins": 0.008846473880112171, "rewards/rejected": -0.01705135405063629, "step": 6 }, { "debug/policy_chosen_logits": 18.909212112426758, "debug/policy_chosen_logps": -402.86126708984375, "debug/policy_rejected_logits": 26.987449645996094, "debug/policy_rejected_logps": -445.4053039550781, "debug/reference_chosen_logps": -403.7894287109375, "debug/reference_rejected_logps": -444.8185729980469, "epoch": 0.0875, "grad_norm": 8.214444111303614, "learning_rate": 1e-06, "logits/chosen": 18.909212112426758, "logits/rejected": 26.987449645996094, "logps/chosen": -402.86126708984375, "logps/rejected": -445.4053039550781, "loss": 0.4736, "rewards/accuracies": 0.625, "rewards/chosen": 0.00928169209510088, "rewards/margins": 0.015148963779211044, "rewards/rejected": -0.005867272149771452, "step": 7 }, { "debug/policy_chosen_logits": 22.914825439453125, "debug/policy_chosen_logps": -427.63922119140625, "debug/policy_rejected_logits": 26.372671127319336, "debug/policy_rejected_logps": -483.4498291015625, "debug/reference_chosen_logps": -427.9068603515625, "debug/reference_rejected_logps": -479.41705322265625, "epoch": 0.1, "grad_norm": 8.86676608632659, "learning_rate": 1e-06, "logits/chosen": 22.914825439453125, "logits/rejected": 26.372671127319336, "logps/chosen": -427.63922119140625, "logps/rejected": -483.4498291015625, "loss": 0.4702, "rewards/accuracies": 0.875, "rewards/chosen": 0.0026765051297843456, "rewards/margins": 0.04300403222441673, "rewards/rejected": -0.0403275266289711, "step": 8 }, { "debug/policy_chosen_logits": 20.00334930419922, "debug/policy_chosen_logps": -432.1858215332031, "debug/policy_rejected_logits": 26.359827041625977, "debug/policy_rejected_logps": -458.9487609863281, "debug/reference_chosen_logps": -433.03985595703125, "debug/reference_rejected_logps": -455.154296875, "epoch": 0.1125, "grad_norm": 7.71461629791825, "learning_rate": 1e-06, "logits/chosen": 20.00334930419922, "logits/rejected": 26.359827041625977, "logps/chosen": -432.1858215332031, "logps/rejected": -458.9487609863281, "loss": 0.4593, "rewards/accuracies": 0.875, "rewards/chosen": 0.008540572598576546, "rewards/margins": 0.04648509621620178, "rewards/rejected": -0.037944525480270386, "step": 9 }, { "debug/policy_chosen_logits": 20.777141571044922, "debug/policy_chosen_logps": -417.86517333984375, "debug/policy_rejected_logits": 20.1833553314209, "debug/policy_rejected_logps": -449.54583740234375, "debug/reference_chosen_logps": -420.6201171875, "debug/reference_rejected_logps": -447.97467041015625, "epoch": 0.125, "grad_norm": 7.863102292412372, "learning_rate": 1e-06, "logits/chosen": 20.777141571044922, "logits/rejected": 20.1833553314209, "logps/chosen": -417.86517333984375, "logps/rejected": -449.54583740234375, "loss": 0.4364, "rewards/accuracies": 0.5, "rewards/chosen": 0.027549132704734802, "rewards/margins": 0.04326057434082031, "rewards/rejected": -0.01571143977344036, "step": 10 }, { "debug/policy_chosen_logits": 20.944578170776367, "debug/policy_chosen_logps": -417.01837158203125, "debug/policy_rejected_logits": 25.846759796142578, "debug/policy_rejected_logps": -450.23468017578125, "debug/reference_chosen_logps": -422.13604736328125, "debug/reference_rejected_logps": -443.507568359375, "epoch": 0.1375, "grad_norm": 6.8158984104886, "learning_rate": 1e-06, "logits/chosen": 20.944578170776367, "logits/rejected": 25.846759796142578, "logps/chosen": -417.01837158203125, "logps/rejected": -450.23468017578125, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 0.05117690563201904, "rewards/margins": 0.11844829469919205, "rewards/rejected": -0.067271389067173, "step": 11 }, { "debug/policy_chosen_logits": 18.338014602661133, "debug/policy_chosen_logps": -406.2593994140625, "debug/policy_rejected_logits": 27.694448471069336, "debug/policy_rejected_logps": -448.88671875, "debug/reference_chosen_logps": -411.54815673828125, "debug/reference_rejected_logps": -443.79315185546875, "epoch": 0.15, "grad_norm": 5.494973875316365, "learning_rate": 1e-06, "logits/chosen": 18.338014602661133, "logits/rejected": 27.694448471069336, "logps/chosen": -406.2593994140625, "logps/rejected": -448.88671875, "loss": 0.4392, "rewards/accuracies": 0.75, "rewards/chosen": 0.05288715288043022, "rewards/margins": 0.10382324457168579, "rewards/rejected": -0.05093608796596527, "step": 12 }, { "debug/policy_chosen_logits": 17.940452575683594, "debug/policy_chosen_logps": -409.26666259765625, "debug/policy_rejected_logits": 26.594871520996094, "debug/policy_rejected_logps": -462.8791809082031, "debug/reference_chosen_logps": -418.971923828125, "debug/reference_rejected_logps": -455.71661376953125, "epoch": 0.1625, "grad_norm": 6.839480437899935, "learning_rate": 1e-06, "logits/chosen": 17.940452575683594, "logits/rejected": 26.594871520996094, "logps/chosen": -409.26666259765625, "logps/rejected": -462.8791809082031, "loss": 0.4146, "rewards/accuracies": 0.875, "rewards/chosen": 0.09705245494842529, "rewards/margins": 0.1686779260635376, "rewards/rejected": -0.0716254711151123, "step": 13 }, { "debug/policy_chosen_logits": 21.89854621887207, "debug/policy_chosen_logps": -419.85137939453125, "debug/policy_rejected_logits": 28.44005012512207, "debug/policy_rejected_logps": -485.2733154296875, "debug/reference_chosen_logps": -422.12200927734375, "debug/reference_rejected_logps": -476.89013671875, "epoch": 0.175, "grad_norm": 6.253412161075545, "learning_rate": 1e-06, "logits/chosen": 21.89854621887207, "logits/rejected": 28.44005012512207, "logps/chosen": -419.85137939453125, "logps/rejected": -485.2733154296875, "loss": 0.4071, "rewards/accuracies": 0.75, "rewards/chosen": 0.022706221789121628, "rewards/margins": 0.10653793066740036, "rewards/rejected": -0.08383171260356903, "step": 14 }, { "debug/policy_chosen_logits": 17.64580726623535, "debug/policy_chosen_logps": -411.6397705078125, "debug/policy_rejected_logits": 21.619295120239258, "debug/policy_rejected_logps": -427.70172119140625, "debug/reference_chosen_logps": -417.8538818359375, "debug/reference_rejected_logps": -421.78173828125, "epoch": 0.1875, "grad_norm": 5.383646265219509, "learning_rate": 1e-06, "logits/chosen": 17.64580726623535, "logits/rejected": 21.619295120239258, "logps/chosen": -411.6397705078125, "logps/rejected": -427.70172119140625, "loss": 0.4213, "rewards/accuracies": 0.75, "rewards/chosen": 0.0621412992477417, "rewards/margins": 0.12134108692407608, "rewards/rejected": -0.05919978767633438, "step": 15 }, { "debug/policy_chosen_logits": 20.415315628051758, "debug/policy_chosen_logps": -423.58721923828125, "debug/policy_rejected_logits": 28.420116424560547, "debug/policy_rejected_logps": -477.6827697753906, "debug/reference_chosen_logps": -427.49700927734375, "debug/reference_rejected_logps": -467.93035888671875, "epoch": 0.2, "grad_norm": 5.208847435610174, "learning_rate": 1e-06, "logits/chosen": 20.415315628051758, "logits/rejected": 28.420116424560547, "logps/chosen": -423.58721923828125, "logps/rejected": -477.6827697753906, "loss": 0.4097, "rewards/accuracies": 0.875, "rewards/chosen": 0.03909797593951225, "rewards/margins": 0.1366221308708191, "rewards/rejected": -0.09752415120601654, "step": 16 }, { "debug/policy_chosen_logits": 17.148258209228516, "debug/policy_chosen_logps": -400.81634521484375, "debug/policy_rejected_logits": 23.263225555419922, "debug/policy_rejected_logps": -491.70166015625, "debug/reference_chosen_logps": -413.73779296875, "debug/reference_rejected_logps": -475.34991455078125, "epoch": 0.2125, "grad_norm": 6.371009448433531, "learning_rate": 1e-06, "logits/chosen": 17.148258209228516, "logits/rejected": 23.263225555419922, "logps/chosen": -400.81634521484375, "logps/rejected": -491.70166015625, "loss": 0.362, "rewards/accuracies": 0.875, "rewards/chosen": 0.12921428680419922, "rewards/margins": 0.2927318513393402, "rewards/rejected": -0.163517564535141, "step": 17 }, { "debug/policy_chosen_logits": 19.67523956298828, "debug/policy_chosen_logps": -417.3398742675781, "debug/policy_rejected_logits": 28.897300720214844, "debug/policy_rejected_logps": -495.4722900390625, "debug/reference_chosen_logps": -429.54852294921875, "debug/reference_rejected_logps": -480.121337890625, "epoch": 0.225, "grad_norm": 4.6720603095118305, "learning_rate": 1e-06, "logits/chosen": 19.67523956298828, "logits/rejected": 28.897300720214844, "logps/chosen": -417.3398742675781, "logps/rejected": -495.4722900390625, "loss": 0.3625, "rewards/accuracies": 1.0, "rewards/chosen": 0.12208674848079681, "rewards/margins": 0.2755962014198303, "rewards/rejected": -0.1535094678401947, "step": 18 }, { "debug/policy_chosen_logits": 22.884260177612305, "debug/policy_chosen_logps": -439.2691650390625, "debug/policy_rejected_logits": 24.13343620300293, "debug/policy_rejected_logps": -466.717041015625, "debug/reference_chosen_logps": -442.8848876953125, "debug/reference_rejected_logps": -443.7426452636719, "epoch": 0.2375, "grad_norm": 6.3874571813001, "learning_rate": 1e-06, "logits/chosen": 22.884260177612305, "logits/rejected": 24.13343620300293, "logps/chosen": -439.2691650390625, "logps/rejected": -466.717041015625, "loss": 0.2871, "rewards/accuracies": 0.75, "rewards/chosen": 0.03615710884332657, "rewards/margins": 0.26590147614479065, "rewards/rejected": -0.22974437475204468, "step": 19 }, { "debug/policy_chosen_logits": 18.085355758666992, "debug/policy_chosen_logps": -405.03131103515625, "debug/policy_rejected_logits": 25.65454864501953, "debug/policy_rejected_logps": -455.36871337890625, "debug/reference_chosen_logps": -411.85076904296875, "debug/reference_rejected_logps": -429.8324890136719, "epoch": 0.25, "grad_norm": 4.779696568165662, "learning_rate": 1e-06, "logits/chosen": 18.085355758666992, "logits/rejected": 25.65454864501953, "logps/chosen": -405.03131103515625, "logps/rejected": -455.36871337890625, "loss": 0.3732, "rewards/accuracies": 0.875, "rewards/chosen": 0.06819470226764679, "rewards/margins": 0.3235568404197693, "rewards/rejected": -0.2553621232509613, "step": 20 }, { "debug/policy_chosen_logits": 20.691024780273438, "debug/policy_chosen_logps": -395.2339782714844, "debug/policy_rejected_logits": 28.38985824584961, "debug/policy_rejected_logps": -447.9287109375, "debug/reference_chosen_logps": -419.5431823730469, "debug/reference_rejected_logps": -440.24517822265625, "epoch": 0.2625, "grad_norm": 4.138890165784992, "learning_rate": 1e-06, "logits/chosen": 20.691024780273438, "logits/rejected": 28.38985824584961, "logps/chosen": -395.2339782714844, "logps/rejected": -447.9287109375, "loss": 0.3061, "rewards/accuracies": 0.75, "rewards/chosen": 0.24309200048446655, "rewards/margins": 0.3199271261692047, "rewards/rejected": -0.07683513313531876, "step": 21 }, { "debug/policy_chosen_logits": 19.216005325317383, "debug/policy_chosen_logps": -398.2820129394531, "debug/policy_rejected_logits": 27.54266929626465, "debug/policy_rejected_logps": -462.150146484375, "debug/reference_chosen_logps": -406.9403076171875, "debug/reference_rejected_logps": -450.525390625, "epoch": 0.275, "grad_norm": 4.456237735720388, "learning_rate": 1e-06, "logits/chosen": 19.216005325317383, "logits/rejected": 27.54266929626465, "logps/chosen": -398.2820129394531, "logps/rejected": -462.150146484375, "loss": 0.3342, "rewards/accuracies": 0.875, "rewards/chosen": 0.08658306300640106, "rewards/margins": 0.20283077657222748, "rewards/rejected": -0.11624770611524582, "step": 22 }, { "debug/policy_chosen_logits": 18.20200538635254, "debug/policy_chosen_logps": -399.0750427246094, "debug/policy_rejected_logits": 29.885364532470703, "debug/policy_rejected_logps": -464.96728515625, "debug/reference_chosen_logps": -416.9361877441406, "debug/reference_rejected_logps": -453.67095947265625, "epoch": 0.2875, "grad_norm": 6.33340244824422, "learning_rate": 1e-06, "logits/chosen": 18.20200538635254, "logits/rejected": 29.885364532470703, "logps/chosen": -399.0750427246094, "logps/rejected": -464.96728515625, "loss": 0.3919, "rewards/accuracies": 0.75, "rewards/chosen": 0.17861154675483704, "rewards/margins": 0.2915750741958618, "rewards/rejected": -0.11296352744102478, "step": 23 }, { "debug/policy_chosen_logits": 20.623193740844727, "debug/policy_chosen_logps": -405.6262512207031, "debug/policy_rejected_logits": 30.310110092163086, "debug/policy_rejected_logps": -461.8826599121094, "debug/reference_chosen_logps": -436.78277587890625, "debug/reference_rejected_logps": -433.28759765625, "epoch": 0.3, "grad_norm": 6.894047386241825, "learning_rate": 1e-06, "logits/chosen": 20.623193740844727, "logits/rejected": 30.310110092163086, "logps/chosen": -405.6262512207031, "logps/rejected": -461.8826599121094, "loss": 0.3025, "rewards/accuracies": 1.0, "rewards/chosen": 0.3115653395652771, "rewards/margins": 0.5975160598754883, "rewards/rejected": -0.2859506905078888, "step": 24 }, { "debug/policy_chosen_logits": 21.141868591308594, "debug/policy_chosen_logps": -387.9573974609375, "debug/policy_rejected_logits": 25.790800094604492, "debug/policy_rejected_logps": -441.328369140625, "debug/reference_chosen_logps": -414.5980529785156, "debug/reference_rejected_logps": -434.82177734375, "epoch": 0.3125, "grad_norm": 4.7433049140532395, "learning_rate": 1e-06, "logits/chosen": 21.141868591308594, "logits/rejected": 25.790800094604492, "logps/chosen": -387.9573974609375, "logps/rejected": -441.328369140625, "loss": 0.3551, "rewards/accuracies": 0.875, "rewards/chosen": 0.26640671491622925, "rewards/margins": 0.33147239685058594, "rewards/rejected": -0.0650656521320343, "step": 25 }, { "debug/policy_chosen_logits": 18.596500396728516, "debug/policy_chosen_logps": -389.6454162597656, "debug/policy_rejected_logits": 27.930261611938477, "debug/policy_rejected_logps": -470.5025329589844, "debug/reference_chosen_logps": -414.76007080078125, "debug/reference_rejected_logps": -440.7433166503906, "epoch": 0.325, "grad_norm": 4.306188553423187, "learning_rate": 1e-06, "logits/chosen": 18.596500396728516, "logits/rejected": 27.930261611938477, "logps/chosen": -389.6454162597656, "logps/rejected": -470.5025329589844, "loss": 0.2747, "rewards/accuracies": 0.875, "rewards/chosen": 0.25114673376083374, "rewards/margins": 0.5487388968467712, "rewards/rejected": -0.2975921928882599, "step": 26 }, { "debug/policy_chosen_logits": 19.838258743286133, "debug/policy_chosen_logps": -396.57708740234375, "debug/policy_rejected_logits": 27.663278579711914, "debug/policy_rejected_logps": -454.4193115234375, "debug/reference_chosen_logps": -419.39898681640625, "debug/reference_rejected_logps": -444.2556457519531, "epoch": 0.3375, "grad_norm": 5.503733763414167, "learning_rate": 1e-06, "logits/chosen": 19.838258743286133, "logits/rejected": 27.663278579711914, "logps/chosen": -396.57708740234375, "logps/rejected": -454.4193115234375, "loss": 0.3097, "rewards/accuracies": 1.0, "rewards/chosen": 0.22821906208992004, "rewards/margins": 0.32985571026802063, "rewards/rejected": -0.10163664817810059, "step": 27 }, { "debug/policy_chosen_logits": 19.494264602661133, "debug/policy_chosen_logps": -386.3175048828125, "debug/policy_rejected_logits": 25.22478675842285, "debug/policy_rejected_logps": -454.5428466796875, "debug/reference_chosen_logps": -409.989501953125, "debug/reference_rejected_logps": -440.07342529296875, "epoch": 0.35, "grad_norm": 5.4619274191461775, "learning_rate": 1e-06, "logits/chosen": 19.494264602661133, "logits/rejected": 25.22478675842285, "logps/chosen": -386.3175048828125, "logps/rejected": -454.5428466796875, "loss": 0.3427, "rewards/accuracies": 0.75, "rewards/chosen": 0.23671993613243103, "rewards/margins": 0.38141441345214844, "rewards/rejected": -0.1446944773197174, "step": 28 }, { "debug/policy_chosen_logits": 20.47563362121582, "debug/policy_chosen_logps": -411.8681640625, "debug/policy_rejected_logits": 29.555112838745117, "debug/policy_rejected_logps": -495.034423828125, "debug/reference_chosen_logps": -428.219970703125, "debug/reference_rejected_logps": -462.74237060546875, "epoch": 0.3625, "grad_norm": 5.291532814248025, "learning_rate": 1e-06, "logits/chosen": 20.47563362121582, "logits/rejected": 29.555112838745117, "logps/chosen": -411.8681640625, "logps/rejected": -495.034423828125, "loss": 0.3448, "rewards/accuracies": 1.0, "rewards/chosen": 0.16351768374443054, "rewards/margins": 0.486438125371933, "rewards/rejected": -0.32292044162750244, "step": 29 }, { "debug/policy_chosen_logits": 20.484766006469727, "debug/policy_chosen_logps": -403.1236572265625, "debug/policy_rejected_logits": 28.859256744384766, "debug/policy_rejected_logps": -473.91632080078125, "debug/reference_chosen_logps": -417.843505859375, "debug/reference_rejected_logps": -439.25738525390625, "epoch": 0.375, "grad_norm": 6.796682505768599, "learning_rate": 1e-06, "logits/chosen": 20.484766006469727, "logits/rejected": 28.859256744384766, "logps/chosen": -403.1236572265625, "logps/rejected": -473.91632080078125, "loss": 0.2546, "rewards/accuracies": 0.75, "rewards/chosen": 0.14719845354557037, "rewards/margins": 0.4937880337238312, "rewards/rejected": -0.3465895652770996, "step": 30 }, { "debug/policy_chosen_logits": 22.867298126220703, "debug/policy_chosen_logps": -418.6160888671875, "debug/policy_rejected_logits": 28.045475006103516, "debug/policy_rejected_logps": -446.5901184082031, "debug/reference_chosen_logps": -429.67620849609375, "debug/reference_rejected_logps": -433.1312255859375, "epoch": 0.3875, "grad_norm": 7.032111537325665, "learning_rate": 1e-06, "logits/chosen": 22.867298126220703, "logits/rejected": 28.045475006103516, "logps/chosen": -418.6160888671875, "logps/rejected": -446.5901184082031, "loss": 0.3774, "rewards/accuracies": 0.75, "rewards/chosen": 0.11060100793838501, "rewards/margins": 0.24519018828868866, "rewards/rejected": -0.13458918035030365, "step": 31 }, { "debug/policy_chosen_logits": 21.08600425720215, "debug/policy_chosen_logps": -425.2349853515625, "debug/policy_rejected_logits": 28.975238800048828, "debug/policy_rejected_logps": -470.2121887207031, "debug/reference_chosen_logps": -433.0067443847656, "debug/reference_rejected_logps": -453.880859375, "epoch": 0.4, "grad_norm": 6.839829841567521, "learning_rate": 1e-06, "logits/chosen": 21.08600425720215, "logits/rejected": 28.975238800048828, "logps/chosen": -425.2349853515625, "logps/rejected": -470.2121887207031, "loss": 0.2944, "rewards/accuracies": 0.75, "rewards/chosen": 0.07771759480237961, "rewards/margins": 0.2410309612751007, "rewards/rejected": -0.1633133739233017, "step": 32 }, { "debug/policy_chosen_logits": 25.196168899536133, "debug/policy_chosen_logps": -411.496337890625, "debug/policy_rejected_logits": 26.846052169799805, "debug/policy_rejected_logps": -488.6064453125, "debug/reference_chosen_logps": -432.0963439941406, "debug/reference_rejected_logps": -467.6219787597656, "epoch": 0.4125, "grad_norm": 4.649169588980754, "learning_rate": 1e-06, "logits/chosen": 25.196168899536133, "logits/rejected": 26.846052169799805, "logps/chosen": -411.496337890625, "logps/rejected": -488.6064453125, "loss": 0.2653, "rewards/accuracies": 0.875, "rewards/chosen": 0.20599989593029022, "rewards/margins": 0.41584473848342896, "rewards/rejected": -0.20984479784965515, "step": 33 }, { "debug/policy_chosen_logits": 18.207822799682617, "debug/policy_chosen_logps": -383.0249328613281, "debug/policy_rejected_logits": 32.11717224121094, "debug/policy_rejected_logps": -473.04638671875, "debug/reference_chosen_logps": -408.9935302734375, "debug/reference_rejected_logps": -424.98388671875, "epoch": 0.425, "grad_norm": 5.057572035835091, "learning_rate": 1e-06, "logits/chosen": 18.207822799682617, "logits/rejected": 32.11717224121094, "logps/chosen": -383.0249328613281, "logps/rejected": -473.04638671875, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": 0.2596858739852905, "rewards/margins": 0.740310549736023, "rewards/rejected": -0.4806246757507324, "step": 34 }, { "debug/policy_chosen_logits": 20.943693161010742, "debug/policy_chosen_logps": -399.00933837890625, "debug/policy_rejected_logits": 30.121450424194336, "debug/policy_rejected_logps": -462.94781494140625, "debug/reference_chosen_logps": -414.2557067871094, "debug/reference_rejected_logps": -443.25909423828125, "epoch": 0.4375, "grad_norm": 3.7317792693885603, "learning_rate": 1e-06, "logits/chosen": 20.943693161010742, "logits/rejected": 30.121450424194336, "logps/chosen": -399.00933837890625, "logps/rejected": -462.94781494140625, "loss": 0.2713, "rewards/accuracies": 0.875, "rewards/chosen": 0.15246371924877167, "rewards/margins": 0.34935063123703003, "rewards/rejected": -0.19688692688941956, "step": 35 }, { "debug/policy_chosen_logits": 18.31939125061035, "debug/policy_chosen_logps": -400.14715576171875, "debug/policy_rejected_logits": 31.468852996826172, "debug/policy_rejected_logps": -513.3980712890625, "debug/reference_chosen_logps": -426.7197570800781, "debug/reference_rejected_logps": -475.2760925292969, "epoch": 0.45, "grad_norm": 4.146366903828521, "learning_rate": 1e-06, "logits/chosen": 18.31939125061035, "logits/rejected": 31.468852996826172, "logps/chosen": -400.14715576171875, "logps/rejected": -513.3980712890625, "loss": 0.284, "rewards/accuracies": 1.0, "rewards/chosen": 0.2657262682914734, "rewards/margins": 0.646946370601654, "rewards/rejected": -0.3812200725078583, "step": 36 }, { "debug/policy_chosen_logits": 18.51481056213379, "debug/policy_chosen_logps": -385.47406005859375, "debug/policy_rejected_logits": 30.034343719482422, "debug/policy_rejected_logps": -452.683837890625, "debug/reference_chosen_logps": -420.1163635253906, "debug/reference_rejected_logps": -444.416259765625, "epoch": 0.4625, "grad_norm": 4.656579513796896, "learning_rate": 1e-06, "logits/chosen": 18.51481056213379, "logits/rejected": 30.034343719482422, "logps/chosen": -385.47406005859375, "logps/rejected": -452.683837890625, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 0.34642308950424194, "rewards/margins": 0.4290991425514221, "rewards/rejected": -0.08267608284950256, "step": 37 }, { "debug/policy_chosen_logits": 19.45956039428711, "debug/policy_chosen_logps": -408.93182373046875, "debug/policy_rejected_logits": 26.489351272583008, "debug/policy_rejected_logps": -452.5537414550781, "debug/reference_chosen_logps": -432.7354736328125, "debug/reference_rejected_logps": -445.1405944824219, "epoch": 0.475, "grad_norm": 6.017458719962861, "learning_rate": 1e-06, "logits/chosen": 19.45956039428711, "logits/rejected": 26.489351272583008, "logps/chosen": -408.93182373046875, "logps/rejected": -452.5537414550781, "loss": 0.2954, "rewards/accuracies": 1.0, "rewards/chosen": 0.23803669214248657, "rewards/margins": 0.3121681809425354, "rewards/rejected": -0.07413151115179062, "step": 38 }, { "debug/policy_chosen_logits": 18.04475975036621, "debug/policy_chosen_logps": -381.45330810546875, "debug/policy_rejected_logits": 31.125469207763672, "debug/policy_rejected_logps": -458.4730529785156, "debug/reference_chosen_logps": -411.2501525878906, "debug/reference_rejected_logps": -440.47845458984375, "epoch": 0.4875, "grad_norm": 7.182310738360888, "learning_rate": 1e-06, "logits/chosen": 18.04475975036621, "logits/rejected": 31.125469207763672, "logps/chosen": -381.45330810546875, "logps/rejected": -458.4730529785156, "loss": 0.2881, "rewards/accuracies": 0.875, "rewards/chosen": 0.29796817898750305, "rewards/margins": 0.4779142439365387, "rewards/rejected": -0.17994609475135803, "step": 39 }, { "debug/policy_chosen_logits": 18.85544776916504, "debug/policy_chosen_logps": -395.41802978515625, "debug/policy_rejected_logits": 23.536022186279297, "debug/policy_rejected_logps": -410.3462829589844, "debug/reference_chosen_logps": -413.8913269042969, "debug/reference_rejected_logps": -419.77679443359375, "epoch": 0.5, "grad_norm": 7.078963246488324, "learning_rate": 1e-06, "logits/chosen": 18.85544776916504, "logits/rejected": 23.536022186279297, "logps/chosen": -395.41802978515625, "logps/rejected": -410.3462829589844, "loss": 0.3245, "rewards/accuracies": 0.625, "rewards/chosen": 0.18473277986049652, "rewards/margins": 0.09042750298976898, "rewards/rejected": 0.09430526942014694, "step": 40 }, { "debug/policy_chosen_logits": 19.017606735229492, "debug/policy_chosen_logps": -385.16949462890625, "debug/policy_rejected_logits": 26.136432647705078, "debug/policy_rejected_logps": -455.77178955078125, "debug/reference_chosen_logps": -409.3904724121094, "debug/reference_rejected_logps": -446.24786376953125, "epoch": 0.5125, "grad_norm": 4.348585522756699, "learning_rate": 1e-06, "logits/chosen": 19.017606735229492, "logits/rejected": 26.136432647705078, "logps/chosen": -385.16949462890625, "logps/rejected": -455.77178955078125, "loss": 0.3493, "rewards/accuracies": 0.75, "rewards/chosen": 0.24220973253250122, "rewards/margins": 0.33744922280311584, "rewards/rejected": -0.09523948282003403, "step": 41 }, { "debug/policy_chosen_logits": 18.70126724243164, "debug/policy_chosen_logps": -393.8923645019531, "debug/policy_rejected_logits": 28.3759765625, "debug/policy_rejected_logps": -474.95550537109375, "debug/reference_chosen_logps": -413.5538330078125, "debug/reference_rejected_logps": -451.510498046875, "epoch": 0.525, "grad_norm": 6.403780077024945, "learning_rate": 1e-06, "logits/chosen": 18.70126724243164, "logits/rejected": 28.3759765625, "logps/chosen": -393.8923645019531, "logps/rejected": -474.95550537109375, "loss": 0.2725, "rewards/accuracies": 0.625, "rewards/chosen": 0.19661453366279602, "rewards/margins": 0.4310648441314697, "rewards/rejected": -0.2344503253698349, "step": 42 }, { "debug/policy_chosen_logits": 20.40342903137207, "debug/policy_chosen_logps": -410.5350341796875, "debug/policy_rejected_logits": 31.70136833190918, "debug/policy_rejected_logps": -484.47381591796875, "debug/reference_chosen_logps": -424.74542236328125, "debug/reference_rejected_logps": -446.88623046875, "epoch": 0.5375, "grad_norm": 9.04429983740781, "learning_rate": 1e-06, "logits/chosen": 20.40342903137207, "logits/rejected": 31.70136833190918, "logps/chosen": -410.5350341796875, "logps/rejected": -484.47381591796875, "loss": 0.2588, "rewards/accuracies": 0.875, "rewards/chosen": 0.14210368692874908, "rewards/margins": 0.5179791450500488, "rewards/rejected": -0.37587541341781616, "step": 43 }, { "debug/policy_chosen_logits": 19.714599609375, "debug/policy_chosen_logps": -411.0771484375, "debug/policy_rejected_logits": 22.63053321838379, "debug/policy_rejected_logps": -460.9835510253906, "debug/reference_chosen_logps": -425.8843994140625, "debug/reference_rejected_logps": -441.40985107421875, "epoch": 0.55, "grad_norm": 5.462116354601322, "learning_rate": 1e-06, "logits/chosen": 19.714599609375, "logits/rejected": 22.63053321838379, "logps/chosen": -411.0771484375, "logps/rejected": -460.9835510253906, "loss": 0.3296, "rewards/accuracies": 1.0, "rewards/chosen": 0.14807261526584625, "rewards/margins": 0.34380990266799927, "rewards/rejected": -0.19573725759983063, "step": 44 }, { "debug/policy_chosen_logits": 23.49720001220703, "debug/policy_chosen_logps": -402.09844970703125, "debug/policy_rejected_logits": 29.42737579345703, "debug/policy_rejected_logps": -443.2966003417969, "debug/reference_chosen_logps": -417.8492431640625, "debug/reference_rejected_logps": -440.06658935546875, "epoch": 0.5625, "grad_norm": 5.887141950804354, "learning_rate": 1e-06, "logits/chosen": 23.49720001220703, "logits/rejected": 29.42737579345703, "logps/chosen": -402.09844970703125, "logps/rejected": -443.2966003417969, "loss": 0.3101, "rewards/accuracies": 0.75, "rewards/chosen": 0.15750765800476074, "rewards/margins": 0.1898074597120285, "rewards/rejected": -0.03229980170726776, "step": 45 }, { "debug/policy_chosen_logits": 18.707294464111328, "debug/policy_chosen_logps": -381.8944396972656, "debug/policy_rejected_logits": 26.21320343017578, "debug/policy_rejected_logps": -449.5614318847656, "debug/reference_chosen_logps": -402.29058837890625, "debug/reference_rejected_logps": -439.96856689453125, "epoch": 0.575, "grad_norm": 7.109794252989897, "learning_rate": 1e-06, "logits/chosen": 18.707294464111328, "logits/rejected": 26.21320343017578, "logps/chosen": -381.8944396972656, "logps/rejected": -449.5614318847656, "loss": 0.2665, "rewards/accuracies": 0.875, "rewards/chosen": 0.20396174490451813, "rewards/margins": 0.2998904585838318, "rewards/rejected": -0.09592871367931366, "step": 46 }, { "debug/policy_chosen_logits": 25.76456069946289, "debug/policy_chosen_logps": -413.05450439453125, "debug/policy_rejected_logits": 29.15418815612793, "debug/policy_rejected_logps": -446.2088623046875, "debug/reference_chosen_logps": -437.90911865234375, "debug/reference_rejected_logps": -437.53997802734375, "epoch": 0.5875, "grad_norm": 4.107110423899632, "learning_rate": 1e-06, "logits/chosen": 25.76456069946289, "logits/rejected": 29.15418815612793, "logps/chosen": -413.05450439453125, "logps/rejected": -446.2088623046875, "loss": 0.2756, "rewards/accuracies": 0.75, "rewards/chosen": 0.24854573607444763, "rewards/margins": 0.3352348506450653, "rewards/rejected": -0.08668914437294006, "step": 47 }, { "debug/policy_chosen_logits": 18.897085189819336, "debug/policy_chosen_logps": -395.95501708984375, "debug/policy_rejected_logits": 24.991113662719727, "debug/policy_rejected_logps": -443.54205322265625, "debug/reference_chosen_logps": -417.3414611816406, "debug/reference_rejected_logps": -438.97320556640625, "epoch": 0.6, "grad_norm": 3.7538316619274052, "learning_rate": 1e-06, "logits/chosen": 18.897085189819336, "logits/rejected": 24.991113662719727, "logps/chosen": -395.95501708984375, "logps/rejected": -443.54205322265625, "loss": 0.3109, "rewards/accuracies": 0.625, "rewards/chosen": 0.21386432647705078, "rewards/margins": 0.25955283641815186, "rewards/rejected": -0.04568850249052048, "step": 48 }, { "debug/policy_chosen_logits": 17.76668357849121, "debug/policy_chosen_logps": -381.77435302734375, "debug/policy_rejected_logits": 28.45110511779785, "debug/policy_rejected_logps": -451.37335205078125, "debug/reference_chosen_logps": -409.0759582519531, "debug/reference_rejected_logps": -429.466552734375, "epoch": 0.6125, "grad_norm": 4.0217662693111516, "learning_rate": 1e-06, "logits/chosen": 17.76668357849121, "logits/rejected": 28.45110511779785, "logps/chosen": -381.77435302734375, "logps/rejected": -451.37335205078125, "loss": 0.2499, "rewards/accuracies": 0.875, "rewards/chosen": 0.27301597595214844, "rewards/margins": 0.49208390712738037, "rewards/rejected": -0.21906791627407074, "step": 49 }, { "debug/policy_chosen_logits": 23.378732681274414, "debug/policy_chosen_logps": -419.51385498046875, "debug/policy_rejected_logits": 25.671220779418945, "debug/policy_rejected_logps": -420.00140380859375, "debug/reference_chosen_logps": -430.8975524902344, "debug/reference_rejected_logps": -418.96746826171875, "epoch": 0.625, "grad_norm": 4.909954105878226, "learning_rate": 1e-06, "logits/chosen": 23.378732681274414, "logits/rejected": 25.671220779418945, "logps/chosen": -419.51385498046875, "logps/rejected": -420.00140380859375, "loss": 0.315, "rewards/accuracies": 0.75, "rewards/chosen": 0.11383689939975739, "rewards/margins": 0.12417624145746231, "rewards/rejected": -0.010339349508285522, "step": 50 }, { "debug/policy_chosen_logits": 23.499061584472656, "debug/policy_chosen_logps": -395.154541015625, "debug/policy_rejected_logits": 26.088699340820312, "debug/policy_rejected_logps": -437.3359375, "debug/reference_chosen_logps": -418.2652587890625, "debug/reference_rejected_logps": -433.26739501953125, "epoch": 0.6375, "grad_norm": 5.626401478627072, "learning_rate": 1e-06, "logits/chosen": 23.499061584472656, "logits/rejected": 26.088699340820312, "logps/chosen": -395.154541015625, "logps/rejected": -437.3359375, "loss": 0.2901, "rewards/accuracies": 0.75, "rewards/chosen": 0.23110699653625488, "rewards/margins": 0.27179256081581116, "rewards/rejected": -0.04068557173013687, "step": 51 }, { "debug/policy_chosen_logits": 21.639236450195312, "debug/policy_chosen_logps": -402.419189453125, "debug/policy_rejected_logits": 27.019371032714844, "debug/policy_rejected_logps": -426.9820251464844, "debug/reference_chosen_logps": -423.36669921875, "debug/reference_rejected_logps": -438.14837646484375, "epoch": 0.65, "grad_norm": 5.769823478560454, "learning_rate": 1e-06, "logits/chosen": 21.639236450195312, "logits/rejected": 27.019371032714844, "logps/chosen": -402.419189453125, "logps/rejected": -426.9820251464844, "loss": 0.2988, "rewards/accuracies": 0.375, "rewards/chosen": 0.20947498083114624, "rewards/margins": 0.09781143069267273, "rewards/rejected": 0.11166355758905411, "step": 52 }, { "debug/policy_chosen_logits": 23.30719757080078, "debug/policy_chosen_logps": -396.96221923828125, "debug/policy_rejected_logits": 32.520904541015625, "debug/policy_rejected_logps": -487.1607666015625, "debug/reference_chosen_logps": -425.7676696777344, "debug/reference_rejected_logps": -464.8005676269531, "epoch": 0.6625, "grad_norm": 3.958996117827781, "learning_rate": 1e-06, "logits/chosen": 23.30719757080078, "logits/rejected": 32.520904541015625, "logps/chosen": -396.96221923828125, "logps/rejected": -487.1607666015625, "loss": 0.2418, "rewards/accuracies": 1.0, "rewards/chosen": 0.2880541682243347, "rewards/margins": 0.5116561651229858, "rewards/rejected": -0.2236020267009735, "step": 53 }, { "debug/policy_chosen_logits": 19.69259262084961, "debug/policy_chosen_logps": -380.706298828125, "debug/policy_rejected_logits": 34.78654098510742, "debug/policy_rejected_logps": -464.5356140136719, "debug/reference_chosen_logps": -408.298583984375, "debug/reference_rejected_logps": -437.8012390136719, "epoch": 0.675, "grad_norm": 5.599167260869624, "learning_rate": 1e-06, "logits/chosen": 19.69259262084961, "logits/rejected": 34.78654098510742, "logps/chosen": -380.706298828125, "logps/rejected": -464.5356140136719, "loss": 0.2732, "rewards/accuracies": 0.875, "rewards/chosen": 0.27592289447784424, "rewards/margins": 0.5432666540145874, "rewards/rejected": -0.26734381914138794, "step": 54 }, { "debug/policy_chosen_logits": 19.60610580444336, "debug/policy_chosen_logps": -388.4259948730469, "debug/policy_rejected_logits": 28.09248161315918, "debug/policy_rejected_logps": -442.44500732421875, "debug/reference_chosen_logps": -414.5915832519531, "debug/reference_rejected_logps": -430.89666748046875, "epoch": 0.6875, "grad_norm": 3.7882949756257345, "learning_rate": 1e-06, "logits/chosen": 19.60610580444336, "logits/rejected": 28.09248161315918, "logps/chosen": -388.4259948730469, "logps/rejected": -442.44500732421875, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": 0.2616558372974396, "rewards/margins": 0.3771390914916992, "rewards/rejected": -0.11548325419425964, "step": 55 }, { "debug/policy_chosen_logits": 19.7696533203125, "debug/policy_chosen_logps": -400.4974365234375, "debug/policy_rejected_logits": 30.27635383605957, "debug/policy_rejected_logps": -492.6361999511719, "debug/reference_chosen_logps": -423.21453857421875, "debug/reference_rejected_logps": -456.6308288574219, "epoch": 0.7, "grad_norm": 3.5927401238775176, "learning_rate": 1e-06, "logits/chosen": 19.7696533203125, "logits/rejected": 30.27635383605957, "logps/chosen": -400.4974365234375, "logps/rejected": -492.6361999511719, "loss": 0.2819, "rewards/accuracies": 0.875, "rewards/chosen": 0.22717097401618958, "rewards/margins": 0.5872249007225037, "rewards/rejected": -0.3600539267063141, "step": 56 }, { "debug/policy_chosen_logits": 18.955514907836914, "debug/policy_chosen_logps": -407.9693603515625, "debug/policy_rejected_logits": 29.411741256713867, "debug/policy_rejected_logps": -474.837646484375, "debug/reference_chosen_logps": -418.5709228515625, "debug/reference_rejected_logps": -455.1912841796875, "epoch": 0.7125, "grad_norm": 5.251957841550457, "learning_rate": 1e-06, "logits/chosen": 18.955514907836914, "logits/rejected": 29.411741256713867, "logps/chosen": -407.9693603515625, "logps/rejected": -474.837646484375, "loss": 0.2251, "rewards/accuracies": 0.875, "rewards/chosen": 0.10601577907800674, "rewards/margins": 0.3024793565273285, "rewards/rejected": -0.19646358489990234, "step": 57 }, { "debug/policy_chosen_logits": 19.137195587158203, "debug/policy_chosen_logps": -394.4432678222656, "debug/policy_rejected_logits": 25.144182205200195, "debug/policy_rejected_logps": -469.2935791015625, "debug/reference_chosen_logps": -414.51580810546875, "debug/reference_rejected_logps": -443.5001220703125, "epoch": 0.725, "grad_norm": 4.675802502460992, "learning_rate": 1e-06, "logits/chosen": 19.137195587158203, "logits/rejected": 25.144182205200195, "logps/chosen": -394.4432678222656, "logps/rejected": -469.2935791015625, "loss": 0.2485, "rewards/accuracies": 0.75, "rewards/chosen": 0.20072558522224426, "rewards/margins": 0.4586600065231323, "rewards/rejected": -0.25793442130088806, "step": 58 }, { "debug/policy_chosen_logits": 28.265625, "debug/policy_chosen_logps": -439.6889953613281, "debug/policy_rejected_logits": 26.68956756591797, "debug/policy_rejected_logps": -480.45562744140625, "debug/reference_chosen_logps": -452.0250244140625, "debug/reference_rejected_logps": -461.8011474609375, "epoch": 0.7375, "grad_norm": 4.874826559571265, "learning_rate": 1e-06, "logits/chosen": 28.265625, "logits/rejected": 26.68956756591797, "logps/chosen": -439.6889953613281, "logps/rejected": -480.45562744140625, "loss": 0.3103, "rewards/accuracies": 0.75, "rewards/chosen": 0.12336035817861557, "rewards/margins": 0.30990517139434814, "rewards/rejected": -0.18654480576515198, "step": 59 }, { "debug/policy_chosen_logits": 17.890281677246094, "debug/policy_chosen_logps": -388.18328857421875, "debug/policy_rejected_logits": 29.060598373413086, "debug/policy_rejected_logps": -480.3656921386719, "debug/reference_chosen_logps": -411.6337890625, "debug/reference_rejected_logps": -446.0444641113281, "epoch": 0.75, "grad_norm": 3.8092727948167764, "learning_rate": 1e-06, "logits/chosen": 17.890281677246094, "logits/rejected": 29.060598373413086, "logps/chosen": -388.18328857421875, "logps/rejected": -480.3656921386719, "loss": 0.3023, "rewards/accuracies": 1.0, "rewards/chosen": 0.23450516164302826, "rewards/margins": 0.5777173638343811, "rewards/rejected": -0.3432122468948364, "step": 60 }, { "debug/policy_chosen_logits": 21.306364059448242, "debug/policy_chosen_logps": -417.54681396484375, "debug/policy_rejected_logits": 23.916120529174805, "debug/policy_rejected_logps": -446.38153076171875, "debug/reference_chosen_logps": -437.6956787109375, "debug/reference_rejected_logps": -429.464599609375, "epoch": 0.7625, "grad_norm": 3.732343889219544, "learning_rate": 1e-06, "logits/chosen": 21.306364059448242, "logits/rejected": 23.916120529174805, "logps/chosen": -417.54681396484375, "logps/rejected": -446.38153076171875, "loss": 0.2784, "rewards/accuracies": 0.75, "rewards/chosen": 0.2014881819486618, "rewards/margins": 0.3706575632095337, "rewards/rejected": -0.16916939616203308, "step": 61 }, { "debug/policy_chosen_logits": 19.258764266967773, "debug/policy_chosen_logps": -400.505126953125, "debug/policy_rejected_logits": 29.247554779052734, "debug/policy_rejected_logps": -488.6958923339844, "debug/reference_chosen_logps": -424.3946533203125, "debug/reference_rejected_logps": -456.62408447265625, "epoch": 0.775, "grad_norm": 3.9288590692913727, "learning_rate": 1e-06, "logits/chosen": 19.258764266967773, "logits/rejected": 29.247554779052734, "logps/chosen": -400.505126953125, "logps/rejected": -488.6958923339844, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 0.23889553546905518, "rewards/margins": 0.5596135854721069, "rewards/rejected": -0.32071805000305176, "step": 62 }, { "debug/policy_chosen_logits": 25.917665481567383, "debug/policy_chosen_logps": -414.66156005859375, "debug/policy_rejected_logits": 27.687381744384766, "debug/policy_rejected_logps": -445.0428771972656, "debug/reference_chosen_logps": -433.19635009765625, "debug/reference_rejected_logps": -456.6318664550781, "epoch": 0.7875, "grad_norm": 5.374299437309603, "learning_rate": 1e-06, "logits/chosen": 25.917665481567383, "logits/rejected": 27.687381744384766, "logps/chosen": -414.66156005859375, "logps/rejected": -445.0428771972656, "loss": 0.3391, "rewards/accuracies": 0.25, "rewards/chosen": 0.18534794449806213, "rewards/margins": 0.06945788860321045, "rewards/rejected": 0.11589004844427109, "step": 63 }, { "debug/policy_chosen_logits": 23.454923629760742, "debug/policy_chosen_logps": -392.32305908203125, "debug/policy_rejected_logits": 28.59387969970703, "debug/policy_rejected_logps": -434.9425048828125, "debug/reference_chosen_logps": -421.68670654296875, "debug/reference_rejected_logps": -419.7337646484375, "epoch": 0.8, "grad_norm": 6.44340155168501, "learning_rate": 1e-06, "logits/chosen": 23.454923629760742, "logits/rejected": 28.59387969970703, "logps/chosen": -392.32305908203125, "logps/rejected": -434.9425048828125, "loss": 0.286, "rewards/accuracies": 0.875, "rewards/chosen": 0.29363659024238586, "rewards/margins": 0.44572439789772034, "rewards/rejected": -0.15208780765533447, "step": 64 }, { "debug/policy_chosen_logits": 24.071760177612305, "debug/policy_chosen_logps": -397.78515625, "debug/policy_rejected_logits": 24.690345764160156, "debug/policy_rejected_logps": -474.0267028808594, "debug/reference_chosen_logps": -413.3990783691406, "debug/reference_rejected_logps": -447.173095703125, "epoch": 0.8125, "grad_norm": 4.265641795749149, "learning_rate": 1e-06, "logits/chosen": 24.071760177612305, "logits/rejected": 24.690345764160156, "logps/chosen": -397.78515625, "logps/rejected": -474.0267028808594, "loss": 0.2996, "rewards/accuracies": 0.625, "rewards/chosen": 0.15613901615142822, "rewards/margins": 0.42467522621154785, "rewards/rejected": -0.26853621006011963, "step": 65 }, { "debug/policy_chosen_logits": 27.323055267333984, "debug/policy_chosen_logps": -423.77105712890625, "debug/policy_rejected_logits": 32.6217041015625, "debug/policy_rejected_logps": -454.37091064453125, "debug/reference_chosen_logps": -444.37615966796875, "debug/reference_rejected_logps": -440.6434326171875, "epoch": 0.825, "grad_norm": 4.106006750125632, "learning_rate": 1e-06, "logits/chosen": 27.323055267333984, "logits/rejected": 32.6217041015625, "logps/chosen": -423.77105712890625, "logps/rejected": -454.37091064453125, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": 0.2060512900352478, "rewards/margins": 0.3433259427547455, "rewards/rejected": -0.13727465271949768, "step": 66 }, { "debug/policy_chosen_logits": 20.215694427490234, "debug/policy_chosen_logps": -392.8240966796875, "debug/policy_rejected_logits": 27.513879776000977, "debug/policy_rejected_logps": -464.026611328125, "debug/reference_chosen_logps": -423.304931640625, "debug/reference_rejected_logps": -444.7491455078125, "epoch": 0.8375, "grad_norm": 3.4052279815687894, "learning_rate": 1e-06, "logits/chosen": 20.215694427490234, "logits/rejected": 27.513879776000977, "logps/chosen": -392.8240966796875, "logps/rejected": -464.026611328125, "loss": 0.2463, "rewards/accuracies": 1.0, "rewards/chosen": 0.3048083484172821, "rewards/margins": 0.49758297204971313, "rewards/rejected": -0.19277465343475342, "step": 67 }, { "debug/policy_chosen_logits": 20.277034759521484, "debug/policy_chosen_logps": -390.63189697265625, "debug/policy_rejected_logits": 26.955398559570312, "debug/policy_rejected_logps": -494.57330322265625, "debug/reference_chosen_logps": -411.6377868652344, "debug/reference_rejected_logps": -470.29150390625, "epoch": 0.85, "grad_norm": 4.314646861788122, "learning_rate": 1e-06, "logits/chosen": 20.277034759521484, "logits/rejected": 26.955398559570312, "logps/chosen": -390.63189697265625, "logps/rejected": -494.57330322265625, "loss": 0.2748, "rewards/accuracies": 0.875, "rewards/chosen": 0.21005921065807343, "rewards/margins": 0.4528772830963135, "rewards/rejected": -0.24281807243824005, "step": 68 }, { "debug/policy_chosen_logits": 17.920215606689453, "debug/policy_chosen_logps": -383.5913391113281, "debug/policy_rejected_logits": 22.582468032836914, "debug/policy_rejected_logps": -483.46142578125, "debug/reference_chosen_logps": -406.38525390625, "debug/reference_rejected_logps": -442.0745544433594, "epoch": 0.8625, "grad_norm": 7.009734793629414, "learning_rate": 1e-06, "logits/chosen": 17.920215606689453, "logits/rejected": 22.582468032836914, "logps/chosen": -383.5913391113281, "logps/rejected": -483.46142578125, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": 0.22793909907341003, "rewards/margins": 0.641808032989502, "rewards/rejected": -0.4138689637184143, "step": 69 }, { "debug/policy_chosen_logits": 21.91058349609375, "debug/policy_chosen_logps": -391.1068115234375, "debug/policy_rejected_logits": 28.602935791015625, "debug/policy_rejected_logps": -462.3726501464844, "debug/reference_chosen_logps": -408.93109130859375, "debug/reference_rejected_logps": -436.0927429199219, "epoch": 0.875, "grad_norm": 6.986157067049592, "learning_rate": 1e-06, "logits/chosen": 21.91058349609375, "logits/rejected": 28.602935791015625, "logps/chosen": -391.1068115234375, "logps/rejected": -462.3726501464844, "loss": 0.2593, "rewards/accuracies": 0.875, "rewards/chosen": 0.1782427579164505, "rewards/margins": 0.4410419762134552, "rewards/rejected": -0.2627992033958435, "step": 70 }, { "debug/policy_chosen_logits": 19.776662826538086, "debug/policy_chosen_logps": -398.9197082519531, "debug/policy_rejected_logits": 32.09967803955078, "debug/policy_rejected_logps": -487.4455871582031, "debug/reference_chosen_logps": -420.3005676269531, "debug/reference_rejected_logps": -452.2755126953125, "epoch": 0.8875, "grad_norm": 5.014513725137227, "learning_rate": 1e-06, "logits/chosen": 19.776662826538086, "logits/rejected": 32.09967803955078, "logps/chosen": -398.9197082519531, "logps/rejected": -487.4455871582031, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": 0.2138085961341858, "rewards/margins": 0.5655093193054199, "rewards/rejected": -0.3517007827758789, "step": 71 }, { "debug/policy_chosen_logits": 21.157052993774414, "debug/policy_chosen_logps": -391.25897216796875, "debug/policy_rejected_logits": 26.412094116210938, "debug/policy_rejected_logps": -490.5589294433594, "debug/reference_chosen_logps": -413.3541259765625, "debug/reference_rejected_logps": -471.00921630859375, "epoch": 0.9, "grad_norm": 5.328562975855576, "learning_rate": 1e-06, "logits/chosen": 21.157052993774414, "logits/rejected": 26.412094116210938, "logps/chosen": -391.25897216796875, "logps/rejected": -490.5589294433594, "loss": 0.2412, "rewards/accuracies": 0.875, "rewards/chosen": 0.2209514081478119, "rewards/margins": 0.4164482653141022, "rewards/rejected": -0.19549685716629028, "step": 72 }, { "debug/policy_chosen_logits": 24.695026397705078, "debug/policy_chosen_logps": -397.16497802734375, "debug/policy_rejected_logits": 26.946775436401367, "debug/policy_rejected_logps": -438.91900634765625, "debug/reference_chosen_logps": -418.1241455078125, "debug/reference_rejected_logps": -434.0196228027344, "epoch": 0.9125, "grad_norm": 5.591526461654656, "learning_rate": 1e-06, "logits/chosen": 24.695026397705078, "logits/rejected": 26.946775436401367, "logps/chosen": -397.16497802734375, "logps/rejected": -438.91900634765625, "loss": 0.3228, "rewards/accuracies": 0.625, "rewards/chosen": 0.20959174633026123, "rewards/margins": 0.25858569145202637, "rewards/rejected": -0.04899394512176514, "step": 73 }, { "debug/policy_chosen_logits": 22.788793563842773, "debug/policy_chosen_logps": -402.5521240234375, "debug/policy_rejected_logits": 27.343595504760742, "debug/policy_rejected_logps": -457.029052734375, "debug/reference_chosen_logps": -427.92510986328125, "debug/reference_rejected_logps": -443.33807373046875, "epoch": 0.925, "grad_norm": 4.13982614264715, "learning_rate": 1e-06, "logits/chosen": 22.788793563842773, "logits/rejected": 27.343595504760742, "logps/chosen": -402.5521240234375, "logps/rejected": -457.029052734375, "loss": 0.2539, "rewards/accuracies": 0.75, "rewards/chosen": 0.2537294626235962, "rewards/margins": 0.39063963294029236, "rewards/rejected": -0.13691017031669617, "step": 74 }, { "debug/policy_chosen_logits": 17.950855255126953, "debug/policy_chosen_logps": -386.80126953125, "debug/policy_rejected_logits": 28.871490478515625, "debug/policy_rejected_logps": -489.54559326171875, "debug/reference_chosen_logps": -418.87652587890625, "debug/reference_rejected_logps": -437.8173828125, "epoch": 0.9375, "grad_norm": 3.573253258125765, "learning_rate": 1e-06, "logits/chosen": 17.950855255126953, "logits/rejected": 28.871490478515625, "logps/chosen": -386.80126953125, "logps/rejected": -489.54559326171875, "loss": 0.235, "rewards/accuracies": 1.0, "rewards/chosen": 0.3207521438598633, "rewards/margins": 0.8380340933799744, "rewards/rejected": -0.5172819495201111, "step": 75 }, { "debug/policy_chosen_logits": 20.622098922729492, "debug/policy_chosen_logps": -384.69195556640625, "debug/policy_rejected_logits": 31.19890594482422, "debug/policy_rejected_logps": -452.1475524902344, "debug/reference_chosen_logps": -407.402099609375, "debug/reference_rejected_logps": -432.25592041015625, "epoch": 0.95, "grad_norm": 6.4154538717209695, "learning_rate": 1e-06, "logits/chosen": 20.622098922729492, "logits/rejected": 31.19890594482422, "logps/chosen": -384.69195556640625, "logps/rejected": -452.1475524902344, "loss": 0.3166, "rewards/accuracies": 0.875, "rewards/chosen": 0.22710174322128296, "rewards/margins": 0.42601796984672546, "rewards/rejected": -0.1989162415266037, "step": 76 }, { "debug/policy_chosen_logits": 21.178997039794922, "debug/policy_chosen_logps": -384.0088806152344, "debug/policy_rejected_logits": 32.716346740722656, "debug/policy_rejected_logps": -477.344482421875, "debug/reference_chosen_logps": -413.40850830078125, "debug/reference_rejected_logps": -443.79180908203125, "epoch": 0.9625, "grad_norm": 4.479488616499859, "learning_rate": 1e-06, "logits/chosen": 21.178997039794922, "logits/rejected": 32.716346740722656, "logps/chosen": -384.0088806152344, "logps/rejected": -477.344482421875, "loss": 0.2545, "rewards/accuracies": 0.875, "rewards/chosen": 0.2939964532852173, "rewards/margins": 0.6295228600502014, "rewards/rejected": -0.33552637696266174, "step": 77 }, { "debug/policy_chosen_logits": 25.3104190826416, "debug/policy_chosen_logps": -409.6578063964844, "debug/policy_rejected_logits": 31.45233917236328, "debug/policy_rejected_logps": -462.6581115722656, "debug/reference_chosen_logps": -426.9599609375, "debug/reference_rejected_logps": -430.9308166503906, "epoch": 0.975, "grad_norm": 3.3409543416332235, "learning_rate": 1e-06, "logits/chosen": 25.3104190826416, "logits/rejected": 31.45233917236328, "logps/chosen": -409.6578063964844, "logps/rejected": -462.6581115722656, "loss": 0.2304, "rewards/accuracies": 0.875, "rewards/chosen": 0.17302149534225464, "rewards/margins": 0.49029436707496643, "rewards/rejected": -0.3172728717327118, "step": 78 }, { "debug/policy_chosen_logits": 20.39267349243164, "debug/policy_chosen_logps": -391.23089599609375, "debug/policy_rejected_logits": 29.421716690063477, "debug/policy_rejected_logps": -489.40728759765625, "debug/reference_chosen_logps": -423.12115478515625, "debug/reference_rejected_logps": -447.7901611328125, "epoch": 0.9875, "grad_norm": 5.428664997337287, "learning_rate": 1e-06, "logits/chosen": 20.39267349243164, "logits/rejected": 29.421716690063477, "logps/chosen": -391.23089599609375, "logps/rejected": -489.40728759765625, "loss": 0.2057, "rewards/accuracies": 1.0, "rewards/chosen": 0.3189027011394501, "rewards/margins": 0.7350739240646362, "rewards/rejected": -0.41617119312286377, "step": 79 }, { "debug/policy_chosen_logits": 18.865188598632812, "debug/policy_chosen_logps": -384.6566162109375, "debug/policy_rejected_logits": 29.98931884765625, "debug/policy_rejected_logps": -469.95892333984375, "debug/reference_chosen_logps": -407.10784912109375, "debug/reference_rejected_logps": -433.1329345703125, "epoch": 1.0, "grad_norm": 4.751087690542253, "learning_rate": 1e-06, "logits/chosen": 18.865188598632812, "logits/rejected": 29.98931884765625, "logps/chosen": -384.6566162109375, "logps/rejected": -469.95892333984375, "loss": 0.1778, "rewards/accuracies": 0.875, "rewards/chosen": 0.22451260685920715, "rewards/margins": 0.5927724838256836, "rewards/rejected": -0.36825984716415405, "step": 80 }, { "epoch": 1.0, "step": 80, "total_flos": 0.0, "train_loss": 0.3231356654316187, "train_runtime": 754.2708, "train_samples_per_second": 6.788, "train_steps_per_second": 0.106 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }