{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eta": 0.0010000000474974513, "grad_norm": 16.074478059343143, "learning_rate": 3.125e-08, "logits/chosen": -1.9564645290374756, "logits/rejected": -2.1290814876556396, "logps/chosen": -144.1077423095703, "logps/pi_response": -268.6929931640625, "logps/ref_response": -268.6929931640625, "logps/rejected": -144.41493225097656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "eta": 0.0010000000474974513, "grad_norm": 17.576222912928348, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.241427183151245, "logits/rejected": -2.282970666885376, "logps/chosen": -171.37808227539062, "logps/pi_response": -273.0738525390625, "logps/ref_response": -271.9916687011719, "logps/rejected": -176.56832885742188, "loss": 0.6928, "rewards/accuracies": 0.3923611044883728, "rewards/chosen": -0.004230719991028309, "rewards/margins": -0.0005770567222498357, "rewards/rejected": -0.0036536632105708122, "step": 10 }, { "epoch": 0.13, "eta": 0.0010000000474974513, "grad_norm": 18.23257699755048, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.2886428833007812, "logits/rejected": -2.1147801876068115, "logps/chosen": -194.26535034179688, "logps/pi_response": -308.6405029296875, "logps/ref_response": -274.3199157714844, "logps/rejected": -196.698974609375, "loss": 0.6919, "rewards/accuracies": 0.515625, "rewards/chosen": -0.2213359773159027, "rewards/margins": 0.020678246393799782, "rewards/rejected": -0.24201424419879913, "step": 20 }, { "epoch": 0.19, "eta": 0.0010000000474974513, "grad_norm": 21.359473410005467, "learning_rate": 4.877641290737883e-07, "logits/chosen": -2.213491916656494, "logits/rejected": -2.1212565898895264, "logps/chosen": -213.91452026367188, "logps/pi_response": -317.0865783691406, "logps/ref_response": -260.5080261230469, "logps/rejected": -215.670166015625, "loss": 0.6897, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.35239773988723755, "rewards/margins": 0.03320372849702835, "rewards/rejected": -0.3856014609336853, "step": 30 }, { "epoch": 0.26, "eta": 0.0010000000474974513, "grad_norm": 19.54689711054047, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.36901593208313, "logits/rejected": -2.241117000579834, "logps/chosen": -191.35202026367188, "logps/pi_response": -293.92608642578125, "logps/ref_response": -255.9798126220703, "logps/rejected": -191.24124145507812, "loss": 0.6947, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.22946178913116455, "rewards/margins": 0.020651038736104965, "rewards/rejected": -0.2501128315925598, "step": 40 }, { "epoch": 0.32, "eta": 0.0010000000474974513, "grad_norm": 19.66181931005281, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -2.2629857063293457, "logits/rejected": -2.1153407096862793, "logps/chosen": -225.6036834716797, "logps/pi_response": -330.4422912597656, "logps/ref_response": -266.11285400390625, "logps/rejected": -226.37161254882812, "loss": 0.6836, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.4082844853401184, "rewards/margins": 0.03162597864866257, "rewards/rejected": -0.4399104118347168, "step": 50 }, { "epoch": 0.38, "eta": 0.0010000000474974513, "grad_norm": 34.081390496400246, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.3039848804473877, "logits/rejected": -2.3428866863250732, "logps/chosen": -220.15634155273438, "logps/pi_response": -319.7514953613281, "logps/ref_response": -254.2370147705078, "logps/rejected": -226.6223907470703, "loss": 0.6898, "rewards/accuracies": 0.546875, "rewards/chosen": -0.3762189447879791, "rewards/margins": 0.03814256191253662, "rewards/rejected": -0.41436153650283813, "step": 60 }, { "epoch": 0.45, "eta": 0.0010000000474974513, "grad_norm": 17.20872152727463, "learning_rate": 3.378437060203357e-07, "logits/chosen": -2.387434959411621, "logits/rejected": -2.2482728958129883, "logps/chosen": -199.58290100097656, "logps/pi_response": -299.43707275390625, "logps/ref_response": -256.967529296875, "logps/rejected": -197.93199157714844, "loss": 0.6856, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.21634867787361145, "rewards/margins": 0.03388797491788864, "rewards/rejected": -0.2502366304397583, "step": 70 }, { "epoch": 0.51, "eta": 0.0010000000474974513, "grad_norm": 18.125911483507668, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.2822232246398926, "logits/rejected": -2.355548620223999, "logps/chosen": -211.54409790039062, "logps/pi_response": -338.62335205078125, "logps/ref_response": -268.83172607421875, "logps/rejected": -212.7510223388672, "loss": 0.6795, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.39336004853248596, "rewards/margins": 0.008717315271496773, "rewards/rejected": -0.4020773470401764, "step": 80 }, { "epoch": 0.58, "eta": 0.0010000000474974513, "grad_norm": 18.45235135252255, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -2.3078341484069824, "logits/rejected": -2.3145835399627686, "logps/chosen": -221.66226196289062, "logps/pi_response": -324.65771484375, "logps/ref_response": -253.67257690429688, "logps/rejected": -230.7862091064453, "loss": 0.6837, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4485122263431549, "rewards/margins": 0.015239333733916283, "rewards/rejected": -0.46375155448913574, "step": 90 }, { "epoch": 0.64, "eta": 0.0010000000474974513, "grad_norm": 21.755132830081727, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.324589729309082, "logits/rejected": -2.312774181365967, "logps/chosen": -224.4755401611328, "logps/pi_response": -331.3367919921875, "logps/ref_response": -261.8123474121094, "logps/rejected": -226.1329345703125, "loss": 0.6807, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5234971642494202, "rewards/margins": 0.04355122521519661, "rewards/rejected": -0.5670484304428101, "step": 100 }, { "epoch": 0.7, "eta": 0.0010000000474974513, "grad_norm": 17.643769449739274, "learning_rate": 1.2177518064852348e-07, "logits/chosen": -2.396841526031494, "logits/rejected": -2.2907986640930176, "logps/chosen": -216.90243530273438, "logps/pi_response": -318.94024658203125, "logps/ref_response": -251.3756561279297, "logps/rejected": -215.78512573242188, "loss": 0.6808, "rewards/accuracies": 0.609375, "rewards/chosen": -0.42508357763290405, "rewards/margins": 0.0596102774143219, "rewards/rejected": -0.48469385504722595, "step": 110 }, { "epoch": 0.77, "eta": 0.0010000000474974513, "grad_norm": 19.96301055359274, "learning_rate": 7.723433775328384e-08, "logits/chosen": -2.3210701942443848, "logits/rejected": -2.387702465057373, "logps/chosen": -209.82119750976562, "logps/pi_response": -329.6842956542969, "logps/ref_response": -276.03692626953125, "logps/rejected": -222.03341674804688, "loss": 0.6769, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.32728347182273865, "rewards/margins": 0.0734453871846199, "rewards/rejected": -0.40072885155677795, "step": 120 }, { "epoch": 0.83, "eta": 0.0010000000474974513, "grad_norm": 20.360241724578835, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -2.3041348457336426, "logits/rejected": -2.2705655097961426, "logps/chosen": -210.88119506835938, "logps/pi_response": -328.033203125, "logps/ref_response": -266.6432189941406, "logps/rejected": -211.0803680419922, "loss": 0.6748, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.3793022036552429, "rewards/margins": 0.05234457924962044, "rewards/rejected": -0.43164676427841187, "step": 130 }, { "epoch": 0.9, "eta": 0.0010000000474974513, "grad_norm": 18.16724101735529, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -2.401698350906372, "logits/rejected": -2.386355400085449, "logps/chosen": -201.68978881835938, "logps/pi_response": -315.3774719238281, "logps/ref_response": -254.541259765625, "logps/rejected": -215.79934692382812, "loss": 0.668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3424326777458191, "rewards/margins": 0.09304080158472061, "rewards/rejected": -0.4354734420776367, "step": 140 }, { "epoch": 0.96, "eta": 0.0010000000474974513, "grad_norm": 21.61601513002701, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -2.2798948287963867, "logits/rejected": -2.293689489364624, "logps/chosen": -211.88253784179688, "logps/pi_response": -325.4713439941406, "logps/ref_response": -264.48388671875, "logps/rejected": -218.7720489501953, "loss": 0.6717, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.3841695487499237, "rewards/margins": 0.059767745435237885, "rewards/rejected": -0.4439373016357422, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.6820480842620898, "train_runtime": 31897.7284, "train_samples_per_second": 0.627, "train_steps_per_second": 0.005 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }