approx_nash_again_1_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
55df777 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 500,
"global_step": 156,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 16.074478059343143,
"learning_rate": 3.125e-08,
"logits/chosen": -1.9564645290374756,
"logits/rejected": -2.1290814876556396,
"logps/chosen": -144.1077423095703,
"logps/pi_response": -268.6929931640625,
"logps/ref_response": -268.6929931640625,
"logps/rejected": -144.41493225097656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"eta": 0.0010000000474974513,
"grad_norm": 17.576222912928348,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.241427183151245,
"logits/rejected": -2.282970666885376,
"logps/chosen": -171.37808227539062,
"logps/pi_response": -273.0738525390625,
"logps/ref_response": -271.9916687011719,
"logps/rejected": -176.56832885742188,
"loss": 0.6928,
"rewards/accuracies": 0.3923611044883728,
"rewards/chosen": -0.004230719991028309,
"rewards/margins": -0.0005770567222498357,
"rewards/rejected": -0.0036536632105708122,
"step": 10
},
{
"epoch": 0.13,
"eta": 0.0010000000474974513,
"grad_norm": 18.23257699755048,
"learning_rate": 4.989935734988097e-07,
"logits/chosen": -2.2886428833007812,
"logits/rejected": -2.1147801876068115,
"logps/chosen": -194.26535034179688,
"logps/pi_response": -308.6405029296875,
"logps/ref_response": -274.3199157714844,
"logps/rejected": -196.698974609375,
"loss": 0.6919,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.2213359773159027,
"rewards/margins": 0.020678246393799782,
"rewards/rejected": -0.24201424419879913,
"step": 20
},
{
"epoch": 0.19,
"eta": 0.0010000000474974513,
"grad_norm": 21.359473410005467,
"learning_rate": 4.877641290737883e-07,
"logits/chosen": -2.213491916656494,
"logits/rejected": -2.1212565898895264,
"logps/chosen": -213.91452026367188,
"logps/pi_response": -317.0865783691406,
"logps/ref_response": -260.5080261230469,
"logps/rejected": -215.670166015625,
"loss": 0.6897,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.35239773988723755,
"rewards/margins": 0.03320372849702835,
"rewards/rejected": -0.3856014609336853,
"step": 30
},
{
"epoch": 0.26,
"eta": 0.0010000000474974513,
"grad_norm": 19.54689711054047,
"learning_rate": 4.646121984004665e-07,
"logits/chosen": -2.36901593208313,
"logits/rejected": -2.241117000579834,
"logps/chosen": -191.35202026367188,
"logps/pi_response": -293.92608642578125,
"logps/ref_response": -255.9798126220703,
"logps/rejected": -191.24124145507812,
"loss": 0.6947,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.22946178913116455,
"rewards/margins": 0.020651038736104965,
"rewards/rejected": -0.2501128315925598,
"step": 40
},
{
"epoch": 0.32,
"eta": 0.0010000000474974513,
"grad_norm": 19.66181931005281,
"learning_rate": 4.3069871595684787e-07,
"logits/chosen": -2.2629857063293457,
"logits/rejected": -2.1153407096862793,
"logps/chosen": -225.6036834716797,
"logps/pi_response": -330.4422912597656,
"logps/ref_response": -266.11285400390625,
"logps/rejected": -226.37161254882812,
"loss": 0.6836,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.4082844853401184,
"rewards/margins": 0.03162597864866257,
"rewards/rejected": -0.4399104118347168,
"step": 50
},
{
"epoch": 0.38,
"eta": 0.0010000000474974513,
"grad_norm": 34.081390496400246,
"learning_rate": 3.877242453630256e-07,
"logits/chosen": -2.3039848804473877,
"logits/rejected": -2.3428866863250732,
"logps/chosen": -220.15634155273438,
"logps/pi_response": -319.7514953613281,
"logps/ref_response": -254.2370147705078,
"logps/rejected": -226.6223907470703,
"loss": 0.6898,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.3762189447879791,
"rewards/margins": 0.03814256191253662,
"rewards/rejected": -0.41436153650283813,
"step": 60
},
{
"epoch": 0.45,
"eta": 0.0010000000474974513,
"grad_norm": 17.20872152727463,
"learning_rate": 3.378437060203357e-07,
"logits/chosen": -2.387434959411621,
"logits/rejected": -2.2482728958129883,
"logps/chosen": -199.58290100097656,
"logps/pi_response": -299.43707275390625,
"logps/ref_response": -256.967529296875,
"logps/rejected": -197.93199157714844,
"loss": 0.6856,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.21634867787361145,
"rewards/margins": 0.03388797491788864,
"rewards/rejected": -0.2502366304397583,
"step": 70
},
{
"epoch": 0.51,
"eta": 0.0010000000474974513,
"grad_norm": 18.125911483507668,
"learning_rate": 2.8355831645441387e-07,
"logits/chosen": -2.2822232246398926,
"logits/rejected": -2.355548620223999,
"logps/chosen": -211.54409790039062,
"logps/pi_response": -338.62335205078125,
"logps/ref_response": -268.83172607421875,
"logps/rejected": -212.7510223388672,
"loss": 0.6795,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.39336004853248596,
"rewards/margins": 0.008717315271496773,
"rewards/rejected": -0.4020773470401764,
"step": 80
},
{
"epoch": 0.58,
"eta": 0.0010000000474974513,
"grad_norm": 18.45235135252255,
"learning_rate": 2.2759017277414164e-07,
"logits/chosen": -2.3078341484069824,
"logits/rejected": -2.3145835399627686,
"logps/chosen": -221.66226196289062,
"logps/pi_response": -324.65771484375,
"logps/ref_response": -253.67257690429688,
"logps/rejected": -230.7862091064453,
"loss": 0.6837,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4485122263431549,
"rewards/margins": 0.015239333733916283,
"rewards/rejected": -0.46375155448913574,
"step": 90
},
{
"epoch": 0.64,
"eta": 0.0010000000474974513,
"grad_norm": 21.755132830081727,
"learning_rate": 1.7274575140626315e-07,
"logits/chosen": -2.324589729309082,
"logits/rejected": -2.312774181365967,
"logps/chosen": -224.4755401611328,
"logps/pi_response": -331.3367919921875,
"logps/ref_response": -261.8123474121094,
"logps/rejected": -226.1329345703125,
"loss": 0.6807,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.5234971642494202,
"rewards/margins": 0.04355122521519661,
"rewards/rejected": -0.5670484304428101,
"step": 100
},
{
"epoch": 0.7,
"eta": 0.0010000000474974513,
"grad_norm": 17.643769449739274,
"learning_rate": 1.2177518064852348e-07,
"logits/chosen": -2.396841526031494,
"logits/rejected": -2.2907986640930176,
"logps/chosen": -216.90243530273438,
"logps/pi_response": -318.94024658203125,
"logps/ref_response": -251.3756561279297,
"logps/rejected": -215.78512573242188,
"loss": 0.6808,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.42508357763290405,
"rewards/margins": 0.0596102774143219,
"rewards/rejected": -0.48469385504722595,
"step": 110
},
{
"epoch": 0.77,
"eta": 0.0010000000474974513,
"grad_norm": 19.96301055359274,
"learning_rate": 7.723433775328384e-08,
"logits/chosen": -2.3210701942443848,
"logits/rejected": -2.387702465057373,
"logps/chosen": -209.82119750976562,
"logps/pi_response": -329.6842956542969,
"logps/ref_response": -276.03692626953125,
"logps/rejected": -222.03341674804688,
"loss": 0.6769,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -0.32728347182273865,
"rewards/margins": 0.0734453871846199,
"rewards/rejected": -0.40072885155677795,
"step": 120
},
{
"epoch": 0.83,
"eta": 0.0010000000474974513,
"grad_norm": 20.360241724578835,
"learning_rate": 4.1356686569674335e-08,
"logits/chosen": -2.3041348457336426,
"logits/rejected": -2.2705655097961426,
"logps/chosen": -210.88119506835938,
"logps/pi_response": -328.033203125,
"logps/ref_response": -266.6432189941406,
"logps/rejected": -211.0803680419922,
"loss": 0.6748,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -0.3793022036552429,
"rewards/margins": 0.05234457924962044,
"rewards/rejected": -0.43164676427841187,
"step": 130
},
{
"epoch": 0.9,
"eta": 0.0010000000474974513,
"grad_norm": 18.16724101735529,
"learning_rate": 1.5941282340065697e-08,
"logits/chosen": -2.401698350906372,
"logits/rejected": -2.386355400085449,
"logps/chosen": -201.68978881835938,
"logps/pi_response": -315.3774719238281,
"logps/ref_response": -254.541259765625,
"logps/rejected": -215.79934692382812,
"loss": 0.668,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3424326777458191,
"rewards/margins": 0.09304080158472061,
"rewards/rejected": -0.4354734420776367,
"step": 140
},
{
"epoch": 0.96,
"eta": 0.0010000000474974513,
"grad_norm": 21.61601513002701,
"learning_rate": 2.2625595580163247e-09,
"logits/chosen": -2.2798948287963867,
"logits/rejected": -2.293689489364624,
"logps/chosen": -211.88253784179688,
"logps/pi_response": -325.4713439941406,
"logps/ref_response": -264.48388671875,
"logps/rejected": -218.7720489501953,
"loss": 0.6717,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -0.3841695487499237,
"rewards/margins": 0.059767745435237885,
"rewards/rejected": -0.4439373016357422,
"step": 150
},
{
"epoch": 1.0,
"step": 156,
"total_flos": 0.0,
"train_loss": 0.6820480842620898,
"train_runtime": 31897.7284,
"train_samples_per_second": 0.627,
"train_steps_per_second": 0.005
}
],
"logging_steps": 10,
"max_steps": 156,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}