{ "best_metric": 0.057673219591379166, "best_model_checkpoint": "./mistral/21-04-24-Weni-WeniGPT-Agents-Mistral-1.0.0-SFT-1.0.17-DPO_Experiment on DPO with other hyperparameters and best SFT model of WeniGPT-2_max_steps-180_batch_8_2024-04-21_ppid_9/checkpoint-90", "epoch": 2.903225806451613, "eval_steps": 30, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3225806451612903, "grad_norm": 69.4647445678711, "learning_rate": 4.166666666666667e-06, "logits/chosen": -1.8231691122055054, "logits/rejected": -1.828621506690979, "logps/chosen": -158.80148315429688, "logps/rejected": -171.9478302001953, "loss": 0.6797, "rewards/accuracies": 0.25, "rewards/chosen": 0.01820509135723114, "rewards/margins": 0.018905125558376312, "rewards/rejected": -0.0007000354235060513, "step": 10 }, { "epoch": 0.6451612903225806, "grad_norm": 55.0553092956543, "learning_rate": 4.741379310344828e-06, "logits/chosen": -1.8079488277435303, "logits/rejected": -1.8343168497085571, "logps/chosen": -183.0499725341797, "logps/rejected": -252.92544555664062, "loss": 0.4325, "rewards/accuracies": 0.75, "rewards/chosen": 0.6964864134788513, "rewards/margins": 0.7227998375892639, "rewards/rejected": -0.026313429698348045, "step": 20 }, { "epoch": 0.967741935483871, "grad_norm": 12.128168106079102, "learning_rate": 4.454022988505747e-06, "logits/chosen": -1.8517696857452393, "logits/rejected": -1.885725736618042, "logps/chosen": -192.10862731933594, "logps/rejected": -237.21206665039062, "loss": 0.2533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7528797388076782, "rewards/margins": 2.4615116119384766, "rewards/rejected": -0.7086319327354431, "step": 30 }, { "epoch": 0.967741935483871, "eval_logits/chosen": -1.8228849172592163, "eval_logits/rejected": -1.8303192853927612, "eval_logps/chosen": -143.72242736816406, "eval_logps/rejected": -179.08694458007812, "eval_loss": 0.18642649054527283, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 2.223684072494507, "eval_rewards/margins": 2.5237960815429688, "eval_rewards/rejected": -0.3001118302345276, "eval_runtime": 12.9786, "eval_samples_per_second": 2.157, "eval_steps_per_second": 0.539, "step": 30 }, { "epoch": 1.2903225806451613, "grad_norm": 14.146989822387695, "learning_rate": 4.166666666666667e-06, "logits/chosen": -1.8532211780548096, "logits/rejected": -1.8761682510375977, "logps/chosen": -176.05001831054688, "logps/rejected": -235.13314819335938, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": 2.285470485687256, "rewards/margins": 3.9319252967834473, "rewards/rejected": -1.64645516872406, "step": 40 }, { "epoch": 1.6129032258064515, "grad_norm": 11.002118110656738, "learning_rate": 3.8793103448275865e-06, "logits/chosen": -1.770261526107788, "logits/rejected": -1.7761493921279907, "logps/chosen": -149.0377197265625, "logps/rejected": -169.15806579589844, "loss": 0.1134, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.4034476280212402, "rewards/margins": 4.229793071746826, "rewards/rejected": -0.8263454437255859, "step": 50 }, { "epoch": 1.935483870967742, "grad_norm": 16.15618896484375, "learning_rate": 3.5919540229885056e-06, "logits/chosen": -1.8206300735473633, "logits/rejected": -1.845362663269043, "logps/chosen": -148.51126098632812, "logps/rejected": -217.98989868164062, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 3.102309226989746, "rewards/margins": 4.121461391448975, "rewards/rejected": -1.0191524028778076, "step": 60 }, { "epoch": 1.935483870967742, "eval_logits/chosen": -1.8270312547683716, "eval_logits/rejected": -1.8343855142593384, "eval_logps/chosen": -139.5704803466797, "eval_logps/rejected": -181.44911193847656, "eval_loss": 0.0837571769952774, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 3.4692695140838623, "eval_rewards/margins": 4.478025436401367, "eval_rewards/rejected": -1.0087559223175049, "eval_runtime": 12.9762, "eval_samples_per_second": 2.158, "eval_steps_per_second": 0.539, "step": 60 }, { "epoch": 2.258064516129032, "grad_norm": 3.0225539207458496, "learning_rate": 3.3045977011494256e-06, "logits/chosen": -1.8196799755096436, "logits/rejected": -1.8384376764297485, "logps/chosen": -155.3164825439453, "logps/rejected": -224.11764526367188, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 3.732748508453369, "rewards/margins": 5.434271335601807, "rewards/rejected": -1.7015235424041748, "step": 70 }, { "epoch": 2.5806451612903225, "grad_norm": 0.4993923008441925, "learning_rate": 3.017241379310345e-06, "logits/chosen": -1.7981538772583008, "logits/rejected": -1.8249304294586182, "logps/chosen": -103.8951187133789, "logps/rejected": -169.70159912109375, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 3.9167656898498535, "rewards/margins": 5.2651543617248535, "rewards/rejected": -1.3483887910842896, "step": 80 }, { "epoch": 2.903225806451613, "grad_norm": 1.028433084487915, "learning_rate": 2.729885057471265e-06, "logits/chosen": -1.7703297138214111, "logits/rejected": -1.7897396087646484, "logps/chosen": -105.6802749633789, "logps/rejected": -199.07763671875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 4.359316825866699, "rewards/margins": 6.259133338928223, "rewards/rejected": -1.8998162746429443, "step": 90 }, { "epoch": 2.903225806451613, "eval_logits/chosen": -1.8372517824172974, "eval_logits/rejected": -1.8450374603271484, "eval_logps/chosen": -136.87937927246094, "eval_logps/rejected": -183.79039001464844, "eval_loss": 0.057673219591379166, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 4.276595115661621, "eval_rewards/margins": 5.987736225128174, "eval_rewards/rejected": -1.7111417055130005, "eval_runtime": 12.9682, "eval_samples_per_second": 2.159, "eval_steps_per_second": 0.54, "step": 90 } ], "logging_steps": 10, "max_steps": 180, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 90, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }