{ "best_metric": 0.529485821723938, "best_model_checkpoint": "./mistral/20-04-24-Weni-WeniGPT-Agents-Mistral-1.0.6-SFT-1.0.5-DPO_Experiment on DPO with other hyperparameters and best SFT model of WeniGPT-2_max_steps-366_batch_4_2024-04-20_ppid_9/checkpoint-90", "epoch": 1.4634146341463414, "eval_steps": 30, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 8.378021240234375, "learning_rate": 4.0909090909090915e-06, "logits/chosen": -1.830958604812622, "logits/rejected": -1.8507845401763916, "logps/chosen": -28.701984405517578, "logps/rejected": -54.28569793701172, "loss": 0.6924, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.0008967495523393154, "rewards/margins": 0.0014666033675894141, "rewards/rejected": -0.0005698538152500987, "step": 10 }, { "epoch": 0.33, "grad_norm": 5.193418502807617, "learning_rate": 4.887323943661972e-06, "logits/chosen": -1.7550897598266602, "logits/rejected": -1.770708680152893, "logps/chosen": -47.344207763671875, "logps/rejected": -64.0368423461914, "loss": 0.6852, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.017231885343790054, "rewards/margins": 0.01606021076440811, "rewards/rejected": 0.0011716745793819427, "step": 20 }, { "epoch": 0.49, "grad_norm": 7.308932304382324, "learning_rate": 4.746478873239437e-06, "logits/chosen": -1.781267762184143, "logits/rejected": -1.8114898204803467, "logps/chosen": -54.274559020996094, "logps/rejected": -95.20500183105469, "loss": 0.6635, "rewards/accuracies": 0.5, "rewards/chosen": 0.0641159638762474, "rewards/margins": 0.061691801995038986, "rewards/rejected": 0.0024241588544100523, "step": 30 }, { "epoch": 0.49, "eval_logits/chosen": -1.7831767797470093, "eval_logits/rejected": -1.8043663501739502, "eval_logps/chosen": -55.16960906982422, "eval_logps/rejected": -97.32585144042969, "eval_loss": 0.6523757576942444, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.09036973863840103, "eval_rewards/margins": 0.08673857897520065, "eval_rewards/rejected": 0.0036311547737568617, "eval_runtime": 8.141, "eval_samples_per_second": 3.439, "eval_steps_per_second": 1.72, "step": 30 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 4.6056338028169015e-06, "logits/chosen": -1.889905333518982, "logits/rejected": -1.9024461507797241, "logps/chosen": -27.918941497802734, "logps/rejected": -42.093284606933594, "loss": 0.668, "rewards/accuracies": 0.25, "rewards/chosen": 0.054457180202007294, "rewards/margins": 0.0539846234023571, "rewards/rejected": 0.0004725646285805851, "step": 40 }, { "epoch": 0.81, "grad_norm": 8.53225326538086, "learning_rate": 4.464788732394367e-06, "logits/chosen": -1.8278567790985107, "logits/rejected": -1.849957823753357, "logps/chosen": -43.8238639831543, "logps/rejected": -68.02179718017578, "loss": 0.6358, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.13941256701946259, "rewards/margins": 0.13133978843688965, "rewards/rejected": 0.008072790689766407, "step": 50 }, { "epoch": 0.98, "grad_norm": 9.436968803405762, "learning_rate": 4.3239436619718315e-06, "logits/chosen": -1.805991768836975, "logits/rejected": -1.8437427282333374, "logps/chosen": -43.8873291015625, "logps/rejected": -95.2943115234375, "loss": 0.6026, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.18793432414531708, "rewards/margins": 0.21308371424674988, "rewards/rejected": -0.025149401277303696, "step": 60 }, { "epoch": 0.98, "eval_logits/chosen": -1.7877694368362427, "eval_logits/rejected": -1.8098936080932617, "eval_logps/chosen": -53.567203521728516, "eval_logps/rejected": -97.33795928955078, "eval_loss": 0.5890871286392212, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.25061002373695374, "eval_rewards/margins": 0.2481890469789505, "eval_rewards/rejected": 0.002420984674245119, "eval_runtime": 8.1404, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.72, "step": 60 }, { "epoch": 1.14, "grad_norm": 0.0, "learning_rate": 4.183098591549296e-06, "logits/chosen": -1.8344879150390625, "logits/rejected": -1.8489716053009033, "logps/chosen": -40.38930892944336, "logps/rejected": -60.9084358215332, "loss": 0.6031, "rewards/accuracies": 0.375, "rewards/chosen": 0.19739331305027008, "rewards/margins": 0.22638121247291565, "rewards/rejected": -0.028987903147935867, "step": 70 }, { "epoch": 1.3, "grad_norm": 5.49536657333374, "learning_rate": 4.042253521126761e-06, "logits/chosen": -1.7903095483779907, "logits/rejected": -1.8362411260604858, "logps/chosen": -44.288116455078125, "logps/rejected": -90.21073913574219, "loss": 0.5357, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.34061312675476074, "rewards/margins": 0.40679749846458435, "rewards/rejected": -0.06618441641330719, "step": 80 }, { "epoch": 1.46, "grad_norm": 13.401692390441895, "learning_rate": 3.901408450704225e-06, "logits/chosen": -1.8004281520843506, "logits/rejected": -1.8247934579849243, "logps/chosen": -42.32465362548828, "logps/rejected": -70.9749984741211, "loss": 0.5387, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.3678433299064636, "rewards/margins": 0.4186524450778961, "rewards/rejected": -0.05080908536911011, "step": 90 }, { "epoch": 1.46, "eval_logits/chosen": -1.7943389415740967, "eval_logits/rejected": -1.8181126117706299, "eval_logps/chosen": -51.677486419677734, "eval_logps/rejected": -97.63689422607422, "eval_loss": 0.529485821723938, "eval_rewards/accuracies": 0.4642857015132904, "eval_rewards/chosen": 0.4395819306373596, "eval_rewards/margins": 0.4670555889606476, "eval_rewards/rejected": -0.027473628520965576, "eval_runtime": 8.1412, "eval_samples_per_second": 3.439, "eval_steps_per_second": 1.72, "step": 90 } ], "logging_steps": 10, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 90, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }