{ "best_metric": 0.15727286040782928, "best_model_checkpoint": "./mistral/23-04-24-Weni-WeniGPT-Agents-Mistral-1.0.11-SFT-1.0.27-DPO_Experiment on DPO with other hyperparameters and best SFT model of WeniGPT-2_max_steps-180_batch_8_2024-04-23_ppid_9/checkpoint-90", "epoch": 2.903225806451613, "eval_steps": 30, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3225806451612903, "grad_norm": 8.21396541595459, "learning_rate": 4.971264367816092e-06, "logits/chosen": -1.8720277547836304, "logits/rejected": -1.9025541543960571, "logps/chosen": -185.8878631591797, "logps/rejected": -211.33914184570312, "loss": 0.6877, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.004850864410400391, "rewards/margins": 0.005139083601534367, "rewards/rejected": -0.00028821948217228055, "step": 10 }, { "epoch": 0.6451612903225806, "grad_norm": 8.390437126159668, "learning_rate": 4.683908045977012e-06, "logits/chosen": -1.911663293838501, "logits/rejected": -1.95529305934906, "logps/chosen": -208.3522186279297, "logps/rejected": -255.2483673095703, "loss": 0.6173, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12306723743677139, "rewards/margins": 0.19074788689613342, "rewards/rejected": -0.06768067181110382, "step": 20 }, { "epoch": 0.967741935483871, "grad_norm": 5.7153449058532715, "learning_rate": 4.396551724137931e-06, "logits/chosen": -1.8318233489990234, "logits/rejected": -1.8505750894546509, "logps/chosen": -198.19886779785156, "logps/rejected": -201.53128051757812, "loss": 0.5138, "rewards/accuracies": 1.0, "rewards/chosen": 0.39280813932418823, "rewards/margins": 0.48059898614883423, "rewards/rejected": -0.0877908319234848, "step": 30 }, { "epoch": 0.967741935483871, "eval_logits/chosen": -1.8199779987335205, "eval_logits/rejected": -1.8497353792190552, "eval_logps/chosen": -141.1519775390625, "eval_logps/rejected": -179.952880859375, "eval_loss": 0.4535991847515106, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.5539799928665161, "eval_rewards/margins": 0.6171248555183411, "eval_rewards/rejected": -0.06314478814601898, "eval_runtime": 11.0404, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.634, "step": 30 }, { "epoch": 1.2903225806451613, "grad_norm": 5.215389251708984, "learning_rate": 4.1091954022988515e-06, "logits/chosen": -1.8099329471588135, "logits/rejected": -1.827679991722107, "logps/chosen": -173.8700408935547, "logps/rejected": -178.54608154296875, "loss": 0.4012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.590423047542572, "rewards/margins": 0.5694462060928345, "rewards/rejected": 0.02097688615322113, "step": 40 }, { "epoch": 1.6129032258064515, "grad_norm": 5.5629096031188965, "learning_rate": 3.82183908045977e-06, "logits/chosen": -1.7727949619293213, "logits/rejected": -1.7900581359863281, "logps/chosen": -140.95156860351562, "logps/rejected": -164.9313507080078, "loss": 0.3905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5463284850120544, "rewards/margins": 0.7675722241401672, "rewards/rejected": -0.2212437391281128, "step": 50 }, { "epoch": 1.935483870967742, "grad_norm": 4.645016193389893, "learning_rate": 3.5344827586206898e-06, "logits/chosen": -1.7856403589248657, "logits/rejected": -1.8158725500106812, "logps/chosen": -143.9577178955078, "logps/rejected": -168.88681030273438, "loss": 0.2915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9448509216308594, "rewards/margins": 1.046811819076538, "rewards/rejected": -0.10196097940206528, "step": 60 }, { "epoch": 1.935483870967742, "eval_logits/chosen": -1.8333207368850708, "eval_logits/rejected": -1.863599181175232, "eval_logps/chosen": -134.95201110839844, "eval_logps/rejected": -181.9961395263672, "eval_loss": 0.24886395037174225, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.1739768981933594, "eval_rewards/margins": 1.441449522972107, "eval_rewards/rejected": -0.2674725353717804, "eval_runtime": 11.0448, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.634, "step": 60 }, { "epoch": 2.258064516129032, "grad_norm": 5.072994709014893, "learning_rate": 3.24712643678161e-06, "logits/chosen": -1.8422324657440186, "logits/rejected": -1.8538700342178345, "logps/chosen": -148.91033935546875, "logps/rejected": -174.97677612304688, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0643397569656372, "rewards/margins": 1.4705278873443604, "rewards/rejected": -0.40618810057640076, "step": 70 }, { "epoch": 2.5806451612903225, "grad_norm": 4.727698802947998, "learning_rate": 2.959770114942529e-06, "logits/chosen": -1.8584480285644531, "logits/rejected": -1.9063705205917358, "logps/chosen": -120.79048156738281, "logps/rejected": -250.99142456054688, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 1.022344946861267, "rewards/margins": 1.722174882888794, "rewards/rejected": -0.6998298764228821, "step": 80 }, { "epoch": 2.903225806451613, "grad_norm": 3.443195104598999, "learning_rate": 2.672413793103448e-06, "logits/chosen": -1.8592571020126343, "logits/rejected": -1.8985626697540283, "logps/chosen": -137.69444274902344, "logps/rejected": -232.9921112060547, "loss": 0.1995, "rewards/accuracies": 1.0, "rewards/chosen": 1.2343705892562866, "rewards/margins": 2.051001787185669, "rewards/rejected": -0.8166313171386719, "step": 90 }, { "epoch": 2.903225806451613, "eval_logits/chosen": -1.8481289148330688, "eval_logits/rejected": -1.878524661064148, "eval_logps/chosen": -131.11766052246094, "eval_logps/rejected": -185.3928680419922, "eval_loss": 0.15727286040782928, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.5574114322662354, "eval_rewards/margins": 2.1645569801330566, "eval_rewards/rejected": -0.6071456074714661, "eval_runtime": 11.0454, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.634, "step": 90 } ], "logging_steps": 10, "max_steps": 180, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 90, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }