{ "best_metric": 0.08551714569330215, "best_model_checkpoint": "./mistral/23-04-24-Weni-WeniGPT-Agents-Mistral-1.0.11-SFT-1.0.27-DPO_Experiment on DPO with other hyperparameters and best SFT model of WeniGPT-2_max_steps-180_batch_8_2024-04-23_ppid_9/checkpoint-180", "epoch": 5.806451612903226, "eval_steps": 30, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3225806451612903, "grad_norm": 8.21396541595459, "learning_rate": 4.971264367816092e-06, "logits/chosen": -1.8720277547836304, "logits/rejected": -1.9025541543960571, "logps/chosen": -185.8878631591797, "logps/rejected": -211.33914184570312, "loss": 0.6877, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.004850864410400391, "rewards/margins": 0.005139083601534367, "rewards/rejected": -0.00028821948217228055, "step": 10 }, { "epoch": 0.6451612903225806, "grad_norm": 8.390437126159668, "learning_rate": 4.683908045977012e-06, "logits/chosen": -1.911663293838501, "logits/rejected": -1.95529305934906, "logps/chosen": -208.3522186279297, "logps/rejected": -255.2483673095703, "loss": 0.6173, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12306723743677139, "rewards/margins": 0.19074788689613342, "rewards/rejected": -0.06768067181110382, "step": 20 }, { "epoch": 0.967741935483871, "grad_norm": 5.7153449058532715, "learning_rate": 4.396551724137931e-06, "logits/chosen": -1.8318233489990234, "logits/rejected": -1.8505750894546509, "logps/chosen": -198.19886779785156, "logps/rejected": -201.53128051757812, "loss": 0.5138, "rewards/accuracies": 1.0, "rewards/chosen": 0.39280813932418823, "rewards/margins": 0.48059898614883423, "rewards/rejected": -0.0877908319234848, "step": 30 }, { "epoch": 0.967741935483871, "eval_logits/chosen": -1.8199779987335205, "eval_logits/rejected": -1.8497353792190552, "eval_logps/chosen": -141.1519775390625, "eval_logps/rejected": -179.952880859375, "eval_loss": 0.4535991847515106, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 0.5539799928665161, "eval_rewards/margins": 0.6171248555183411, "eval_rewards/rejected": -0.06314478814601898, "eval_runtime": 11.0404, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.634, "step": 30 }, { "epoch": 1.2903225806451613, "grad_norm": 5.215389251708984, "learning_rate": 4.1091954022988515e-06, "logits/chosen": -1.8099329471588135, "logits/rejected": -1.827679991722107, "logps/chosen": -173.8700408935547, "logps/rejected": -178.54608154296875, "loss": 0.4012, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.590423047542572, "rewards/margins": 0.5694462060928345, "rewards/rejected": 0.02097688615322113, "step": 40 }, { "epoch": 1.6129032258064515, "grad_norm": 5.5629096031188965, "learning_rate": 3.82183908045977e-06, "logits/chosen": -1.7727949619293213, "logits/rejected": -1.7900581359863281, "logps/chosen": -140.95156860351562, "logps/rejected": -164.9313507080078, "loss": 0.3905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5463284850120544, "rewards/margins": 0.7675722241401672, "rewards/rejected": -0.2212437391281128, "step": 50 }, { "epoch": 1.935483870967742, "grad_norm": 4.645016193389893, "learning_rate": 3.5344827586206898e-06, "logits/chosen": -1.7856403589248657, "logits/rejected": -1.8158725500106812, "logps/chosen": -143.9577178955078, "logps/rejected": -168.88681030273438, "loss": 0.2915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9448509216308594, "rewards/margins": 1.046811819076538, "rewards/rejected": -0.10196097940206528, "step": 60 }, { "epoch": 1.935483870967742, "eval_logits/chosen": -1.8333207368850708, "eval_logits/rejected": -1.863599181175232, "eval_logps/chosen": -134.95201110839844, "eval_logps/rejected": -181.9961395263672, "eval_loss": 0.24886395037174225, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.1739768981933594, "eval_rewards/margins": 1.441449522972107, "eval_rewards/rejected": -0.2674725353717804, "eval_runtime": 11.0448, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.634, "step": 60 }, { "epoch": 2.258064516129032, "grad_norm": 5.072994709014893, "learning_rate": 3.24712643678161e-06, "logits/chosen": -1.8422324657440186, "logits/rejected": -1.8538700342178345, "logps/chosen": -148.91033935546875, "logps/rejected": -174.97677612304688, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0643397569656372, "rewards/margins": 1.4705278873443604, "rewards/rejected": -0.40618810057640076, "step": 70 }, { "epoch": 2.5806451612903225, "grad_norm": 4.727698802947998, "learning_rate": 2.959770114942529e-06, "logits/chosen": -1.8584480285644531, "logits/rejected": -1.9063705205917358, "logps/chosen": -120.79048156738281, "logps/rejected": -250.99142456054688, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 1.022344946861267, "rewards/margins": 1.722174882888794, "rewards/rejected": -0.6998298764228821, "step": 80 }, { "epoch": 2.903225806451613, "grad_norm": 3.443195104598999, "learning_rate": 2.672413793103448e-06, "logits/chosen": -1.8592571020126343, "logits/rejected": -1.8985626697540283, "logps/chosen": -137.69444274902344, "logps/rejected": -232.9921112060547, "loss": 0.1995, "rewards/accuracies": 1.0, "rewards/chosen": 1.2343705892562866, "rewards/margins": 2.051001787185669, "rewards/rejected": -0.8166313171386719, "step": 90 }, { "epoch": 2.903225806451613, "eval_logits/chosen": -1.8481289148330688, "eval_logits/rejected": -1.878524661064148, "eval_logps/chosen": -131.11766052246094, "eval_logps/rejected": -185.3928680419922, "eval_loss": 0.15727286040782928, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.5574114322662354, "eval_rewards/margins": 2.1645569801330566, "eval_rewards/rejected": -0.6071456074714661, "eval_runtime": 11.0454, "eval_samples_per_second": 2.535, "eval_steps_per_second": 0.634, "step": 90 }, { "epoch": 3.225806451612903, "grad_norm": 2.092071533203125, "learning_rate": 2.3850574712643677e-06, "logits/chosen": -1.8506457805633545, "logits/rejected": -1.8858827352523804, "logps/chosen": -151.75924682617188, "logps/rejected": -217.5481719970703, "loss": 0.1523, "rewards/accuracies": 1.0, "rewards/chosen": 1.423624038696289, "rewards/margins": 2.1922757625579834, "rewards/rejected": -0.7686518430709839, "step": 100 }, { "epoch": 3.5483870967741935, "grad_norm": 4.031655311584473, "learning_rate": 2.0977011494252873e-06, "logits/chosen": -1.93359375, "logits/rejected": -1.946930170059204, "logps/chosen": -177.41221618652344, "logps/rejected": -190.0054931640625, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": 1.1838829517364502, "rewards/margins": 2.2103819847106934, "rewards/rejected": -1.0264989137649536, "step": 110 }, { "epoch": 3.870967741935484, "grad_norm": 1.9795210361480713, "learning_rate": 1.810344827586207e-06, "logits/chosen": -1.8957767486572266, "logits/rejected": -1.9097095727920532, "logps/chosen": -169.95924377441406, "logps/rejected": -243.188232421875, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": 1.3196746110916138, "rewards/margins": 2.5982871055603027, "rewards/rejected": -1.2786123752593994, "step": 120 }, { "epoch": 3.870967741935484, "eval_logits/chosen": -1.8605867624282837, "eval_logits/rejected": -1.8898895978927612, "eval_logps/chosen": -128.9481658935547, "eval_logps/rejected": -188.56149291992188, "eval_loss": 0.11003036051988602, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.7743602991104126, "eval_rewards/margins": 2.6983656883239746, "eval_rewards/rejected": -0.9240055084228516, "eval_runtime": 11.043, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.634, "step": 120 }, { "epoch": 4.193548387096774, "grad_norm": 2.9046499729156494, "learning_rate": 1.5229885057471267e-06, "logits/chosen": -1.8014376163482666, "logits/rejected": -1.8344871997833252, "logps/chosen": -109.79362487792969, "logps/rejected": -197.57846069335938, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 1.6577131748199463, "rewards/margins": 2.7808759212493896, "rewards/rejected": -1.1231629848480225, "step": 130 }, { "epoch": 4.516129032258064, "grad_norm": 2.7017457485198975, "learning_rate": 1.235632183908046e-06, "logits/chosen": -1.855467438697815, "logits/rejected": -1.8879632949829102, "logps/chosen": -153.61981201171875, "logps/rejected": -275.4436950683594, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 1.142747163772583, "rewards/margins": 2.762828826904297, "rewards/rejected": -1.6200816631317139, "step": 140 }, { "epoch": 4.838709677419355, "grad_norm": 2.4724111557006836, "learning_rate": 9.482758620689655e-07, "logits/chosen": -1.8847665786743164, "logits/rejected": -1.9286410808563232, "logps/chosen": -129.7620391845703, "logps/rejected": -280.47564697265625, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 1.9071842432022095, "rewards/margins": 3.235039472579956, "rewards/rejected": -1.3278553485870361, "step": 150 }, { "epoch": 4.838709677419355, "eval_logits/chosen": -1.8677905797958374, "eval_logits/rejected": -1.896652102470398, "eval_logps/chosen": -128.13369750976562, "eval_logps/rejected": -190.9434356689453, "eval_loss": 0.09096517413854599, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.8558064699172974, "eval_rewards/margins": 3.018005609512329, "eval_rewards/rejected": -1.1621991395950317, "eval_runtime": 11.0419, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.634, "step": 150 }, { "epoch": 5.161290322580645, "grad_norm": 3.0030722618103027, "learning_rate": 6.609195402298851e-07, "logits/chosen": -1.9072681665420532, "logits/rejected": -1.9386742115020752, "logps/chosen": -137.89524841308594, "logps/rejected": -212.41598510742188, "loss": 0.1126, "rewards/accuracies": 1.0, "rewards/chosen": 1.144260287284851, "rewards/margins": 2.634052038192749, "rewards/rejected": -1.4897918701171875, "step": 160 }, { "epoch": 5.483870967741936, "grad_norm": 1.655219554901123, "learning_rate": 3.7356321839080463e-07, "logits/chosen": -1.9006898403167725, "logits/rejected": -1.936577558517456, "logps/chosen": -159.63670349121094, "logps/rejected": -260.5877685546875, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": 1.5920008420944214, "rewards/margins": 3.147128105163574, "rewards/rejected": -1.5551271438598633, "step": 170 }, { "epoch": 5.806451612903226, "grad_norm": 1.4150224924087524, "learning_rate": 8.620689655172414e-08, "logits/chosen": -1.8560463190078735, "logits/rejected": -1.9170904159545898, "logps/chosen": -154.82614135742188, "logps/rejected": -270.3697509765625, "loss": 0.1057, "rewards/accuracies": 1.0, "rewards/chosen": 1.297466516494751, "rewards/margins": 3.326411724090576, "rewards/rejected": -2.0289454460144043, "step": 180 }, { "epoch": 5.806451612903226, "eval_logits/chosen": -1.8695993423461914, "eval_logits/rejected": -1.8981773853302002, "eval_logps/chosen": -127.97382354736328, "eval_logps/rejected": -191.76492309570312, "eval_loss": 0.08551714569330215, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 1.8717962503433228, "eval_rewards/margins": 3.116144895553589, "eval_rewards/rejected": -1.2443491220474243, "eval_runtime": 11.0413, "eval_samples_per_second": 2.536, "eval_steps_per_second": 0.634, "step": 180 } ], "logging_steps": 10, "max_steps": 180, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 90, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }