{ "best_metric": 0.5, "best_model_checkpoint": "./zephyr/10-04-24-Weni-WeniGPT-Agents-Zephyr-1.0.25-KTO_Experiment with a new tokenizer configuration for chat template of zephyr-2_max_steps-1470_batch_16_2024-04-10_ppid_9/checkpoint-300", "epoch": 7.260726072607261, "eval_steps": 50, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13, "grad_norm": 57.293792724609375, "kl": 0.03853478282690048, "learning_rate": 6.222222222222222e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7078, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 20 }, { "epoch": 0.26, "grad_norm": 112.50944519042969, "kl": 3.2648494243621826, "learning_rate": 0.00014666666666666666, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6966, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 40 }, { "epoch": 0.33, "eval_kl": 0.0, "eval_logps/chosen": -413.6161193847656, "eval_logps/rejected": -362.2559509277344, "eval_loss": 0.5063381791114807, "eval_rewards/chosen": -13.412939071655273, "eval_rewards/margins": -1.0048810243606567, "eval_rewards/rejected": -12.408059120178223, "eval_runtime": 170.1826, "eval_samples_per_second": 2.057, "eval_steps_per_second": 0.517, "step": 50 }, { "epoch": 0.4, "grad_norm": 19.94582748413086, "kl": 0.45922356843948364, "learning_rate": 0.00019887719298245616, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.5743, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 60 }, { "epoch": 0.53, "grad_norm": 79.92957305908203, "kl": 0.0, "learning_rate": 0.0001960701754385965, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6108, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 80 }, { "epoch": 0.66, "grad_norm": 0.06103940308094025, "kl": 0.0, "learning_rate": 0.00019326315789473686, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.754, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 100 }, { "epoch": 0.66, "eval_kl": 0.0, "eval_logps/chosen": -2027.0018310546875, "eval_logps/rejected": -1697.82177734375, "eval_loss": 0.5000000596046448, "eval_rewards/chosen": -174.75149536132812, "eval_rewards/margins": -28.786863327026367, "eval_rewards/rejected": -145.96463012695312, "eval_runtime": 170.0562, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.517, "step": 100 }, { "epoch": 0.79, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001904561403508772, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.95, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 120 }, { "epoch": 0.92, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00018764912280701756, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6274, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 140 }, { "epoch": 0.99, "eval_kl": 0.0, "eval_logps/chosen": -2237.81494140625, "eval_logps/rejected": -1889.774169921875, "eval_loss": 0.5, "eval_rewards/chosen": -195.83285522460938, "eval_rewards/margins": -30.672954559326172, "eval_rewards/rejected": -165.15989685058594, "eval_runtime": 169.8795, "eval_samples_per_second": 2.06, "eval_steps_per_second": 0.518, "step": 150 }, { "epoch": 1.06, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001848421052631579, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6387, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 160 }, { "epoch": 1.19, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00018203508771929826, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8327, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 180 }, { "epoch": 1.32, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00017922807017543862, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.642, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 200 }, { "epoch": 1.32, "eval_kl": 0.0, "eval_logps/chosen": -2230.916259765625, "eval_logps/rejected": -1884.9520263671875, "eval_loss": 0.5000000596046448, "eval_rewards/chosen": -195.14297485351562, "eval_rewards/margins": -30.465293884277344, "eval_rewards/rejected": -164.67767333984375, "eval_runtime": 170.1489, "eval_samples_per_second": 2.057, "eval_steps_per_second": 0.517, "step": 200 }, { "epoch": 1.45, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00017642105263157896, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7493, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 220 }, { "epoch": 1.58, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001736140350877193, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6241, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 240 }, { "epoch": 1.65, "eval_kl": 0.0, "eval_logps/chosen": -2230.957275390625, "eval_logps/rejected": -1885.0225830078125, "eval_loss": 0.5000000596046448, "eval_rewards/chosen": -195.14706420898438, "eval_rewards/margins": -30.462318420410156, "eval_rewards/rejected": -164.68475341796875, "eval_runtime": 170.1092, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.517, "step": 250 }, { "epoch": 1.72, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00017080701754385965, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.9621, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 260 }, { "epoch": 1.85, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.000168, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7279, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 280 }, { "epoch": 1.98, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00016519298245614035, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7477, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 300 }, { "epoch": 1.98, "eval_kl": 0.0, "eval_logps/chosen": -2238.164306640625, "eval_logps/rejected": -1890.7996826171875, "eval_loss": 0.5, "eval_rewards/chosen": -195.86773681640625, "eval_rewards/margins": -30.605329513549805, "eval_rewards/rejected": -165.26242065429688, "eval_runtime": 170.0647, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.517, "step": 300 }, { "epoch": 2.11, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00016238596491228072, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7111, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 320 }, { "epoch": 2.24, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00015957894736842105, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8685, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 340 }, { "epoch": 2.31, "eval_kl": 0.0, "eval_logps/chosen": -2238.054931640625, "eval_logps/rejected": -1890.694580078125, "eval_loss": 0.5, "eval_rewards/chosen": -195.85682678222656, "eval_rewards/margins": -30.604921340942383, "eval_rewards/rejected": -165.2519073486328, "eval_runtime": 170.0528, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.517, "step": 350 }, { "epoch": 2.38, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00015677192982456142, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6905, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 360 }, { "epoch": 2.51, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00015396491228070175, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.736, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 380 }, { "epoch": 2.64, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00015115789473684211, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.693, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 400 }, { "epoch": 2.64, "eval_kl": 0.0, "eval_logps/chosen": -2237.827392578125, "eval_logps/rejected": -1890.5028076171875, "eval_loss": 0.5, "eval_rewards/chosen": -195.83407592773438, "eval_rewards/margins": -30.601318359375, "eval_rewards/rejected": -165.23275756835938, "eval_runtime": 170.2445, "eval_samples_per_second": 2.056, "eval_steps_per_second": 0.517, "step": 400 }, { "epoch": 2.77, "grad_norm": 8.788210266175156e-07, "kl": 0.0, "learning_rate": 0.00014835087719298245, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8652, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 420 }, { "epoch": 2.9, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001455438596491228, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.686, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 440 }, { "epoch": 2.97, "eval_kl": 0.0, "eval_logps/chosen": -2237.722412109375, "eval_logps/rejected": -1890.4027099609375, "eval_loss": 0.5, "eval_rewards/chosen": -195.82354736328125, "eval_rewards/margins": -30.600812911987305, "eval_rewards/rejected": -165.22274780273438, "eval_runtime": 170.3429, "eval_samples_per_second": 2.055, "eval_steps_per_second": 0.517, "step": 450 }, { "epoch": 3.04, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00014273684210526318, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6858, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 460 }, { "epoch": 3.17, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001399298245614035, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8479, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 480 }, { "epoch": 3.3, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00013712280701754388, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6119, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 500 }, { "epoch": 3.3, "eval_kl": 0.0, "eval_logps/chosen": -2237.6083984375, "eval_logps/rejected": -1890.3140869140625, "eval_loss": 0.5, "eval_rewards/chosen": -195.81216430664062, "eval_rewards/margins": -30.598268508911133, "eval_rewards/rejected": -165.21388244628906, "eval_runtime": 169.9488, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 500 }, { "epoch": 3.43, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001343157894736842, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7107, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 520 }, { "epoch": 3.56, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00013150877192982455, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.5902, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 540 }, { "epoch": 3.63, "eval_kl": 0.0, "eval_logps/chosen": -2237.56494140625, "eval_logps/rejected": -1890.3043212890625, "eval_loss": 0.5, "eval_rewards/chosen": -195.80784606933594, "eval_rewards/margins": -30.59491539001465, "eval_rewards/rejected": -165.21290588378906, "eval_runtime": 169.9756, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 550 }, { "epoch": 3.7, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0001287017543859649, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.9042, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 560 }, { "epoch": 3.83, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00012589473684210527, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7268, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 580 }, { "epoch": 3.96, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00012308771929824564, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7106, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 600 }, { "epoch": 3.96, "eval_kl": 0.0, "eval_logps/chosen": -2241.97509765625, "eval_logps/rejected": -1893.87646484375, "eval_loss": 0.5, "eval_rewards/chosen": -196.24884033203125, "eval_rewards/margins": -30.67871856689453, "eval_rewards/rejected": -165.57012939453125, "eval_runtime": 169.9427, "eval_samples_per_second": 2.06, "eval_steps_per_second": 0.518, "step": 600 }, { "epoch": 4.09, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00012028070175438597, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6829, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 620 }, { "epoch": 4.22, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00011747368421052631, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8232, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 640 }, { "epoch": 4.29, "eval_kl": 0.0, "eval_logps/chosen": -2241.91552734375, "eval_logps/rejected": -1893.757080078125, "eval_loss": 0.5, "eval_rewards/chosen": -196.24290466308594, "eval_rewards/margins": -30.684709548950195, "eval_rewards/rejected": -165.55816650390625, "eval_runtime": 169.9605, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 650 }, { "epoch": 4.36, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00011466666666666667, "logps/chosen": -2123.240234375, "logps/rejected": NaN, "loss": 0.6315, "rewards/chosen": -188.09486389160156, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 660 }, { "epoch": 4.49, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00011185964912280702, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7998, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 680 }, { "epoch": 4.62, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00010905263157894738, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.5881, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 700 }, { "epoch": 4.62, "eval_kl": 0.0, "eval_logps/chosen": -2251.134033203125, "eval_logps/rejected": -1901.2047119140625, "eval_loss": 0.5, "eval_rewards/chosen": -197.1647491455078, "eval_rewards/margins": -30.8618221282959, "eval_rewards/rejected": -166.30291748046875, "eval_runtime": 169.9704, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 700 }, { "epoch": 4.75, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00010624561403508772, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8756, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 720 }, { "epoch": 4.88, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00010343859649122807, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6156, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 740 }, { "epoch": 4.95, "eval_kl": 0.0, "eval_logps/chosen": -2250.90234375, "eval_logps/rejected": -1901.0179443359375, "eval_loss": 0.5, "eval_rewards/chosen": -197.1415557861328, "eval_rewards/margins": -30.857322692871094, "eval_rewards/rejected": -166.28424072265625, "eval_runtime": 169.9616, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 750 }, { "epoch": 5.02, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.00010063157894736843, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7376, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 760 }, { "epoch": 5.15, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.782456140350877e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7998, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 780 }, { "epoch": 5.28, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.501754385964913e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6291, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 800 }, { "epoch": 5.28, "eval_kl": 0.0, "eval_logps/chosen": -2250.995849609375, "eval_logps/rejected": -1901.1036376953125, "eval_loss": 0.5, "eval_rewards/chosen": -197.15087890625, "eval_rewards/margins": -30.858049392700195, "eval_rewards/rejected": -166.29283142089844, "eval_runtime": 169.941, "eval_samples_per_second": 2.06, "eval_steps_per_second": 0.518, "step": 800 }, { "epoch": 5.41, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.221052631578948e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7167, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 820 }, { "epoch": 5.54, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.940350877192983e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6285, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 840 }, { "epoch": 5.61, "eval_kl": 0.0, "eval_logps/chosen": -2251.08837890625, "eval_logps/rejected": -1901.1571044921875, "eval_loss": 0.5, "eval_rewards/chosen": -197.16017150878906, "eval_rewards/margins": -30.86201286315918, "eval_rewards/rejected": -166.2981719970703, "eval_runtime": 169.9583, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 850 }, { "epoch": 5.68, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.659649122807018e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7898, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 860 }, { "epoch": 5.81, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.378947368421053e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8174, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 880 }, { "epoch": 5.94, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.098245614035088e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6918, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 900 }, { "epoch": 5.94, "eval_kl": 0.0, "eval_logps/chosen": -2251.1103515625, "eval_logps/rejected": -1901.1773681640625, "eval_loss": 0.5, "eval_rewards/chosen": -197.16233825683594, "eval_rewards/margins": -30.86213493347168, "eval_rewards/rejected": -166.30018615722656, "eval_runtime": 170.0642, "eval_samples_per_second": 2.058, "eval_steps_per_second": 0.517, "step": 900 }, { "epoch": 6.07, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.817543859649124e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6965, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 920 }, { "epoch": 6.2, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.536842105263158e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7869, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 940 }, { "epoch": 6.27, "eval_kl": 0.0, "eval_logps/chosen": -2251.116943359375, "eval_logps/rejected": -1901.21484375, "eval_loss": 0.5, "eval_rewards/chosen": -197.16302490234375, "eval_rewards/margins": -30.85906982421875, "eval_rewards/rejected": -166.303955078125, "eval_runtime": 169.9373, "eval_samples_per_second": 2.06, "eval_steps_per_second": 0.518, "step": 950 }, { "epoch": 6.34, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.256140350877193e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.6402, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 960 }, { "epoch": 6.47, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.975438596491229e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.8122, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 980 }, { "epoch": 6.6, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.694736842105264e-05, "logps/chosen": -2150.89111328125, "logps/rejected": NaN, "loss": 0.5483, "rewards/chosen": -190.25399780273438, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1000 }, { "epoch": 6.6, "eval_kl": 0.0, "eval_logps/chosen": -2251.134521484375, "eval_logps/rejected": -1901.1729736328125, "eval_loss": 0.5, "eval_rewards/chosen": -197.16481018066406, "eval_rewards/margins": -30.865028381347656, "eval_rewards/rejected": -166.29977416992188, "eval_runtime": 169.9607, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 1000 }, { "epoch": 6.73, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.414035087719299e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 1.0998, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1020 }, { "epoch": 6.86, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.133333333333334e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7744, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1040 }, { "epoch": 6.93, "eval_kl": 0.0, "eval_logps/chosen": -2254.820068359375, "eval_logps/rejected": -1904.1441650390625, "eval_loss": 0.5, "eval_rewards/chosen": -197.53334045410156, "eval_rewards/margins": -30.936431884765625, "eval_rewards/rejected": -166.59690856933594, "eval_runtime": 169.9328, "eval_samples_per_second": 2.06, "eval_steps_per_second": 0.518, "step": 1050 }, { "epoch": 7.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.852631578947369e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7891, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1060 }, { "epoch": 7.13, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.571929824561404e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.7203, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1080 }, { "epoch": 7.26, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.291228070175439e-05, "logps/chosen": NaN, "logps/rejected": NaN, "loss": 0.9077, "rewards/chosen": NaN, "rewards/margins": NaN, "rewards/rejected": NaN, "step": 1100 }, { "epoch": 7.26, "eval_kl": 0.0, "eval_logps/chosen": -2254.888427734375, "eval_logps/rejected": -1904.1827392578125, "eval_loss": 0.5, "eval_rewards/chosen": -197.54017639160156, "eval_rewards/margins": -30.939420700073242, "eval_rewards/rejected": -166.6007537841797, "eval_runtime": 169.9818, "eval_samples_per_second": 2.059, "eval_steps_per_second": 0.518, "step": 1100 } ], "logging_steps": 20, "max_steps": 1470, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }