diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5952 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 26.0, + "global_step": 466570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002, + "loss": 1.9055, + "step": 1000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002, + "loss": 1.8958, + "step": 1500 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002, + "loss": 1.8887, + "step": 2000 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002, + "loss": 1.8921, + "step": 2500 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002, + "loss": 1.8743, + "step": 3000 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002, + "loss": 1.8678, + "step": 3500 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002, + "loss": 1.8695, + "step": 4000 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002, + "loss": 1.8759, + "step": 4500 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002, + "loss": 1.8489, + "step": 5000 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002, + "loss": 1.8643, + "step": 5500 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 6000 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 6500 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002, + "loss": 1.8667, + "step": 7000 + }, + { + "epoch": 0.42, + "learning_rate": 0.0002, + "loss": 1.8646, + "step": 7500 + }, + { + "epoch": 0.45, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 8000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 8500 + }, + { + "epoch": 0.5, + "learning_rate": 0.0002, + "loss": 1.8569, + "step": 9000 + }, + { + "epoch": 0.53, + "learning_rate": 0.0002, + "loss": 1.8536, + "step": 9500 + }, + { + "epoch": 0.56, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 10000 + }, + { + "epoch": 0.59, + "learning_rate": 0.0002, + "loss": 1.8589, + "step": 10500 + }, + { + "epoch": 0.61, + "learning_rate": 0.0002, + "loss": 1.8462, + "step": 11000 + }, + { + "epoch": 0.64, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 11500 + }, + { + "epoch": 0.67, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 12000 + }, + { + "epoch": 0.7, + "learning_rate": 0.0002, + "loss": 1.8458, + "step": 12500 + }, + { + "epoch": 0.72, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 13000 + }, + { + "epoch": 0.75, + "learning_rate": 0.0002, + "loss": 1.8577, + "step": 13500 + }, + { + "epoch": 0.78, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 14000 + }, + { + "epoch": 0.81, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 14500 + }, + { + "epoch": 0.84, + "learning_rate": 0.0002, + "loss": 1.8373, + "step": 15000 + }, + { + "epoch": 0.86, + "learning_rate": 0.0002, + "loss": 1.8447, + "step": 15500 + }, + { + "epoch": 0.89, + "learning_rate": 0.0002, + "loss": 1.8508, + "step": 16000 + }, + { + "epoch": 0.92, + "learning_rate": 0.0002, + "loss": 1.8333, + "step": 16500 + }, + { + "epoch": 0.95, + "learning_rate": 0.0002, + "loss": 1.8298, + "step": 17000 + }, + { + "epoch": 0.98, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 17500 + }, + { + "epoch": 1.0, + "eval_gen_len": 19.0, + "eval_loss": 1.6631494760513306, + "eval_rouge1": 24.5532, + "eval_rouge2": 11.7093, + "eval_rougeL": 20.2667, + "eval_rougeLsum": 23.1638, + "eval_runtime": 448.1964, + "eval_samples_per_second": 29.826, + "eval_steps_per_second": 1.865, + "step": 17945 + }, + { + "epoch": 1.0, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 18000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 18500 + }, + { + "epoch": 1.06, + "learning_rate": 0.0002, + "loss": 1.8007, + "step": 19000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0002, + "loss": 1.8053, + "step": 19500 + }, + { + "epoch": 1.11, + "learning_rate": 0.0002, + "loss": 1.7934, + "step": 20000 + }, + { + "epoch": 1.14, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 20500 + }, + { + "epoch": 1.17, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 21000 + }, + { + "epoch": 1.2, + "learning_rate": 0.0002, + "loss": 1.8153, + "step": 21500 + }, + { + "epoch": 1.23, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 22000 + }, + { + "epoch": 1.25, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 22500 + }, + { + "epoch": 1.28, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 23000 + }, + { + "epoch": 1.31, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 23500 + }, + { + "epoch": 1.34, + "learning_rate": 0.0002, + "loss": 1.7973, + "step": 24000 + }, + { + "epoch": 1.37, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 24500 + }, + { + "epoch": 1.39, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 25000 + }, + { + "epoch": 1.42, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 25500 + }, + { + "epoch": 1.45, + "learning_rate": 0.0002, + "loss": 1.7966, + "step": 26000 + }, + { + "epoch": 1.48, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 26500 + }, + { + "epoch": 1.5, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 27000 + }, + { + "epoch": 1.53, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 27500 + }, + { + "epoch": 1.56, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 28000 + }, + { + "epoch": 1.59, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 28500 + }, + { + "epoch": 1.62, + "learning_rate": 0.0002, + "loss": 1.7915, + "step": 29000 + }, + { + "epoch": 1.64, + "learning_rate": 0.0002, + "loss": 1.8024, + "step": 29500 + }, + { + "epoch": 1.67, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 30000 + }, + { + "epoch": 1.7, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 30500 + }, + { + "epoch": 1.73, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 31000 + }, + { + "epoch": 1.76, + "learning_rate": 0.0002, + "loss": 1.7937, + "step": 31500 + }, + { + "epoch": 1.78, + "learning_rate": 0.0002, + "loss": 1.8043, + "step": 32000 + }, + { + "epoch": 1.81, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 32500 + }, + { + "epoch": 1.84, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 33000 + }, + { + "epoch": 1.87, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 33500 + }, + { + "epoch": 1.89, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 34000 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 34500 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 35000 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002, + "loss": 1.7898, + "step": 35500 + }, + { + "epoch": 2.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.6393862962722778, + "eval_rouge1": 24.5891, + "eval_rouge2": 11.7919, + "eval_rougeL": 20.3168, + "eval_rougeLsum": 23.1912, + "eval_runtime": 447.7546, + "eval_samples_per_second": 29.856, + "eval_steps_per_second": 1.867, + "step": 35890 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 36000 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002, + "loss": 1.7595, + "step": 36500 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002, + "loss": 1.7526, + "step": 37000 + }, + { + "epoch": 2.09, + "learning_rate": 0.0002, + "loss": 1.7677, + "step": 37500 + }, + { + "epoch": 2.12, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 38000 + }, + { + "epoch": 2.15, + "learning_rate": 0.0002, + "loss": 1.759, + "step": 38500 + }, + { + "epoch": 2.17, + "learning_rate": 0.0002, + "loss": 1.7633, + "step": 39000 + }, + { + "epoch": 2.2, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 39500 + }, + { + "epoch": 2.23, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 40000 + }, + { + "epoch": 2.26, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 40500 + }, + { + "epoch": 2.28, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 41000 + }, + { + "epoch": 2.31, + "learning_rate": 0.0002, + "loss": 1.7758, + "step": 41500 + }, + { + "epoch": 2.34, + "learning_rate": 0.0002, + "loss": 1.7547, + "step": 42000 + }, + { + "epoch": 2.37, + "learning_rate": 0.0002, + "loss": 1.755, + "step": 42500 + }, + { + "epoch": 2.4, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 43000 + }, + { + "epoch": 2.42, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 43500 + }, + { + "epoch": 2.45, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 44000 + }, + { + "epoch": 2.48, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 44500 + }, + { + "epoch": 2.51, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 45000 + }, + { + "epoch": 2.54, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 45500 + }, + { + "epoch": 2.56, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 46000 + }, + { + "epoch": 2.59, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 46500 + }, + { + "epoch": 2.62, + "learning_rate": 0.0002, + "loss": 1.7576, + "step": 47000 + }, + { + "epoch": 2.65, + "learning_rate": 0.0002, + "loss": 1.7618, + "step": 47500 + }, + { + "epoch": 2.67, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 48000 + }, + { + "epoch": 2.7, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 48500 + }, + { + "epoch": 2.73, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 49000 + }, + { + "epoch": 2.76, + "learning_rate": 0.0002, + "loss": 1.7632, + "step": 49500 + }, + { + "epoch": 2.79, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 50000 + }, + { + "epoch": 2.81, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 50500 + }, + { + "epoch": 2.84, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 51000 + }, + { + "epoch": 2.87, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 51500 + }, + { + "epoch": 2.9, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 52000 + }, + { + "epoch": 2.93, + "learning_rate": 0.0002, + "loss": 1.7626, + "step": 52500 + }, + { + "epoch": 2.95, + "learning_rate": 0.0002, + "loss": 1.7534, + "step": 53000 + }, + { + "epoch": 2.98, + "learning_rate": 0.0002, + "loss": 1.7617, + "step": 53500 + }, + { + "epoch": 3.0, + "eval_gen_len": 18.999401555954517, + "eval_loss": 1.6363353729248047, + "eval_rouge1": 24.6169, + "eval_rouge2": 11.9038, + "eval_rougeL": 20.3703, + "eval_rougeLsum": 23.2215, + "eval_runtime": 448.243, + "eval_samples_per_second": 29.823, + "eval_steps_per_second": 1.865, + "step": 53835 + }, + { + "epoch": 3.01, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 54000 + }, + { + "epoch": 3.04, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 54500 + }, + { + "epoch": 3.06, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 55000 + }, + { + "epoch": 3.09, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 55500 + }, + { + "epoch": 3.12, + "learning_rate": 0.0002, + "loss": 1.7415, + "step": 56000 + }, + { + "epoch": 3.15, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 56500 + }, + { + "epoch": 3.18, + "learning_rate": 0.0002, + "loss": 1.7194, + "step": 57000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 57500 + }, + { + "epoch": 3.23, + "learning_rate": 0.0002, + "loss": 1.7415, + "step": 58000 + }, + { + "epoch": 3.26, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 58500 + }, + { + "epoch": 3.29, + "learning_rate": 0.0002, + "loss": 1.74, + "step": 59000 + }, + { + "epoch": 3.32, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 59500 + }, + { + "epoch": 3.34, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 60000 + }, + { + "epoch": 3.37, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 60500 + }, + { + "epoch": 3.4, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 61000 + }, + { + "epoch": 3.43, + "learning_rate": 0.0002, + "loss": 1.7284, + "step": 61500 + }, + { + "epoch": 3.46, + "learning_rate": 0.0002, + "loss": 1.7494, + "step": 62000 + }, + { + "epoch": 3.48, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 62500 + }, + { + "epoch": 3.51, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 63000 + }, + { + "epoch": 3.54, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 63500 + }, + { + "epoch": 3.57, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 64000 + }, + { + "epoch": 3.59, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 64500 + }, + { + "epoch": 3.62, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 65000 + }, + { + "epoch": 3.65, + "learning_rate": 0.0002, + "loss": 1.7392, + "step": 65500 + }, + { + "epoch": 3.68, + "learning_rate": 0.0002, + "loss": 1.7414, + "step": 66000 + }, + { + "epoch": 3.71, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 66500 + }, + { + "epoch": 3.73, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 67000 + }, + { + "epoch": 3.76, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 67500 + }, + { + "epoch": 3.79, + "learning_rate": 0.0002, + "loss": 1.7235, + "step": 68000 + }, + { + "epoch": 3.82, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 68500 + }, + { + "epoch": 3.85, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 69000 + }, + { + "epoch": 3.87, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 69500 + }, + { + "epoch": 3.9, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 70000 + }, + { + "epoch": 3.93, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 70500 + }, + { + "epoch": 3.96, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 71000 + }, + { + "epoch": 3.98, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 71500 + }, + { + "epoch": 4.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.618168830871582, + "eval_rouge1": 24.7438, + "eval_rouge2": 11.9392, + "eval_rougeL": 20.4669, + "eval_rougeLsum": 23.3646, + "eval_runtime": 447.3678, + "eval_samples_per_second": 29.881, + "eval_steps_per_second": 1.869, + "step": 71780 + }, + { + "epoch": 4.01, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 72000 + }, + { + "epoch": 4.04, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 72500 + }, + { + "epoch": 4.07, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 73000 + }, + { + "epoch": 4.1, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 73500 + }, + { + "epoch": 4.12, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 74000 + }, + { + "epoch": 4.15, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 74500 + }, + { + "epoch": 4.18, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 75000 + }, + { + "epoch": 4.21, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 75500 + }, + { + "epoch": 4.24, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 76000 + }, + { + "epoch": 4.26, + "learning_rate": 0.0002, + "loss": 1.7165, + "step": 76500 + }, + { + "epoch": 4.29, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 77000 + }, + { + "epoch": 4.32, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 77500 + }, + { + "epoch": 4.35, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 78000 + }, + { + "epoch": 4.37, + "learning_rate": 0.0002, + "loss": 1.7172, + "step": 78500 + }, + { + "epoch": 4.4, + "learning_rate": 0.0002, + "loss": 1.7097, + "step": 79000 + }, + { + "epoch": 4.43, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 79500 + }, + { + "epoch": 4.46, + "learning_rate": 0.0002, + "loss": 1.7164, + "step": 80000 + }, + { + "epoch": 4.49, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 80500 + }, + { + "epoch": 4.51, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 81000 + }, + { + "epoch": 4.54, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 81500 + }, + { + "epoch": 4.57, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 82000 + }, + { + "epoch": 4.6, + "learning_rate": 0.0002, + "loss": 1.7183, + "step": 82500 + }, + { + "epoch": 4.63, + "learning_rate": 0.0002, + "loss": 1.7236, + "step": 83000 + }, + { + "epoch": 4.65, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 83500 + }, + { + "epoch": 4.68, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 84000 + }, + { + "epoch": 4.71, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 84500 + }, + { + "epoch": 4.74, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 85000 + }, + { + "epoch": 4.76, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 85500 + }, + { + "epoch": 4.79, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 86000 + }, + { + "epoch": 4.82, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 86500 + }, + { + "epoch": 4.85, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 87000 + }, + { + "epoch": 4.88, + "learning_rate": 0.0002, + "loss": 1.7238, + "step": 87500 + }, + { + "epoch": 4.9, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 88000 + }, + { + "epoch": 4.93, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 88500 + }, + { + "epoch": 4.96, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 89000 + }, + { + "epoch": 4.99, + "learning_rate": 0.0002, + "loss": 1.7278, + "step": 89500 + }, + { + "epoch": 5.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.6149933338165283, + "eval_rouge1": 24.8219, + "eval_rouge2": 12.0279, + "eval_rougeL": 20.5184, + "eval_rougeLsum": 23.4296, + "eval_runtime": 448.4677, + "eval_samples_per_second": 29.808, + "eval_steps_per_second": 1.864, + "step": 89725 + }, + { + "epoch": 5.02, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 90000 + }, + { + "epoch": 5.04, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 90500 + }, + { + "epoch": 5.07, + "learning_rate": 0.0002, + "loss": 1.6936, + "step": 91000 + }, + { + "epoch": 5.1, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 91500 + }, + { + "epoch": 5.13, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 92000 + }, + { + "epoch": 5.15, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 92500 + }, + { + "epoch": 5.18, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 93000 + }, + { + "epoch": 5.21, + "learning_rate": 0.0002, + "loss": 1.6954, + "step": 93500 + }, + { + "epoch": 5.24, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 94000 + }, + { + "epoch": 5.27, + "learning_rate": 0.0002, + "loss": 1.6941, + "step": 94500 + }, + { + "epoch": 5.29, + "learning_rate": 0.0002, + "loss": 1.6915, + "step": 95000 + }, + { + "epoch": 5.32, + "learning_rate": 0.0002, + "loss": 1.7156, + "step": 95500 + }, + { + "epoch": 5.35, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 96000 + }, + { + "epoch": 5.38, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 96500 + }, + { + "epoch": 5.41, + "learning_rate": 0.0002, + "loss": 1.7045, + "step": 97000 + }, + { + "epoch": 5.43, + "learning_rate": 0.0002, + "loss": 1.7061, + "step": 97500 + }, + { + "epoch": 5.46, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 98000 + }, + { + "epoch": 5.49, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 98500 + }, + { + "epoch": 5.52, + "learning_rate": 0.0002, + "loss": 1.6952, + "step": 99000 + }, + { + "epoch": 5.54, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 99500 + }, + { + "epoch": 5.57, + "learning_rate": 0.0002, + "loss": 1.6924, + "step": 100000 + }, + { + "epoch": 5.6, + "learning_rate": 0.0002, + "loss": 1.6975, + "step": 100500 + }, + { + "epoch": 5.63, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 101000 + }, + { + "epoch": 5.66, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 101500 + }, + { + "epoch": 5.68, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 102000 + }, + { + "epoch": 5.71, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 102500 + }, + { + "epoch": 5.74, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 103000 + }, + { + "epoch": 5.77, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 103500 + }, + { + "epoch": 5.8, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 104000 + }, + { + "epoch": 5.82, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 104500 + }, + { + "epoch": 5.85, + "learning_rate": 0.0002, + "loss": 1.6959, + "step": 105000 + }, + { + "epoch": 5.88, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 105500 + }, + { + "epoch": 5.91, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 106000 + }, + { + "epoch": 5.93, + "learning_rate": 0.0002, + "loss": 1.6956, + "step": 106500 + }, + { + "epoch": 5.96, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 107000 + }, + { + "epoch": 5.99, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 107500 + }, + { + "epoch": 6.0, + "eval_gen_len": 18.999476361460204, + "eval_loss": 1.6058918237686157, + "eval_rouge1": 24.805, + "eval_rouge2": 11.8889, + "eval_rougeL": 20.4365, + "eval_rougeLsum": 23.3773, + "eval_runtime": 447.2273, + "eval_samples_per_second": 29.891, + "eval_steps_per_second": 1.869, + "step": 107670 + }, + { + "epoch": 6.02, + "learning_rate": 0.0002, + "loss": 1.6862, + "step": 108000 + }, + { + "epoch": 6.05, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 108500 + }, + { + "epoch": 6.07, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 109000 + }, + { + "epoch": 6.1, + "learning_rate": 0.0002, + "loss": 1.656, + "step": 109500 + }, + { + "epoch": 6.13, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 110000 + }, + { + "epoch": 6.16, + "learning_rate": 0.0002, + "loss": 1.6902, + "step": 110500 + }, + { + "epoch": 6.19, + "learning_rate": 0.0002, + "loss": 1.6914, + "step": 111000 + }, + { + "epoch": 6.21, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 111500 + }, + { + "epoch": 6.24, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 112000 + }, + { + "epoch": 6.27, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 112500 + }, + { + "epoch": 6.3, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 113000 + }, + { + "epoch": 6.32, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 113500 + }, + { + "epoch": 6.35, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 114000 + }, + { + "epoch": 6.38, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 114500 + }, + { + "epoch": 6.41, + "learning_rate": 0.0002, + "loss": 1.6769, + "step": 115000 + }, + { + "epoch": 6.44, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 115500 + }, + { + "epoch": 6.46, + "learning_rate": 0.0002, + "loss": 1.6842, + "step": 116000 + }, + { + "epoch": 6.49, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 116500 + }, + { + "epoch": 6.52, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 117000 + }, + { + "epoch": 6.55, + "learning_rate": 0.0002, + "loss": 1.6801, + "step": 117500 + }, + { + "epoch": 6.58, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 118000 + }, + { + "epoch": 6.6, + "learning_rate": 0.0002, + "loss": 1.6888, + "step": 118500 + }, + { + "epoch": 6.63, + "learning_rate": 0.0002, + "loss": 1.7039, + "step": 119000 + }, + { + "epoch": 6.66, + "learning_rate": 0.0002, + "loss": 1.6778, + "step": 119500 + }, + { + "epoch": 6.69, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 120000 + }, + { + "epoch": 6.71, + "learning_rate": 0.0002, + "loss": 1.6817, + "step": 120500 + }, + { + "epoch": 6.74, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 121000 + }, + { + "epoch": 6.77, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 121500 + }, + { + "epoch": 6.8, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 122000 + }, + { + "epoch": 6.83, + "learning_rate": 0.0002, + "loss": 1.6883, + "step": 122500 + }, + { + "epoch": 6.85, + "learning_rate": 0.0002, + "loss": 1.6898, + "step": 123000 + }, + { + "epoch": 6.88, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 123500 + }, + { + "epoch": 6.91, + "learning_rate": 0.0002, + "loss": 1.6961, + "step": 124000 + }, + { + "epoch": 6.94, + "learning_rate": 0.0002, + "loss": 1.6935, + "step": 124500 + }, + { + "epoch": 6.97, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 125000 + }, + { + "epoch": 6.99, + "learning_rate": 0.0002, + "loss": 1.6958, + "step": 125500 + }, + { + "epoch": 7.0, + "eval_gen_len": 19.0, + "eval_loss": 1.6058346033096313, + "eval_rouge1": 24.681, + "eval_rouge2": 11.9144, + "eval_rougeL": 20.3968, + "eval_rougeLsum": 23.2841, + "eval_runtime": 449.9498, + "eval_samples_per_second": 29.71, + "eval_steps_per_second": 1.858, + "step": 125615 + }, + { + "epoch": 7.02, + "learning_rate": 0.0002, + "loss": 1.6566, + "step": 126000 + }, + { + "epoch": 7.05, + "learning_rate": 0.0002, + "loss": 1.6548, + "step": 126500 + }, + { + "epoch": 7.08, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 127000 + }, + { + "epoch": 7.11, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 127500 + }, + { + "epoch": 7.13, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 128000 + }, + { + "epoch": 7.16, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 128500 + }, + { + "epoch": 7.19, + "learning_rate": 0.0002, + "loss": 1.655, + "step": 129000 + }, + { + "epoch": 7.22, + "learning_rate": 0.0002, + "loss": 1.6619, + "step": 129500 + }, + { + "epoch": 7.24, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 130000 + }, + { + "epoch": 7.27, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 130500 + }, + { + "epoch": 7.3, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 131000 + }, + { + "epoch": 7.33, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 131500 + }, + { + "epoch": 7.36, + "learning_rate": 0.0002, + "loss": 1.6588, + "step": 132000 + }, + { + "epoch": 7.38, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 132500 + }, + { + "epoch": 7.41, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 133000 + }, + { + "epoch": 7.44, + "learning_rate": 0.0002, + "loss": 1.6628, + "step": 133500 + }, + { + "epoch": 7.47, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 134000 + }, + { + "epoch": 7.5, + "learning_rate": 0.0002, + "loss": 1.672, + "step": 134500 + }, + { + "epoch": 7.52, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 135000 + }, + { + "epoch": 7.55, + "learning_rate": 0.0002, + "loss": 1.6617, + "step": 135500 + }, + { + "epoch": 7.58, + "learning_rate": 0.0002, + "loss": 1.6611, + "step": 136000 + }, + { + "epoch": 7.61, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 136500 + }, + { + "epoch": 7.63, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 137000 + }, + { + "epoch": 7.66, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 137500 + }, + { + "epoch": 7.69, + "learning_rate": 0.0002, + "loss": 1.6813, + "step": 138000 + }, + { + "epoch": 7.72, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 138500 + }, + { + "epoch": 7.75, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 139000 + }, + { + "epoch": 7.77, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 139500 + }, + { + "epoch": 7.8, + "learning_rate": 0.0002, + "loss": 1.6691, + "step": 140000 + }, + { + "epoch": 7.83, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 140500 + }, + { + "epoch": 7.86, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 141000 + }, + { + "epoch": 7.89, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 141500 + }, + { + "epoch": 7.91, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 142000 + }, + { + "epoch": 7.94, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 142500 + }, + { + "epoch": 7.97, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 143000 + }, + { + "epoch": 8.0, + "learning_rate": 0.0002, + "loss": 1.6775, + "step": 143500 + }, + { + "epoch": 8.0, + "eval_gen_len": 19.0, + "eval_loss": 1.6005198955535889, + "eval_rouge1": 24.7596, + "eval_rouge2": 11.9899, + "eval_rougeL": 20.4974, + "eval_rougeLsum": 23.348, + "eval_runtime": 449.6575, + "eval_samples_per_second": 29.729, + "eval_steps_per_second": 1.859, + "step": 143560 + }, + { + "epoch": 8.02, + "learning_rate": 0.0002, + "loss": 1.6455, + "step": 144000 + }, + { + "epoch": 8.05, + "learning_rate": 0.0002, + "loss": 1.6342, + "step": 144500 + }, + { + "epoch": 8.08, + "learning_rate": 0.0002, + "loss": 1.6436, + "step": 145000 + }, + { + "epoch": 8.11, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 145500 + }, + { + "epoch": 8.14, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 146000 + }, + { + "epoch": 8.16, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 146500 + }, + { + "epoch": 8.19, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 147000 + }, + { + "epoch": 8.22, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 147500 + }, + { + "epoch": 8.25, + "learning_rate": 0.0002, + "loss": 1.6508, + "step": 148000 + }, + { + "epoch": 8.28, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 148500 + }, + { + "epoch": 8.3, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 149000 + }, + { + "epoch": 8.33, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 149500 + }, + { + "epoch": 8.36, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 150000 + }, + { + "epoch": 8.39, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 150500 + }, + { + "epoch": 8.41, + "learning_rate": 0.0002, + "loss": 1.6575, + "step": 151000 + }, + { + "epoch": 8.44, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 151500 + }, + { + "epoch": 8.47, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 152000 + }, + { + "epoch": 8.5, + "learning_rate": 0.0002, + "loss": 1.6571, + "step": 152500 + }, + { + "epoch": 8.53, + "learning_rate": 0.0002, + "loss": 1.6618, + "step": 153000 + }, + { + "epoch": 8.55, + "learning_rate": 0.0002, + "loss": 1.6592, + "step": 153500 + }, + { + "epoch": 8.58, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 154000 + }, + { + "epoch": 8.61, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 154500 + }, + { + "epoch": 8.64, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 155000 + }, + { + "epoch": 8.67, + "learning_rate": 0.0002, + "loss": 1.6682, + "step": 155500 + }, + { + "epoch": 8.69, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 156000 + }, + { + "epoch": 8.72, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 156500 + }, + { + "epoch": 8.75, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 157000 + }, + { + "epoch": 8.78, + "learning_rate": 0.0002, + "loss": 1.6466, + "step": 157500 + }, + { + "epoch": 8.8, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 158000 + }, + { + "epoch": 8.83, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 158500 + }, + { + "epoch": 8.86, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 159000 + }, + { + "epoch": 8.89, + "learning_rate": 0.0002, + "loss": 1.6732, + "step": 159500 + }, + { + "epoch": 8.92, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 160000 + }, + { + "epoch": 8.94, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 160500 + }, + { + "epoch": 8.97, + "learning_rate": 0.0002, + "loss": 1.6611, + "step": 161000 + }, + { + "epoch": 9.0, + "learning_rate": 0.0002, + "loss": 1.6645, + "step": 161500 + }, + { + "epoch": 9.0, + "eval_gen_len": 18.99970077797726, + "eval_loss": 1.6074172258377075, + "eval_rouge1": 24.6775, + "eval_rouge2": 11.9459, + "eval_rougeL": 20.399, + "eval_rougeLsum": 23.2846, + "eval_runtime": 456.3741, + "eval_samples_per_second": 29.292, + "eval_steps_per_second": 1.832, + "step": 161505 + }, + { + "epoch": 9.03, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 162000 + }, + { + "epoch": 9.06, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 162500 + }, + { + "epoch": 9.08, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 163000 + }, + { + "epoch": 9.11, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 163500 + }, + { + "epoch": 9.14, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 164000 + }, + { + "epoch": 9.17, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 164500 + }, + { + "epoch": 9.19, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 165000 + }, + { + "epoch": 9.22, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 165500 + }, + { + "epoch": 9.25, + "learning_rate": 0.0002, + "loss": 1.6443, + "step": 166000 + }, + { + "epoch": 9.28, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 166500 + }, + { + "epoch": 9.31, + "learning_rate": 0.0002, + "loss": 1.627, + "step": 167000 + }, + { + "epoch": 9.33, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 167500 + }, + { + "epoch": 9.36, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 168000 + }, + { + "epoch": 9.39, + "learning_rate": 0.0002, + "loss": 1.6435, + "step": 168500 + }, + { + "epoch": 9.42, + "learning_rate": 0.0002, + "loss": 1.6358, + "step": 169000 + }, + { + "epoch": 9.45, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 169500 + }, + { + "epoch": 9.47, + "learning_rate": 0.0002, + "loss": 1.6505, + "step": 170000 + }, + { + "epoch": 9.5, + "learning_rate": 0.0002, + "loss": 1.6379, + "step": 170500 + }, + { + "epoch": 9.53, + "learning_rate": 0.0002, + "loss": 1.6499, + "step": 171000 + }, + { + "epoch": 9.56, + "learning_rate": 0.0002, + "loss": 1.6483, + "step": 171500 + }, + { + "epoch": 9.58, + "learning_rate": 0.0002, + "loss": 1.6383, + "step": 172000 + }, + { + "epoch": 9.61, + "learning_rate": 0.0002, + "loss": 1.6454, + "step": 172500 + }, + { + "epoch": 9.64, + "learning_rate": 0.0002, + "loss": 1.6448, + "step": 173000 + }, + { + "epoch": 9.67, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 173500 + }, + { + "epoch": 9.7, + "learning_rate": 0.0002, + "loss": 1.6443, + "step": 174000 + }, + { + "epoch": 9.72, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 174500 + }, + { + "epoch": 9.75, + "learning_rate": 0.0002, + "loss": 1.6498, + "step": 175000 + }, + { + "epoch": 9.78, + "learning_rate": 0.0002, + "loss": 1.6509, + "step": 175500 + }, + { + "epoch": 9.81, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 176000 + }, + { + "epoch": 9.84, + "learning_rate": 0.0002, + "loss": 1.6608, + "step": 176500 + }, + { + "epoch": 9.86, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 177000 + }, + { + "epoch": 9.89, + "learning_rate": 0.0002, + "loss": 1.6448, + "step": 177500 + }, + { + "epoch": 9.92, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 178000 + }, + { + "epoch": 9.95, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 178500 + }, + { + "epoch": 9.97, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 179000 + }, + { + "epoch": 10.0, + "eval_gen_len": 18.999476361460204, + "eval_loss": 1.5967729091644287, + "eval_rouge1": 24.656, + "eval_rouge2": 11.8517, + "eval_rougeL": 20.3594, + "eval_rougeLsum": 23.2389, + "eval_runtime": 456.5734, + "eval_samples_per_second": 29.279, + "eval_steps_per_second": 1.831, + "step": 179450 + }, + { + "epoch": 10.0, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 179500 + }, + { + "epoch": 10.03, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 180000 + }, + { + "epoch": 10.06, + "learning_rate": 0.0002, + "loss": 1.6252, + "step": 180500 + }, + { + "epoch": 10.09, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 181000 + }, + { + "epoch": 10.11, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 181500 + }, + { + "epoch": 10.14, + "learning_rate": 0.0002, + "loss": 1.6226, + "step": 182000 + }, + { + "epoch": 10.17, + "learning_rate": 0.0002, + "loss": 1.6254, + "step": 182500 + }, + { + "epoch": 10.2, + "learning_rate": 0.0002, + "loss": 1.6194, + "step": 183000 + }, + { + "epoch": 10.23, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 183500 + }, + { + "epoch": 10.25, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 184000 + }, + { + "epoch": 10.28, + "learning_rate": 0.0002, + "loss": 1.6196, + "step": 184500 + }, + { + "epoch": 10.31, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 185000 + }, + { + "epoch": 10.34, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 185500 + }, + { + "epoch": 10.37, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 186000 + }, + { + "epoch": 10.39, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 186500 + }, + { + "epoch": 10.42, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 187000 + }, + { + "epoch": 10.45, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 187500 + }, + { + "epoch": 10.48, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 188000 + }, + { + "epoch": 10.5, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 188500 + }, + { + "epoch": 10.53, + "learning_rate": 0.0002, + "loss": 1.631, + "step": 189000 + }, + { + "epoch": 10.56, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 189500 + }, + { + "epoch": 10.59, + "learning_rate": 0.0002, + "loss": 1.6298, + "step": 190000 + }, + { + "epoch": 10.62, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 190500 + }, + { + "epoch": 10.64, + "learning_rate": 0.0002, + "loss": 1.6416, + "step": 191000 + }, + { + "epoch": 10.67, + "learning_rate": 0.0002, + "loss": 1.6289, + "step": 191500 + }, + { + "epoch": 10.7, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 192000 + }, + { + "epoch": 10.73, + "learning_rate": 0.0002, + "loss": 1.6303, + "step": 192500 + }, + { + "epoch": 10.76, + "learning_rate": 0.0002, + "loss": 1.6393, + "step": 193000 + }, + { + "epoch": 10.78, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 193500 + }, + { + "epoch": 10.81, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 194000 + }, + { + "epoch": 10.84, + "learning_rate": 0.0002, + "loss": 1.6289, + "step": 194500 + }, + { + "epoch": 10.87, + "learning_rate": 0.0002, + "loss": 1.6388, + "step": 195000 + }, + { + "epoch": 10.89, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 195500 + }, + { + "epoch": 10.92, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 196000 + }, + { + "epoch": 10.95, + "learning_rate": 0.0002, + "loss": 1.6496, + "step": 196500 + }, + { + "epoch": 10.98, + "learning_rate": 0.0002, + "loss": 1.6374, + "step": 197000 + }, + { + "epoch": 11.0, + "eval_gen_len": 19.0, + "eval_loss": 1.5931986570358276, + "eval_rouge1": 24.7416, + "eval_rouge2": 11.9481, + "eval_rougeL": 20.4574, + "eval_rougeLsum": 23.3066, + "eval_runtime": 457.3631, + "eval_samples_per_second": 29.228, + "eval_steps_per_second": 1.828, + "step": 197395 + }, + { + "epoch": 11.01, + "learning_rate": 0.0002, + "loss": 1.6378, + "step": 197500 + }, + { + "epoch": 11.03, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 198000 + }, + { + "epoch": 11.06, + "learning_rate": 0.0002, + "loss": 1.6045, + "step": 198500 + }, + { + "epoch": 11.09, + "learning_rate": 0.0002, + "loss": 1.62, + "step": 199000 + }, + { + "epoch": 11.12, + "learning_rate": 0.0002, + "loss": 1.6175, + "step": 199500 + }, + { + "epoch": 11.15, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 200000 + }, + { + "epoch": 11.17, + "learning_rate": 0.0002, + "loss": 1.621, + "step": 200500 + }, + { + "epoch": 11.2, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 201000 + }, + { + "epoch": 11.23, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 201500 + }, + { + "epoch": 11.26, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 202000 + }, + { + "epoch": 11.28, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 202500 + }, + { + "epoch": 11.31, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 203000 + }, + { + "epoch": 11.34, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 203500 + }, + { + "epoch": 11.37, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 204000 + }, + { + "epoch": 11.4, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 204500 + }, + { + "epoch": 11.42, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 205000 + }, + { + "epoch": 11.45, + "learning_rate": 0.0002, + "loss": 1.6206, + "step": 205500 + }, + { + "epoch": 11.48, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 206000 + }, + { + "epoch": 11.51, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 206500 + }, + { + "epoch": 11.54, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 207000 + }, + { + "epoch": 11.56, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 207500 + }, + { + "epoch": 11.59, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 208000 + }, + { + "epoch": 11.62, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 208500 + }, + { + "epoch": 11.65, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 209000 + }, + { + "epoch": 11.67, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 209500 + }, + { + "epoch": 11.7, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 210000 + }, + { + "epoch": 11.73, + "learning_rate": 0.0002, + "loss": 1.6244, + "step": 210500 + }, + { + "epoch": 11.76, + "learning_rate": 0.0002, + "loss": 1.629, + "step": 211000 + }, + { + "epoch": 11.79, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 211500 + }, + { + "epoch": 11.81, + "learning_rate": 0.0002, + "loss": 1.628, + "step": 212000 + }, + { + "epoch": 11.84, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 212500 + }, + { + "epoch": 11.87, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 213000 + }, + { + "epoch": 11.9, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 213500 + }, + { + "epoch": 11.93, + "learning_rate": 0.0002, + "loss": 1.6352, + "step": 214000 + }, + { + "epoch": 11.95, + "learning_rate": 0.0002, + "loss": 1.6115, + "step": 214500 + }, + { + "epoch": 11.98, + "learning_rate": 0.0002, + "loss": 1.6271, + "step": 215000 + }, + { + "epoch": 12.0, + "eval_gen_len": 18.99985038898863, + "eval_loss": 1.5888069868087769, + "eval_rouge1": 24.8074, + "eval_rouge2": 11.9491, + "eval_rougeL": 20.4824, + "eval_rougeLsum": 23.3506, + "eval_runtime": 456.7914, + "eval_samples_per_second": 29.265, + "eval_steps_per_second": 1.83, + "step": 215340 + }, + { + "epoch": 12.01, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 215500 + }, + { + "epoch": 12.04, + "learning_rate": 0.0002, + "loss": 1.5915, + "step": 216000 + }, + { + "epoch": 12.06, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 216500 + }, + { + "epoch": 12.09, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 217000 + }, + { + "epoch": 12.12, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 217500 + }, + { + "epoch": 12.15, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 218000 + }, + { + "epoch": 12.18, + "learning_rate": 0.0002, + "loss": 1.6177, + "step": 218500 + }, + { + "epoch": 12.2, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 219000 + }, + { + "epoch": 12.23, + "learning_rate": 0.0002, + "loss": 1.6066, + "step": 219500 + }, + { + "epoch": 12.26, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 220000 + }, + { + "epoch": 12.29, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 220500 + }, + { + "epoch": 12.32, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 221000 + }, + { + "epoch": 12.34, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 221500 + }, + { + "epoch": 12.37, + "learning_rate": 0.0002, + "loss": 1.6054, + "step": 222000 + }, + { + "epoch": 12.4, + "learning_rate": 0.0002, + "loss": 1.6027, + "step": 222500 + }, + { + "epoch": 12.43, + "learning_rate": 0.0002, + "loss": 1.6046, + "step": 223000 + }, + { + "epoch": 12.45, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 223500 + }, + { + "epoch": 12.48, + "learning_rate": 0.0002, + "loss": 1.6248, + "step": 224000 + }, + { + "epoch": 12.51, + "learning_rate": 0.0002, + "loss": 1.6108, + "step": 224500 + }, + { + "epoch": 12.54, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 225000 + }, + { + "epoch": 12.57, + "learning_rate": 0.0002, + "loss": 1.6064, + "step": 225500 + }, + { + "epoch": 12.59, + "learning_rate": 0.0002, + "loss": 1.6167, + "step": 226000 + }, + { + "epoch": 12.62, + "learning_rate": 0.0002, + "loss": 1.6142, + "step": 226500 + }, + { + "epoch": 12.65, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 227000 + }, + { + "epoch": 12.68, + "learning_rate": 0.0002, + "loss": 1.6158, + "step": 227500 + }, + { + "epoch": 12.71, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 228000 + }, + { + "epoch": 12.73, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 228500 + }, + { + "epoch": 12.76, + "learning_rate": 0.0002, + "loss": 1.6165, + "step": 229000 + }, + { + "epoch": 12.79, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 229500 + }, + { + "epoch": 12.82, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 230000 + }, + { + "epoch": 12.84, + "learning_rate": 0.0002, + "loss": 1.6028, + "step": 230500 + }, + { + "epoch": 12.87, + "learning_rate": 0.0002, + "loss": 1.6147, + "step": 231000 + }, + { + "epoch": 12.9, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 231500 + }, + { + "epoch": 12.93, + "learning_rate": 0.0002, + "loss": 1.6281, + "step": 232000 + }, + { + "epoch": 12.96, + "learning_rate": 0.0002, + "loss": 1.6259, + "step": 232500 + }, + { + "epoch": 12.98, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 233000 + }, + { + "epoch": 13.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.5894633531570435, + "eval_rouge1": 24.7893, + "eval_rouge2": 11.8992, + "eval_rougeL": 20.4586, + "eval_rougeLsum": 23.3777, + "eval_runtime": 454.974, + "eval_samples_per_second": 29.382, + "eval_steps_per_second": 1.837, + "step": 233285 + }, + { + "epoch": 13.01, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 233500 + }, + { + "epoch": 13.04, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 234000 + }, + { + "epoch": 13.07, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 234500 + }, + { + "epoch": 13.1, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 235000 + }, + { + "epoch": 13.12, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 235500 + }, + { + "epoch": 13.15, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 236000 + }, + { + "epoch": 13.18, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 236500 + }, + { + "epoch": 13.21, + "learning_rate": 0.0002, + "loss": 1.5971, + "step": 237000 + }, + { + "epoch": 13.23, + "learning_rate": 0.0002, + "loss": 1.602, + "step": 237500 + }, + { + "epoch": 13.26, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 238000 + }, + { + "epoch": 13.29, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 238500 + }, + { + "epoch": 13.32, + "learning_rate": 0.0002, + "loss": 1.6018, + "step": 239000 + }, + { + "epoch": 13.35, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 239500 + }, + { + "epoch": 13.37, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 240000 + }, + { + "epoch": 13.4, + "learning_rate": 0.0002, + "loss": 1.5977, + "step": 240500 + }, + { + "epoch": 13.43, + "learning_rate": 0.0002, + "loss": 1.6096, + "step": 241000 + }, + { + "epoch": 13.46, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 241500 + }, + { + "epoch": 13.49, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 242000 + }, + { + "epoch": 13.51, + "learning_rate": 0.0002, + "loss": 1.5945, + "step": 242500 + }, + { + "epoch": 13.54, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 243000 + }, + { + "epoch": 13.57, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 243500 + }, + { + "epoch": 13.6, + "learning_rate": 0.0002, + "loss": 1.6116, + "step": 244000 + }, + { + "epoch": 13.62, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 244500 + }, + { + "epoch": 13.65, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 245000 + }, + { + "epoch": 13.68, + "learning_rate": 0.0002, + "loss": 1.6051, + "step": 245500 + }, + { + "epoch": 13.71, + "learning_rate": 0.0002, + "loss": 1.6178, + "step": 246000 + }, + { + "epoch": 13.74, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 246500 + }, + { + "epoch": 13.76, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 247000 + }, + { + "epoch": 13.79, + "learning_rate": 0.0002, + "loss": 1.5952, + "step": 247500 + }, + { + "epoch": 13.82, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 248000 + }, + { + "epoch": 13.85, + "learning_rate": 0.0002, + "loss": 1.6135, + "step": 248500 + }, + { + "epoch": 13.88, + "learning_rate": 0.0002, + "loss": 1.607, + "step": 249000 + }, + { + "epoch": 13.9, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 249500 + }, + { + "epoch": 13.93, + "learning_rate": 0.0002, + "loss": 1.6, + "step": 250000 + }, + { + "epoch": 13.96, + "learning_rate": 0.0002, + "loss": 1.6118, + "step": 250500 + }, + { + "epoch": 13.99, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 251000 + }, + { + "epoch": 14.0, + "eval_gen_len": 18.99985038898863, + "eval_loss": 1.595921516418457, + "eval_rouge1": 24.7781, + "eval_rouge2": 11.9972, + "eval_rougeL": 20.528, + "eval_rougeLsum": 23.3812, + "eval_runtime": 455.5258, + "eval_samples_per_second": 29.346, + "eval_steps_per_second": 1.835, + "step": 251230 + }, + { + "epoch": 14.02, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 251500 + }, + { + "epoch": 14.04, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 252000 + }, + { + "epoch": 14.07, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 252500 + }, + { + "epoch": 14.1, + "learning_rate": 0.0002, + "loss": 1.5895, + "step": 253000 + }, + { + "epoch": 14.13, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 253500 + }, + { + "epoch": 14.15, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 254000 + }, + { + "epoch": 14.18, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 254500 + }, + { + "epoch": 14.21, + "learning_rate": 0.0002, + "loss": 1.6008, + "step": 255000 + }, + { + "epoch": 14.24, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 255500 + }, + { + "epoch": 14.27, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 256000 + }, + { + "epoch": 14.29, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 256500 + }, + { + "epoch": 14.32, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 257000 + }, + { + "epoch": 14.35, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 257500 + }, + { + "epoch": 14.38, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 258000 + }, + { + "epoch": 14.41, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 258500 + }, + { + "epoch": 14.43, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 259000 + }, + { + "epoch": 14.46, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 259500 + }, + { + "epoch": 14.49, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 260000 + }, + { + "epoch": 14.52, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 260500 + }, + { + "epoch": 14.54, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 261000 + }, + { + "epoch": 14.57, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 261500 + }, + { + "epoch": 14.6, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 262000 + }, + { + "epoch": 14.63, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 262500 + }, + { + "epoch": 14.66, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 263000 + }, + { + "epoch": 14.68, + "learning_rate": 0.0002, + "loss": 1.6035, + "step": 263500 + }, + { + "epoch": 14.71, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 264000 + }, + { + "epoch": 14.74, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 264500 + }, + { + "epoch": 14.77, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 265000 + }, + { + "epoch": 14.8, + "learning_rate": 0.0002, + "loss": 1.6016, + "step": 265500 + }, + { + "epoch": 14.82, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 266000 + }, + { + "epoch": 14.85, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 266500 + }, + { + "epoch": 14.88, + "learning_rate": 0.0002, + "loss": 1.6083, + "step": 267000 + }, + { + "epoch": 14.91, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 267500 + }, + { + "epoch": 14.93, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 268000 + }, + { + "epoch": 14.96, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 268500 + }, + { + "epoch": 14.99, + "learning_rate": 0.0002, + "loss": 1.6085, + "step": 269000 + }, + { + "epoch": 15.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.5895360708236694, + "eval_rouge1": 24.7245, + "eval_rouge2": 12.0165, + "eval_rougeL": 20.4687, + "eval_rougeLsum": 23.3264, + "eval_runtime": 455.5133, + "eval_samples_per_second": 29.347, + "eval_steps_per_second": 1.835, + "step": 269175 + }, + { + "epoch": 15.02, + "learning_rate": 0.0002, + "loss": 1.5819, + "step": 269500 + }, + { + "epoch": 15.05, + "learning_rate": 0.0002, + "loss": 1.5829, + "step": 270000 + }, + { + "epoch": 15.07, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 270500 + }, + { + "epoch": 15.1, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 271000 + }, + { + "epoch": 15.13, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 271500 + }, + { + "epoch": 15.16, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 272000 + }, + { + "epoch": 15.19, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 272500 + }, + { + "epoch": 15.21, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 273000 + }, + { + "epoch": 15.24, + "learning_rate": 0.0002, + "loss": 1.5754, + "step": 273500 + }, + { + "epoch": 15.27, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 274000 + }, + { + "epoch": 15.3, + "learning_rate": 0.0002, + "loss": 1.5771, + "step": 274500 + }, + { + "epoch": 15.32, + "learning_rate": 0.0002, + "loss": 1.5822, + "step": 275000 + }, + { + "epoch": 15.35, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 275500 + }, + { + "epoch": 15.38, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 276000 + }, + { + "epoch": 15.41, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 276500 + }, + { + "epoch": 15.44, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 277000 + }, + { + "epoch": 15.46, + "learning_rate": 0.0002, + "loss": 1.582, + "step": 277500 + }, + { + "epoch": 15.49, + "learning_rate": 0.0002, + "loss": 1.5845, + "step": 278000 + }, + { + "epoch": 15.52, + "learning_rate": 0.0002, + "loss": 1.5911, + "step": 278500 + }, + { + "epoch": 15.55, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 279000 + }, + { + "epoch": 15.58, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 279500 + }, + { + "epoch": 15.6, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 280000 + }, + { + "epoch": 15.63, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 280500 + }, + { + "epoch": 15.66, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 281000 + }, + { + "epoch": 15.69, + "learning_rate": 0.0002, + "loss": 1.5856, + "step": 281500 + }, + { + "epoch": 15.71, + "learning_rate": 0.0002, + "loss": 1.5882, + "step": 282000 + }, + { + "epoch": 15.74, + "learning_rate": 0.0002, + "loss": 1.5849, + "step": 282500 + }, + { + "epoch": 15.77, + "learning_rate": 0.0002, + "loss": 1.5887, + "step": 283000 + }, + { + "epoch": 15.8, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 283500 + }, + { + "epoch": 15.83, + "learning_rate": 0.0002, + "loss": 1.5852, + "step": 284000 + }, + { + "epoch": 15.85, + "learning_rate": 0.0002, + "loss": 1.5997, + "step": 284500 + }, + { + "epoch": 15.88, + "learning_rate": 0.0002, + "loss": 1.596, + "step": 285000 + }, + { + "epoch": 15.91, + "learning_rate": 0.0002, + "loss": 1.5869, + "step": 285500 + }, + { + "epoch": 15.94, + "learning_rate": 0.0002, + "loss": 1.5904, + "step": 286000 + }, + { + "epoch": 15.97, + "learning_rate": 0.0002, + "loss": 1.586, + "step": 286500 + }, + { + "epoch": 15.99, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 287000 + }, + { + "epoch": 16.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.5896196365356445, + "eval_rouge1": 24.6479, + "eval_rouge2": 11.9044, + "eval_rougeL": 20.357, + "eval_rougeLsum": 23.2246, + "eval_runtime": 457.4929, + "eval_samples_per_second": 29.22, + "eval_steps_per_second": 1.827, + "step": 287120 + }, + { + "epoch": 16.02, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 287500 + }, + { + "epoch": 16.05, + "learning_rate": 0.0002, + "loss": 1.5579, + "step": 288000 + }, + { + "epoch": 16.08, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 288500 + }, + { + "epoch": 16.1, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 289000 + }, + { + "epoch": 16.13, + "learning_rate": 0.0002, + "loss": 1.5732, + "step": 289500 + }, + { + "epoch": 16.16, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 290000 + }, + { + "epoch": 16.19, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 290500 + }, + { + "epoch": 16.22, + "learning_rate": 0.0002, + "loss": 1.5858, + "step": 291000 + }, + { + "epoch": 16.24, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 291500 + }, + { + "epoch": 16.27, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 292000 + }, + { + "epoch": 16.3, + "learning_rate": 0.0002, + "loss": 1.5711, + "step": 292500 + }, + { + "epoch": 16.33, + "learning_rate": 0.0002, + "loss": 1.5695, + "step": 293000 + }, + { + "epoch": 16.36, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 293500 + }, + { + "epoch": 16.38, + "learning_rate": 0.0002, + "loss": 1.58, + "step": 294000 + }, + { + "epoch": 16.41, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 294500 + }, + { + "epoch": 16.44, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 295000 + }, + { + "epoch": 16.47, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 295500 + }, + { + "epoch": 16.49, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 296000 + }, + { + "epoch": 16.52, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 296500 + }, + { + "epoch": 16.55, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 297000 + }, + { + "epoch": 16.58, + "learning_rate": 0.0002, + "loss": 1.5735, + "step": 297500 + }, + { + "epoch": 16.61, + "learning_rate": 0.0002, + "loss": 1.5772, + "step": 298000 + }, + { + "epoch": 16.63, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 298500 + }, + { + "epoch": 16.66, + "learning_rate": 0.0002, + "loss": 1.5749, + "step": 299000 + }, + { + "epoch": 16.69, + "learning_rate": 0.0002, + "loss": 1.5837, + "step": 299500 + }, + { + "epoch": 16.72, + "learning_rate": 0.0002, + "loss": 1.5848, + "step": 300000 + }, + { + "epoch": 16.75, + "learning_rate": 0.0002, + "loss": 1.5859, + "step": 300500 + }, + { + "epoch": 16.77, + "learning_rate": 0.0002, + "loss": 1.5755, + "step": 301000 + }, + { + "epoch": 16.8, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 301500 + }, + { + "epoch": 16.83, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 302000 + }, + { + "epoch": 16.86, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 302500 + }, + { + "epoch": 16.88, + "learning_rate": 0.0002, + "loss": 1.5935, + "step": 303000 + }, + { + "epoch": 16.91, + "learning_rate": 0.0002, + "loss": 1.5891, + "step": 303500 + }, + { + "epoch": 16.94, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 304000 + }, + { + "epoch": 16.97, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 304500 + }, + { + "epoch": 17.0, + "learning_rate": 0.0002, + "loss": 1.5963, + "step": 305000 + }, + { + "epoch": 17.0, + "eval_gen_len": 18.99985038898863, + "eval_loss": 1.5875639915466309, + "eval_rouge1": 24.8499, + "eval_rouge2": 12.0533, + "eval_rougeL": 20.5578, + "eval_rougeLsum": 23.4369, + "eval_runtime": 454.7479, + "eval_samples_per_second": 29.397, + "eval_steps_per_second": 1.838, + "step": 305065 + }, + { + "epoch": 17.02, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 305500 + }, + { + "epoch": 17.05, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 306000 + }, + { + "epoch": 17.08, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 306500 + }, + { + "epoch": 17.11, + "learning_rate": 0.0002, + "loss": 1.5575, + "step": 307000 + }, + { + "epoch": 17.14, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 307500 + }, + { + "epoch": 17.16, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 308000 + }, + { + "epoch": 17.19, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 308500 + }, + { + "epoch": 17.22, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 309000 + }, + { + "epoch": 17.25, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 309500 + }, + { + "epoch": 17.28, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 310000 + }, + { + "epoch": 17.3, + "learning_rate": 0.0002, + "loss": 1.566, + "step": 310500 + }, + { + "epoch": 17.33, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 311000 + }, + { + "epoch": 17.36, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 311500 + }, + { + "epoch": 17.39, + "learning_rate": 0.0002, + "loss": 1.5685, + "step": 312000 + }, + { + "epoch": 17.41, + "learning_rate": 0.0002, + "loss": 1.5675, + "step": 312500 + }, + { + "epoch": 17.44, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 313000 + }, + { + "epoch": 17.47, + "learning_rate": 0.0002, + "loss": 1.5784, + "step": 313500 + }, + { + "epoch": 17.5, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 314000 + }, + { + "epoch": 17.53, + "learning_rate": 0.0002, + "loss": 1.5636, + "step": 314500 + }, + { + "epoch": 17.55, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 315000 + }, + { + "epoch": 17.58, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 315500 + }, + { + "epoch": 17.61, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 316000 + }, + { + "epoch": 17.64, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 316500 + }, + { + "epoch": 17.67, + "learning_rate": 0.0002, + "loss": 1.5717, + "step": 317000 + }, + { + "epoch": 17.69, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 317500 + }, + { + "epoch": 17.72, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 318000 + }, + { + "epoch": 17.75, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 318500 + }, + { + "epoch": 17.78, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 319000 + }, + { + "epoch": 17.8, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 319500 + }, + { + "epoch": 17.83, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 320000 + }, + { + "epoch": 17.86, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 320500 + }, + { + "epoch": 17.89, + "learning_rate": 0.0002, + "loss": 1.5791, + "step": 321000 + }, + { + "epoch": 17.92, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 321500 + }, + { + "epoch": 17.94, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 322000 + }, + { + "epoch": 17.97, + "learning_rate": 0.0002, + "loss": 1.5777, + "step": 322500 + }, + { + "epoch": 18.0, + "learning_rate": 0.0002, + "loss": 1.5884, + "step": 323000 + }, + { + "epoch": 18.0, + "eval_gen_len": 18.999625972471573, + "eval_loss": 1.5868136882781982, + "eval_rouge1": 24.8052, + "eval_rouge2": 12.0432, + "eval_rougeL": 20.5093, + "eval_rougeLsum": 23.3853, + "eval_runtime": 453.6378, + "eval_samples_per_second": 29.468, + "eval_steps_per_second": 1.843, + "step": 323010 + }, + { + "epoch": 18.03, + "learning_rate": 0.0002, + "loss": 1.5496, + "step": 323500 + }, + { + "epoch": 18.06, + "learning_rate": 0.0002, + "loss": 1.5482, + "step": 324000 + }, + { + "epoch": 18.08, + "learning_rate": 0.0002, + "loss": 1.5439, + "step": 324500 + }, + { + "epoch": 18.11, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 325000 + }, + { + "epoch": 18.14, + "learning_rate": 0.0002, + "loss": 1.5523, + "step": 325500 + }, + { + "epoch": 18.17, + "learning_rate": 0.0002, + "loss": 1.5683, + "step": 326000 + }, + { + "epoch": 18.19, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 326500 + }, + { + "epoch": 18.22, + "learning_rate": 0.0002, + "loss": 1.5485, + "step": 327000 + }, + { + "epoch": 18.25, + "learning_rate": 0.0002, + "loss": 1.5628, + "step": 327500 + }, + { + "epoch": 18.28, + "learning_rate": 0.0002, + "loss": 1.5538, + "step": 328000 + }, + { + "epoch": 18.31, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 328500 + }, + { + "epoch": 18.33, + "learning_rate": 0.0002, + "loss": 1.5614, + "step": 329000 + }, + { + "epoch": 18.36, + "learning_rate": 0.0002, + "loss": 1.5634, + "step": 329500 + }, + { + "epoch": 18.39, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 330000 + }, + { + "epoch": 18.42, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 330500 + }, + { + "epoch": 18.45, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 331000 + }, + { + "epoch": 18.47, + "learning_rate": 0.0002, + "loss": 1.5603, + "step": 331500 + }, + { + "epoch": 18.5, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 332000 + }, + { + "epoch": 18.53, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 332500 + }, + { + "epoch": 18.56, + "learning_rate": 0.0002, + "loss": 1.5603, + "step": 333000 + }, + { + "epoch": 18.58, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 333500 + }, + { + "epoch": 18.61, + "learning_rate": 0.0002, + "loss": 1.572, + "step": 334000 + }, + { + "epoch": 18.64, + "learning_rate": 0.0002, + "loss": 1.5637, + "step": 334500 + }, + { + "epoch": 18.67, + "learning_rate": 0.0002, + "loss": 1.5703, + "step": 335000 + }, + { + "epoch": 18.7, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 335500 + }, + { + "epoch": 18.72, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 336000 + }, + { + "epoch": 18.75, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 336500 + }, + { + "epoch": 18.78, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 337000 + }, + { + "epoch": 18.81, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 337500 + }, + { + "epoch": 18.84, + "learning_rate": 0.0002, + "loss": 1.556, + "step": 338000 + }, + { + "epoch": 18.86, + "learning_rate": 0.0002, + "loss": 1.5865, + "step": 338500 + }, + { + "epoch": 18.89, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 339000 + }, + { + "epoch": 18.92, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 339500 + }, + { + "epoch": 18.95, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 340000 + }, + { + "epoch": 18.97, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 340500 + }, + { + "epoch": 19.0, + "eval_gen_len": 18.999625972471573, + "eval_loss": 1.6005514860153198, + "eval_rouge1": 24.7356, + "eval_rouge2": 12.0038, + "eval_rougeL": 20.4853, + "eval_rougeLsum": 23.353, + "eval_runtime": 454.2953, + "eval_samples_per_second": 29.426, + "eval_steps_per_second": 1.84, + "step": 340955 + }, + { + "epoch": 19.0, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 341000 + }, + { + "epoch": 19.03, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 341500 + }, + { + "epoch": 19.06, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 342000 + }, + { + "epoch": 19.09, + "learning_rate": 0.0002, + "loss": 1.5496, + "step": 342500 + }, + { + "epoch": 19.11, + "learning_rate": 0.0002, + "loss": 1.5376, + "step": 343000 + }, + { + "epoch": 19.14, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 343500 + }, + { + "epoch": 19.17, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 344000 + }, + { + "epoch": 19.2, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 344500 + }, + { + "epoch": 19.23, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 345000 + }, + { + "epoch": 19.25, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 345500 + }, + { + "epoch": 19.28, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 346000 + }, + { + "epoch": 19.31, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 346500 + }, + { + "epoch": 19.34, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 347000 + }, + { + "epoch": 19.36, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 347500 + }, + { + "epoch": 19.39, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 348000 + }, + { + "epoch": 19.42, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 348500 + }, + { + "epoch": 19.45, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 349000 + }, + { + "epoch": 19.48, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 349500 + }, + { + "epoch": 19.5, + "learning_rate": 0.0002, + "loss": 1.5482, + "step": 350000 + }, + { + "epoch": 19.53, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 350500 + }, + { + "epoch": 19.56, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 351000 + }, + { + "epoch": 19.59, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 351500 + }, + { + "epoch": 19.62, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 352000 + }, + { + "epoch": 19.64, + "learning_rate": 0.0002, + "loss": 1.5504, + "step": 352500 + }, + { + "epoch": 19.67, + "learning_rate": 0.0002, + "loss": 1.5645, + "step": 353000 + }, + { + "epoch": 19.7, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 353500 + }, + { + "epoch": 19.73, + "learning_rate": 0.0002, + "loss": 1.5597, + "step": 354000 + }, + { + "epoch": 19.75, + "learning_rate": 0.0002, + "loss": 1.5644, + "step": 354500 + }, + { + "epoch": 19.78, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 355000 + }, + { + "epoch": 19.81, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 355500 + }, + { + "epoch": 19.84, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 356000 + }, + { + "epoch": 19.87, + "learning_rate": 0.0002, + "loss": 1.5699, + "step": 356500 + }, + { + "epoch": 19.89, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 357000 + }, + { + "epoch": 19.92, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 357500 + }, + { + "epoch": 19.95, + "learning_rate": 0.0002, + "loss": 1.57, + "step": 358000 + }, + { + "epoch": 19.98, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 358500 + }, + { + "epoch": 20.0, + "eval_gen_len": 18.999925194494313, + "eval_loss": 1.5914615392684937, + "eval_rouge1": 24.7644, + "eval_rouge2": 12.0064, + "eval_rougeL": 20.5076, + "eval_rougeLsum": 23.3719, + "eval_runtime": 453.4878, + "eval_samples_per_second": 29.478, + "eval_steps_per_second": 1.843, + "step": 358900 + }, + { + "epoch": 20.01, + "learning_rate": 0.0002, + "loss": 1.5598, + "step": 359000 + }, + { + "epoch": 20.03, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 359500 + }, + { + "epoch": 20.06, + "learning_rate": 0.0002, + "loss": 1.5369, + "step": 360000 + }, + { + "epoch": 20.09, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 360500 + }, + { + "epoch": 20.12, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 361000 + }, + { + "epoch": 20.14, + "learning_rate": 0.0002, + "loss": 1.5405, + "step": 361500 + }, + { + "epoch": 20.17, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 362000 + }, + { + "epoch": 20.2, + "learning_rate": 0.0002, + "loss": 1.5469, + "step": 362500 + }, + { + "epoch": 20.23, + "learning_rate": 0.0002, + "loss": 1.5402, + "step": 363000 + }, + { + "epoch": 20.26, + "learning_rate": 0.0002, + "loss": 1.5235, + "step": 363500 + }, + { + "epoch": 20.28, + "learning_rate": 0.0002, + "loss": 1.5473, + "step": 364000 + }, + { + "epoch": 20.31, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 364500 + }, + { + "epoch": 20.34, + "learning_rate": 0.0002, + "loss": 1.5541, + "step": 365000 + }, + { + "epoch": 20.37, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 365500 + }, + { + "epoch": 20.4, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 366000 + }, + { + "epoch": 20.42, + "learning_rate": 0.0002, + "loss": 1.5525, + "step": 366500 + }, + { + "epoch": 20.45, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 367000 + }, + { + "epoch": 20.48, + "learning_rate": 0.0002, + "loss": 1.5486, + "step": 367500 + }, + { + "epoch": 20.51, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 368000 + }, + { + "epoch": 20.53, + "learning_rate": 0.0002, + "loss": 1.5496, + "step": 368500 + }, + { + "epoch": 20.56, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 369000 + }, + { + "epoch": 20.59, + "learning_rate": 0.0002, + "loss": 1.5437, + "step": 369500 + }, + { + "epoch": 20.62, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 370000 + }, + { + "epoch": 20.65, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 370500 + }, + { + "epoch": 20.67, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 371000 + }, + { + "epoch": 20.7, + "learning_rate": 0.0002, + "loss": 1.5604, + "step": 371500 + }, + { + "epoch": 20.73, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 372000 + }, + { + "epoch": 20.76, + "learning_rate": 0.0002, + "loss": 1.5557, + "step": 372500 + }, + { + "epoch": 20.79, + "learning_rate": 0.0002, + "loss": 1.5617, + "step": 373000 + }, + { + "epoch": 20.81, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 373500 + }, + { + "epoch": 20.84, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 374000 + }, + { + "epoch": 20.87, + "learning_rate": 0.0002, + "loss": 1.5514, + "step": 374500 + }, + { + "epoch": 20.9, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 375000 + }, + { + "epoch": 20.93, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 375500 + }, + { + "epoch": 20.95, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 376000 + }, + { + "epoch": 20.98, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 376500 + }, + { + "epoch": 21.0, + "eval_gen_len": 18.999625972471573, + "eval_loss": 1.5916765928268433, + "eval_rouge1": 24.7552, + "eval_rouge2": 11.9691, + "eval_rougeL": 20.4797, + "eval_rougeLsum": 23.3411, + "eval_runtime": 454.0277, + "eval_samples_per_second": 29.443, + "eval_steps_per_second": 1.841, + "step": 376845 + }, + { + "epoch": 21.01, + "learning_rate": 0.0002, + "loss": 1.546, + "step": 377000 + }, + { + "epoch": 21.04, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 377500 + }, + { + "epoch": 21.06, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 378000 + }, + { + "epoch": 21.09, + "learning_rate": 0.0002, + "loss": 1.5358, + "step": 378500 + }, + { + "epoch": 21.12, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 379000 + }, + { + "epoch": 21.15, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 379500 + }, + { + "epoch": 21.18, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 380000 + }, + { + "epoch": 21.2, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 380500 + }, + { + "epoch": 21.23, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 381000 + }, + { + "epoch": 21.26, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 381500 + }, + { + "epoch": 21.29, + "learning_rate": 0.0002, + "loss": 1.5272, + "step": 382000 + }, + { + "epoch": 21.32, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 382500 + }, + { + "epoch": 21.34, + "learning_rate": 0.0002, + "loss": 1.5336, + "step": 383000 + }, + { + "epoch": 21.37, + "learning_rate": 0.0002, + "loss": 1.5346, + "step": 383500 + }, + { + "epoch": 21.4, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 384000 + }, + { + "epoch": 21.43, + "learning_rate": 0.0002, + "loss": 1.5468, + "step": 384500 + }, + { + "epoch": 21.45, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 385000 + }, + { + "epoch": 21.48, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 385500 + }, + { + "epoch": 21.51, + "learning_rate": 0.0002, + "loss": 1.5502, + "step": 386000 + }, + { + "epoch": 21.54, + "learning_rate": 0.0002, + "loss": 1.5437, + "step": 386500 + }, + { + "epoch": 21.57, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 387000 + }, + { + "epoch": 21.59, + "learning_rate": 0.0002, + "loss": 1.5439, + "step": 387500 + }, + { + "epoch": 21.62, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 388000 + }, + { + "epoch": 21.65, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 388500 + }, + { + "epoch": 21.68, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 389000 + }, + { + "epoch": 21.71, + "learning_rate": 0.0002, + "loss": 1.5509, + "step": 389500 + }, + { + "epoch": 21.73, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 390000 + }, + { + "epoch": 21.76, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 390500 + }, + { + "epoch": 21.79, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 391000 + }, + { + "epoch": 21.82, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 391500 + }, + { + "epoch": 21.84, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 392000 + }, + { + "epoch": 21.87, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 392500 + }, + { + "epoch": 21.9, + "learning_rate": 0.0002, + "loss": 1.5479, + "step": 393000 + }, + { + "epoch": 21.93, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 393500 + }, + { + "epoch": 21.96, + "learning_rate": 0.0002, + "loss": 1.5565, + "step": 394000 + }, + { + "epoch": 21.98, + "learning_rate": 0.0002, + "loss": 1.5635, + "step": 394500 + }, + { + "epoch": 22.0, + "eval_gen_len": 18.99985038898863, + "eval_loss": 1.5907750129699707, + "eval_rouge1": 24.8457, + "eval_rouge2": 12.0955, + "eval_rougeL": 20.5316, + "eval_rougeLsum": 23.433, + "eval_runtime": 454.9481, + "eval_samples_per_second": 29.384, + "eval_steps_per_second": 1.838, + "step": 394790 + }, + { + "epoch": 22.01, + "learning_rate": 0.0002, + "loss": 1.5393, + "step": 395000 + }, + { + "epoch": 22.04, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 395500 + }, + { + "epoch": 22.07, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 396000 + }, + { + "epoch": 22.1, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 396500 + }, + { + "epoch": 22.12, + "learning_rate": 0.0002, + "loss": 1.5217, + "step": 397000 + }, + { + "epoch": 22.15, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 397500 + }, + { + "epoch": 22.18, + "learning_rate": 0.0002, + "loss": 1.5382, + "step": 398000 + }, + { + "epoch": 22.21, + "learning_rate": 0.0002, + "loss": 1.5273, + "step": 398500 + }, + { + "epoch": 22.23, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 399000 + }, + { + "epoch": 22.26, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 399500 + }, + { + "epoch": 22.29, + "learning_rate": 0.0002, + "loss": 1.5326, + "step": 400000 + }, + { + "epoch": 22.32, + "learning_rate": 0.0002, + "loss": 1.5331, + "step": 400500 + }, + { + "epoch": 22.35, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 401000 + }, + { + "epoch": 22.37, + "learning_rate": 0.0002, + "loss": 1.5401, + "step": 401500 + }, + { + "epoch": 22.4, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 402000 + }, + { + "epoch": 22.43, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 402500 + }, + { + "epoch": 22.46, + "learning_rate": 0.0002, + "loss": 1.533, + "step": 403000 + }, + { + "epoch": 22.49, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 403500 + }, + { + "epoch": 22.51, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 404000 + }, + { + "epoch": 22.54, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 404500 + }, + { + "epoch": 22.57, + "learning_rate": 0.0002, + "loss": 1.5302, + "step": 405000 + }, + { + "epoch": 22.6, + "learning_rate": 0.0002, + "loss": 1.532, + "step": 405500 + }, + { + "epoch": 22.62, + "learning_rate": 0.0002, + "loss": 1.5374, + "step": 406000 + }, + { + "epoch": 22.65, + "learning_rate": 0.0002, + "loss": 1.5357, + "step": 406500 + }, + { + "epoch": 22.68, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 407000 + }, + { + "epoch": 22.71, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 407500 + }, + { + "epoch": 22.74, + "learning_rate": 0.0002, + "loss": 1.5396, + "step": 408000 + }, + { + "epoch": 22.76, + "learning_rate": 0.0002, + "loss": 1.5419, + "step": 408500 + }, + { + "epoch": 22.79, + "learning_rate": 0.0002, + "loss": 1.54, + "step": 409000 + }, + { + "epoch": 22.82, + "learning_rate": 0.0002, + "loss": 1.5392, + "step": 409500 + }, + { + "epoch": 22.85, + "learning_rate": 0.0002, + "loss": 1.5498, + "step": 410000 + }, + { + "epoch": 22.88, + "learning_rate": 0.0002, + "loss": 1.5403, + "step": 410500 + }, + { + "epoch": 22.9, + "learning_rate": 0.0002, + "loss": 1.5558, + "step": 411000 + }, + { + "epoch": 22.93, + "learning_rate": 0.0002, + "loss": 1.542, + "step": 411500 + }, + { + "epoch": 22.96, + "learning_rate": 0.0002, + "loss": 1.5365, + "step": 412000 + }, + { + "epoch": 22.99, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 412500 + }, + { + "epoch": 23.0, + "eval_gen_len": 18.99985038898863, + "eval_loss": 1.6024245023727417, + "eval_rouge1": 24.8366, + "eval_rouge2": 12.0534, + "eval_rougeL": 20.5687, + "eval_rougeLsum": 23.4081, + "eval_runtime": 454.875, + "eval_samples_per_second": 29.388, + "eval_steps_per_second": 1.838, + "step": 412735 + }, + { + "epoch": 23.01, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 413000 + }, + { + "epoch": 23.04, + "learning_rate": 0.0002, + "loss": 1.522, + "step": 413500 + }, + { + "epoch": 23.07, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 414000 + }, + { + "epoch": 23.1, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 414500 + }, + { + "epoch": 23.13, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 415000 + }, + { + "epoch": 23.15, + "learning_rate": 0.0002, + "loss": 1.5331, + "step": 415500 + }, + { + "epoch": 23.18, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 416000 + }, + { + "epoch": 23.21, + "learning_rate": 0.0002, + "loss": 1.5195, + "step": 416500 + }, + { + "epoch": 23.24, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 417000 + }, + { + "epoch": 23.27, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 417500 + }, + { + "epoch": 23.29, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 418000 + }, + { + "epoch": 23.32, + "learning_rate": 0.0002, + "loss": 1.5217, + "step": 418500 + }, + { + "epoch": 23.35, + "learning_rate": 0.0002, + "loss": 1.5133, + "step": 419000 + }, + { + "epoch": 23.38, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 419500 + }, + { + "epoch": 23.4, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 420000 + }, + { + "epoch": 23.43, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 420500 + }, + { + "epoch": 23.46, + "learning_rate": 0.0002, + "loss": 1.5287, + "step": 421000 + }, + { + "epoch": 23.49, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 421500 + }, + { + "epoch": 23.52, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 422000 + }, + { + "epoch": 23.54, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 422500 + }, + { + "epoch": 23.57, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 423000 + }, + { + "epoch": 23.6, + "learning_rate": 0.0002, + "loss": 1.5316, + "step": 423500 + }, + { + "epoch": 23.63, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 424000 + }, + { + "epoch": 23.66, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 424500 + }, + { + "epoch": 23.68, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 425000 + }, + { + "epoch": 23.71, + "learning_rate": 0.0002, + "loss": 1.5291, + "step": 425500 + }, + { + "epoch": 23.74, + "learning_rate": 0.0002, + "loss": 1.5288, + "step": 426000 + }, + { + "epoch": 23.77, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 426500 + }, + { + "epoch": 23.79, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 427000 + }, + { + "epoch": 23.82, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 427500 + }, + { + "epoch": 23.85, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 428000 + }, + { + "epoch": 23.88, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 428500 + }, + { + "epoch": 23.91, + "learning_rate": 0.0002, + "loss": 1.5385, + "step": 429000 + }, + { + "epoch": 23.93, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 429500 + }, + { + "epoch": 23.96, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 430000 + }, + { + "epoch": 23.99, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 430500 + }, + { + "epoch": 24.0, + "eval_gen_len": 18.999251944943147, + "eval_loss": 1.5907658338546753, + "eval_rouge1": 24.8365, + "eval_rouge2": 12.0916, + "eval_rougeL": 20.5302, + "eval_rougeLsum": 23.4212, + "eval_runtime": 455.0271, + "eval_samples_per_second": 29.378, + "eval_steps_per_second": 1.837, + "step": 430680 + }, + { + "epoch": 24.02, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 431000 + }, + { + "epoch": 24.05, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 431500 + }, + { + "epoch": 24.07, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 432000 + }, + { + "epoch": 24.1, + "learning_rate": 0.0002, + "loss": 1.5011, + "step": 432500 + }, + { + "epoch": 24.13, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 433000 + }, + { + "epoch": 24.16, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 433500 + }, + { + "epoch": 24.19, + "learning_rate": 0.0002, + "loss": 1.5026, + "step": 434000 + }, + { + "epoch": 24.21, + "learning_rate": 0.0002, + "loss": 1.5027, + "step": 434500 + }, + { + "epoch": 24.24, + "learning_rate": 0.0002, + "loss": 1.5328, + "step": 435000 + }, + { + "epoch": 24.27, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 435500 + }, + { + "epoch": 24.3, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 436000 + }, + { + "epoch": 24.32, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 436500 + }, + { + "epoch": 24.35, + "learning_rate": 0.0002, + "loss": 1.5198, + "step": 437000 + }, + { + "epoch": 24.38, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 437500 + }, + { + "epoch": 24.41, + "learning_rate": 0.0002, + "loss": 1.5192, + "step": 438000 + }, + { + "epoch": 24.44, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 438500 + }, + { + "epoch": 24.46, + "learning_rate": 0.0002, + "loss": 1.5263, + "step": 439000 + }, + { + "epoch": 24.49, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 439500 + }, + { + "epoch": 24.52, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 440000 + }, + { + "epoch": 24.55, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 440500 + }, + { + "epoch": 24.58, + "learning_rate": 0.0002, + "loss": 1.5256, + "step": 441000 + }, + { + "epoch": 24.6, + "learning_rate": 0.0002, + "loss": 1.5432, + "step": 441500 + }, + { + "epoch": 24.63, + "learning_rate": 0.0002, + "loss": 1.5136, + "step": 442000 + }, + { + "epoch": 24.66, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 442500 + }, + { + "epoch": 24.69, + "learning_rate": 0.0002, + "loss": 1.5325, + "step": 443000 + }, + { + "epoch": 24.71, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 443500 + }, + { + "epoch": 24.74, + "learning_rate": 0.0002, + "loss": 1.521, + "step": 444000 + }, + { + "epoch": 24.77, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 444500 + }, + { + "epoch": 24.8, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 445000 + }, + { + "epoch": 24.83, + "learning_rate": 0.0002, + "loss": 1.5413, + "step": 445500 + }, + { + "epoch": 24.85, + "learning_rate": 0.0002, + "loss": 1.5301, + "step": 446000 + }, + { + "epoch": 24.88, + "learning_rate": 0.0002, + "loss": 1.5415, + "step": 446500 + }, + { + "epoch": 24.91, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 447000 + }, + { + "epoch": 24.94, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 447500 + }, + { + "epoch": 24.97, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 448000 + }, + { + "epoch": 24.99, + "learning_rate": 0.0002, + "loss": 1.531, + "step": 448500 + }, + { + "epoch": 25.0, + "eval_gen_len": 19.0, + "eval_loss": 1.6025395393371582, + "eval_rouge1": 24.7463, + "eval_rouge2": 11.985, + "eval_rougeL": 20.4706, + "eval_rougeLsum": 23.2953, + "eval_runtime": 452.1354, + "eval_samples_per_second": 29.566, + "eval_steps_per_second": 1.849, + "step": 448625 + }, + { + "epoch": 25.02, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 449000 + }, + { + "epoch": 25.05, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 449500 + }, + { + "epoch": 25.08, + "learning_rate": 0.0002, + "loss": 1.5078, + "step": 450000 + }, + { + "epoch": 25.1, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 450500 + }, + { + "epoch": 25.13, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 451000 + }, + { + "epoch": 25.16, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 451500 + }, + { + "epoch": 25.19, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 452000 + }, + { + "epoch": 25.22, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 452500 + }, + { + "epoch": 25.24, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 453000 + }, + { + "epoch": 25.27, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 453500 + }, + { + "epoch": 25.3, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 454000 + }, + { + "epoch": 25.33, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 454500 + }, + { + "epoch": 25.36, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 455000 + }, + { + "epoch": 25.38, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 455500 + }, + { + "epoch": 25.41, + "learning_rate": 0.0002, + "loss": 1.5124, + "step": 456000 + }, + { + "epoch": 25.44, + "learning_rate": 0.0002, + "loss": 1.517, + "step": 456500 + }, + { + "epoch": 25.47, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 457000 + }, + { + "epoch": 25.49, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 457500 + }, + { + "epoch": 25.52, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 458000 + }, + { + "epoch": 25.55, + "learning_rate": 0.0002, + "loss": 1.5133, + "step": 458500 + }, + { + "epoch": 25.58, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 459000 + }, + { + "epoch": 25.61, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 459500 + }, + { + "epoch": 25.63, + "learning_rate": 0.0002, + "loss": 1.5171, + "step": 460000 + }, + { + "epoch": 25.66, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 460500 + }, + { + "epoch": 25.69, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 461000 + }, + { + "epoch": 25.72, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 461500 + }, + { + "epoch": 25.75, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 462000 + }, + { + "epoch": 25.77, + "learning_rate": 0.0002, + "loss": 1.5194, + "step": 462500 + }, + { + "epoch": 25.8, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 463000 + }, + { + "epoch": 25.83, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 463500 + }, + { + "epoch": 25.86, + "learning_rate": 0.0002, + "loss": 1.5206, + "step": 464000 + }, + { + "epoch": 25.88, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 464500 + }, + { + "epoch": 25.91, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 465000 + }, + { + "epoch": 25.94, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 465500 + }, + { + "epoch": 25.97, + "learning_rate": 0.0002, + "loss": 1.5248, + "step": 466000 + }, + { + "epoch": 26.0, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 466500 + }, + { + "epoch": 26.0, + "eval_gen_len": 19.0, + "eval_loss": 1.5949018001556396, + "eval_rouge1": 24.9535, + "eval_rouge2": 12.0763, + "eval_rougeL": 20.6432, + "eval_rougeLsum": 23.5531, + "eval_runtime": 451.3411, + "eval_samples_per_second": 29.618, + "eval_steps_per_second": 1.852, + "step": 466570 + } + ], + "max_steps": 538350, + "num_train_epochs": 30, + "total_flos": 2.0206105478981222e+18, + "trial_name": null, + "trial_params": null +}