diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31279 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 39.993726474278546, + "global_step": 510000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 2.9557, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0005, + "loss": 2.8658, + "step": 200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0005, + "loss": 2.8174, + "step": 300 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 2.7738, + "step": 400 + }, + { + "epoch": 0.04, + "learning_rate": 0.0005, + "loss": 2.8055, + "step": 500 + }, + { + "epoch": 0.05, + "learning_rate": 0.0005, + "loss": 2.7804, + "step": 600 + }, + { + "epoch": 0.05, + "learning_rate": 0.0005, + "loss": 2.7898, + "step": 700 + }, + { + "epoch": 0.06, + "learning_rate": 0.0005, + "loss": 2.7531, + "step": 800 + }, + { + "epoch": 0.07, + "learning_rate": 0.0005, + "loss": 2.7081, + "step": 900 + }, + { + "epoch": 0.08, + "learning_rate": 0.0005, + "loss": 2.7108, + "step": 1000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0005, + "loss": 2.7372, + "step": 1100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0005, + "loss": 2.7245, + "step": 1200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0005, + "loss": 2.718, + "step": 1300 + }, + { + "epoch": 0.11, + "learning_rate": 0.0005, + "loss": 2.71, + "step": 1400 + }, + { + "epoch": 0.12, + "learning_rate": 0.0005, + "loss": 2.6816, + "step": 1500 + }, + { + "epoch": 0.13, + "learning_rate": 0.0005, + "loss": 2.7044, + "step": 1600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0005, + "loss": 2.6867, + "step": 1700 + }, + { + "epoch": 0.14, + "learning_rate": 0.0005, + "loss": 2.7035, + "step": 1800 + }, + { + "epoch": 0.15, + "learning_rate": 0.0005, + "loss": 2.6853, + "step": 1900 + }, + { + "epoch": 0.16, + "learning_rate": 0.0005, + "loss": 2.6772, + "step": 2000 + }, + { + "epoch": 0.16, + "learning_rate": 0.0005, + "loss": 2.6647, + "step": 2100 + }, + { + "epoch": 0.17, + "learning_rate": 0.0005, + "loss": 2.6923, + "step": 2200 + }, + { + "epoch": 0.18, + "learning_rate": 0.0005, + "loss": 2.6514, + "step": 2300 + }, + { + "epoch": 0.19, + "learning_rate": 0.0005, + "loss": 2.6542, + "step": 2400 + }, + { + "epoch": 0.2, + "learning_rate": 0.0005, + "loss": 2.6391, + "step": 2500 + }, + { + "epoch": 0.2, + "learning_rate": 0.0005, + "loss": 2.6527, + "step": 2600 + }, + { + "epoch": 0.21, + "learning_rate": 0.0005, + "loss": 2.6658, + "step": 2700 + }, + { + "epoch": 0.22, + "learning_rate": 0.0005, + "loss": 2.6537, + "step": 2800 + }, + { + "epoch": 0.23, + "learning_rate": 0.0005, + "loss": 2.67, + "step": 2900 + }, + { + "epoch": 0.24, + "learning_rate": 0.0005, + "loss": 2.6605, + "step": 3000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0005, + "loss": 2.6516, + "step": 3100 + }, + { + "epoch": 0.25, + "learning_rate": 0.0005, + "loss": 2.6358, + "step": 3200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0005, + "loss": 2.6302, + "step": 3300 + }, + { + "epoch": 0.27, + "learning_rate": 0.0005, + "loss": 2.6007, + "step": 3400 + }, + { + "epoch": 0.27, + "learning_rate": 0.0005, + "loss": 2.6063, + "step": 3500 + }, + { + "epoch": 0.28, + "learning_rate": 0.0005, + "loss": 2.6016, + "step": 3600 + }, + { + "epoch": 0.29, + "learning_rate": 0.0005, + "loss": 2.6013, + "step": 3700 + }, + { + "epoch": 0.3, + "learning_rate": 0.0005, + "loss": 2.5946, + "step": 3800 + }, + { + "epoch": 0.31, + "learning_rate": 0.0005, + "loss": 2.6054, + "step": 3900 + }, + { + "epoch": 0.31, + "learning_rate": 0.0005, + "loss": 2.6073, + "step": 4000 + }, + { + "epoch": 0.32, + "learning_rate": 0.0005, + "loss": 2.6101, + "step": 4100 + }, + { + "epoch": 0.33, + "learning_rate": 0.0005, + "loss": 2.6027, + "step": 4200 + }, + { + "epoch": 0.34, + "learning_rate": 0.0005, + "loss": 2.5753, + "step": 4300 + }, + { + "epoch": 0.35, + "learning_rate": 0.0005, + "loss": 2.5974, + "step": 4400 + }, + { + "epoch": 0.35, + "learning_rate": 0.0005, + "loss": 2.6356, + "step": 4500 + }, + { + "epoch": 0.36, + "learning_rate": 0.0005, + "loss": 2.5751, + "step": 4600 + }, + { + "epoch": 0.37, + "learning_rate": 0.0005, + "loss": 2.5813, + "step": 4700 + }, + { + "epoch": 0.38, + "learning_rate": 0.0005, + "loss": 2.5925, + "step": 4800 + }, + { + "epoch": 0.38, + "learning_rate": 0.0005, + "loss": 2.5726, + "step": 4900 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005, + "loss": 2.593, + "step": 5000 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005, + "loss": 2.5702, + "step": 5100 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005, + "loss": 2.5746, + "step": 5200 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005, + "loss": 2.5799, + "step": 5300 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005, + "loss": 2.5502, + "step": 5400 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005, + "loss": 2.5812, + "step": 5500 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005, + "loss": 2.5866, + "step": 5600 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005, + "loss": 2.5586, + "step": 5700 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005, + "loss": 2.568, + "step": 5800 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005, + "loss": 2.5671, + "step": 5900 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005, + "loss": 2.5502, + "step": 6000 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005, + "loss": 2.5526, + "step": 6100 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005, + "loss": 2.5421, + "step": 6200 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005, + "loss": 2.5462, + "step": 6300 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005, + "loss": 2.5394, + "step": 6400 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005, + "loss": 2.5563, + "step": 6500 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005, + "loss": 2.5762, + "step": 6600 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005, + "loss": 2.5232, + "step": 6700 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005, + "loss": 2.5681, + "step": 6800 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005, + "loss": 2.565, + "step": 6900 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005, + "loss": 2.516, + "step": 7000 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005, + "loss": 2.5506, + "step": 7100 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005, + "loss": 2.5251, + "step": 7200 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005, + "loss": 2.4942, + "step": 7300 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005, + "loss": 2.5034, + "step": 7400 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005, + "loss": 2.5232, + "step": 7500 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005, + "loss": 2.523, + "step": 7600 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005, + "loss": 2.5113, + "step": 7700 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005, + "loss": 2.5103, + "step": 7800 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005, + "loss": 2.512, + "step": 7900 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005, + "loss": 2.5053, + "step": 8000 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005, + "loss": 2.5327, + "step": 8100 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005, + "loss": 2.5199, + "step": 8200 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005, + "loss": 2.5373, + "step": 8300 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005, + "loss": 2.484, + "step": 8400 + }, + { + "epoch": 0.67, + "learning_rate": 0.0005, + "loss": 2.5555, + "step": 8500 + }, + { + "epoch": 0.67, + "learning_rate": 0.0005, + "loss": 2.4938, + "step": 8600 + }, + { + "epoch": 0.68, + "learning_rate": 0.0005, + "loss": 2.5342, + "step": 8700 + }, + { + "epoch": 0.69, + "learning_rate": 0.0005, + "loss": 2.4963, + "step": 8800 + }, + { + "epoch": 0.7, + "learning_rate": 0.0005, + "loss": 2.5022, + "step": 8900 + }, + { + "epoch": 0.71, + "learning_rate": 0.0005, + "loss": 2.5201, + "step": 9000 + }, + { + "epoch": 0.71, + "learning_rate": 0.0005, + "loss": 2.4868, + "step": 9100 + }, + { + "epoch": 0.72, + "learning_rate": 0.0005, + "loss": 2.4804, + "step": 9200 + }, + { + "epoch": 0.73, + "learning_rate": 0.0005, + "loss": 2.4936, + "step": 9300 + }, + { + "epoch": 0.74, + "learning_rate": 0.0005, + "loss": 2.5081, + "step": 9400 + }, + { + "epoch": 0.74, + "learning_rate": 0.0005, + "loss": 2.4851, + "step": 9500 + }, + { + "epoch": 0.75, + "learning_rate": 0.0005, + "loss": 2.498, + "step": 9600 + }, + { + "epoch": 0.76, + "learning_rate": 0.0005, + "loss": 2.4868, + "step": 9700 + }, + { + "epoch": 0.77, + "learning_rate": 0.0005, + "loss": 2.5126, + "step": 9800 + }, + { + "epoch": 0.78, + "learning_rate": 0.0005, + "loss": 2.5031, + "step": 9900 + }, + { + "epoch": 0.78, + "learning_rate": 0.0005, + "loss": 2.4989, + "step": 10000 + }, + { + "epoch": 0.78, + "eval_gen_len": 18.83164121126512, + "eval_loss": 2.2748022079467773, + "eval_rouge1": 31.1136, + "eval_rouge2": 10.0478, + "eval_rougeL": 24.9492, + "eval_rougeLsum": 24.9433, + "eval_runtime": 361.4284, + "eval_samples_per_second": 31.34, + "eval_steps_per_second": 1.959, + "step": 10000 + }, + { + "epoch": 0.79, + "learning_rate": 0.0005, + "loss": 2.4972, + "step": 10100 + }, + { + "epoch": 0.8, + "learning_rate": 0.0005, + "loss": 2.4781, + "step": 10200 + }, + { + "epoch": 0.81, + "learning_rate": 0.0005, + "loss": 2.4716, + "step": 10300 + }, + { + "epoch": 0.82, + "learning_rate": 0.0005, + "loss": 2.47, + "step": 10400 + }, + { + "epoch": 0.82, + "learning_rate": 0.0005, + "loss": 2.5001, + "step": 10500 + }, + { + "epoch": 0.83, + "learning_rate": 0.0005, + "loss": 2.5084, + "step": 10600 + }, + { + "epoch": 0.84, + "learning_rate": 0.0005, + "loss": 2.4845, + "step": 10700 + }, + { + "epoch": 0.85, + "learning_rate": 0.0005, + "loss": 2.469, + "step": 10800 + }, + { + "epoch": 0.85, + "learning_rate": 0.0005, + "loss": 2.4909, + "step": 10900 + }, + { + "epoch": 0.86, + "learning_rate": 0.0005, + "loss": 2.4785, + "step": 11000 + }, + { + "epoch": 0.87, + "learning_rate": 0.0005, + "loss": 2.4664, + "step": 11100 + }, + { + "epoch": 0.88, + "learning_rate": 0.0005, + "loss": 2.4913, + "step": 11200 + }, + { + "epoch": 0.89, + "learning_rate": 0.0005, + "loss": 2.4588, + "step": 11300 + }, + { + "epoch": 0.89, + "learning_rate": 0.0005, + "loss": 2.4712, + "step": 11400 + }, + { + "epoch": 0.9, + "learning_rate": 0.0005, + "loss": 2.4857, + "step": 11500 + }, + { + "epoch": 0.91, + "learning_rate": 0.0005, + "loss": 2.4542, + "step": 11600 + }, + { + "epoch": 0.92, + "learning_rate": 0.0005, + "loss": 2.4724, + "step": 11700 + }, + { + "epoch": 0.93, + "learning_rate": 0.0005, + "loss": 2.48, + "step": 11800 + }, + { + "epoch": 0.93, + "learning_rate": 0.0005, + "loss": 2.4975, + "step": 11900 + }, + { + "epoch": 0.94, + "learning_rate": 0.0005, + "loss": 2.4908, + "step": 12000 + }, + { + "epoch": 0.95, + "learning_rate": 0.0005, + "loss": 2.4845, + "step": 12100 + }, + { + "epoch": 0.96, + "learning_rate": 0.0005, + "loss": 2.474, + "step": 12200 + }, + { + "epoch": 0.96, + "learning_rate": 0.0005, + "loss": 2.4674, + "step": 12300 + }, + { + "epoch": 0.97, + "learning_rate": 0.0005, + "loss": 2.4571, + "step": 12400 + }, + { + "epoch": 0.98, + "learning_rate": 0.0005, + "loss": 2.4618, + "step": 12500 + }, + { + "epoch": 0.99, + "learning_rate": 0.0005, + "loss": 2.5003, + "step": 12600 + }, + { + "epoch": 1.0, + "learning_rate": 0.0005, + "loss": 2.4793, + "step": 12700 + }, + { + "epoch": 1.0, + "learning_rate": 0.0005, + "loss": 2.4271, + "step": 12800 + }, + { + "epoch": 1.01, + "learning_rate": 0.0005, + "loss": 2.372, + "step": 12900 + }, + { + "epoch": 1.02, + "learning_rate": 0.0005, + "loss": 2.3714, + "step": 13000 + }, + { + "epoch": 1.03, + "learning_rate": 0.0005, + "loss": 2.3498, + "step": 13100 + }, + { + "epoch": 1.04, + "learning_rate": 0.0005, + "loss": 2.3581, + "step": 13200 + }, + { + "epoch": 1.04, + "learning_rate": 0.0005, + "loss": 2.3774, + "step": 13300 + }, + { + "epoch": 1.05, + "learning_rate": 0.0005, + "loss": 2.3658, + "step": 13400 + }, + { + "epoch": 1.06, + "learning_rate": 0.0005, + "loss": 2.4074, + "step": 13500 + }, + { + "epoch": 1.07, + "learning_rate": 0.0005, + "loss": 2.4005, + "step": 13600 + }, + { + "epoch": 1.07, + "learning_rate": 0.0005, + "loss": 2.3915, + "step": 13700 + }, + { + "epoch": 1.08, + "learning_rate": 0.0005, + "loss": 2.3894, + "step": 13800 + }, + { + "epoch": 1.09, + "learning_rate": 0.0005, + "loss": 2.3981, + "step": 13900 + }, + { + "epoch": 1.1, + "learning_rate": 0.0005, + "loss": 2.3875, + "step": 14000 + }, + { + "epoch": 1.11, + "learning_rate": 0.0005, + "loss": 2.3823, + "step": 14100 + }, + { + "epoch": 1.11, + "learning_rate": 0.0005, + "loss": 2.3678, + "step": 14200 + }, + { + "epoch": 1.12, + "learning_rate": 0.0005, + "loss": 2.3891, + "step": 14300 + }, + { + "epoch": 1.13, + "learning_rate": 0.0005, + "loss": 2.3688, + "step": 14400 + }, + { + "epoch": 1.14, + "learning_rate": 0.0005, + "loss": 2.3524, + "step": 14500 + }, + { + "epoch": 1.14, + "learning_rate": 0.0005, + "loss": 2.3719, + "step": 14600 + }, + { + "epoch": 1.15, + "learning_rate": 0.0005, + "loss": 2.3646, + "step": 14700 + }, + { + "epoch": 1.16, + "learning_rate": 0.0005, + "loss": 2.3875, + "step": 14800 + }, + { + "epoch": 1.17, + "learning_rate": 0.0005, + "loss": 2.3717, + "step": 14900 + }, + { + "epoch": 1.18, + "learning_rate": 0.0005, + "loss": 2.3585, + "step": 15000 + }, + { + "epoch": 1.18, + "learning_rate": 0.0005, + "loss": 2.348, + "step": 15100 + }, + { + "epoch": 1.19, + "learning_rate": 0.0005, + "loss": 2.3713, + "step": 15200 + }, + { + "epoch": 1.2, + "learning_rate": 0.0005, + "loss": 2.392, + "step": 15300 + }, + { + "epoch": 1.21, + "learning_rate": 0.0005, + "loss": 2.3611, + "step": 15400 + }, + { + "epoch": 1.22, + "learning_rate": 0.0005, + "loss": 2.3781, + "step": 15500 + }, + { + "epoch": 1.22, + "learning_rate": 0.0005, + "loss": 2.3851, + "step": 15600 + }, + { + "epoch": 1.23, + "learning_rate": 0.0005, + "loss": 2.3868, + "step": 15700 + }, + { + "epoch": 1.24, + "learning_rate": 0.0005, + "loss": 2.3603, + "step": 15800 + }, + { + "epoch": 1.25, + "learning_rate": 0.0005, + "loss": 2.3947, + "step": 15900 + }, + { + "epoch": 1.25, + "learning_rate": 0.0005, + "loss": 2.3965, + "step": 16000 + }, + { + "epoch": 1.26, + "learning_rate": 0.0005, + "loss": 2.3836, + "step": 16100 + }, + { + "epoch": 1.27, + "learning_rate": 0.0005, + "loss": 2.3423, + "step": 16200 + }, + { + "epoch": 1.28, + "learning_rate": 0.0005, + "loss": 2.3669, + "step": 16300 + }, + { + "epoch": 1.29, + "learning_rate": 0.0005, + "loss": 2.3748, + "step": 16400 + }, + { + "epoch": 1.29, + "learning_rate": 0.0005, + "loss": 2.3593, + "step": 16500 + }, + { + "epoch": 1.3, + "learning_rate": 0.0005, + "loss": 2.3559, + "step": 16600 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005, + "loss": 2.3652, + "step": 16700 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005, + "loss": 2.3841, + "step": 16800 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005, + "loss": 2.3874, + "step": 16900 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005, + "loss": 2.3607, + "step": 17000 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005, + "loss": 2.3849, + "step": 17100 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005, + "loss": 2.3809, + "step": 17200 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005, + "loss": 2.359, + "step": 17300 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005, + "loss": 2.3618, + "step": 17400 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005, + "loss": 2.3758, + "step": 17500 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005, + "loss": 2.3538, + "step": 17600 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005, + "loss": 2.3733, + "step": 17700 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005, + "loss": 2.3735, + "step": 17800 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005, + "loss": 2.3859, + "step": 17900 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005, + "loss": 2.3508, + "step": 18000 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005, + "loss": 2.3859, + "step": 18100 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005, + "loss": 2.3567, + "step": 18200 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005, + "loss": 2.3292, + "step": 18300 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005, + "loss": 2.3882, + "step": 18400 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005, + "loss": 2.3592, + "step": 18500 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005, + "loss": 2.3594, + "step": 18600 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005, + "loss": 2.359, + "step": 18700 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005, + "loss": 2.3549, + "step": 18800 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005, + "loss": 2.3962, + "step": 18900 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005, + "loss": 2.3413, + "step": 19000 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005, + "loss": 2.3636, + "step": 19100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0005, + "loss": 2.3381, + "step": 19200 + }, + { + "epoch": 1.51, + "learning_rate": 0.0005, + "loss": 2.363, + "step": 19300 + }, + { + "epoch": 1.52, + "learning_rate": 0.0005, + "loss": 2.3687, + "step": 19400 + }, + { + "epoch": 1.53, + "learning_rate": 0.0005, + "loss": 2.3359, + "step": 19500 + }, + { + "epoch": 1.54, + "learning_rate": 0.0005, + "loss": 2.361, + "step": 19600 + }, + { + "epoch": 1.54, + "learning_rate": 0.0005, + "loss": 2.3808, + "step": 19700 + }, + { + "epoch": 1.55, + "learning_rate": 0.0005, + "loss": 2.347, + "step": 19800 + }, + { + "epoch": 1.56, + "learning_rate": 0.0005, + "loss": 2.3695, + "step": 19900 + }, + { + "epoch": 1.57, + "learning_rate": 0.0005, + "loss": 2.3456, + "step": 20000 + }, + { + "epoch": 1.57, + "eval_gen_len": 18.73682351902534, + "eval_loss": 2.202566385269165, + "eval_rouge1": 31.9461, + "eval_rouge2": 10.8723, + "eval_rougeL": 25.7822, + "eval_rougeLsum": 25.7746, + "eval_runtime": 358.9411, + "eval_samples_per_second": 31.557, + "eval_steps_per_second": 1.972, + "step": 20000 + }, + { + "epoch": 1.58, + "learning_rate": 0.0005, + "loss": 2.3445, + "step": 20100 + }, + { + "epoch": 1.58, + "learning_rate": 0.0005, + "loss": 2.36, + "step": 20200 + }, + { + "epoch": 1.59, + "learning_rate": 0.0005, + "loss": 2.3749, + "step": 20300 + }, + { + "epoch": 1.6, + "learning_rate": 0.0005, + "loss": 2.3606, + "step": 20400 + }, + { + "epoch": 1.61, + "learning_rate": 0.0005, + "loss": 2.3375, + "step": 20500 + }, + { + "epoch": 1.62, + "learning_rate": 0.0005, + "loss": 2.3448, + "step": 20600 + }, + { + "epoch": 1.62, + "learning_rate": 0.0005, + "loss": 2.3698, + "step": 20700 + }, + { + "epoch": 1.63, + "learning_rate": 0.0005, + "loss": 2.3403, + "step": 20800 + }, + { + "epoch": 1.64, + "learning_rate": 0.0005, + "loss": 2.354, + "step": 20900 + }, + { + "epoch": 1.65, + "learning_rate": 0.0005, + "loss": 2.3578, + "step": 21000 + }, + { + "epoch": 1.65, + "learning_rate": 0.0005, + "loss": 2.3598, + "step": 21100 + }, + { + "epoch": 1.66, + "learning_rate": 0.0005, + "loss": 2.3262, + "step": 21200 + }, + { + "epoch": 1.67, + "learning_rate": 0.0005, + "loss": 2.3667, + "step": 21300 + }, + { + "epoch": 1.68, + "learning_rate": 0.0005, + "loss": 2.3801, + "step": 21400 + }, + { + "epoch": 1.69, + "learning_rate": 0.0005, + "loss": 2.3638, + "step": 21500 + }, + { + "epoch": 1.69, + "learning_rate": 0.0005, + "loss": 2.3559, + "step": 21600 + }, + { + "epoch": 1.7, + "learning_rate": 0.0005, + "loss": 2.3595, + "step": 21700 + }, + { + "epoch": 1.71, + "learning_rate": 0.0005, + "loss": 2.3472, + "step": 21800 + }, + { + "epoch": 1.72, + "learning_rate": 0.0005, + "loss": 2.3415, + "step": 21900 + }, + { + "epoch": 1.73, + "learning_rate": 0.0005, + "loss": 2.3427, + "step": 22000 + }, + { + "epoch": 1.73, + "learning_rate": 0.0005, + "loss": 2.3306, + "step": 22100 + }, + { + "epoch": 1.74, + "learning_rate": 0.0005, + "loss": 2.3147, + "step": 22200 + }, + { + "epoch": 1.75, + "learning_rate": 0.0005, + "loss": 2.3402, + "step": 22300 + }, + { + "epoch": 1.76, + "learning_rate": 0.0005, + "loss": 2.3294, + "step": 22400 + }, + { + "epoch": 1.76, + "learning_rate": 0.0005, + "loss": 2.383, + "step": 22500 + }, + { + "epoch": 1.77, + "learning_rate": 0.0005, + "loss": 2.3562, + "step": 22600 + }, + { + "epoch": 1.78, + "learning_rate": 0.0005, + "loss": 2.3583, + "step": 22700 + }, + { + "epoch": 1.79, + "learning_rate": 0.0005, + "loss": 2.353, + "step": 22800 + }, + { + "epoch": 1.8, + "learning_rate": 0.0005, + "loss": 2.3428, + "step": 22900 + }, + { + "epoch": 1.8, + "learning_rate": 0.0005, + "loss": 2.3461, + "step": 23000 + }, + { + "epoch": 1.81, + "learning_rate": 0.0005, + "loss": 2.3351, + "step": 23100 + }, + { + "epoch": 1.82, + "learning_rate": 0.0005, + "loss": 2.3735, + "step": 23200 + }, + { + "epoch": 1.83, + "learning_rate": 0.0005, + "loss": 2.3722, + "step": 23300 + }, + { + "epoch": 1.84, + "learning_rate": 0.0005, + "loss": 2.3445, + "step": 23400 + }, + { + "epoch": 1.84, + "learning_rate": 0.0005, + "loss": 2.2993, + "step": 23500 + }, + { + "epoch": 1.85, + "learning_rate": 0.0005, + "loss": 2.3267, + "step": 23600 + }, + { + "epoch": 1.86, + "learning_rate": 0.0005, + "loss": 2.3075, + "step": 23700 + }, + { + "epoch": 1.87, + "learning_rate": 0.0005, + "loss": 2.3993, + "step": 23800 + }, + { + "epoch": 1.87, + "learning_rate": 0.0005, + "loss": 2.3256, + "step": 23900 + }, + { + "epoch": 1.88, + "learning_rate": 0.0005, + "loss": 2.3395, + "step": 24000 + }, + { + "epoch": 1.89, + "learning_rate": 0.0005, + "loss": 2.3395, + "step": 24100 + }, + { + "epoch": 1.9, + "learning_rate": 0.0005, + "loss": 2.3619, + "step": 24200 + }, + { + "epoch": 1.91, + "learning_rate": 0.0005, + "loss": 2.3502, + "step": 24300 + }, + { + "epoch": 1.91, + "learning_rate": 0.0005, + "loss": 2.3278, + "step": 24400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0005, + "loss": 2.3603, + "step": 24500 + }, + { + "epoch": 1.93, + "learning_rate": 0.0005, + "loss": 2.3323, + "step": 24600 + }, + { + "epoch": 1.94, + "learning_rate": 0.0005, + "loss": 2.374, + "step": 24700 + }, + { + "epoch": 1.94, + "learning_rate": 0.0005, + "loss": 2.3298, + "step": 24800 + }, + { + "epoch": 1.95, + "learning_rate": 0.0005, + "loss": 2.3274, + "step": 24900 + }, + { + "epoch": 1.96, + "learning_rate": 0.0005, + "loss": 2.3451, + "step": 25000 + }, + { + "epoch": 1.97, + "learning_rate": 0.0005, + "loss": 2.2976, + "step": 25100 + }, + { + "epoch": 1.98, + "learning_rate": 0.0005, + "loss": 2.3222, + "step": 25200 + }, + { + "epoch": 1.98, + "learning_rate": 0.0005, + "loss": 2.3013, + "step": 25300 + }, + { + "epoch": 1.99, + "learning_rate": 0.0005, + "loss": 2.3352, + "step": 25400 + }, + { + "epoch": 2.0, + "learning_rate": 0.0005, + "loss": 2.3124, + "step": 25500 + }, + { + "epoch": 2.01, + "learning_rate": 0.0005, + "loss": 2.2476, + "step": 25600 + }, + { + "epoch": 2.02, + "learning_rate": 0.0005, + "loss": 2.2515, + "step": 25700 + }, + { + "epoch": 2.02, + "learning_rate": 0.0005, + "loss": 2.2521, + "step": 25800 + }, + { + "epoch": 2.03, + "learning_rate": 0.0005, + "loss": 2.2177, + "step": 25900 + }, + { + "epoch": 2.04, + "learning_rate": 0.0005, + "loss": 2.2554, + "step": 26000 + }, + { + "epoch": 2.05, + "learning_rate": 0.0005, + "loss": 2.2345, + "step": 26100 + }, + { + "epoch": 2.05, + "learning_rate": 0.0005, + "loss": 2.2548, + "step": 26200 + }, + { + "epoch": 2.06, + "learning_rate": 0.0005, + "loss": 2.2591, + "step": 26300 + }, + { + "epoch": 2.07, + "learning_rate": 0.0005, + "loss": 2.2414, + "step": 26400 + }, + { + "epoch": 2.08, + "learning_rate": 0.0005, + "loss": 2.2562, + "step": 26500 + }, + { + "epoch": 2.09, + "learning_rate": 0.0005, + "loss": 2.2722, + "step": 26600 + }, + { + "epoch": 2.09, + "learning_rate": 0.0005, + "loss": 2.2945, + "step": 26700 + }, + { + "epoch": 2.1, + "learning_rate": 0.0005, + "loss": 2.2473, + "step": 26800 + }, + { + "epoch": 2.11, + "learning_rate": 0.0005, + "loss": 2.2406, + "step": 26900 + }, + { + "epoch": 2.12, + "learning_rate": 0.0005, + "loss": 2.2683, + "step": 27000 + }, + { + "epoch": 2.13, + "learning_rate": 0.0005, + "loss": 2.2277, + "step": 27100 + }, + { + "epoch": 2.13, + "learning_rate": 0.0005, + "loss": 2.2773, + "step": 27200 + }, + { + "epoch": 2.14, + "learning_rate": 0.0005, + "loss": 2.2379, + "step": 27300 + }, + { + "epoch": 2.15, + "learning_rate": 0.0005, + "loss": 2.2632, + "step": 27400 + }, + { + "epoch": 2.16, + "learning_rate": 0.0005, + "loss": 2.2494, + "step": 27500 + }, + { + "epoch": 2.16, + "learning_rate": 0.0005, + "loss": 2.2149, + "step": 27600 + }, + { + "epoch": 2.17, + "learning_rate": 0.0005, + "loss": 2.2357, + "step": 27700 + }, + { + "epoch": 2.18, + "learning_rate": 0.0005, + "loss": 2.2573, + "step": 27800 + }, + { + "epoch": 2.19, + "learning_rate": 0.0005, + "loss": 2.2577, + "step": 27900 + }, + { + "epoch": 2.2, + "learning_rate": 0.0005, + "loss": 2.2526, + "step": 28000 + }, + { + "epoch": 2.2, + "learning_rate": 0.0005, + "loss": 2.2573, + "step": 28100 + }, + { + "epoch": 2.21, + "learning_rate": 0.0005, + "loss": 2.2741, + "step": 28200 + }, + { + "epoch": 2.22, + "learning_rate": 0.0005, + "loss": 2.2442, + "step": 28300 + }, + { + "epoch": 2.23, + "learning_rate": 0.0005, + "loss": 2.2483, + "step": 28400 + }, + { + "epoch": 2.23, + "learning_rate": 0.0005, + "loss": 2.2582, + "step": 28500 + }, + { + "epoch": 2.24, + "learning_rate": 0.0005, + "loss": 2.275, + "step": 28600 + }, + { + "epoch": 2.25, + "learning_rate": 0.0005, + "loss": 2.2972, + "step": 28700 + }, + { + "epoch": 2.26, + "learning_rate": 0.0005, + "loss": 2.2356, + "step": 28800 + }, + { + "epoch": 2.27, + "learning_rate": 0.0005, + "loss": 2.2517, + "step": 28900 + }, + { + "epoch": 2.27, + "learning_rate": 0.0005, + "loss": 2.3137, + "step": 29000 + }, + { + "epoch": 2.28, + "learning_rate": 0.0005, + "loss": 2.2788, + "step": 29100 + }, + { + "epoch": 2.29, + "learning_rate": 0.0005, + "loss": 2.2562, + "step": 29200 + }, + { + "epoch": 2.3, + "learning_rate": 0.0005, + "loss": 2.2825, + "step": 29300 + }, + { + "epoch": 2.31, + "learning_rate": 0.0005, + "loss": 2.2082, + "step": 29400 + }, + { + "epoch": 2.31, + "learning_rate": 0.0005, + "loss": 2.2491, + "step": 29500 + }, + { + "epoch": 2.32, + "learning_rate": 0.0005, + "loss": 2.2299, + "step": 29600 + }, + { + "epoch": 2.33, + "learning_rate": 0.0005, + "loss": 2.2678, + "step": 29700 + }, + { + "epoch": 2.34, + "learning_rate": 0.0005, + "loss": 2.2961, + "step": 29800 + }, + { + "epoch": 2.34, + "learning_rate": 0.0005, + "loss": 2.2751, + "step": 29900 + }, + { + "epoch": 2.35, + "learning_rate": 0.0005, + "loss": 2.2444, + "step": 30000 + }, + { + "epoch": 2.35, + "eval_gen_len": 18.769223978105412, + "eval_loss": 2.1571497917175293, + "eval_rouge1": 32.5123, + "eval_rouge2": 11.4387, + "eval_rougeL": 26.3083, + "eval_rougeLsum": 26.2953, + "eval_runtime": 360.0544, + "eval_samples_per_second": 31.459, + "eval_steps_per_second": 1.966, + "step": 30000 + }, + { + "epoch": 2.36, + "learning_rate": 0.0005, + "loss": 2.2231, + "step": 30100 + }, + { + "epoch": 2.37, + "learning_rate": 0.0005, + "loss": 2.2138, + "step": 30200 + }, + { + "epoch": 2.38, + "learning_rate": 0.0005, + "loss": 2.2555, + "step": 30300 + }, + { + "epoch": 2.38, + "learning_rate": 0.0005, + "loss": 2.255, + "step": 30400 + }, + { + "epoch": 2.39, + "learning_rate": 0.0005, + "loss": 2.2482, + "step": 30500 + }, + { + "epoch": 2.4, + "learning_rate": 0.0005, + "loss": 2.2546, + "step": 30600 + }, + { + "epoch": 2.41, + "learning_rate": 0.0005, + "loss": 2.2404, + "step": 30700 + }, + { + "epoch": 2.42, + "learning_rate": 0.0005, + "loss": 2.2568, + "step": 30800 + }, + { + "epoch": 2.42, + "learning_rate": 0.0005, + "loss": 2.2331, + "step": 30900 + }, + { + "epoch": 2.43, + "learning_rate": 0.0005, + "loss": 2.2175, + "step": 31000 + }, + { + "epoch": 2.44, + "learning_rate": 0.0005, + "loss": 2.2343, + "step": 31100 + }, + { + "epoch": 2.45, + "learning_rate": 0.0005, + "loss": 2.2838, + "step": 31200 + }, + { + "epoch": 2.45, + "learning_rate": 0.0005, + "loss": 2.2531, + "step": 31300 + }, + { + "epoch": 2.46, + "learning_rate": 0.0005, + "loss": 2.2971, + "step": 31400 + }, + { + "epoch": 2.47, + "learning_rate": 0.0005, + "loss": 2.2527, + "step": 31500 + }, + { + "epoch": 2.48, + "learning_rate": 0.0005, + "loss": 2.2508, + "step": 31600 + }, + { + "epoch": 2.49, + "learning_rate": 0.0005, + "loss": 2.2392, + "step": 31700 + }, + { + "epoch": 2.49, + "learning_rate": 0.0005, + "loss": 2.2263, + "step": 31800 + }, + { + "epoch": 2.5, + "learning_rate": 0.0005, + "loss": 2.2048, + "step": 31900 + }, + { + "epoch": 2.51, + "learning_rate": 0.0005, + "loss": 2.2687, + "step": 32000 + }, + { + "epoch": 2.52, + "learning_rate": 0.0005, + "loss": 2.2202, + "step": 32100 + }, + { + "epoch": 2.53, + "learning_rate": 0.0005, + "loss": 2.2439, + "step": 32200 + }, + { + "epoch": 2.53, + "learning_rate": 0.0005, + "loss": 2.2705, + "step": 32300 + }, + { + "epoch": 2.54, + "learning_rate": 0.0005, + "loss": 2.2384, + "step": 32400 + }, + { + "epoch": 2.55, + "learning_rate": 0.0005, + "loss": 2.2517, + "step": 32500 + }, + { + "epoch": 2.56, + "learning_rate": 0.0005, + "loss": 2.2336, + "step": 32600 + }, + { + "epoch": 2.56, + "learning_rate": 0.0005, + "loss": 2.2587, + "step": 32700 + }, + { + "epoch": 2.57, + "learning_rate": 0.0005, + "loss": 2.2716, + "step": 32800 + }, + { + "epoch": 2.58, + "learning_rate": 0.0005, + "loss": 2.2294, + "step": 32900 + }, + { + "epoch": 2.59, + "learning_rate": 0.0005, + "loss": 2.2819, + "step": 33000 + }, + { + "epoch": 2.6, + "learning_rate": 0.0005, + "loss": 2.2461, + "step": 33100 + }, + { + "epoch": 2.6, + "learning_rate": 0.0005, + "loss": 2.2399, + "step": 33200 + }, + { + "epoch": 2.61, + "learning_rate": 0.0005, + "loss": 2.2518, + "step": 33300 + }, + { + "epoch": 2.62, + "learning_rate": 0.0005, + "loss": 2.2797, + "step": 33400 + }, + { + "epoch": 2.63, + "learning_rate": 0.0005, + "loss": 2.2575, + "step": 33500 + }, + { + "epoch": 2.63, + "learning_rate": 0.0005, + "loss": 2.2288, + "step": 33600 + }, + { + "epoch": 2.64, + "learning_rate": 0.0005, + "loss": 2.2504, + "step": 33700 + }, + { + "epoch": 2.65, + "learning_rate": 0.0005, + "loss": 2.229, + "step": 33800 + }, + { + "epoch": 2.66, + "learning_rate": 0.0005, + "loss": 2.2625, + "step": 33900 + }, + { + "epoch": 2.67, + "learning_rate": 0.0005, + "loss": 2.2648, + "step": 34000 + }, + { + "epoch": 2.67, + "learning_rate": 0.0005, + "loss": 2.2677, + "step": 34100 + }, + { + "epoch": 2.68, + "learning_rate": 0.0005, + "loss": 2.2825, + "step": 34200 + }, + { + "epoch": 2.69, + "learning_rate": 0.0005, + "loss": 2.3043, + "step": 34300 + }, + { + "epoch": 2.7, + "learning_rate": 0.0005, + "loss": 2.2503, + "step": 34400 + }, + { + "epoch": 2.71, + "learning_rate": 0.0005, + "loss": 2.2582, + "step": 34500 + }, + { + "epoch": 2.71, + "learning_rate": 0.0005, + "loss": 2.2516, + "step": 34600 + }, + { + "epoch": 2.72, + "learning_rate": 0.0005, + "loss": 2.2583, + "step": 34700 + }, + { + "epoch": 2.73, + "learning_rate": 0.0005, + "loss": 2.2899, + "step": 34800 + }, + { + "epoch": 2.74, + "learning_rate": 0.0005, + "loss": 2.2794, + "step": 34900 + }, + { + "epoch": 2.74, + "learning_rate": 0.0005, + "loss": 2.2469, + "step": 35000 + }, + { + "epoch": 2.75, + "learning_rate": 0.0005, + "loss": 2.2274, + "step": 35100 + }, + { + "epoch": 2.76, + "learning_rate": 0.0005, + "loss": 2.2566, + "step": 35200 + }, + { + "epoch": 2.77, + "learning_rate": 0.0005, + "loss": 2.2657, + "step": 35300 + }, + { + "epoch": 2.78, + "learning_rate": 0.0005, + "loss": 2.2718, + "step": 35400 + }, + { + "epoch": 2.78, + "learning_rate": 0.0005, + "loss": 2.2226, + "step": 35500 + }, + { + "epoch": 2.79, + "learning_rate": 0.0005, + "loss": 2.2495, + "step": 35600 + }, + { + "epoch": 2.8, + "learning_rate": 0.0005, + "loss": 2.2368, + "step": 35700 + }, + { + "epoch": 2.81, + "learning_rate": 0.0005, + "loss": 2.2553, + "step": 35800 + }, + { + "epoch": 2.82, + "learning_rate": 0.0005, + "loss": 2.2305, + "step": 35900 + }, + { + "epoch": 2.82, + "learning_rate": 0.0005, + "loss": 2.2716, + "step": 36000 + }, + { + "epoch": 2.83, + "learning_rate": 0.0005, + "loss": 2.2758, + "step": 36100 + }, + { + "epoch": 2.84, + "learning_rate": 0.0005, + "loss": 2.2743, + "step": 36200 + }, + { + "epoch": 2.85, + "learning_rate": 0.0005, + "loss": 2.2621, + "step": 36300 + }, + { + "epoch": 2.85, + "learning_rate": 0.0005, + "loss": 2.2489, + "step": 36400 + }, + { + "epoch": 2.86, + "learning_rate": 0.0005, + "loss": 2.2363, + "step": 36500 + }, + { + "epoch": 2.87, + "learning_rate": 0.0005, + "loss": 2.257, + "step": 36600 + }, + { + "epoch": 2.88, + "learning_rate": 0.0005, + "loss": 2.2738, + "step": 36700 + }, + { + "epoch": 2.89, + "learning_rate": 0.0005, + "loss": 2.2153, + "step": 36800 + }, + { + "epoch": 2.89, + "learning_rate": 0.0005, + "loss": 2.2587, + "step": 36900 + }, + { + "epoch": 2.9, + "learning_rate": 0.0005, + "loss": 2.2536, + "step": 37000 + }, + { + "epoch": 2.91, + "learning_rate": 0.0005, + "loss": 2.2509, + "step": 37100 + }, + { + "epoch": 2.92, + "learning_rate": 0.0005, + "loss": 2.2362, + "step": 37200 + }, + { + "epoch": 2.93, + "learning_rate": 0.0005, + "loss": 2.2259, + "step": 37300 + }, + { + "epoch": 2.93, + "learning_rate": 0.0005, + "loss": 2.2235, + "step": 37400 + }, + { + "epoch": 2.94, + "learning_rate": 0.0005, + "loss": 2.2439, + "step": 37500 + }, + { + "epoch": 2.95, + "learning_rate": 0.0005, + "loss": 2.2646, + "step": 37600 + }, + { + "epoch": 2.96, + "learning_rate": 0.0005, + "loss": 2.2301, + "step": 37700 + }, + { + "epoch": 2.96, + "learning_rate": 0.0005, + "loss": 2.2284, + "step": 37800 + }, + { + "epoch": 2.97, + "learning_rate": 0.0005, + "loss": 2.2703, + "step": 37900 + }, + { + "epoch": 2.98, + "learning_rate": 0.0005, + "loss": 2.2647, + "step": 38000 + }, + { + "epoch": 2.99, + "learning_rate": 0.0005, + "loss": 2.2456, + "step": 38100 + }, + { + "epoch": 3.0, + "learning_rate": 0.0005, + "loss": 2.2619, + "step": 38200 + }, + { + "epoch": 3.0, + "learning_rate": 0.0005, + "loss": 2.21, + "step": 38300 + }, + { + "epoch": 3.01, + "learning_rate": 0.0005, + "loss": 2.1604, + "step": 38400 + }, + { + "epoch": 3.02, + "learning_rate": 0.0005, + "loss": 2.165, + "step": 38500 + }, + { + "epoch": 3.03, + "learning_rate": 0.0005, + "loss": 2.1502, + "step": 38600 + }, + { + "epoch": 3.03, + "learning_rate": 0.0005, + "loss": 2.1564, + "step": 38700 + }, + { + "epoch": 3.04, + "learning_rate": 0.0005, + "loss": 2.147, + "step": 38800 + }, + { + "epoch": 3.05, + "learning_rate": 0.0005, + "loss": 2.171, + "step": 38900 + }, + { + "epoch": 3.06, + "learning_rate": 0.0005, + "loss": 2.1522, + "step": 39000 + }, + { + "epoch": 3.07, + "learning_rate": 0.0005, + "loss": 2.19, + "step": 39100 + }, + { + "epoch": 3.07, + "learning_rate": 0.0005, + "loss": 2.1632, + "step": 39200 + }, + { + "epoch": 3.08, + "learning_rate": 0.0005, + "loss": 2.1739, + "step": 39300 + }, + { + "epoch": 3.09, + "learning_rate": 0.0005, + "loss": 2.1466, + "step": 39400 + }, + { + "epoch": 3.1, + "learning_rate": 0.0005, + "loss": 2.1726, + "step": 39500 + }, + { + "epoch": 3.11, + "learning_rate": 0.0005, + "loss": 2.1659, + "step": 39600 + }, + { + "epoch": 3.11, + "learning_rate": 0.0005, + "loss": 2.1573, + "step": 39700 + }, + { + "epoch": 3.12, + "learning_rate": 0.0005, + "loss": 2.1853, + "step": 39800 + }, + { + "epoch": 3.13, + "learning_rate": 0.0005, + "loss": 2.1446, + "step": 39900 + }, + { + "epoch": 3.14, + "learning_rate": 0.0005, + "loss": 2.1901, + "step": 40000 + }, + { + "epoch": 3.14, + "eval_gen_len": 18.773461640328417, + "eval_loss": 2.1257381439208984, + "eval_rouge1": 32.8553, + "eval_rouge2": 11.7404, + "eval_rougeL": 26.6114, + "eval_rougeLsum": 26.6102, + "eval_runtime": 359.175, + "eval_samples_per_second": 31.536, + "eval_steps_per_second": 1.971, + "step": 40000 + }, + { + "epoch": 3.14, + "learning_rate": 0.0005, + "loss": 2.1934, + "step": 40100 + }, + { + "epoch": 3.15, + "learning_rate": 0.0005, + "loss": 2.1582, + "step": 40200 + }, + { + "epoch": 3.16, + "learning_rate": 0.0005, + "loss": 2.1633, + "step": 40300 + }, + { + "epoch": 3.17, + "learning_rate": 0.0005, + "loss": 2.1623, + "step": 40400 + }, + { + "epoch": 3.18, + "learning_rate": 0.0005, + "loss": 2.1895, + "step": 40500 + }, + { + "epoch": 3.18, + "learning_rate": 0.0005, + "loss": 2.1656, + "step": 40600 + }, + { + "epoch": 3.19, + "learning_rate": 0.0005, + "loss": 2.1944, + "step": 40700 + }, + { + "epoch": 3.2, + "learning_rate": 0.0005, + "loss": 2.1575, + "step": 40800 + }, + { + "epoch": 3.21, + "learning_rate": 0.0005, + "loss": 2.1717, + "step": 40900 + }, + { + "epoch": 3.22, + "learning_rate": 0.0005, + "loss": 2.1541, + "step": 41000 + }, + { + "epoch": 3.22, + "learning_rate": 0.0005, + "loss": 2.1976, + "step": 41100 + }, + { + "epoch": 3.23, + "learning_rate": 0.0005, + "loss": 2.1578, + "step": 41200 + }, + { + "epoch": 3.24, + "learning_rate": 0.0005, + "loss": 2.1661, + "step": 41300 + }, + { + "epoch": 3.25, + "learning_rate": 0.0005, + "loss": 2.2012, + "step": 41400 + }, + { + "epoch": 3.25, + "learning_rate": 0.0005, + "loss": 2.1878, + "step": 41500 + }, + { + "epoch": 3.26, + "learning_rate": 0.0005, + "loss": 2.144, + "step": 41600 + }, + { + "epoch": 3.27, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 41700 + }, + { + "epoch": 3.28, + "learning_rate": 0.0005, + "loss": 2.1741, + "step": 41800 + }, + { + "epoch": 3.29, + "learning_rate": 0.0005, + "loss": 2.1908, + "step": 41900 + }, + { + "epoch": 3.29, + "learning_rate": 0.0005, + "loss": 2.1943, + "step": 42000 + }, + { + "epoch": 3.3, + "learning_rate": 0.0005, + "loss": 2.1714, + "step": 42100 + }, + { + "epoch": 3.31, + "learning_rate": 0.0005, + "loss": 2.1638, + "step": 42200 + }, + { + "epoch": 3.32, + "learning_rate": 0.0005, + "loss": 2.1751, + "step": 42300 + }, + { + "epoch": 3.32, + "learning_rate": 0.0005, + "loss": 2.1649, + "step": 42400 + }, + { + "epoch": 3.33, + "learning_rate": 0.0005, + "loss": 2.2036, + "step": 42500 + }, + { + "epoch": 3.34, + "learning_rate": 0.0005, + "loss": 2.1772, + "step": 42600 + }, + { + "epoch": 3.35, + "learning_rate": 0.0005, + "loss": 2.16, + "step": 42700 + }, + { + "epoch": 3.36, + "learning_rate": 0.0005, + "loss": 2.1918, + "step": 42800 + }, + { + "epoch": 3.36, + "learning_rate": 0.0005, + "loss": 2.1737, + "step": 42900 + }, + { + "epoch": 3.37, + "learning_rate": 0.0005, + "loss": 2.1684, + "step": 43000 + }, + { + "epoch": 3.38, + "learning_rate": 0.0005, + "loss": 2.1722, + "step": 43100 + }, + { + "epoch": 3.39, + "learning_rate": 0.0005, + "loss": 2.1881, + "step": 43200 + }, + { + "epoch": 3.4, + "learning_rate": 0.0005, + "loss": 2.1944, + "step": 43300 + }, + { + "epoch": 3.4, + "learning_rate": 0.0005, + "loss": 2.192, + "step": 43400 + }, + { + "epoch": 3.41, + "learning_rate": 0.0005, + "loss": 2.1617, + "step": 43500 + }, + { + "epoch": 3.42, + "learning_rate": 0.0005, + "loss": 2.2029, + "step": 43600 + }, + { + "epoch": 3.43, + "learning_rate": 0.0005, + "loss": 2.1596, + "step": 43700 + }, + { + "epoch": 3.43, + "learning_rate": 0.0005, + "loss": 2.1793, + "step": 43800 + }, + { + "epoch": 3.44, + "learning_rate": 0.0005, + "loss": 2.1792, + "step": 43900 + }, + { + "epoch": 3.45, + "learning_rate": 0.0005, + "loss": 2.1892, + "step": 44000 + }, + { + "epoch": 3.46, + "learning_rate": 0.0005, + "loss": 2.1759, + "step": 44100 + }, + { + "epoch": 3.47, + "learning_rate": 0.0005, + "loss": 2.1724, + "step": 44200 + }, + { + "epoch": 3.47, + "learning_rate": 0.0005, + "loss": 2.1689, + "step": 44300 + }, + { + "epoch": 3.48, + "learning_rate": 0.0005, + "loss": 2.1707, + "step": 44400 + }, + { + "epoch": 3.49, + "learning_rate": 0.0005, + "loss": 2.1845, + "step": 44500 + }, + { + "epoch": 3.5, + "learning_rate": 0.0005, + "loss": 2.1765, + "step": 44600 + }, + { + "epoch": 3.51, + "learning_rate": 0.0005, + "loss": 2.1788, + "step": 44700 + }, + { + "epoch": 3.51, + "learning_rate": 0.0005, + "loss": 2.1824, + "step": 44800 + }, + { + "epoch": 3.52, + "learning_rate": 0.0005, + "loss": 2.1826, + "step": 44900 + }, + { + "epoch": 3.53, + "learning_rate": 0.0005, + "loss": 2.1299, + "step": 45000 + }, + { + "epoch": 3.54, + "learning_rate": 0.0005, + "loss": 2.1819, + "step": 45100 + }, + { + "epoch": 3.54, + "learning_rate": 0.0005, + "loss": 2.1741, + "step": 45200 + }, + { + "epoch": 3.55, + "learning_rate": 0.0005, + "loss": 2.1896, + "step": 45300 + }, + { + "epoch": 3.56, + "learning_rate": 0.0005, + "loss": 2.1772, + "step": 45400 + }, + { + "epoch": 3.57, + "learning_rate": 0.0005, + "loss": 2.2326, + "step": 45500 + }, + { + "epoch": 3.58, + "learning_rate": 0.0005, + "loss": 2.1733, + "step": 45600 + }, + { + "epoch": 3.58, + "learning_rate": 0.0005, + "loss": 2.2147, + "step": 45700 + }, + { + "epoch": 3.59, + "learning_rate": 0.0005, + "loss": 2.1753, + "step": 45800 + }, + { + "epoch": 3.6, + "learning_rate": 0.0005, + "loss": 2.187, + "step": 45900 + }, + { + "epoch": 3.61, + "learning_rate": 0.0005, + "loss": 2.1863, + "step": 46000 + }, + { + "epoch": 3.62, + "learning_rate": 0.0005, + "loss": 2.1605, + "step": 46100 + }, + { + "epoch": 3.62, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 46200 + }, + { + "epoch": 3.63, + "learning_rate": 0.0005, + "loss": 2.2143, + "step": 46300 + }, + { + "epoch": 3.64, + "learning_rate": 0.0005, + "loss": 2.1812, + "step": 46400 + }, + { + "epoch": 3.65, + "learning_rate": 0.0005, + "loss": 2.1725, + "step": 46500 + }, + { + "epoch": 3.65, + "learning_rate": 0.0005, + "loss": 2.2017, + "step": 46600 + }, + { + "epoch": 3.66, + "learning_rate": 0.0005, + "loss": 2.225, + "step": 46700 + }, + { + "epoch": 3.67, + "learning_rate": 0.0005, + "loss": 2.1981, + "step": 46800 + }, + { + "epoch": 3.68, + "learning_rate": 0.0005, + "loss": 2.1845, + "step": 46900 + }, + { + "epoch": 3.69, + "learning_rate": 0.0005, + "loss": 2.1595, + "step": 47000 + }, + { + "epoch": 3.69, + "learning_rate": 0.0005, + "loss": 2.1557, + "step": 47100 + }, + { + "epoch": 3.7, + "learning_rate": 0.0005, + "loss": 2.1729, + "step": 47200 + }, + { + "epoch": 3.71, + "learning_rate": 0.0005, + "loss": 2.1649, + "step": 47300 + }, + { + "epoch": 3.72, + "learning_rate": 0.0005, + "loss": 2.1793, + "step": 47400 + }, + { + "epoch": 3.72, + "learning_rate": 0.0005, + "loss": 2.1388, + "step": 47500 + }, + { + "epoch": 3.73, + "learning_rate": 0.0005, + "loss": 2.1773, + "step": 47600 + }, + { + "epoch": 3.74, + "learning_rate": 0.0005, + "loss": 2.1915, + "step": 47700 + }, + { + "epoch": 3.75, + "learning_rate": 0.0005, + "loss": 2.1809, + "step": 47800 + }, + { + "epoch": 3.76, + "learning_rate": 0.0005, + "loss": 2.1909, + "step": 47900 + }, + { + "epoch": 3.76, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 48000 + }, + { + "epoch": 3.77, + "learning_rate": 0.0005, + "loss": 2.1878, + "step": 48100 + }, + { + "epoch": 3.78, + "learning_rate": 0.0005, + "loss": 2.1892, + "step": 48200 + }, + { + "epoch": 3.79, + "learning_rate": 0.0005, + "loss": 2.1804, + "step": 48300 + }, + { + "epoch": 3.8, + "learning_rate": 0.0005, + "loss": 2.1694, + "step": 48400 + }, + { + "epoch": 3.8, + "learning_rate": 0.0005, + "loss": 2.208, + "step": 48500 + }, + { + "epoch": 3.81, + "learning_rate": 0.0005, + "loss": 2.153, + "step": 48600 + }, + { + "epoch": 3.82, + "learning_rate": 0.0005, + "loss": 2.215, + "step": 48700 + }, + { + "epoch": 3.83, + "learning_rate": 0.0005, + "loss": 2.1499, + "step": 48800 + }, + { + "epoch": 3.83, + "learning_rate": 0.0005, + "loss": 2.1766, + "step": 48900 + }, + { + "epoch": 3.84, + "learning_rate": 0.0005, + "loss": 2.1973, + "step": 49000 + }, + { + "epoch": 3.85, + "learning_rate": 0.0005, + "loss": 2.2039, + "step": 49100 + }, + { + "epoch": 3.86, + "learning_rate": 0.0005, + "loss": 2.1866, + "step": 49200 + }, + { + "epoch": 3.87, + "learning_rate": 0.0005, + "loss": 2.1763, + "step": 49300 + }, + { + "epoch": 3.87, + "learning_rate": 0.0005, + "loss": 2.1737, + "step": 49400 + }, + { + "epoch": 3.88, + "learning_rate": 0.0005, + "loss": 2.2036, + "step": 49500 + }, + { + "epoch": 3.89, + "learning_rate": 0.0005, + "loss": 2.21, + "step": 49600 + }, + { + "epoch": 3.9, + "learning_rate": 0.0005, + "loss": 2.137, + "step": 49700 + }, + { + "epoch": 3.91, + "learning_rate": 0.0005, + "loss": 2.1908, + "step": 49800 + }, + { + "epoch": 3.91, + "learning_rate": 0.0005, + "loss": 2.1764, + "step": 49900 + }, + { + "epoch": 3.92, + "learning_rate": 0.0005, + "loss": 2.2058, + "step": 50000 + }, + { + "epoch": 3.92, + "eval_gen_len": 18.717577469762514, + "eval_loss": 2.0990633964538574, + "eval_rouge1": 33.2448, + "eval_rouge2": 12.1995, + "eval_rougeL": 27.0821, + "eval_rougeLsum": 27.086, + "eval_runtime": 360.4822, + "eval_samples_per_second": 31.422, + "eval_steps_per_second": 1.964, + "step": 50000 + }, + { + "epoch": 3.93, + "learning_rate": 0.0005, + "loss": 2.1585, + "step": 50100 + }, + { + "epoch": 3.94, + "learning_rate": 0.0005, + "loss": 2.1888, + "step": 50200 + }, + { + "epoch": 3.94, + "learning_rate": 0.0005, + "loss": 2.2, + "step": 50300 + }, + { + "epoch": 3.95, + "learning_rate": 0.0005, + "loss": 2.1772, + "step": 50400 + }, + { + "epoch": 3.96, + "learning_rate": 0.0005, + "loss": 2.2016, + "step": 50500 + }, + { + "epoch": 3.97, + "learning_rate": 0.0005, + "loss": 2.1783, + "step": 50600 + }, + { + "epoch": 3.98, + "learning_rate": 0.0005, + "loss": 2.1639, + "step": 50700 + }, + { + "epoch": 3.98, + "learning_rate": 0.0005, + "loss": 2.1585, + "step": 50800 + }, + { + "epoch": 3.99, + "learning_rate": 0.0005, + "loss": 2.1983, + "step": 50900 + }, + { + "epoch": 4.0, + "learning_rate": 0.0005, + "loss": 2.1695, + "step": 51000 + }, + { + "epoch": 4.01, + "learning_rate": 0.0005, + "loss": 2.0872, + "step": 51100 + }, + { + "epoch": 4.02, + "learning_rate": 0.0005, + "loss": 2.1085, + "step": 51200 + }, + { + "epoch": 4.02, + "learning_rate": 0.0005, + "loss": 2.1441, + "step": 51300 + }, + { + "epoch": 4.03, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 51400 + }, + { + "epoch": 4.04, + "learning_rate": 0.0005, + "loss": 2.1041, + "step": 51500 + }, + { + "epoch": 4.05, + "learning_rate": 0.0005, + "loss": 2.0745, + "step": 51600 + }, + { + "epoch": 4.05, + "learning_rate": 0.0005, + "loss": 2.097, + "step": 51700 + }, + { + "epoch": 4.06, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 51800 + }, + { + "epoch": 4.07, + "learning_rate": 0.0005, + "loss": 2.0559, + "step": 51900 + }, + { + "epoch": 4.08, + "learning_rate": 0.0005, + "loss": 2.1543, + "step": 52000 + }, + { + "epoch": 4.09, + "learning_rate": 0.0005, + "loss": 2.1138, + "step": 52100 + }, + { + "epoch": 4.09, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 52200 + }, + { + "epoch": 4.1, + "learning_rate": 0.0005, + "loss": 2.1135, + "step": 52300 + }, + { + "epoch": 4.11, + "learning_rate": 0.0005, + "loss": 2.0917, + "step": 52400 + }, + { + "epoch": 4.12, + "learning_rate": 0.0005, + "loss": 2.1246, + "step": 52500 + }, + { + "epoch": 4.12, + "learning_rate": 0.0005, + "loss": 2.1137, + "step": 52600 + }, + { + "epoch": 4.13, + "learning_rate": 0.0005, + "loss": 2.1029, + "step": 52700 + }, + { + "epoch": 4.14, + "learning_rate": 0.0005, + "loss": 2.103, + "step": 52800 + }, + { + "epoch": 4.15, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 52900 + }, + { + "epoch": 4.16, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 53000 + }, + { + "epoch": 4.16, + "learning_rate": 0.0005, + "loss": 2.1083, + "step": 53100 + }, + { + "epoch": 4.17, + "learning_rate": 0.0005, + "loss": 2.1142, + "step": 53200 + }, + { + "epoch": 4.18, + "learning_rate": 0.0005, + "loss": 2.1066, + "step": 53300 + }, + { + "epoch": 4.19, + "learning_rate": 0.0005, + "loss": 2.1003, + "step": 53400 + }, + { + "epoch": 4.2, + "learning_rate": 0.0005, + "loss": 2.0934, + "step": 53500 + }, + { + "epoch": 4.2, + "learning_rate": 0.0005, + "loss": 2.0904, + "step": 53600 + }, + { + "epoch": 4.21, + "learning_rate": 0.0005, + "loss": 2.141, + "step": 53700 + }, + { + "epoch": 4.22, + "learning_rate": 0.0005, + "loss": 2.0869, + "step": 53800 + }, + { + "epoch": 4.23, + "learning_rate": 0.0005, + "loss": 2.1202, + "step": 53900 + }, + { + "epoch": 4.23, + "learning_rate": 0.0005, + "loss": 2.1131, + "step": 54000 + }, + { + "epoch": 4.24, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 54100 + }, + { + "epoch": 4.25, + "learning_rate": 0.0005, + "loss": 2.1484, + "step": 54200 + }, + { + "epoch": 4.26, + "learning_rate": 0.0005, + "loss": 2.1127, + "step": 54300 + }, + { + "epoch": 4.27, + "learning_rate": 0.0005, + "loss": 2.1079, + "step": 54400 + }, + { + "epoch": 4.27, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 54500 + }, + { + "epoch": 4.28, + "learning_rate": 0.0005, + "loss": 2.1231, + "step": 54600 + }, + { + "epoch": 4.29, + "learning_rate": 0.0005, + "loss": 2.1139, + "step": 54700 + }, + { + "epoch": 4.3, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 54800 + }, + { + "epoch": 4.31, + "learning_rate": 0.0005, + "loss": 2.1174, + "step": 54900 + }, + { + "epoch": 4.31, + "learning_rate": 0.0005, + "loss": 2.1045, + "step": 55000 + }, + { + "epoch": 4.32, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 55100 + }, + { + "epoch": 4.33, + "learning_rate": 0.0005, + "loss": 2.0954, + "step": 55200 + }, + { + "epoch": 4.34, + "learning_rate": 0.0005, + "loss": 2.0964, + "step": 55300 + }, + { + "epoch": 4.34, + "learning_rate": 0.0005, + "loss": 2.1128, + "step": 55400 + }, + { + "epoch": 4.35, + "learning_rate": 0.0005, + "loss": 2.0874, + "step": 55500 + }, + { + "epoch": 4.36, + "learning_rate": 0.0005, + "loss": 2.1303, + "step": 55600 + }, + { + "epoch": 4.37, + "learning_rate": 0.0005, + "loss": 2.1261, + "step": 55700 + }, + { + "epoch": 4.38, + "learning_rate": 0.0005, + "loss": 2.0916, + "step": 55800 + }, + { + "epoch": 4.38, + "learning_rate": 0.0005, + "loss": 2.0894, + "step": 55900 + }, + { + "epoch": 4.39, + "learning_rate": 0.0005, + "loss": 2.1365, + "step": 56000 + }, + { + "epoch": 4.4, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 56100 + }, + { + "epoch": 4.41, + "learning_rate": 0.0005, + "loss": 2.143, + "step": 56200 + }, + { + "epoch": 4.41, + "learning_rate": 0.0005, + "loss": 2.1147, + "step": 56300 + }, + { + "epoch": 4.42, + "learning_rate": 0.0005, + "loss": 2.1072, + "step": 56400 + }, + { + "epoch": 4.43, + "learning_rate": 0.0005, + "loss": 2.1, + "step": 56500 + }, + { + "epoch": 4.44, + "learning_rate": 0.0005, + "loss": 2.1373, + "step": 56600 + }, + { + "epoch": 4.45, + "learning_rate": 0.0005, + "loss": 2.1385, + "step": 56700 + }, + { + "epoch": 4.45, + "learning_rate": 0.0005, + "loss": 2.1063, + "step": 56800 + }, + { + "epoch": 4.46, + "learning_rate": 0.0005, + "loss": 2.1563, + "step": 56900 + }, + { + "epoch": 4.47, + "learning_rate": 0.0005, + "loss": 2.1238, + "step": 57000 + }, + { + "epoch": 4.48, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 57100 + }, + { + "epoch": 4.49, + "learning_rate": 0.0005, + "loss": 2.1064, + "step": 57200 + }, + { + "epoch": 4.49, + "learning_rate": 0.0005, + "loss": 2.0979, + "step": 57300 + }, + { + "epoch": 4.5, + "learning_rate": 0.0005, + "loss": 2.0988, + "step": 57400 + }, + { + "epoch": 4.51, + "learning_rate": 0.0005, + "loss": 2.115, + "step": 57500 + }, + { + "epoch": 4.52, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 57600 + }, + { + "epoch": 4.52, + "learning_rate": 0.0005, + "loss": 2.098, + "step": 57700 + }, + { + "epoch": 4.53, + "learning_rate": 0.0005, + "loss": 2.1081, + "step": 57800 + }, + { + "epoch": 4.54, + "learning_rate": 0.0005, + "loss": 2.134, + "step": 57900 + }, + { + "epoch": 4.55, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 58000 + }, + { + "epoch": 4.56, + "learning_rate": 0.0005, + "loss": 2.1114, + "step": 58100 + }, + { + "epoch": 4.56, + "learning_rate": 0.0005, + "loss": 2.1201, + "step": 58200 + }, + { + "epoch": 4.57, + "learning_rate": 0.0005, + "loss": 2.1435, + "step": 58300 + }, + { + "epoch": 4.58, + "learning_rate": 0.0005, + "loss": 2.1254, + "step": 58400 + }, + { + "epoch": 4.59, + "learning_rate": 0.0005, + "loss": 2.1204, + "step": 58500 + }, + { + "epoch": 4.6, + "learning_rate": 0.0005, + "loss": 2.1461, + "step": 58600 + }, + { + "epoch": 4.6, + "learning_rate": 0.0005, + "loss": 2.1181, + "step": 58700 + }, + { + "epoch": 4.61, + "learning_rate": 0.0005, + "loss": 2.1405, + "step": 58800 + }, + { + "epoch": 4.62, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 58900 + }, + { + "epoch": 4.63, + "learning_rate": 0.0005, + "loss": 2.1087, + "step": 59000 + }, + { + "epoch": 4.63, + "learning_rate": 0.0005, + "loss": 2.1094, + "step": 59100 + }, + { + "epoch": 4.64, + "learning_rate": 0.0005, + "loss": 2.12, + "step": 59200 + }, + { + "epoch": 4.65, + "learning_rate": 0.0005, + "loss": 2.0994, + "step": 59300 + }, + { + "epoch": 4.66, + "learning_rate": 0.0005, + "loss": 2.1193, + "step": 59400 + }, + { + "epoch": 4.67, + "learning_rate": 0.0005, + "loss": 2.1288, + "step": 59500 + }, + { + "epoch": 4.67, + "learning_rate": 0.0005, + "loss": 2.1091, + "step": 59600 + }, + { + "epoch": 4.68, + "learning_rate": 0.0005, + "loss": 2.1191, + "step": 59700 + }, + { + "epoch": 4.69, + "learning_rate": 0.0005, + "loss": 2.1305, + "step": 59800 + }, + { + "epoch": 4.7, + "learning_rate": 0.0005, + "loss": 2.1271, + "step": 59900 + }, + { + "epoch": 4.71, + "learning_rate": 0.0005, + "loss": 2.1531, + "step": 60000 + }, + { + "epoch": 4.71, + "eval_gen_len": 18.730025602542597, + "eval_loss": 2.0838534832000732, + "eval_rouge1": 33.6123, + "eval_rouge2": 12.46, + "eval_rougeL": 27.3966, + "eval_rougeLsum": 27.3902, + "eval_runtime": 366.4271, + "eval_samples_per_second": 30.912, + "eval_steps_per_second": 1.932, + "step": 60000 + }, + { + "epoch": 4.71, + "learning_rate": 0.0005, + "loss": 2.1047, + "step": 60100 + }, + { + "epoch": 4.72, + "learning_rate": 0.0005, + "loss": 2.1382, + "step": 60200 + }, + { + "epoch": 4.73, + "learning_rate": 0.0005, + "loss": 2.1807, + "step": 60300 + }, + { + "epoch": 4.74, + "learning_rate": 0.0005, + "loss": 2.1061, + "step": 60400 + }, + { + "epoch": 4.74, + "learning_rate": 0.0005, + "loss": 2.1272, + "step": 60500 + }, + { + "epoch": 4.75, + "learning_rate": 0.0005, + "loss": 2.1286, + "step": 60600 + }, + { + "epoch": 4.76, + "learning_rate": 0.0005, + "loss": 2.1149, + "step": 60700 + }, + { + "epoch": 4.77, + "learning_rate": 0.0005, + "loss": 2.1097, + "step": 60800 + }, + { + "epoch": 4.78, + "learning_rate": 0.0005, + "loss": 2.1103, + "step": 60900 + }, + { + "epoch": 4.78, + "learning_rate": 0.0005, + "loss": 2.121, + "step": 61000 + }, + { + "epoch": 4.79, + "learning_rate": 0.0005, + "loss": 2.1304, + "step": 61100 + }, + { + "epoch": 4.8, + "learning_rate": 0.0005, + "loss": 2.1219, + "step": 61200 + }, + { + "epoch": 4.81, + "learning_rate": 0.0005, + "loss": 2.1391, + "step": 61300 + }, + { + "epoch": 4.81, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 61400 + }, + { + "epoch": 4.82, + "learning_rate": 0.0005, + "loss": 2.1213, + "step": 61500 + }, + { + "epoch": 4.83, + "learning_rate": 0.0005, + "loss": 2.0743, + "step": 61600 + }, + { + "epoch": 4.84, + "learning_rate": 0.0005, + "loss": 2.101, + "step": 61700 + }, + { + "epoch": 4.85, + "learning_rate": 0.0005, + "loss": 2.1325, + "step": 61800 + }, + { + "epoch": 4.85, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 61900 + }, + { + "epoch": 4.86, + "learning_rate": 0.0005, + "loss": 2.1176, + "step": 62000 + }, + { + "epoch": 4.87, + "learning_rate": 0.0005, + "loss": 2.1379, + "step": 62100 + }, + { + "epoch": 4.88, + "learning_rate": 0.0005, + "loss": 2.1102, + "step": 62200 + }, + { + "epoch": 4.89, + "learning_rate": 0.0005, + "loss": 2.149, + "step": 62300 + }, + { + "epoch": 4.89, + "learning_rate": 0.0005, + "loss": 2.1386, + "step": 62400 + }, + { + "epoch": 4.9, + "learning_rate": 0.0005, + "loss": 2.1165, + "step": 62500 + }, + { + "epoch": 4.91, + "learning_rate": 0.0005, + "loss": 2.1297, + "step": 62600 + }, + { + "epoch": 4.92, + "learning_rate": 0.0005, + "loss": 2.1164, + "step": 62700 + }, + { + "epoch": 4.92, + "learning_rate": 0.0005, + "loss": 2.1092, + "step": 62800 + }, + { + "epoch": 4.93, + "learning_rate": 0.0005, + "loss": 2.1188, + "step": 62900 + }, + { + "epoch": 4.94, + "learning_rate": 0.0005, + "loss": 2.128, + "step": 63000 + }, + { + "epoch": 4.95, + "learning_rate": 0.0005, + "loss": 2.1438, + "step": 63100 + }, + { + "epoch": 4.96, + "learning_rate": 0.0005, + "loss": 2.119, + "step": 63200 + }, + { + "epoch": 4.96, + "learning_rate": 0.0005, + "loss": 2.1456, + "step": 63300 + }, + { + "epoch": 4.97, + "learning_rate": 0.0005, + "loss": 2.1314, + "step": 63400 + }, + { + "epoch": 4.98, + "learning_rate": 0.0005, + "loss": 2.1293, + "step": 63500 + }, + { + "epoch": 4.99, + "learning_rate": 0.0005, + "loss": 2.1111, + "step": 63600 + }, + { + "epoch": 5.0, + "learning_rate": 0.0005, + "loss": 2.1195, + "step": 63700 + }, + { + "epoch": 5.0, + "learning_rate": 0.0005, + "loss": 2.1054, + "step": 63800 + }, + { + "epoch": 5.01, + "learning_rate": 0.0005, + "loss": 2.0384, + "step": 63900 + }, + { + "epoch": 5.02, + "learning_rate": 0.0005, + "loss": 2.034, + "step": 64000 + }, + { + "epoch": 5.03, + "learning_rate": 0.0005, + "loss": 2.0256, + "step": 64100 + }, + { + "epoch": 5.03, + "learning_rate": 0.0005, + "loss": 2.0358, + "step": 64200 + }, + { + "epoch": 5.04, + "learning_rate": 0.0005, + "loss": 2.0021, + "step": 64300 + }, + { + "epoch": 5.05, + "learning_rate": 0.0005, + "loss": 2.0582, + "step": 64400 + }, + { + "epoch": 5.06, + "learning_rate": 0.0005, + "loss": 2.0526, + "step": 64500 + }, + { + "epoch": 5.07, + "learning_rate": 0.0005, + "loss": 2.0456, + "step": 64600 + }, + { + "epoch": 5.07, + "learning_rate": 0.0005, + "loss": 2.0602, + "step": 64700 + }, + { + "epoch": 5.08, + "learning_rate": 0.0005, + "loss": 2.0504, + "step": 64800 + }, + { + "epoch": 5.09, + "learning_rate": 0.0005, + "loss": 2.0517, + "step": 64900 + }, + { + "epoch": 5.1, + "learning_rate": 0.0005, + "loss": 2.0499, + "step": 65000 + }, + { + "epoch": 5.11, + "learning_rate": 0.0005, + "loss": 2.0296, + "step": 65100 + }, + { + "epoch": 5.11, + "learning_rate": 0.0005, + "loss": 2.054, + "step": 65200 + }, + { + "epoch": 5.12, + "learning_rate": 0.0005, + "loss": 2.0454, + "step": 65300 + }, + { + "epoch": 5.13, + "learning_rate": 0.0005, + "loss": 2.0635, + "step": 65400 + }, + { + "epoch": 5.14, + "learning_rate": 0.0005, + "loss": 2.0474, + "step": 65500 + }, + { + "epoch": 5.14, + "learning_rate": 0.0005, + "loss": 2.0691, + "step": 65600 + }, + { + "epoch": 5.15, + "learning_rate": 0.0005, + "loss": 2.0333, + "step": 65700 + }, + { + "epoch": 5.16, + "learning_rate": 0.0005, + "loss": 2.0033, + "step": 65800 + }, + { + "epoch": 5.17, + "learning_rate": 0.0005, + "loss": 2.0756, + "step": 65900 + }, + { + "epoch": 5.18, + "learning_rate": 0.0005, + "loss": 2.0386, + "step": 66000 + }, + { + "epoch": 5.18, + "learning_rate": 0.0005, + "loss": 2.0326, + "step": 66100 + }, + { + "epoch": 5.19, + "learning_rate": 0.0005, + "loss": 2.0672, + "step": 66200 + }, + { + "epoch": 5.2, + "learning_rate": 0.0005, + "loss": 2.0822, + "step": 66300 + }, + { + "epoch": 5.21, + "learning_rate": 0.0005, + "loss": 2.0693, + "step": 66400 + }, + { + "epoch": 5.21, + "learning_rate": 0.0005, + "loss": 2.0563, + "step": 66500 + }, + { + "epoch": 5.22, + "learning_rate": 0.0005, + "loss": 2.0756, + "step": 66600 + }, + { + "epoch": 5.23, + "learning_rate": 0.0005, + "loss": 2.0132, + "step": 66700 + }, + { + "epoch": 5.24, + "learning_rate": 0.0005, + "loss": 2.0786, + "step": 66800 + }, + { + "epoch": 5.25, + "learning_rate": 0.0005, + "loss": 2.067, + "step": 66900 + }, + { + "epoch": 5.25, + "learning_rate": 0.0005, + "loss": 2.059, + "step": 67000 + }, + { + "epoch": 5.26, + "learning_rate": 0.0005, + "loss": 2.0199, + "step": 67100 + }, + { + "epoch": 5.27, + "learning_rate": 0.0005, + "loss": 2.0458, + "step": 67200 + }, + { + "epoch": 5.28, + "learning_rate": 0.0005, + "loss": 2.0783, + "step": 67300 + }, + { + "epoch": 5.29, + "learning_rate": 0.0005, + "loss": 2.076, + "step": 67400 + }, + { + "epoch": 5.29, + "learning_rate": 0.0005, + "loss": 2.0714, + "step": 67500 + }, + { + "epoch": 5.3, + "learning_rate": 0.0005, + "loss": 2.0723, + "step": 67600 + }, + { + "epoch": 5.31, + "learning_rate": 0.0005, + "loss": 2.0727, + "step": 67700 + }, + { + "epoch": 5.32, + "learning_rate": 0.0005, + "loss": 2.0356, + "step": 67800 + }, + { + "epoch": 5.32, + "learning_rate": 0.0005, + "loss": 2.0634, + "step": 67900 + }, + { + "epoch": 5.33, + "learning_rate": 0.0005, + "loss": 2.0605, + "step": 68000 + }, + { + "epoch": 5.34, + "learning_rate": 0.0005, + "loss": 2.0342, + "step": 68100 + }, + { + "epoch": 5.35, + "learning_rate": 0.0005, + "loss": 2.0354, + "step": 68200 + }, + { + "epoch": 5.36, + "learning_rate": 0.0005, + "loss": 2.0479, + "step": 68300 + }, + { + "epoch": 5.36, + "learning_rate": 0.0005, + "loss": 2.0752, + "step": 68400 + }, + { + "epoch": 5.37, + "learning_rate": 0.0005, + "loss": 2.0633, + "step": 68500 + }, + { + "epoch": 5.38, + "learning_rate": 0.0005, + "loss": 2.0621, + "step": 68600 + }, + { + "epoch": 5.39, + "learning_rate": 0.0005, + "loss": 2.0963, + "step": 68700 + }, + { + "epoch": 5.4, + "learning_rate": 0.0005, + "loss": 2.0484, + "step": 68800 + }, + { + "epoch": 5.4, + "learning_rate": 0.0005, + "loss": 2.0551, + "step": 68900 + }, + { + "epoch": 5.41, + "learning_rate": 0.0005, + "loss": 2.0968, + "step": 69000 + }, + { + "epoch": 5.42, + "learning_rate": 0.0005, + "loss": 2.0809, + "step": 69100 + }, + { + "epoch": 5.43, + "learning_rate": 0.0005, + "loss": 2.0783, + "step": 69200 + }, + { + "epoch": 5.43, + "learning_rate": 0.0005, + "loss": 2.0593, + "step": 69300 + }, + { + "epoch": 5.44, + "learning_rate": 0.0005, + "loss": 2.0443, + "step": 69400 + }, + { + "epoch": 5.45, + "learning_rate": 0.0005, + "loss": 2.0586, + "step": 69500 + }, + { + "epoch": 5.46, + "learning_rate": 0.0005, + "loss": 2.0708, + "step": 69600 + }, + { + "epoch": 5.47, + "learning_rate": 0.0005, + "loss": 2.079, + "step": 69700 + }, + { + "epoch": 5.47, + "learning_rate": 0.0005, + "loss": 2.0888, + "step": 69800 + }, + { + "epoch": 5.48, + "learning_rate": 0.0005, + "loss": 2.0972, + "step": 69900 + }, + { + "epoch": 5.49, + "learning_rate": 0.0005, + "loss": 2.063, + "step": 70000 + }, + { + "epoch": 5.49, + "eval_gen_len": 18.784497219034165, + "eval_loss": 2.068556785583496, + "eval_rouge1": 33.6877, + "eval_rouge2": 12.6196, + "eval_rougeL": 27.5291, + "eval_rougeLsum": 27.5307, + "eval_runtime": 359.5783, + "eval_samples_per_second": 31.501, + "eval_steps_per_second": 1.969, + "step": 70000 + }, + { + "epoch": 5.5, + "learning_rate": 0.0005, + "loss": 2.0793, + "step": 70100 + }, + { + "epoch": 5.51, + "learning_rate": 0.0005, + "loss": 2.0636, + "step": 70200 + }, + { + "epoch": 5.51, + "learning_rate": 0.0005, + "loss": 2.0758, + "step": 70300 + }, + { + "epoch": 5.52, + "learning_rate": 0.0005, + "loss": 2.0628, + "step": 70400 + }, + { + "epoch": 5.53, + "learning_rate": 0.0005, + "loss": 2.0622, + "step": 70500 + }, + { + "epoch": 5.54, + "learning_rate": 0.0005, + "loss": 2.0957, + "step": 70600 + }, + { + "epoch": 5.54, + "learning_rate": 0.0005, + "loss": 2.096, + "step": 70700 + }, + { + "epoch": 5.55, + "learning_rate": 0.0005, + "loss": 2.0606, + "step": 70800 + }, + { + "epoch": 5.56, + "learning_rate": 0.0005, + "loss": 2.1034, + "step": 70900 + }, + { + "epoch": 5.57, + "learning_rate": 0.0005, + "loss": 2.0591, + "step": 71000 + }, + { + "epoch": 5.58, + "learning_rate": 0.0005, + "loss": 2.0884, + "step": 71100 + }, + { + "epoch": 5.58, + "learning_rate": 0.0005, + "loss": 2.0633, + "step": 71200 + }, + { + "epoch": 5.59, + "learning_rate": 0.0005, + "loss": 2.0562, + "step": 71300 + }, + { + "epoch": 5.6, + "learning_rate": 0.0005, + "loss": 2.0772, + "step": 71400 + }, + { + "epoch": 5.61, + "learning_rate": 0.0005, + "loss": 2.126, + "step": 71500 + }, + { + "epoch": 5.61, + "learning_rate": 0.0005, + "loss": 2.0717, + "step": 71600 + }, + { + "epoch": 5.62, + "learning_rate": 0.0005, + "loss": 2.081, + "step": 71700 + }, + { + "epoch": 5.63, + "learning_rate": 0.0005, + "loss": 2.0639, + "step": 71800 + }, + { + "epoch": 5.64, + "learning_rate": 0.0005, + "loss": 2.1004, + "step": 71900 + }, + { + "epoch": 5.65, + "learning_rate": 0.0005, + "loss": 2.0877, + "step": 72000 + }, + { + "epoch": 5.65, + "learning_rate": 0.0005, + "loss": 2.0836, + "step": 72100 + }, + { + "epoch": 5.66, + "learning_rate": 0.0005, + "loss": 2.0649, + "step": 72200 + }, + { + "epoch": 5.67, + "learning_rate": 0.0005, + "loss": 2.0571, + "step": 72300 + }, + { + "epoch": 5.68, + "learning_rate": 0.0005, + "loss": 2.0496, + "step": 72400 + }, + { + "epoch": 5.69, + "learning_rate": 0.0005, + "loss": 2.0761, + "step": 72500 + }, + { + "epoch": 5.69, + "learning_rate": 0.0005, + "loss": 2.0765, + "step": 72600 + }, + { + "epoch": 5.7, + "learning_rate": 0.0005, + "loss": 2.0764, + "step": 72700 + }, + { + "epoch": 5.71, + "learning_rate": 0.0005, + "loss": 2.0524, + "step": 72800 + }, + { + "epoch": 5.72, + "learning_rate": 0.0005, + "loss": 2.0802, + "step": 72900 + }, + { + "epoch": 5.72, + "learning_rate": 0.0005, + "loss": 2.0551, + "step": 73000 + }, + { + "epoch": 5.73, + "learning_rate": 0.0005, + "loss": 2.0552, + "step": 73100 + }, + { + "epoch": 5.74, + "learning_rate": 0.0005, + "loss": 2.0579, + "step": 73200 + }, + { + "epoch": 5.75, + "learning_rate": 0.0005, + "loss": 2.0506, + "step": 73300 + }, + { + "epoch": 5.76, + "learning_rate": 0.0005, + "loss": 2.0771, + "step": 73400 + }, + { + "epoch": 5.76, + "learning_rate": 0.0005, + "loss": 2.0851, + "step": 73500 + }, + { + "epoch": 5.77, + "learning_rate": 0.0005, + "loss": 2.0828, + "step": 73600 + }, + { + "epoch": 5.78, + "learning_rate": 0.0005, + "loss": 2.0987, + "step": 73700 + }, + { + "epoch": 5.79, + "learning_rate": 0.0005, + "loss": 2.1023, + "step": 73800 + }, + { + "epoch": 5.8, + "learning_rate": 0.0005, + "loss": 2.0703, + "step": 73900 + }, + { + "epoch": 5.8, + "learning_rate": 0.0005, + "loss": 2.0784, + "step": 74000 + }, + { + "epoch": 5.81, + "learning_rate": 0.0005, + "loss": 2.0518, + "step": 74100 + }, + { + "epoch": 5.82, + "learning_rate": 0.0005, + "loss": 2.0901, + "step": 74200 + }, + { + "epoch": 5.83, + "learning_rate": 0.0005, + "loss": 2.0442, + "step": 74300 + }, + { + "epoch": 5.83, + "learning_rate": 0.0005, + "loss": 2.0926, + "step": 74400 + }, + { + "epoch": 5.84, + "learning_rate": 0.0005, + "loss": 2.0789, + "step": 74500 + }, + { + "epoch": 5.85, + "learning_rate": 0.0005, + "loss": 2.0929, + "step": 74600 + }, + { + "epoch": 5.86, + "learning_rate": 0.0005, + "loss": 2.0689, + "step": 74700 + }, + { + "epoch": 5.87, + "learning_rate": 0.0005, + "loss": 2.1078, + "step": 74800 + }, + { + "epoch": 5.87, + "learning_rate": 0.0005, + "loss": 2.0821, + "step": 74900 + }, + { + "epoch": 5.88, + "learning_rate": 0.0005, + "loss": 2.0605, + "step": 75000 + }, + { + "epoch": 5.89, + "learning_rate": 0.0005, + "loss": 2.0762, + "step": 75100 + }, + { + "epoch": 5.9, + "learning_rate": 0.0005, + "loss": 2.0827, + "step": 75200 + }, + { + "epoch": 5.9, + "learning_rate": 0.0005, + "loss": 2.0828, + "step": 75300 + }, + { + "epoch": 5.91, + "learning_rate": 0.0005, + "loss": 2.0767, + "step": 75400 + }, + { + "epoch": 5.92, + "learning_rate": 0.0005, + "loss": 2.0732, + "step": 75500 + }, + { + "epoch": 5.93, + "learning_rate": 0.0005, + "loss": 2.0692, + "step": 75600 + }, + { + "epoch": 5.94, + "learning_rate": 0.0005, + "loss": 2.0544, + "step": 75700 + }, + { + "epoch": 5.94, + "learning_rate": 0.0005, + "loss": 2.0736, + "step": 75800 + }, + { + "epoch": 5.95, + "learning_rate": 0.0005, + "loss": 2.0803, + "step": 75900 + }, + { + "epoch": 5.96, + "learning_rate": 0.0005, + "loss": 2.0889, + "step": 76000 + }, + { + "epoch": 5.97, + "learning_rate": 0.0005, + "loss": 2.0841, + "step": 76100 + }, + { + "epoch": 5.98, + "learning_rate": 0.0005, + "loss": 2.064, + "step": 76200 + }, + { + "epoch": 5.98, + "learning_rate": 0.0005, + "loss": 2.0575, + "step": 76300 + }, + { + "epoch": 5.99, + "learning_rate": 0.0005, + "loss": 2.0763, + "step": 76400 + }, + { + "epoch": 6.0, + "learning_rate": 0.0005, + "loss": 2.0605, + "step": 76500 + }, + { + "epoch": 6.01, + "learning_rate": 0.0005, + "loss": 2.0113, + "step": 76600 + }, + { + "epoch": 6.01, + "learning_rate": 0.0005, + "loss": 1.9809, + "step": 76700 + }, + { + "epoch": 6.02, + "learning_rate": 0.0005, + "loss": 2.0006, + "step": 76800 + }, + { + "epoch": 6.03, + "learning_rate": 0.0005, + "loss": 1.9569, + "step": 76900 + }, + { + "epoch": 6.04, + "learning_rate": 0.0005, + "loss": 1.985, + "step": 77000 + }, + { + "epoch": 6.05, + "learning_rate": 0.0005, + "loss": 1.9736, + "step": 77100 + }, + { + "epoch": 6.05, + "learning_rate": 0.0005, + "loss": 1.9773, + "step": 77200 + }, + { + "epoch": 6.06, + "learning_rate": 0.0005, + "loss": 2.0067, + "step": 77300 + }, + { + "epoch": 6.07, + "learning_rate": 0.0005, + "loss": 2.0069, + "step": 77400 + }, + { + "epoch": 6.08, + "learning_rate": 0.0005, + "loss": 2.0018, + "step": 77500 + }, + { + "epoch": 6.09, + "learning_rate": 0.0005, + "loss": 1.9895, + "step": 77600 + }, + { + "epoch": 6.09, + "learning_rate": 0.0005, + "loss": 1.9927, + "step": 77700 + }, + { + "epoch": 6.1, + "learning_rate": 0.0005, + "loss": 1.9861, + "step": 77800 + }, + { + "epoch": 6.11, + "learning_rate": 0.0005, + "loss": 1.9965, + "step": 77900 + }, + { + "epoch": 6.12, + "learning_rate": 0.0005, + "loss": 2.0095, + "step": 78000 + }, + { + "epoch": 6.12, + "learning_rate": 0.0005, + "loss": 2.0137, + "step": 78100 + }, + { + "epoch": 6.13, + "learning_rate": 0.0005, + "loss": 1.9971, + "step": 78200 + }, + { + "epoch": 6.14, + "learning_rate": 0.0005, + "loss": 2.0021, + "step": 78300 + }, + { + "epoch": 6.15, + "learning_rate": 0.0005, + "loss": 2.0098, + "step": 78400 + }, + { + "epoch": 6.16, + "learning_rate": 0.0005, + "loss": 2.029, + "step": 78500 + }, + { + "epoch": 6.16, + "learning_rate": 0.0005, + "loss": 2.0294, + "step": 78600 + }, + { + "epoch": 6.17, + "learning_rate": 0.0005, + "loss": 2.0184, + "step": 78700 + }, + { + "epoch": 6.18, + "learning_rate": 0.0005, + "loss": 2.0426, + "step": 78800 + }, + { + "epoch": 6.19, + "learning_rate": 0.0005, + "loss": 2.0107, + "step": 78900 + }, + { + "epoch": 6.2, + "learning_rate": 0.0005, + "loss": 2.0161, + "step": 79000 + }, + { + "epoch": 6.2, + "learning_rate": 0.0005, + "loss": 2.0044, + "step": 79100 + }, + { + "epoch": 6.21, + "learning_rate": 0.0005, + "loss": 2.0025, + "step": 79200 + }, + { + "epoch": 6.22, + "learning_rate": 0.0005, + "loss": 2.0023, + "step": 79300 + }, + { + "epoch": 6.23, + "learning_rate": 0.0005, + "loss": 2.0455, + "step": 79400 + }, + { + "epoch": 6.23, + "learning_rate": 0.0005, + "loss": 2.0443, + "step": 79500 + }, + { + "epoch": 6.24, + "learning_rate": 0.0005, + "loss": 2.0365, + "step": 79600 + }, + { + "epoch": 6.25, + "learning_rate": 0.0005, + "loss": 2.0225, + "step": 79700 + }, + { + "epoch": 6.26, + "learning_rate": 0.0005, + "loss": 2.0021, + "step": 79800 + }, + { + "epoch": 6.27, + "learning_rate": 0.0005, + "loss": 2.0114, + "step": 79900 + }, + { + "epoch": 6.27, + "learning_rate": 0.0005, + "loss": 1.982, + "step": 80000 + }, + { + "epoch": 6.27, + "eval_gen_len": 18.763926900326652, + "eval_loss": 2.057493209838867, + "eval_rouge1": 33.9585, + "eval_rouge2": 12.853, + "eval_rougeL": 27.7139, + "eval_rougeLsum": 27.7071, + "eval_runtime": 362.3051, + "eval_samples_per_second": 31.264, + "eval_steps_per_second": 1.954, + "step": 80000 + }, + { + "epoch": 6.28, + "learning_rate": 0.0005, + "loss": 2.0384, + "step": 80100 + }, + { + "epoch": 6.29, + "learning_rate": 0.0005, + "loss": 2.0101, + "step": 80200 + }, + { + "epoch": 6.3, + "learning_rate": 0.0005, + "loss": 1.9886, + "step": 80300 + }, + { + "epoch": 6.3, + "learning_rate": 0.0005, + "loss": 2.0217, + "step": 80400 + }, + { + "epoch": 6.31, + "learning_rate": 0.0005, + "loss": 1.9968, + "step": 80500 + }, + { + "epoch": 6.32, + "learning_rate": 0.0005, + "loss": 2.0091, + "step": 80600 + }, + { + "epoch": 6.33, + "learning_rate": 0.0005, + "loss": 2.0001, + "step": 80700 + }, + { + "epoch": 6.34, + "learning_rate": 0.0005, + "loss": 2.0355, + "step": 80800 + }, + { + "epoch": 6.34, + "learning_rate": 0.0005, + "loss": 2.0076, + "step": 80900 + }, + { + "epoch": 6.35, + "learning_rate": 0.0005, + "loss": 2.0297, + "step": 81000 + }, + { + "epoch": 6.36, + "learning_rate": 0.0005, + "loss": 1.9883, + "step": 81100 + }, + { + "epoch": 6.37, + "learning_rate": 0.0005, + "loss": 1.9958, + "step": 81200 + }, + { + "epoch": 6.38, + "learning_rate": 0.0005, + "loss": 2.0411, + "step": 81300 + }, + { + "epoch": 6.38, + "learning_rate": 0.0005, + "loss": 1.9781, + "step": 81400 + }, + { + "epoch": 6.39, + "learning_rate": 0.0005, + "loss": 2.047, + "step": 81500 + }, + { + "epoch": 6.4, + "learning_rate": 0.0005, + "loss": 2.0289, + "step": 81600 + }, + { + "epoch": 6.41, + "learning_rate": 0.0005, + "loss": 2.0189, + "step": 81700 + }, + { + "epoch": 6.41, + "learning_rate": 0.0005, + "loss": 2.086, + "step": 81800 + }, + { + "epoch": 6.42, + "learning_rate": 0.0005, + "loss": 2.0131, + "step": 81900 + }, + { + "epoch": 6.43, + "learning_rate": 0.0005, + "loss": 2.0036, + "step": 82000 + }, + { + "epoch": 6.44, + "learning_rate": 0.0005, + "loss": 2.0146, + "step": 82100 + }, + { + "epoch": 6.45, + "learning_rate": 0.0005, + "loss": 2.0362, + "step": 82200 + }, + { + "epoch": 6.45, + "learning_rate": 0.0005, + "loss": 2.0268, + "step": 82300 + }, + { + "epoch": 6.46, + "learning_rate": 0.0005, + "loss": 2.0321, + "step": 82400 + }, + { + "epoch": 6.47, + "learning_rate": 0.0005, + "loss": 2.0105, + "step": 82500 + }, + { + "epoch": 6.48, + "learning_rate": 0.0005, + "loss": 2.0522, + "step": 82600 + }, + { + "epoch": 6.49, + "learning_rate": 0.0005, + "loss": 2.0456, + "step": 82700 + }, + { + "epoch": 6.49, + "learning_rate": 0.0005, + "loss": 1.9935, + "step": 82800 + }, + { + "epoch": 6.5, + "learning_rate": 0.0005, + "loss": 2.041, + "step": 82900 + }, + { + "epoch": 6.51, + "learning_rate": 0.0005, + "loss": 1.9851, + "step": 83000 + }, + { + "epoch": 6.52, + "learning_rate": 0.0005, + "loss": 2.0574, + "step": 83100 + }, + { + "epoch": 6.52, + "learning_rate": 0.0005, + "loss": 2.0324, + "step": 83200 + }, + { + "epoch": 6.53, + "learning_rate": 0.0005, + "loss": 2.0367, + "step": 83300 + }, + { + "epoch": 6.54, + "learning_rate": 0.0005, + "loss": 2.0264, + "step": 83400 + }, + { + "epoch": 6.55, + "learning_rate": 0.0005, + "loss": 2.0314, + "step": 83500 + }, + { + "epoch": 6.56, + "learning_rate": 0.0005, + "loss": 1.9554, + "step": 83600 + }, + { + "epoch": 6.56, + "learning_rate": 0.0005, + "loss": 2.0183, + "step": 83700 + }, + { + "epoch": 6.57, + "learning_rate": 0.0005, + "loss": 2.0161, + "step": 83800 + }, + { + "epoch": 6.58, + "learning_rate": 0.0005, + "loss": 2.0337, + "step": 83900 + }, + { + "epoch": 6.59, + "learning_rate": 0.0005, + "loss": 2.0075, + "step": 84000 + }, + { + "epoch": 6.6, + "learning_rate": 0.0005, + "loss": 2.0191, + "step": 84100 + }, + { + "epoch": 6.6, + "learning_rate": 0.0005, + "loss": 2.0136, + "step": 84200 + }, + { + "epoch": 6.61, + "learning_rate": 0.0005, + "loss": 2.0535, + "step": 84300 + }, + { + "epoch": 6.62, + "learning_rate": 0.0005, + "loss": 2.0413, + "step": 84400 + }, + { + "epoch": 6.63, + "learning_rate": 0.0005, + "loss": 2.0472, + "step": 84500 + }, + { + "epoch": 6.63, + "learning_rate": 0.0005, + "loss": 2.0352, + "step": 84600 + }, + { + "epoch": 6.64, + "learning_rate": 0.0005, + "loss": 2.0219, + "step": 84700 + }, + { + "epoch": 6.65, + "learning_rate": 0.0005, + "loss": 2.0327, + "step": 84800 + }, + { + "epoch": 6.66, + "learning_rate": 0.0005, + "loss": 2.0169, + "step": 84900 + }, + { + "epoch": 6.67, + "learning_rate": 0.0005, + "loss": 2.0091, + "step": 85000 + }, + { + "epoch": 6.67, + "learning_rate": 0.0005, + "loss": 2.0307, + "step": 85100 + }, + { + "epoch": 6.68, + "learning_rate": 0.0005, + "loss": 2.0415, + "step": 85200 + }, + { + "epoch": 6.69, + "learning_rate": 0.0005, + "loss": 1.9875, + "step": 85300 + }, + { + "epoch": 6.7, + "learning_rate": 0.0005, + "loss": 2.0242, + "step": 85400 + }, + { + "epoch": 6.7, + "learning_rate": 0.0005, + "loss": 2.0333, + "step": 85500 + }, + { + "epoch": 6.71, + "learning_rate": 0.0005, + "loss": 2.0341, + "step": 85600 + }, + { + "epoch": 6.72, + "learning_rate": 0.0005, + "loss": 2.0467, + "step": 85700 + }, + { + "epoch": 6.73, + "learning_rate": 0.0005, + "loss": 2.0223, + "step": 85800 + }, + { + "epoch": 6.74, + "learning_rate": 0.0005, + "loss": 2.0483, + "step": 85900 + }, + { + "epoch": 6.74, + "learning_rate": 0.0005, + "loss": 2.0412, + "step": 86000 + }, + { + "epoch": 6.75, + "learning_rate": 0.0005, + "loss": 2.0299, + "step": 86100 + }, + { + "epoch": 6.76, + "learning_rate": 0.0005, + "loss": 2.0148, + "step": 86200 + }, + { + "epoch": 6.77, + "learning_rate": 0.0005, + "loss": 2.0199, + "step": 86300 + }, + { + "epoch": 6.78, + "learning_rate": 0.0005, + "loss": 2.0394, + "step": 86400 + }, + { + "epoch": 6.78, + "learning_rate": 0.0005, + "loss": 2.0118, + "step": 86500 + }, + { + "epoch": 6.79, + "learning_rate": 0.0005, + "loss": 2.0217, + "step": 86600 + }, + { + "epoch": 6.8, + "learning_rate": 0.0005, + "loss": 2.063, + "step": 86700 + }, + { + "epoch": 6.81, + "learning_rate": 0.0005, + "loss": 2.0358, + "step": 86800 + }, + { + "epoch": 6.81, + "learning_rate": 0.0005, + "loss": 2.0223, + "step": 86900 + }, + { + "epoch": 6.82, + "learning_rate": 0.0005, + "loss": 2.0308, + "step": 87000 + }, + { + "epoch": 6.83, + "learning_rate": 0.0005, + "loss": 2.0555, + "step": 87100 + }, + { + "epoch": 6.84, + "learning_rate": 0.0005, + "loss": 2.0664, + "step": 87200 + }, + { + "epoch": 6.85, + "learning_rate": 0.0005, + "loss": 2.0429, + "step": 87300 + }, + { + "epoch": 6.85, + "learning_rate": 0.0005, + "loss": 2.0329, + "step": 87400 + }, + { + "epoch": 6.86, + "learning_rate": 0.0005, + "loss": 2.0086, + "step": 87500 + }, + { + "epoch": 6.87, + "learning_rate": 0.0005, + "loss": 2.0284, + "step": 87600 + }, + { + "epoch": 6.88, + "learning_rate": 0.0005, + "loss": 2.0535, + "step": 87700 + }, + { + "epoch": 6.89, + "learning_rate": 0.0005, + "loss": 2.0376, + "step": 87800 + }, + { + "epoch": 6.89, + "learning_rate": 0.0005, + "loss": 2.0191, + "step": 87900 + }, + { + "epoch": 6.9, + "learning_rate": 0.0005, + "loss": 2.0417, + "step": 88000 + }, + { + "epoch": 6.91, + "learning_rate": 0.0005, + "loss": 2.0254, + "step": 88100 + }, + { + "epoch": 6.92, + "learning_rate": 0.0005, + "loss": 2.0469, + "step": 88200 + }, + { + "epoch": 6.92, + "learning_rate": 0.0005, + "loss": 2.0422, + "step": 88300 + }, + { + "epoch": 6.93, + "learning_rate": 0.0005, + "loss": 2.0291, + "step": 88400 + }, + { + "epoch": 6.94, + "learning_rate": 0.0005, + "loss": 2.0549, + "step": 88500 + }, + { + "epoch": 6.95, + "learning_rate": 0.0005, + "loss": 2.0494, + "step": 88600 + }, + { + "epoch": 6.96, + "learning_rate": 0.0005, + "loss": 2.0522, + "step": 88700 + }, + { + "epoch": 6.96, + "learning_rate": 0.0005, + "loss": 2.0315, + "step": 88800 + }, + { + "epoch": 6.97, + "learning_rate": 0.0005, + "loss": 2.0284, + "step": 88900 + }, + { + "epoch": 6.98, + "learning_rate": 0.0005, + "loss": 2.0619, + "step": 89000 + }, + { + "epoch": 6.99, + "learning_rate": 0.0005, + "loss": 2.0335, + "step": 89100 + }, + { + "epoch": 6.99, + "learning_rate": 0.0005, + "loss": 2.0259, + "step": 89200 + }, + { + "epoch": 7.0, + "learning_rate": 0.0005, + "loss": 2.0142, + "step": 89300 + }, + { + "epoch": 7.01, + "learning_rate": 0.0005, + "loss": 1.9777, + "step": 89400 + }, + { + "epoch": 7.02, + "learning_rate": 0.0005, + "loss": 1.9655, + "step": 89500 + }, + { + "epoch": 7.03, + "learning_rate": 0.0005, + "loss": 1.9457, + "step": 89600 + }, + { + "epoch": 7.03, + "learning_rate": 0.0005, + "loss": 1.9775, + "step": 89700 + }, + { + "epoch": 7.04, + "learning_rate": 0.0005, + "loss": 1.9603, + "step": 89800 + }, + { + "epoch": 7.05, + "learning_rate": 0.0005, + "loss": 1.9705, + "step": 89900 + }, + { + "epoch": 7.06, + "learning_rate": 0.0005, + "loss": 1.9568, + "step": 90000 + }, + { + "epoch": 7.06, + "eval_gen_len": 18.745916835878873, + "eval_loss": 2.060124635696411, + "eval_rouge1": 34.3703, + "eval_rouge2": 13.056, + "eval_rougeL": 28.0273, + "eval_rougeLsum": 28.0164, + "eval_runtime": 360.3535, + "eval_samples_per_second": 31.433, + "eval_steps_per_second": 1.965, + "step": 90000 + }, + { + "epoch": 7.07, + "learning_rate": 0.0005, + "loss": 1.95, + "step": 90100 + }, + { + "epoch": 7.07, + "learning_rate": 0.0005, + "loss": 1.9641, + "step": 90200 + }, + { + "epoch": 7.08, + "learning_rate": 0.0005, + "loss": 1.9718, + "step": 90300 + }, + { + "epoch": 7.09, + "learning_rate": 0.0005, + "loss": 1.9298, + "step": 90400 + }, + { + "epoch": 7.1, + "learning_rate": 0.0005, + "loss": 1.926, + "step": 90500 + }, + { + "epoch": 7.1, + "learning_rate": 0.0005, + "loss": 1.9711, + "step": 90600 + }, + { + "epoch": 7.11, + "learning_rate": 0.0005, + "loss": 1.955, + "step": 90700 + }, + { + "epoch": 7.12, + "learning_rate": 0.0005, + "loss": 1.9411, + "step": 90800 + }, + { + "epoch": 7.13, + "learning_rate": 0.0005, + "loss": 1.9471, + "step": 90900 + }, + { + "epoch": 7.14, + "learning_rate": 0.0005, + "loss": 1.9949, + "step": 91000 + }, + { + "epoch": 7.14, + "learning_rate": 0.0005, + "loss": 1.9662, + "step": 91100 + }, + { + "epoch": 7.15, + "learning_rate": 0.0005, + "loss": 1.9512, + "step": 91200 + }, + { + "epoch": 7.16, + "learning_rate": 0.0005, + "loss": 1.9485, + "step": 91300 + }, + { + "epoch": 7.17, + "learning_rate": 0.0005, + "loss": 1.9587, + "step": 91400 + }, + { + "epoch": 7.18, + "learning_rate": 0.0005, + "loss": 2.0031, + "step": 91500 + }, + { + "epoch": 7.18, + "learning_rate": 0.0005, + "loss": 1.9903, + "step": 91600 + }, + { + "epoch": 7.19, + "learning_rate": 0.0005, + "loss": 1.9852, + "step": 91700 + }, + { + "epoch": 7.2, + "learning_rate": 0.0005, + "loss": 1.9856, + "step": 91800 + }, + { + "epoch": 7.21, + "learning_rate": 0.0005, + "loss": 1.9691, + "step": 91900 + }, + { + "epoch": 7.21, + "learning_rate": 0.0005, + "loss": 1.9728, + "step": 92000 + }, + { + "epoch": 7.22, + "learning_rate": 0.0005, + "loss": 1.9831, + "step": 92100 + }, + { + "epoch": 7.23, + "learning_rate": 0.0005, + "loss": 1.9617, + "step": 92200 + }, + { + "epoch": 7.24, + "learning_rate": 0.0005, + "loss": 1.9783, + "step": 92300 + }, + { + "epoch": 7.25, + "learning_rate": 0.0005, + "loss": 1.9817, + "step": 92400 + }, + { + "epoch": 7.25, + "learning_rate": 0.0005, + "loss": 1.9759, + "step": 92500 + }, + { + "epoch": 7.26, + "learning_rate": 0.0005, + "loss": 1.9912, + "step": 92600 + }, + { + "epoch": 7.27, + "learning_rate": 0.0005, + "loss": 1.9836, + "step": 92700 + }, + { + "epoch": 7.28, + "learning_rate": 0.0005, + "loss": 1.9792, + "step": 92800 + }, + { + "epoch": 7.29, + "learning_rate": 0.0005, + "loss": 1.9728, + "step": 92900 + }, + { + "epoch": 7.29, + "learning_rate": 0.0005, + "loss": 2.0051, + "step": 93000 + }, + { + "epoch": 7.3, + "learning_rate": 0.0005, + "loss": 1.9884, + "step": 93100 + }, + { + "epoch": 7.31, + "learning_rate": 0.0005, + "loss": 1.9679, + "step": 93200 + }, + { + "epoch": 7.32, + "learning_rate": 0.0005, + "loss": 1.9732, + "step": 93300 + }, + { + "epoch": 7.32, + "learning_rate": 0.0005, + "loss": 1.9627, + "step": 93400 + }, + { + "epoch": 7.33, + "learning_rate": 0.0005, + "loss": 1.9745, + "step": 93500 + }, + { + "epoch": 7.34, + "learning_rate": 0.0005, + "loss": 1.9982, + "step": 93600 + }, + { + "epoch": 7.35, + "learning_rate": 0.0005, + "loss": 1.9901, + "step": 93700 + }, + { + "epoch": 7.36, + "learning_rate": 0.0005, + "loss": 1.9544, + "step": 93800 + }, + { + "epoch": 7.36, + "learning_rate": 0.0005, + "loss": 1.9727, + "step": 93900 + }, + { + "epoch": 7.37, + "learning_rate": 0.0005, + "loss": 1.9721, + "step": 94000 + }, + { + "epoch": 7.38, + "learning_rate": 0.0005, + "loss": 1.9807, + "step": 94100 + }, + { + "epoch": 7.39, + "learning_rate": 0.0005, + "loss": 1.937, + "step": 94200 + }, + { + "epoch": 7.39, + "learning_rate": 0.0005, + "loss": 1.9661, + "step": 94300 + }, + { + "epoch": 7.4, + "learning_rate": 0.0005, + "loss": 1.9868, + "step": 94400 + }, + { + "epoch": 7.41, + "learning_rate": 0.0005, + "loss": 1.9727, + "step": 94500 + }, + { + "epoch": 7.42, + "learning_rate": 0.0005, + "loss": 1.9726, + "step": 94600 + }, + { + "epoch": 7.43, + "learning_rate": 0.0005, + "loss": 1.9914, + "step": 94700 + }, + { + "epoch": 7.43, + "learning_rate": 0.0005, + "loss": 1.9804, + "step": 94800 + }, + { + "epoch": 7.44, + "learning_rate": 0.0005, + "loss": 1.9533, + "step": 94900 + }, + { + "epoch": 7.45, + "learning_rate": 0.0005, + "loss": 1.9625, + "step": 95000 + }, + { + "epoch": 7.46, + "learning_rate": 0.0005, + "loss": 1.986, + "step": 95100 + }, + { + "epoch": 7.47, + "learning_rate": 0.0005, + "loss": 1.9745, + "step": 95200 + }, + { + "epoch": 7.47, + "learning_rate": 0.0005, + "loss": 2.0023, + "step": 95300 + }, + { + "epoch": 7.48, + "learning_rate": 0.0005, + "loss": 1.9857, + "step": 95400 + }, + { + "epoch": 7.49, + "learning_rate": 0.0005, + "loss": 1.9837, + "step": 95500 + }, + { + "epoch": 7.5, + "learning_rate": 0.0005, + "loss": 1.9924, + "step": 95600 + }, + { + "epoch": 7.5, + "learning_rate": 0.0005, + "loss": 1.9984, + "step": 95700 + }, + { + "epoch": 7.51, + "learning_rate": 0.0005, + "loss": 1.9832, + "step": 95800 + }, + { + "epoch": 7.52, + "learning_rate": 0.0005, + "loss": 1.9741, + "step": 95900 + }, + { + "epoch": 7.53, + "learning_rate": 0.0005, + "loss": 1.9719, + "step": 96000 + }, + { + "epoch": 7.54, + "learning_rate": 0.0005, + "loss": 1.9789, + "step": 96100 + }, + { + "epoch": 7.54, + "learning_rate": 0.0005, + "loss": 1.9968, + "step": 96200 + }, + { + "epoch": 7.55, + "learning_rate": 0.0005, + "loss": 1.9551, + "step": 96300 + }, + { + "epoch": 7.56, + "learning_rate": 0.0005, + "loss": 2.0159, + "step": 96400 + }, + { + "epoch": 7.57, + "learning_rate": 0.0005, + "loss": 1.9721, + "step": 96500 + }, + { + "epoch": 7.58, + "learning_rate": 0.0005, + "loss": 1.9896, + "step": 96600 + }, + { + "epoch": 7.58, + "learning_rate": 0.0005, + "loss": 2.004, + "step": 96700 + }, + { + "epoch": 7.59, + "learning_rate": 0.0005, + "loss": 1.9564, + "step": 96800 + }, + { + "epoch": 7.6, + "learning_rate": 0.0005, + "loss": 1.9491, + "step": 96900 + }, + { + "epoch": 7.61, + "learning_rate": 0.0005, + "loss": 1.9866, + "step": 97000 + }, + { + "epoch": 7.61, + "learning_rate": 0.0005, + "loss": 1.9746, + "step": 97100 + }, + { + "epoch": 7.62, + "learning_rate": 0.0005, + "loss": 1.9724, + "step": 97200 + }, + { + "epoch": 7.63, + "learning_rate": 0.0005, + "loss": 1.9936, + "step": 97300 + }, + { + "epoch": 7.64, + "learning_rate": 0.0005, + "loss": 1.9812, + "step": 97400 + }, + { + "epoch": 7.65, + "learning_rate": 0.0005, + "loss": 2.0067, + "step": 97500 + }, + { + "epoch": 7.65, + "learning_rate": 0.0005, + "loss": 1.9797, + "step": 97600 + }, + { + "epoch": 7.66, + "learning_rate": 0.0005, + "loss": 2.0005, + "step": 97700 + }, + { + "epoch": 7.67, + "learning_rate": 0.0005, + "loss": 1.9963, + "step": 97800 + }, + { + "epoch": 7.68, + "learning_rate": 0.0005, + "loss": 2.0044, + "step": 97900 + }, + { + "epoch": 7.69, + "learning_rate": 0.0005, + "loss": 1.9896, + "step": 98000 + }, + { + "epoch": 7.69, + "learning_rate": 0.0005, + "loss": 1.9853, + "step": 98100 + }, + { + "epoch": 7.7, + "learning_rate": 0.0005, + "loss": 1.9957, + "step": 98200 + }, + { + "epoch": 7.71, + "learning_rate": 0.0005, + "loss": 2.0128, + "step": 98300 + }, + { + "epoch": 7.72, + "learning_rate": 0.0005, + "loss": 1.9685, + "step": 98400 + }, + { + "epoch": 7.72, + "learning_rate": 0.0005, + "loss": 2.0051, + "step": 98500 + }, + { + "epoch": 7.73, + "learning_rate": 0.0005, + "loss": 1.9703, + "step": 98600 + }, + { + "epoch": 7.74, + "learning_rate": 0.0005, + "loss": 2.0022, + "step": 98700 + }, + { + "epoch": 7.75, + "learning_rate": 0.0005, + "loss": 2.0205, + "step": 98800 + }, + { + "epoch": 7.76, + "learning_rate": 0.0005, + "loss": 2.0008, + "step": 98900 + }, + { + "epoch": 7.76, + "learning_rate": 0.0005, + "loss": 2.0121, + "step": 99000 + }, + { + "epoch": 7.77, + "learning_rate": 0.0005, + "loss": 2.0063, + "step": 99100 + }, + { + "epoch": 7.78, + "learning_rate": 0.0005, + "loss": 1.9981, + "step": 99200 + }, + { + "epoch": 7.79, + "learning_rate": 0.0005, + "loss": 1.9838, + "step": 99300 + }, + { + "epoch": 7.79, + "learning_rate": 0.0005, + "loss": 1.9923, + "step": 99400 + }, + { + "epoch": 7.8, + "learning_rate": 0.0005, + "loss": 1.9959, + "step": 99500 + }, + { + "epoch": 7.81, + "learning_rate": 0.0005, + "loss": 1.9924, + "step": 99600 + }, + { + "epoch": 7.82, + "learning_rate": 0.0005, + "loss": 2.0133, + "step": 99700 + }, + { + "epoch": 7.83, + "learning_rate": 0.0005, + "loss": 1.9995, + "step": 99800 + }, + { + "epoch": 7.83, + "learning_rate": 0.0005, + "loss": 2.0026, + "step": 99900 + }, + { + "epoch": 7.84, + "learning_rate": 0.0005, + "loss": 1.9736, + "step": 100000 + }, + { + "epoch": 7.84, + "eval_gen_len": 18.74485742032312, + "eval_loss": 2.0384180545806885, + "eval_rouge1": 34.1724, + "eval_rouge2": 13.072, + "eval_rougeL": 27.9429, + "eval_rougeLsum": 27.9294, + "eval_runtime": 358.3007, + "eval_samples_per_second": 31.613, + "eval_steps_per_second": 1.976, + "step": 100000 + }, + { + "epoch": 7.85, + "learning_rate": 0.0005, + "loss": 2.0139, + "step": 100100 + }, + { + "epoch": 7.86, + "learning_rate": 0.0005, + "loss": 2.0064, + "step": 100200 + }, + { + "epoch": 7.87, + "learning_rate": 0.0005, + "loss": 1.9845, + "step": 100300 + }, + { + "epoch": 7.87, + "learning_rate": 0.0005, + "loss": 1.9921, + "step": 100400 + }, + { + "epoch": 7.88, + "learning_rate": 0.0005, + "loss": 1.9755, + "step": 100500 + }, + { + "epoch": 7.89, + "learning_rate": 0.0005, + "loss": 2.0093, + "step": 100600 + }, + { + "epoch": 7.9, + "learning_rate": 0.0005, + "loss": 2.0127, + "step": 100700 + }, + { + "epoch": 7.9, + "learning_rate": 0.0005, + "loss": 1.9878, + "step": 100800 + }, + { + "epoch": 7.91, + "learning_rate": 0.0005, + "loss": 1.9925, + "step": 100900 + }, + { + "epoch": 7.92, + "learning_rate": 0.0005, + "loss": 1.9798, + "step": 101000 + }, + { + "epoch": 7.93, + "learning_rate": 0.0005, + "loss": 2.0125, + "step": 101100 + }, + { + "epoch": 7.94, + "learning_rate": 0.0005, + "loss": 2.012, + "step": 101200 + }, + { + "epoch": 7.94, + "learning_rate": 0.0005, + "loss": 1.9951, + "step": 101300 + }, + { + "epoch": 7.95, + "learning_rate": 0.0005, + "loss": 2.0181, + "step": 101400 + }, + { + "epoch": 7.96, + "learning_rate": 0.0005, + "loss": 2.0121, + "step": 101500 + }, + { + "epoch": 7.97, + "learning_rate": 0.0005, + "loss": 2.0009, + "step": 101600 + }, + { + "epoch": 7.98, + "learning_rate": 0.0005, + "loss": 1.9855, + "step": 101700 + }, + { + "epoch": 7.98, + "learning_rate": 0.0005, + "loss": 1.9993, + "step": 101800 + }, + { + "epoch": 7.99, + "learning_rate": 0.0005, + "loss": 1.9784, + "step": 101900 + }, + { + "epoch": 8.0, + "learning_rate": 0.0005, + "loss": 1.9794, + "step": 102000 + }, + { + "epoch": 8.01, + "learning_rate": 0.0005, + "loss": 1.9454, + "step": 102100 + }, + { + "epoch": 8.01, + "learning_rate": 0.0005, + "loss": 1.9057, + "step": 102200 + }, + { + "epoch": 8.02, + "learning_rate": 0.0005, + "loss": 1.912, + "step": 102300 + }, + { + "epoch": 8.03, + "learning_rate": 0.0005, + "loss": 1.9005, + "step": 102400 + }, + { + "epoch": 8.04, + "learning_rate": 0.0005, + "loss": 1.9037, + "step": 102500 + }, + { + "epoch": 8.05, + "learning_rate": 0.0005, + "loss": 1.9229, + "step": 102600 + }, + { + "epoch": 8.05, + "learning_rate": 0.0005, + "loss": 1.9096, + "step": 102700 + }, + { + "epoch": 8.06, + "learning_rate": 0.0005, + "loss": 1.9353, + "step": 102800 + }, + { + "epoch": 8.07, + "learning_rate": 0.0005, + "loss": 1.9332, + "step": 102900 + }, + { + "epoch": 8.08, + "learning_rate": 0.0005, + "loss": 1.9093, + "step": 103000 + }, + { + "epoch": 8.09, + "learning_rate": 0.0005, + "loss": 1.9085, + "step": 103100 + }, + { + "epoch": 8.09, + "learning_rate": 0.0005, + "loss": 1.9351, + "step": 103200 + }, + { + "epoch": 8.1, + "learning_rate": 0.0005, + "loss": 1.9295, + "step": 103300 + }, + { + "epoch": 8.11, + "learning_rate": 0.0005, + "loss": 1.919, + "step": 103400 + }, + { + "epoch": 8.12, + "learning_rate": 0.0005, + "loss": 1.9265, + "step": 103500 + }, + { + "epoch": 8.12, + "learning_rate": 0.0005, + "loss": 1.923, + "step": 103600 + }, + { + "epoch": 8.13, + "learning_rate": 0.0005, + "loss": 1.9284, + "step": 103700 + }, + { + "epoch": 8.14, + "learning_rate": 0.0005, + "loss": 1.9155, + "step": 103800 + }, + { + "epoch": 8.15, + "learning_rate": 0.0005, + "loss": 1.9245, + "step": 103900 + }, + { + "epoch": 8.16, + "learning_rate": 0.0005, + "loss": 1.9388, + "step": 104000 + }, + { + "epoch": 8.16, + "learning_rate": 0.0005, + "loss": 1.9247, + "step": 104100 + }, + { + "epoch": 8.17, + "learning_rate": 0.0005, + "loss": 1.9535, + "step": 104200 + }, + { + "epoch": 8.18, + "learning_rate": 0.0005, + "loss": 1.9312, + "step": 104300 + }, + { + "epoch": 8.19, + "learning_rate": 0.0005, + "loss": 1.924, + "step": 104400 + }, + { + "epoch": 8.19, + "learning_rate": 0.0005, + "loss": 1.944, + "step": 104500 + }, + { + "epoch": 8.2, + "learning_rate": 0.0005, + "loss": 1.9328, + "step": 104600 + }, + { + "epoch": 8.21, + "learning_rate": 0.0005, + "loss": 1.926, + "step": 104700 + }, + { + "epoch": 8.22, + "learning_rate": 0.0005, + "loss": 1.9264, + "step": 104800 + }, + { + "epoch": 8.23, + "learning_rate": 0.0005, + "loss": 1.92, + "step": 104900 + }, + { + "epoch": 8.23, + "learning_rate": 0.0005, + "loss": 1.94, + "step": 105000 + }, + { + "epoch": 8.24, + "learning_rate": 0.0005, + "loss": 1.9447, + "step": 105100 + }, + { + "epoch": 8.25, + "learning_rate": 0.0005, + "loss": 1.9307, + "step": 105200 + }, + { + "epoch": 8.26, + "learning_rate": 0.0005, + "loss": 1.9461, + "step": 105300 + }, + { + "epoch": 8.27, + "learning_rate": 0.0005, + "loss": 1.9426, + "step": 105400 + }, + { + "epoch": 8.27, + "learning_rate": 0.0005, + "loss": 1.9638, + "step": 105500 + }, + { + "epoch": 8.28, + "learning_rate": 0.0005, + "loss": 1.9381, + "step": 105600 + }, + { + "epoch": 8.29, + "learning_rate": 0.0005, + "loss": 1.9362, + "step": 105700 + }, + { + "epoch": 8.3, + "learning_rate": 0.0005, + "loss": 1.9646, + "step": 105800 + }, + { + "epoch": 8.3, + "learning_rate": 0.0005, + "loss": 1.9434, + "step": 105900 + }, + { + "epoch": 8.31, + "learning_rate": 0.0005, + "loss": 1.9305, + "step": 106000 + }, + { + "epoch": 8.32, + "learning_rate": 0.0005, + "loss": 1.9263, + "step": 106100 + }, + { + "epoch": 8.33, + "learning_rate": 0.0005, + "loss": 1.9568, + "step": 106200 + }, + { + "epoch": 8.34, + "learning_rate": 0.0005, + "loss": 1.941, + "step": 106300 + }, + { + "epoch": 8.34, + "learning_rate": 0.0005, + "loss": 1.9541, + "step": 106400 + }, + { + "epoch": 8.35, + "learning_rate": 0.0005, + "loss": 1.9446, + "step": 106500 + }, + { + "epoch": 8.36, + "learning_rate": 0.0005, + "loss": 1.9513, + "step": 106600 + }, + { + "epoch": 8.37, + "learning_rate": 0.0005, + "loss": 1.9394, + "step": 106700 + }, + { + "epoch": 8.38, + "learning_rate": 0.0005, + "loss": 1.9623, + "step": 106800 + }, + { + "epoch": 8.38, + "learning_rate": 0.0005, + "loss": 1.9638, + "step": 106900 + }, + { + "epoch": 8.39, + "learning_rate": 0.0005, + "loss": 1.9246, + "step": 107000 + }, + { + "epoch": 8.4, + "learning_rate": 0.0005, + "loss": 1.9464, + "step": 107100 + }, + { + "epoch": 8.41, + "learning_rate": 0.0005, + "loss": 1.9515, + "step": 107200 + }, + { + "epoch": 8.41, + "learning_rate": 0.0005, + "loss": 1.9604, + "step": 107300 + }, + { + "epoch": 8.42, + "learning_rate": 0.0005, + "loss": 1.9294, + "step": 107400 + }, + { + "epoch": 8.43, + "learning_rate": 0.0005, + "loss": 1.9376, + "step": 107500 + }, + { + "epoch": 8.44, + "learning_rate": 0.0005, + "loss": 1.9506, + "step": 107600 + }, + { + "epoch": 8.45, + "learning_rate": 0.0005, + "loss": 1.9304, + "step": 107700 + }, + { + "epoch": 8.45, + "learning_rate": 0.0005, + "loss": 1.958, + "step": 107800 + }, + { + "epoch": 8.46, + "learning_rate": 0.0005, + "loss": 1.9809, + "step": 107900 + }, + { + "epoch": 8.47, + "learning_rate": 0.0005, + "loss": 1.9446, + "step": 108000 + }, + { + "epoch": 8.48, + "learning_rate": 0.0005, + "loss": 1.9503, + "step": 108100 + }, + { + "epoch": 8.48, + "learning_rate": 0.0005, + "loss": 1.9452, + "step": 108200 + }, + { + "epoch": 8.49, + "learning_rate": 0.0005, + "loss": 1.9523, + "step": 108300 + }, + { + "epoch": 8.5, + "learning_rate": 0.0005, + "loss": 1.9206, + "step": 108400 + }, + { + "epoch": 8.51, + "learning_rate": 0.0005, + "loss": 1.958, + "step": 108500 + }, + { + "epoch": 8.52, + "learning_rate": 0.0005, + "loss": 1.9445, + "step": 108600 + }, + { + "epoch": 8.52, + "learning_rate": 0.0005, + "loss": 1.9558, + "step": 108700 + }, + { + "epoch": 8.53, + "learning_rate": 0.0005, + "loss": 1.9497, + "step": 108800 + }, + { + "epoch": 8.54, + "learning_rate": 0.0005, + "loss": 1.9523, + "step": 108900 + }, + { + "epoch": 8.55, + "learning_rate": 0.0005, + "loss": 1.9334, + "step": 109000 + }, + { + "epoch": 8.56, + "learning_rate": 0.0005, + "loss": 1.9342, + "step": 109100 + }, + { + "epoch": 8.56, + "learning_rate": 0.0005, + "loss": 1.9514, + "step": 109200 + }, + { + "epoch": 8.57, + "learning_rate": 0.0005, + "loss": 1.979, + "step": 109300 + }, + { + "epoch": 8.58, + "learning_rate": 0.0005, + "loss": 1.9424, + "step": 109400 + }, + { + "epoch": 8.59, + "learning_rate": 0.0005, + "loss": 1.9586, + "step": 109500 + }, + { + "epoch": 8.59, + "learning_rate": 0.0005, + "loss": 1.9227, + "step": 109600 + }, + { + "epoch": 8.6, + "learning_rate": 0.0005, + "loss": 1.9335, + "step": 109700 + }, + { + "epoch": 8.61, + "learning_rate": 0.0005, + "loss": 1.9582, + "step": 109800 + }, + { + "epoch": 8.62, + "learning_rate": 0.0005, + "loss": 1.9475, + "step": 109900 + }, + { + "epoch": 8.63, + "learning_rate": 0.0005, + "loss": 1.9571, + "step": 110000 + }, + { + "epoch": 8.63, + "eval_gen_len": 18.777257879403194, + "eval_loss": 2.0414655208587646, + "eval_rouge1": 34.5053, + "eval_rouge2": 13.283, + "eval_rougeL": 28.19, + "eval_rougeLsum": 28.1836, + "eval_runtime": 360.6557, + "eval_samples_per_second": 31.407, + "eval_steps_per_second": 1.963, + "step": 110000 + }, + { + "epoch": 8.63, + "learning_rate": 0.0005, + "loss": 1.9556, + "step": 110100 + }, + { + "epoch": 8.64, + "learning_rate": 0.0005, + "loss": 1.9607, + "step": 110200 + }, + { + "epoch": 8.65, + "learning_rate": 0.0005, + "loss": 1.9676, + "step": 110300 + }, + { + "epoch": 8.66, + "learning_rate": 0.0005, + "loss": 1.9648, + "step": 110400 + }, + { + "epoch": 8.67, + "learning_rate": 0.0005, + "loss": 1.9451, + "step": 110500 + }, + { + "epoch": 8.67, + "learning_rate": 0.0005, + "loss": 1.9554, + "step": 110600 + }, + { + "epoch": 8.68, + "learning_rate": 0.0005, + "loss": 1.9645, + "step": 110700 + }, + { + "epoch": 8.69, + "learning_rate": 0.0005, + "loss": 1.9936, + "step": 110800 + }, + { + "epoch": 8.7, + "learning_rate": 0.0005, + "loss": 1.9629, + "step": 110900 + }, + { + "epoch": 8.7, + "learning_rate": 0.0005, + "loss": 1.9463, + "step": 111000 + }, + { + "epoch": 8.71, + "learning_rate": 0.0005, + "loss": 1.9464, + "step": 111100 + }, + { + "epoch": 8.72, + "learning_rate": 0.0005, + "loss": 1.9193, + "step": 111200 + }, + { + "epoch": 8.73, + "learning_rate": 0.0005, + "loss": 1.9795, + "step": 111300 + }, + { + "epoch": 8.74, + "learning_rate": 0.0005, + "loss": 1.9997, + "step": 111400 + }, + { + "epoch": 8.74, + "learning_rate": 0.0005, + "loss": 1.9712, + "step": 111500 + }, + { + "epoch": 8.75, + "learning_rate": 0.0005, + "loss": 1.9425, + "step": 111600 + }, + { + "epoch": 8.76, + "learning_rate": 0.0005, + "loss": 1.9544, + "step": 111700 + }, + { + "epoch": 8.77, + "learning_rate": 0.0005, + "loss": 1.9692, + "step": 111800 + }, + { + "epoch": 8.78, + "learning_rate": 0.0005, + "loss": 1.9595, + "step": 111900 + }, + { + "epoch": 8.78, + "learning_rate": 0.0005, + "loss": 1.974, + "step": 112000 + }, + { + "epoch": 8.79, + "learning_rate": 0.0005, + "loss": 1.9502, + "step": 112100 + }, + { + "epoch": 8.8, + "learning_rate": 0.0005, + "loss": 1.972, + "step": 112200 + }, + { + "epoch": 8.81, + "learning_rate": 0.0005, + "loss": 1.9803, + "step": 112300 + }, + { + "epoch": 8.81, + "learning_rate": 0.0005, + "loss": 1.9553, + "step": 112400 + }, + { + "epoch": 8.82, + "learning_rate": 0.0005, + "loss": 1.9793, + "step": 112500 + }, + { + "epoch": 8.83, + "learning_rate": 0.0005, + "loss": 1.9593, + "step": 112600 + }, + { + "epoch": 8.84, + "learning_rate": 0.0005, + "loss": 1.9687, + "step": 112700 + }, + { + "epoch": 8.85, + "learning_rate": 0.0005, + "loss": 1.949, + "step": 112800 + }, + { + "epoch": 8.85, + "learning_rate": 0.0005, + "loss": 1.9785, + "step": 112900 + }, + { + "epoch": 8.86, + "learning_rate": 0.0005, + "loss": 1.9625, + "step": 113000 + }, + { + "epoch": 8.87, + "learning_rate": 0.0005, + "loss": 1.9763, + "step": 113100 + }, + { + "epoch": 8.88, + "learning_rate": 0.0005, + "loss": 1.9741, + "step": 113200 + }, + { + "epoch": 8.88, + "learning_rate": 0.0005, + "loss": 1.9307, + "step": 113300 + }, + { + "epoch": 8.89, + "learning_rate": 0.0005, + "loss": 1.9471, + "step": 113400 + }, + { + "epoch": 8.9, + "learning_rate": 0.0005, + "loss": 1.9682, + "step": 113500 + }, + { + "epoch": 8.91, + "learning_rate": 0.0005, + "loss": 2.0082, + "step": 113600 + }, + { + "epoch": 8.92, + "learning_rate": 0.0005, + "loss": 1.9933, + "step": 113700 + }, + { + "epoch": 8.92, + "learning_rate": 0.0005, + "loss": 1.9674, + "step": 113800 + }, + { + "epoch": 8.93, + "learning_rate": 0.0005, + "loss": 1.9422, + "step": 113900 + }, + { + "epoch": 8.94, + "learning_rate": 0.0005, + "loss": 1.9523, + "step": 114000 + }, + { + "epoch": 8.95, + "learning_rate": 0.0005, + "loss": 1.9825, + "step": 114100 + }, + { + "epoch": 8.96, + "learning_rate": 0.0005, + "loss": 1.9663, + "step": 114200 + }, + { + "epoch": 8.96, + "learning_rate": 0.0005, + "loss": 1.9994, + "step": 114300 + }, + { + "epoch": 8.97, + "learning_rate": 0.0005, + "loss": 1.9455, + "step": 114400 + }, + { + "epoch": 8.98, + "learning_rate": 0.0005, + "loss": 1.9715, + "step": 114500 + }, + { + "epoch": 8.99, + "learning_rate": 0.0005, + "loss": 1.9657, + "step": 114600 + }, + { + "epoch": 8.99, + "learning_rate": 0.0005, + "loss": 1.9587, + "step": 114700 + }, + { + "epoch": 9.0, + "learning_rate": 0.0005, + "loss": 1.9472, + "step": 114800 + }, + { + "epoch": 9.01, + "learning_rate": 0.0005, + "loss": 1.8601, + "step": 114900 + }, + { + "epoch": 9.02, + "learning_rate": 0.0005, + "loss": 1.8969, + "step": 115000 + }, + { + "epoch": 9.03, + "learning_rate": 0.0005, + "loss": 1.8682, + "step": 115100 + }, + { + "epoch": 9.03, + "learning_rate": 0.0005, + "loss": 1.9005, + "step": 115200 + }, + { + "epoch": 9.04, + "learning_rate": 0.0005, + "loss": 1.8729, + "step": 115300 + }, + { + "epoch": 9.05, + "learning_rate": 0.0005, + "loss": 1.8889, + "step": 115400 + }, + { + "epoch": 9.06, + "learning_rate": 0.0005, + "loss": 1.8802, + "step": 115500 + }, + { + "epoch": 9.07, + "learning_rate": 0.0005, + "loss": 1.8617, + "step": 115600 + }, + { + "epoch": 9.07, + "learning_rate": 0.0005, + "loss": 1.8912, + "step": 115700 + }, + { + "epoch": 9.08, + "learning_rate": 0.0005, + "loss": 1.8954, + "step": 115800 + }, + { + "epoch": 9.09, + "learning_rate": 0.0005, + "loss": 1.9125, + "step": 115900 + }, + { + "epoch": 9.1, + "learning_rate": 0.0005, + "loss": 1.8967, + "step": 116000 + }, + { + "epoch": 9.1, + "learning_rate": 0.0005, + "loss": 1.8914, + "step": 116100 + }, + { + "epoch": 9.11, + "learning_rate": 0.0005, + "loss": 1.8867, + "step": 116200 + }, + { + "epoch": 9.12, + "learning_rate": 0.0005, + "loss": 1.9153, + "step": 116300 + }, + { + "epoch": 9.13, + "learning_rate": 0.0005, + "loss": 1.9004, + "step": 116400 + }, + { + "epoch": 9.14, + "learning_rate": 0.0005, + "loss": 1.8644, + "step": 116500 + }, + { + "epoch": 9.14, + "learning_rate": 0.0005, + "loss": 1.8856, + "step": 116600 + }, + { + "epoch": 9.15, + "learning_rate": 0.0005, + "loss": 1.8949, + "step": 116700 + }, + { + "epoch": 9.16, + "learning_rate": 0.0005, + "loss": 1.8782, + "step": 116800 + }, + { + "epoch": 9.17, + "learning_rate": 0.0005, + "loss": 1.8724, + "step": 116900 + }, + { + "epoch": 9.18, + "learning_rate": 0.0005, + "loss": 1.9133, + "step": 117000 + }, + { + "epoch": 9.18, + "learning_rate": 0.0005, + "loss": 1.8869, + "step": 117100 + }, + { + "epoch": 9.19, + "learning_rate": 0.0005, + "loss": 1.9042, + "step": 117200 + }, + { + "epoch": 9.2, + "learning_rate": 0.0005, + "loss": 1.8915, + "step": 117300 + }, + { + "epoch": 9.21, + "learning_rate": 0.0005, + "loss": 1.9083, + "step": 117400 + }, + { + "epoch": 9.21, + "learning_rate": 0.0005, + "loss": 1.9009, + "step": 117500 + }, + { + "epoch": 9.22, + "learning_rate": 0.0005, + "loss": 1.8974, + "step": 117600 + }, + { + "epoch": 9.23, + "learning_rate": 0.0005, + "loss": 1.9142, + "step": 117700 + }, + { + "epoch": 9.24, + "learning_rate": 0.0005, + "loss": 1.9136, + "step": 117800 + }, + { + "epoch": 9.25, + "learning_rate": 0.0005, + "loss": 1.9299, + "step": 117900 + }, + { + "epoch": 9.25, + "learning_rate": 0.0005, + "loss": 1.8972, + "step": 118000 + }, + { + "epoch": 9.26, + "learning_rate": 0.0005, + "loss": 1.9068, + "step": 118100 + }, + { + "epoch": 9.27, + "learning_rate": 0.0005, + "loss": 1.8926, + "step": 118200 + }, + { + "epoch": 9.28, + "learning_rate": 0.0005, + "loss": 1.9263, + "step": 118300 + }, + { + "epoch": 9.28, + "learning_rate": 0.0005, + "loss": 1.9052, + "step": 118400 + }, + { + "epoch": 9.29, + "learning_rate": 0.0005, + "loss": 1.9119, + "step": 118500 + }, + { + "epoch": 9.3, + "learning_rate": 0.0005, + "loss": 1.8979, + "step": 118600 + }, + { + "epoch": 9.31, + "learning_rate": 0.0005, + "loss": 1.8753, + "step": 118700 + }, + { + "epoch": 9.32, + "learning_rate": 0.0005, + "loss": 1.8867, + "step": 118800 + }, + { + "epoch": 9.32, + "learning_rate": 0.0005, + "loss": 1.8759, + "step": 118900 + }, + { + "epoch": 9.33, + "learning_rate": 0.0005, + "loss": 1.932, + "step": 119000 + }, + { + "epoch": 9.34, + "learning_rate": 0.0005, + "loss": 1.9213, + "step": 119100 + }, + { + "epoch": 9.35, + "learning_rate": 0.0005, + "loss": 1.9241, + "step": 119200 + }, + { + "epoch": 9.36, + "learning_rate": 0.0005, + "loss": 1.9039, + "step": 119300 + }, + { + "epoch": 9.36, + "learning_rate": 0.0005, + "loss": 1.9411, + "step": 119400 + }, + { + "epoch": 9.37, + "learning_rate": 0.0005, + "loss": 1.9071, + "step": 119500 + }, + { + "epoch": 9.38, + "learning_rate": 0.0005, + "loss": 1.9214, + "step": 119600 + }, + { + "epoch": 9.39, + "learning_rate": 0.0005, + "loss": 1.8785, + "step": 119700 + }, + { + "epoch": 9.39, + "learning_rate": 0.0005, + "loss": 1.9119, + "step": 119800 + }, + { + "epoch": 9.4, + "learning_rate": 0.0005, + "loss": 1.9278, + "step": 119900 + }, + { + "epoch": 9.41, + "learning_rate": 0.0005, + "loss": 1.9196, + "step": 120000 + }, + { + "epoch": 9.41, + "eval_gen_len": 18.812042023483713, + "eval_loss": 2.018998384475708, + "eval_rouge1": 34.5454, + "eval_rouge2": 13.3829, + "eval_rougeL": 28.2239, + "eval_rougeLsum": 28.2114, + "eval_runtime": 359.993, + "eval_samples_per_second": 31.465, + "eval_steps_per_second": 1.967, + "step": 120000 + }, + { + "epoch": 9.42, + "learning_rate": 0.0005, + "loss": 1.9188, + "step": 120100 + }, + { + "epoch": 9.43, + "learning_rate": 0.0005, + "loss": 1.9056, + "step": 120200 + }, + { + "epoch": 9.43, + "learning_rate": 0.0005, + "loss": 1.9468, + "step": 120300 + }, + { + "epoch": 9.44, + "learning_rate": 0.0005, + "loss": 1.9084, + "step": 120400 + }, + { + "epoch": 9.45, + "learning_rate": 0.0005, + "loss": 1.9327, + "step": 120500 + }, + { + "epoch": 9.46, + "learning_rate": 0.0005, + "loss": 1.9426, + "step": 120600 + }, + { + "epoch": 9.47, + "learning_rate": 0.0005, + "loss": 1.9135, + "step": 120700 + }, + { + "epoch": 9.47, + "learning_rate": 0.0005, + "loss": 1.8908, + "step": 120800 + }, + { + "epoch": 9.48, + "learning_rate": 0.0005, + "loss": 1.9273, + "step": 120900 + }, + { + "epoch": 9.49, + "learning_rate": 0.0005, + "loss": 1.9364, + "step": 121000 + }, + { + "epoch": 9.5, + "learning_rate": 0.0005, + "loss": 1.906, + "step": 121100 + }, + { + "epoch": 9.5, + "learning_rate": 0.0005, + "loss": 1.9405, + "step": 121200 + }, + { + "epoch": 9.51, + "learning_rate": 0.0005, + "loss": 1.9018, + "step": 121300 + }, + { + "epoch": 9.52, + "learning_rate": 0.0005, + "loss": 1.9183, + "step": 121400 + }, + { + "epoch": 9.53, + "learning_rate": 0.0005, + "loss": 1.9258, + "step": 121500 + }, + { + "epoch": 9.54, + "learning_rate": 0.0005, + "loss": 1.8952, + "step": 121600 + }, + { + "epoch": 9.54, + "learning_rate": 0.0005, + "loss": 1.9412, + "step": 121700 + }, + { + "epoch": 9.55, + "learning_rate": 0.0005, + "loss": 1.9284, + "step": 121800 + }, + { + "epoch": 9.56, + "learning_rate": 0.0005, + "loss": 1.9197, + "step": 121900 + }, + { + "epoch": 9.57, + "learning_rate": 0.0005, + "loss": 1.9171, + "step": 122000 + }, + { + "epoch": 9.57, + "learning_rate": 0.0005, + "loss": 1.9278, + "step": 122100 + }, + { + "epoch": 9.58, + "learning_rate": 0.0005, + "loss": 1.9395, + "step": 122200 + }, + { + "epoch": 9.59, + "learning_rate": 0.0005, + "loss": 1.936, + "step": 122300 + }, + { + "epoch": 9.6, + "learning_rate": 0.0005, + "loss": 1.9, + "step": 122400 + }, + { + "epoch": 9.61, + "learning_rate": 0.0005, + "loss": 1.9328, + "step": 122500 + }, + { + "epoch": 9.61, + "learning_rate": 0.0005, + "loss": 1.9329, + "step": 122600 + }, + { + "epoch": 9.62, + "learning_rate": 0.0005, + "loss": 1.9407, + "step": 122700 + }, + { + "epoch": 9.63, + "learning_rate": 0.0005, + "loss": 1.9219, + "step": 122800 + }, + { + "epoch": 9.64, + "learning_rate": 0.0005, + "loss": 1.9634, + "step": 122900 + }, + { + "epoch": 9.65, + "learning_rate": 0.0005, + "loss": 1.9186, + "step": 123000 + }, + { + "epoch": 9.65, + "learning_rate": 0.0005, + "loss": 1.9361, + "step": 123100 + }, + { + "epoch": 9.66, + "learning_rate": 0.0005, + "loss": 1.9279, + "step": 123200 + }, + { + "epoch": 9.67, + "learning_rate": 0.0005, + "loss": 1.9251, + "step": 123300 + }, + { + "epoch": 9.68, + "learning_rate": 0.0005, + "loss": 1.9143, + "step": 123400 + }, + { + "epoch": 9.68, + "learning_rate": 0.0005, + "loss": 1.9565, + "step": 123500 + }, + { + "epoch": 9.69, + "learning_rate": 0.0005, + "loss": 1.9419, + "step": 123600 + }, + { + "epoch": 9.7, + "learning_rate": 0.0005, + "loss": 1.9147, + "step": 123700 + }, + { + "epoch": 9.71, + "learning_rate": 0.0005, + "loss": 1.8996, + "step": 123800 + }, + { + "epoch": 9.72, + "learning_rate": 0.0005, + "loss": 1.9299, + "step": 123900 + }, + { + "epoch": 9.72, + "learning_rate": 0.0005, + "loss": 1.9303, + "step": 124000 + }, + { + "epoch": 9.73, + "learning_rate": 0.0005, + "loss": 1.9447, + "step": 124100 + }, + { + "epoch": 9.74, + "learning_rate": 0.0005, + "loss": 1.9251, + "step": 124200 + }, + { + "epoch": 9.75, + "learning_rate": 0.0005, + "loss": 1.9279, + "step": 124300 + }, + { + "epoch": 9.76, + "learning_rate": 0.0005, + "loss": 1.8948, + "step": 124400 + }, + { + "epoch": 9.76, + "learning_rate": 0.0005, + "loss": 1.9331, + "step": 124500 + }, + { + "epoch": 9.77, + "learning_rate": 0.0005, + "loss": 1.9378, + "step": 124600 + }, + { + "epoch": 9.78, + "learning_rate": 0.0005, + "loss": 1.9366, + "step": 124700 + }, + { + "epoch": 9.79, + "learning_rate": 0.0005, + "loss": 1.9875, + "step": 124800 + }, + { + "epoch": 9.79, + "learning_rate": 0.0005, + "loss": 1.9354, + "step": 124900 + }, + { + "epoch": 9.8, + "learning_rate": 0.0005, + "loss": 1.9318, + "step": 125000 + }, + { + "epoch": 9.81, + "learning_rate": 0.0005, + "loss": 1.9058, + "step": 125100 + }, + { + "epoch": 9.82, + "learning_rate": 0.0005, + "loss": 1.9133, + "step": 125200 + }, + { + "epoch": 9.83, + "learning_rate": 0.0005, + "loss": 1.9353, + "step": 125300 + }, + { + "epoch": 9.83, + "learning_rate": 0.0005, + "loss": 1.9616, + "step": 125400 + }, + { + "epoch": 9.84, + "learning_rate": 0.0005, + "loss": 1.9395, + "step": 125500 + }, + { + "epoch": 9.85, + "learning_rate": 0.0005, + "loss": 1.9405, + "step": 125600 + }, + { + "epoch": 9.86, + "learning_rate": 0.0005, + "loss": 1.8969, + "step": 125700 + }, + { + "epoch": 9.87, + "learning_rate": 0.0005, + "loss": 1.9206, + "step": 125800 + }, + { + "epoch": 9.87, + "learning_rate": 0.0005, + "loss": 1.8971, + "step": 125900 + }, + { + "epoch": 9.88, + "learning_rate": 0.0005, + "loss": 1.912, + "step": 126000 + }, + { + "epoch": 9.89, + "learning_rate": 0.0005, + "loss": 1.9517, + "step": 126100 + }, + { + "epoch": 9.9, + "learning_rate": 0.0005, + "loss": 1.9252, + "step": 126200 + }, + { + "epoch": 9.9, + "learning_rate": 0.0005, + "loss": 1.9225, + "step": 126300 + }, + { + "epoch": 9.91, + "learning_rate": 0.0005, + "loss": 1.943, + "step": 126400 + }, + { + "epoch": 9.92, + "learning_rate": 0.0005, + "loss": 1.9287, + "step": 126500 + }, + { + "epoch": 9.93, + "learning_rate": 0.0005, + "loss": 1.9797, + "step": 126600 + }, + { + "epoch": 9.94, + "learning_rate": 0.0005, + "loss": 1.9319, + "step": 126700 + }, + { + "epoch": 9.94, + "learning_rate": 0.0005, + "loss": 1.9392, + "step": 126800 + }, + { + "epoch": 9.95, + "learning_rate": 0.0005, + "loss": 1.9354, + "step": 126900 + }, + { + "epoch": 9.96, + "learning_rate": 0.0005, + "loss": 1.9127, + "step": 127000 + }, + { + "epoch": 9.97, + "learning_rate": 0.0005, + "loss": 1.9682, + "step": 127100 + }, + { + "epoch": 9.97, + "learning_rate": 0.0005, + "loss": 1.9153, + "step": 127200 + }, + { + "epoch": 9.98, + "learning_rate": 0.0005, + "loss": 1.9246, + "step": 127300 + }, + { + "epoch": 9.99, + "learning_rate": 0.0005, + "loss": 1.9302, + "step": 127400 + }, + { + "epoch": 10.0, + "learning_rate": 0.0005, + "loss": 1.9188, + "step": 127500 + }, + { + "epoch": 10.01, + "learning_rate": 0.0005, + "loss": 1.9, + "step": 127600 + }, + { + "epoch": 10.01, + "learning_rate": 0.0005, + "loss": 1.847, + "step": 127700 + }, + { + "epoch": 10.02, + "learning_rate": 0.0005, + "loss": 1.8283, + "step": 127800 + }, + { + "epoch": 10.03, + "learning_rate": 0.0005, + "loss": 1.8143, + "step": 127900 + }, + { + "epoch": 10.04, + "learning_rate": 0.0005, + "loss": 1.8273, + "step": 128000 + }, + { + "epoch": 10.05, + "learning_rate": 0.0005, + "loss": 1.8709, + "step": 128100 + }, + { + "epoch": 10.05, + "learning_rate": 0.0005, + "loss": 1.8487, + "step": 128200 + }, + { + "epoch": 10.06, + "learning_rate": 0.0005, + "loss": 1.8281, + "step": 128300 + }, + { + "epoch": 10.07, + "learning_rate": 0.0005, + "loss": 1.8641, + "step": 128400 + }, + { + "epoch": 10.08, + "learning_rate": 0.0005, + "loss": 1.862, + "step": 128500 + }, + { + "epoch": 10.08, + "learning_rate": 0.0005, + "loss": 1.8973, + "step": 128600 + }, + { + "epoch": 10.09, + "learning_rate": 0.0005, + "loss": 1.8294, + "step": 128700 + }, + { + "epoch": 10.1, + "learning_rate": 0.0005, + "loss": 1.8656, + "step": 128800 + }, + { + "epoch": 10.11, + "learning_rate": 0.0005, + "loss": 1.8619, + "step": 128900 + }, + { + "epoch": 10.12, + "learning_rate": 0.0005, + "loss": 1.8493, + "step": 129000 + }, + { + "epoch": 10.12, + "learning_rate": 0.0005, + "loss": 1.8471, + "step": 129100 + }, + { + "epoch": 10.13, + "learning_rate": 0.0005, + "loss": 1.8663, + "step": 129200 + }, + { + "epoch": 10.14, + "learning_rate": 0.0005, + "loss": 1.8781, + "step": 129300 + }, + { + "epoch": 10.15, + "learning_rate": 0.0005, + "loss": 1.8959, + "step": 129400 + }, + { + "epoch": 10.16, + "learning_rate": 0.0005, + "loss": 1.8703, + "step": 129500 + }, + { + "epoch": 10.16, + "learning_rate": 0.0005, + "loss": 1.8461, + "step": 129600 + }, + { + "epoch": 10.17, + "learning_rate": 0.0005, + "loss": 1.8471, + "step": 129700 + }, + { + "epoch": 10.18, + "learning_rate": 0.0005, + "loss": 1.8793, + "step": 129800 + }, + { + "epoch": 10.19, + "learning_rate": 0.0005, + "loss": 1.9014, + "step": 129900 + }, + { + "epoch": 10.19, + "learning_rate": 0.0005, + "loss": 1.8524, + "step": 130000 + }, + { + "epoch": 10.19, + "eval_gen_len": 18.745651981989937, + "eval_loss": 2.035221815109253, + "eval_rouge1": 34.7913, + "eval_rouge2": 13.5607, + "eval_rougeL": 28.4207, + "eval_rougeLsum": 28.4117, + "eval_runtime": 358.3534, + "eval_samples_per_second": 31.608, + "eval_steps_per_second": 1.976, + "step": 130000 + }, + { + "epoch": 10.2, + "learning_rate": 0.0005, + "loss": 1.8683, + "step": 130100 + }, + { + "epoch": 10.21, + "learning_rate": 0.0005, + "loss": 1.8492, + "step": 130200 + }, + { + "epoch": 10.22, + "learning_rate": 0.0005, + "loss": 1.8901, + "step": 130300 + }, + { + "epoch": 10.23, + "learning_rate": 0.0005, + "loss": 1.8592, + "step": 130400 + }, + { + "epoch": 10.23, + "learning_rate": 0.0005, + "loss": 1.8946, + "step": 130500 + }, + { + "epoch": 10.24, + "learning_rate": 0.0005, + "loss": 1.8495, + "step": 130600 + }, + { + "epoch": 10.25, + "learning_rate": 0.0005, + "loss": 1.8598, + "step": 130700 + }, + { + "epoch": 10.26, + "learning_rate": 0.0005, + "loss": 1.8755, + "step": 130800 + }, + { + "epoch": 10.27, + "learning_rate": 0.0005, + "loss": 1.8473, + "step": 130900 + }, + { + "epoch": 10.27, + "learning_rate": 0.0005, + "loss": 1.8724, + "step": 131000 + }, + { + "epoch": 10.28, + "learning_rate": 0.0005, + "loss": 1.8806, + "step": 131100 + }, + { + "epoch": 10.29, + "learning_rate": 0.0005, + "loss": 1.8776, + "step": 131200 + }, + { + "epoch": 10.3, + "learning_rate": 0.0005, + "loss": 1.8739, + "step": 131300 + }, + { + "epoch": 10.3, + "learning_rate": 0.0005, + "loss": 1.8831, + "step": 131400 + }, + { + "epoch": 10.31, + "learning_rate": 0.0005, + "loss": 1.8764, + "step": 131500 + }, + { + "epoch": 10.32, + "learning_rate": 0.0005, + "loss": 1.8965, + "step": 131600 + }, + { + "epoch": 10.33, + "learning_rate": 0.0005, + "loss": 1.8842, + "step": 131700 + }, + { + "epoch": 10.34, + "learning_rate": 0.0005, + "loss": 1.8578, + "step": 131800 + }, + { + "epoch": 10.34, + "learning_rate": 0.0005, + "loss": 1.8977, + "step": 131900 + }, + { + "epoch": 10.35, + "learning_rate": 0.0005, + "loss": 1.8717, + "step": 132000 + }, + { + "epoch": 10.36, + "learning_rate": 0.0005, + "loss": 1.8595, + "step": 132100 + }, + { + "epoch": 10.37, + "learning_rate": 0.0005, + "loss": 1.8855, + "step": 132200 + }, + { + "epoch": 10.37, + "learning_rate": 0.0005, + "loss": 1.9059, + "step": 132300 + }, + { + "epoch": 10.38, + "learning_rate": 0.0005, + "loss": 1.8721, + "step": 132400 + }, + { + "epoch": 10.39, + "learning_rate": 0.0005, + "loss": 1.9069, + "step": 132500 + }, + { + "epoch": 10.4, + "learning_rate": 0.0005, + "loss": 1.8871, + "step": 132600 + }, + { + "epoch": 10.41, + "learning_rate": 0.0005, + "loss": 1.888, + "step": 132700 + }, + { + "epoch": 10.41, + "learning_rate": 0.0005, + "loss": 1.8623, + "step": 132800 + }, + { + "epoch": 10.42, + "learning_rate": 0.0005, + "loss": 1.8942, + "step": 132900 + }, + { + "epoch": 10.43, + "learning_rate": 0.0005, + "loss": 1.9247, + "step": 133000 + }, + { + "epoch": 10.44, + "learning_rate": 0.0005, + "loss": 1.871, + "step": 133100 + }, + { + "epoch": 10.45, + "learning_rate": 0.0005, + "loss": 1.8971, + "step": 133200 + }, + { + "epoch": 10.45, + "learning_rate": 0.0005, + "loss": 1.8707, + "step": 133300 + }, + { + "epoch": 10.46, + "learning_rate": 0.0005, + "loss": 1.8797, + "step": 133400 + }, + { + "epoch": 10.47, + "learning_rate": 0.0005, + "loss": 1.8896, + "step": 133500 + }, + { + "epoch": 10.48, + "learning_rate": 0.0005, + "loss": 1.878, + "step": 133600 + }, + { + "epoch": 10.48, + "learning_rate": 0.0005, + "loss": 1.8719, + "step": 133700 + }, + { + "epoch": 10.49, + "learning_rate": 0.0005, + "loss": 1.8828, + "step": 133800 + }, + { + "epoch": 10.5, + "learning_rate": 0.0005, + "loss": 1.91, + "step": 133900 + }, + { + "epoch": 10.51, + "learning_rate": 0.0005, + "loss": 1.8776, + "step": 134000 + }, + { + "epoch": 10.52, + "learning_rate": 0.0005, + "loss": 1.8828, + "step": 134100 + }, + { + "epoch": 10.52, + "learning_rate": 0.0005, + "loss": 1.8739, + "step": 134200 + }, + { + "epoch": 10.53, + "learning_rate": 0.0005, + "loss": 1.9038, + "step": 134300 + }, + { + "epoch": 10.54, + "learning_rate": 0.0005, + "loss": 1.9058, + "step": 134400 + }, + { + "epoch": 10.55, + "learning_rate": 0.0005, + "loss": 1.8967, + "step": 134500 + }, + { + "epoch": 10.56, + "learning_rate": 0.0005, + "loss": 1.9057, + "step": 134600 + }, + { + "epoch": 10.56, + "learning_rate": 0.0005, + "loss": 1.8868, + "step": 134700 + }, + { + "epoch": 10.57, + "learning_rate": 0.0005, + "loss": 1.8957, + "step": 134800 + }, + { + "epoch": 10.58, + "learning_rate": 0.0005, + "loss": 1.8816, + "step": 134900 + }, + { + "epoch": 10.59, + "learning_rate": 0.0005, + "loss": 1.8901, + "step": 135000 + }, + { + "epoch": 10.59, + "learning_rate": 0.0005, + "loss": 1.8964, + "step": 135100 + }, + { + "epoch": 10.6, + "learning_rate": 0.0005, + "loss": 1.8975, + "step": 135200 + }, + { + "epoch": 10.61, + "learning_rate": 0.0005, + "loss": 1.9138, + "step": 135300 + }, + { + "epoch": 10.62, + "learning_rate": 0.0005, + "loss": 1.9164, + "step": 135400 + }, + { + "epoch": 10.63, + "learning_rate": 0.0005, + "loss": 1.8969, + "step": 135500 + }, + { + "epoch": 10.63, + "learning_rate": 0.0005, + "loss": 1.9051, + "step": 135600 + }, + { + "epoch": 10.64, + "learning_rate": 0.0005, + "loss": 1.8947, + "step": 135700 + }, + { + "epoch": 10.65, + "learning_rate": 0.0005, + "loss": 1.9156, + "step": 135800 + }, + { + "epoch": 10.66, + "learning_rate": 0.0005, + "loss": 1.906, + "step": 135900 + }, + { + "epoch": 10.66, + "learning_rate": 0.0005, + "loss": 1.8948, + "step": 136000 + }, + { + "epoch": 10.67, + "learning_rate": 0.0005, + "loss": 1.8997, + "step": 136100 + }, + { + "epoch": 10.68, + "learning_rate": 0.0005, + "loss": 1.9112, + "step": 136200 + }, + { + "epoch": 10.69, + "learning_rate": 0.0005, + "loss": 1.9091, + "step": 136300 + }, + { + "epoch": 10.7, + "learning_rate": 0.0005, + "loss": 1.9083, + "step": 136400 + }, + { + "epoch": 10.7, + "learning_rate": 0.0005, + "loss": 1.9142, + "step": 136500 + }, + { + "epoch": 10.71, + "learning_rate": 0.0005, + "loss": 1.9074, + "step": 136600 + }, + { + "epoch": 10.72, + "learning_rate": 0.0005, + "loss": 1.8614, + "step": 136700 + }, + { + "epoch": 10.73, + "learning_rate": 0.0005, + "loss": 1.8908, + "step": 136800 + }, + { + "epoch": 10.74, + "learning_rate": 0.0005, + "loss": 1.9128, + "step": 136900 + }, + { + "epoch": 10.74, + "learning_rate": 0.0005, + "loss": 1.9006, + "step": 137000 + }, + { + "epoch": 10.75, + "learning_rate": 0.0005, + "loss": 1.8909, + "step": 137100 + }, + { + "epoch": 10.76, + "learning_rate": 0.0005, + "loss": 1.8812, + "step": 137200 + }, + { + "epoch": 10.77, + "learning_rate": 0.0005, + "loss": 1.9118, + "step": 137300 + }, + { + "epoch": 10.77, + "learning_rate": 0.0005, + "loss": 1.8966, + "step": 137400 + }, + { + "epoch": 10.78, + "learning_rate": 0.0005, + "loss": 1.9206, + "step": 137500 + }, + { + "epoch": 10.79, + "learning_rate": 0.0005, + "loss": 1.9065, + "step": 137600 + }, + { + "epoch": 10.8, + "learning_rate": 0.0005, + "loss": 1.8795, + "step": 137700 + }, + { + "epoch": 10.81, + "learning_rate": 0.0005, + "loss": 1.9157, + "step": 137800 + }, + { + "epoch": 10.81, + "learning_rate": 0.0005, + "loss": 1.9352, + "step": 137900 + }, + { + "epoch": 10.82, + "learning_rate": 0.0005, + "loss": 1.8882, + "step": 138000 + }, + { + "epoch": 10.83, + "learning_rate": 0.0005, + "loss": 1.8767, + "step": 138100 + }, + { + "epoch": 10.84, + "learning_rate": 0.0005, + "loss": 1.8922, + "step": 138200 + }, + { + "epoch": 10.85, + "learning_rate": 0.0005, + "loss": 1.902, + "step": 138300 + }, + { + "epoch": 10.85, + "learning_rate": 0.0005, + "loss": 1.9059, + "step": 138400 + }, + { + "epoch": 10.86, + "learning_rate": 0.0005, + "loss": 1.9211, + "step": 138500 + }, + { + "epoch": 10.87, + "learning_rate": 0.0005, + "loss": 1.8916, + "step": 138600 + }, + { + "epoch": 10.88, + "learning_rate": 0.0005, + "loss": 1.936, + "step": 138700 + }, + { + "epoch": 10.88, + "learning_rate": 0.0005, + "loss": 1.9028, + "step": 138800 + }, + { + "epoch": 10.89, + "learning_rate": 0.0005, + "loss": 1.9066, + "step": 138900 + }, + { + "epoch": 10.9, + "learning_rate": 0.0005, + "loss": 1.8575, + "step": 139000 + }, + { + "epoch": 10.91, + "learning_rate": 0.0005, + "loss": 1.9116, + "step": 139100 + }, + { + "epoch": 10.92, + "learning_rate": 0.0005, + "loss": 1.9183, + "step": 139200 + }, + { + "epoch": 10.92, + "learning_rate": 0.0005, + "loss": 1.8794, + "step": 139300 + }, + { + "epoch": 10.93, + "learning_rate": 0.0005, + "loss": 1.9203, + "step": 139400 + }, + { + "epoch": 10.94, + "learning_rate": 0.0005, + "loss": 1.9218, + "step": 139500 + }, + { + "epoch": 10.95, + "learning_rate": 0.0005, + "loss": 1.9005, + "step": 139600 + }, + { + "epoch": 10.96, + "learning_rate": 0.0005, + "loss": 1.9146, + "step": 139700 + }, + { + "epoch": 10.96, + "learning_rate": 0.0005, + "loss": 1.9101, + "step": 139800 + }, + { + "epoch": 10.97, + "learning_rate": 0.0005, + "loss": 1.9226, + "step": 139900 + }, + { + "epoch": 10.98, + "learning_rate": 0.0005, + "loss": 1.8968, + "step": 140000 + }, + { + "epoch": 10.98, + "eval_gen_len": 18.780524410700096, + "eval_loss": 2.0128087997436523, + "eval_rouge1": 34.8079, + "eval_rouge2": 13.6337, + "eval_rougeL": 28.4248, + "eval_rougeLsum": 28.4272, + "eval_runtime": 363.0976, + "eval_samples_per_second": 31.195, + "eval_steps_per_second": 1.95, + "step": 140000 + }, + { + "epoch": 10.99, + "learning_rate": 0.0005, + "loss": 1.908, + "step": 140100 + }, + { + "epoch": 10.99, + "learning_rate": 0.0005, + "loss": 1.9198, + "step": 140200 + }, + { + "epoch": 11.0, + "learning_rate": 0.0005, + "loss": 1.8756, + "step": 140300 + }, + { + "epoch": 11.01, + "learning_rate": 0.0005, + "loss": 1.842, + "step": 140400 + }, + { + "epoch": 11.02, + "learning_rate": 0.0005, + "loss": 1.8369, + "step": 140500 + }, + { + "epoch": 11.03, + "learning_rate": 0.0005, + "loss": 1.7975, + "step": 140600 + }, + { + "epoch": 11.03, + "learning_rate": 0.0005, + "loss": 1.8204, + "step": 140700 + }, + { + "epoch": 11.04, + "learning_rate": 0.0005, + "loss": 1.8019, + "step": 140800 + }, + { + "epoch": 11.05, + "learning_rate": 0.0005, + "loss": 1.8289, + "step": 140900 + }, + { + "epoch": 11.06, + "learning_rate": 0.0005, + "loss": 1.8119, + "step": 141000 + }, + { + "epoch": 11.06, + "learning_rate": 0.0005, + "loss": 1.8118, + "step": 141100 + }, + { + "epoch": 11.07, + "learning_rate": 0.0005, + "loss": 1.8111, + "step": 141200 + }, + { + "epoch": 11.08, + "learning_rate": 0.0005, + "loss": 1.8147, + "step": 141300 + }, + { + "epoch": 11.09, + "learning_rate": 0.0005, + "loss": 1.8222, + "step": 141400 + }, + { + "epoch": 11.1, + "learning_rate": 0.0005, + "loss": 1.8397, + "step": 141500 + }, + { + "epoch": 11.1, + "learning_rate": 0.0005, + "loss": 1.8353, + "step": 141600 + }, + { + "epoch": 11.11, + "learning_rate": 0.0005, + "loss": 1.854, + "step": 141700 + }, + { + "epoch": 11.12, + "learning_rate": 0.0005, + "loss": 1.8107, + "step": 141800 + }, + { + "epoch": 11.13, + "learning_rate": 0.0005, + "loss": 1.8227, + "step": 141900 + }, + { + "epoch": 11.14, + "learning_rate": 0.0005, + "loss": 1.8332, + "step": 142000 + }, + { + "epoch": 11.14, + "learning_rate": 0.0005, + "loss": 1.8161, + "step": 142100 + }, + { + "epoch": 11.15, + "learning_rate": 0.0005, + "loss": 1.8332, + "step": 142200 + }, + { + "epoch": 11.16, + "learning_rate": 0.0005, + "loss": 1.8283, + "step": 142300 + }, + { + "epoch": 11.17, + "learning_rate": 0.0005, + "loss": 1.8493, + "step": 142400 + }, + { + "epoch": 11.17, + "learning_rate": 0.0005, + "loss": 1.8284, + "step": 142500 + }, + { + "epoch": 11.18, + "learning_rate": 0.0005, + "loss": 1.8085, + "step": 142600 + }, + { + "epoch": 11.19, + "learning_rate": 0.0005, + "loss": 1.8523, + "step": 142700 + }, + { + "epoch": 11.2, + "learning_rate": 0.0005, + "loss": 1.8412, + "step": 142800 + }, + { + "epoch": 11.21, + "learning_rate": 0.0005, + "loss": 1.8628, + "step": 142900 + }, + { + "epoch": 11.21, + "learning_rate": 0.0005, + "loss": 1.8296, + "step": 143000 + }, + { + "epoch": 11.22, + "learning_rate": 0.0005, + "loss": 1.8591, + "step": 143100 + }, + { + "epoch": 11.23, + "learning_rate": 0.0005, + "loss": 1.8804, + "step": 143200 + }, + { + "epoch": 11.24, + "learning_rate": 0.0005, + "loss": 1.8345, + "step": 143300 + }, + { + "epoch": 11.25, + "learning_rate": 0.0005, + "loss": 1.8262, + "step": 143400 + }, + { + "epoch": 11.25, + "learning_rate": 0.0005, + "loss": 1.8762, + "step": 143500 + }, + { + "epoch": 11.26, + "learning_rate": 0.0005, + "loss": 1.8287, + "step": 143600 + }, + { + "epoch": 11.27, + "learning_rate": 0.0005, + "loss": 1.8452, + "step": 143700 + }, + { + "epoch": 11.28, + "learning_rate": 0.0005, + "loss": 1.8575, + "step": 143800 + }, + { + "epoch": 11.28, + "learning_rate": 0.0005, + "loss": 1.8701, + "step": 143900 + }, + { + "epoch": 11.29, + "learning_rate": 0.0005, + "loss": 1.8291, + "step": 144000 + }, + { + "epoch": 11.3, + "learning_rate": 0.0005, + "loss": 1.8768, + "step": 144100 + }, + { + "epoch": 11.31, + "learning_rate": 0.0005, + "loss": 1.8742, + "step": 144200 + }, + { + "epoch": 11.32, + "learning_rate": 0.0005, + "loss": 1.8481, + "step": 144300 + }, + { + "epoch": 11.32, + "learning_rate": 0.0005, + "loss": 1.8406, + "step": 144400 + }, + { + "epoch": 11.33, + "learning_rate": 0.0005, + "loss": 1.8386, + "step": 144500 + }, + { + "epoch": 11.34, + "learning_rate": 0.0005, + "loss": 1.8815, + "step": 144600 + }, + { + "epoch": 11.35, + "learning_rate": 0.0005, + "loss": 1.8318, + "step": 144700 + }, + { + "epoch": 11.36, + "learning_rate": 0.0005, + "loss": 1.8546, + "step": 144800 + }, + { + "epoch": 11.36, + "learning_rate": 0.0005, + "loss": 1.889, + "step": 144900 + }, + { + "epoch": 11.37, + "learning_rate": 0.0005, + "loss": 1.8426, + "step": 145000 + }, + { + "epoch": 11.38, + "learning_rate": 0.0005, + "loss": 1.8647, + "step": 145100 + }, + { + "epoch": 11.39, + "learning_rate": 0.0005, + "loss": 1.8627, + "step": 145200 + }, + { + "epoch": 11.39, + "learning_rate": 0.0005, + "loss": 1.8513, + "step": 145300 + }, + { + "epoch": 11.4, + "learning_rate": 0.0005, + "loss": 1.8939, + "step": 145400 + }, + { + "epoch": 11.41, + "learning_rate": 0.0005, + "loss": 1.8071, + "step": 145500 + }, + { + "epoch": 11.42, + "learning_rate": 0.0005, + "loss": 1.871, + "step": 145600 + }, + { + "epoch": 11.43, + "learning_rate": 0.0005, + "loss": 1.8685, + "step": 145700 + }, + { + "epoch": 11.43, + "learning_rate": 0.0005, + "loss": 1.8702, + "step": 145800 + }, + { + "epoch": 11.44, + "learning_rate": 0.0005, + "loss": 1.8751, + "step": 145900 + }, + { + "epoch": 11.45, + "learning_rate": 0.0005, + "loss": 1.8614, + "step": 146000 + }, + { + "epoch": 11.46, + "learning_rate": 0.0005, + "loss": 1.8781, + "step": 146100 + }, + { + "epoch": 11.46, + "learning_rate": 0.0005, + "loss": 1.8487, + "step": 146200 + }, + { + "epoch": 11.47, + "learning_rate": 0.0005, + "loss": 1.829, + "step": 146300 + }, + { + "epoch": 11.48, + "learning_rate": 0.0005, + "loss": 1.8417, + "step": 146400 + }, + { + "epoch": 11.49, + "learning_rate": 0.0005, + "loss": 1.8634, + "step": 146500 + }, + { + "epoch": 11.5, + "learning_rate": 0.0005, + "loss": 1.8713, + "step": 146600 + }, + { + "epoch": 11.5, + "learning_rate": 0.0005, + "loss": 1.8597, + "step": 146700 + }, + { + "epoch": 11.51, + "learning_rate": 0.0005, + "loss": 1.8772, + "step": 146800 + }, + { + "epoch": 11.52, + "learning_rate": 0.0005, + "loss": 1.8325, + "step": 146900 + }, + { + "epoch": 11.53, + "learning_rate": 0.0005, + "loss": 1.8753, + "step": 147000 + }, + { + "epoch": 11.54, + "learning_rate": 0.0005, + "loss": 1.8868, + "step": 147100 + }, + { + "epoch": 11.54, + "learning_rate": 0.0005, + "loss": 1.8634, + "step": 147200 + }, + { + "epoch": 11.55, + "learning_rate": 0.0005, + "loss": 1.8759, + "step": 147300 + }, + { + "epoch": 11.56, + "learning_rate": 0.0005, + "loss": 1.8571, + "step": 147400 + }, + { + "epoch": 11.57, + "learning_rate": 0.0005, + "loss": 1.8848, + "step": 147500 + }, + { + "epoch": 11.57, + "learning_rate": 0.0005, + "loss": 1.8947, + "step": 147600 + }, + { + "epoch": 11.58, + "learning_rate": 0.0005, + "loss": 1.8594, + "step": 147700 + }, + { + "epoch": 11.59, + "learning_rate": 0.0005, + "loss": 1.8686, + "step": 147800 + }, + { + "epoch": 11.6, + "learning_rate": 0.0005, + "loss": 1.8718, + "step": 147900 + }, + { + "epoch": 11.61, + "learning_rate": 0.0005, + "loss": 1.8632, + "step": 148000 + }, + { + "epoch": 11.61, + "learning_rate": 0.0005, + "loss": 1.8723, + "step": 148100 + }, + { + "epoch": 11.62, + "learning_rate": 0.0005, + "loss": 1.845, + "step": 148200 + }, + { + "epoch": 11.63, + "learning_rate": 0.0005, + "loss": 1.9104, + "step": 148300 + }, + { + "epoch": 11.64, + "learning_rate": 0.0005, + "loss": 1.8627, + "step": 148400 + }, + { + "epoch": 11.65, + "learning_rate": 0.0005, + "loss": 1.86, + "step": 148500 + }, + { + "epoch": 11.65, + "learning_rate": 0.0005, + "loss": 1.8749, + "step": 148600 + }, + { + "epoch": 11.66, + "learning_rate": 0.0005, + "loss": 1.8809, + "step": 148700 + }, + { + "epoch": 11.67, + "learning_rate": 0.0005, + "loss": 1.8597, + "step": 148800 + }, + { + "epoch": 11.68, + "learning_rate": 0.0005, + "loss": 1.8427, + "step": 148900 + }, + { + "epoch": 11.68, + "learning_rate": 0.0005, + "loss": 1.8585, + "step": 149000 + }, + { + "epoch": 11.69, + "learning_rate": 0.0005, + "loss": 1.8673, + "step": 149100 + }, + { + "epoch": 11.7, + "learning_rate": 0.0005, + "loss": 1.871, + "step": 149200 + }, + { + "epoch": 11.71, + "learning_rate": 0.0005, + "loss": 1.8847, + "step": 149300 + }, + { + "epoch": 11.72, + "learning_rate": 0.0005, + "loss": 1.8439, + "step": 149400 + }, + { + "epoch": 11.72, + "learning_rate": 0.0005, + "loss": 1.8545, + "step": 149500 + }, + { + "epoch": 11.73, + "learning_rate": 0.0005, + "loss": 1.873, + "step": 149600 + }, + { + "epoch": 11.74, + "learning_rate": 0.0005, + "loss": 1.8647, + "step": 149700 + }, + { + "epoch": 11.75, + "learning_rate": 0.0005, + "loss": 1.8725, + "step": 149800 + }, + { + "epoch": 11.76, + "learning_rate": 0.0005, + "loss": 1.8905, + "step": 149900 + }, + { + "epoch": 11.76, + "learning_rate": 0.0005, + "loss": 1.8758, + "step": 150000 + }, + { + "epoch": 11.76, + "eval_gen_len": 18.80771607663106, + "eval_loss": 2.0203864574432373, + "eval_rouge1": 34.6581, + "eval_rouge2": 13.5851, + "eval_rougeL": 28.3861, + "eval_rougeLsum": 28.3839, + "eval_runtime": 370.8862, + "eval_samples_per_second": 30.54, + "eval_steps_per_second": 1.909, + "step": 150000 + }, + { + "epoch": 11.77, + "learning_rate": 0.0005, + "loss": 1.8741, + "step": 150100 + }, + { + "epoch": 11.78, + "learning_rate": 0.0005, + "loss": 1.8762, + "step": 150200 + }, + { + "epoch": 11.79, + "learning_rate": 0.0005, + "loss": 1.8504, + "step": 150300 + }, + { + "epoch": 11.79, + "learning_rate": 0.0005, + "loss": 1.8605, + "step": 150400 + }, + { + "epoch": 11.8, + "learning_rate": 0.0005, + "loss": 1.8651, + "step": 150500 + }, + { + "epoch": 11.81, + "learning_rate": 0.0005, + "loss": 1.8913, + "step": 150600 + }, + { + "epoch": 11.82, + "learning_rate": 0.0005, + "loss": 1.9204, + "step": 150700 + }, + { + "epoch": 11.83, + "learning_rate": 0.0005, + "loss": 1.8737, + "step": 150800 + }, + { + "epoch": 11.83, + "learning_rate": 0.0005, + "loss": 1.8794, + "step": 150900 + }, + { + "epoch": 11.84, + "learning_rate": 0.0005, + "loss": 1.9059, + "step": 151000 + }, + { + "epoch": 11.85, + "learning_rate": 0.0005, + "loss": 1.9089, + "step": 151100 + }, + { + "epoch": 11.86, + "learning_rate": 0.0005, + "loss": 1.8743, + "step": 151200 + }, + { + "epoch": 11.86, + "learning_rate": 0.0005, + "loss": 1.8677, + "step": 151300 + }, + { + "epoch": 11.87, + "learning_rate": 0.0005, + "loss": 1.8821, + "step": 151400 + }, + { + "epoch": 11.88, + "learning_rate": 0.0005, + "loss": 1.8667, + "step": 151500 + }, + { + "epoch": 11.89, + "learning_rate": 0.0005, + "loss": 1.8949, + "step": 151600 + }, + { + "epoch": 11.9, + "learning_rate": 0.0005, + "loss": 1.8714, + "step": 151700 + }, + { + "epoch": 11.9, + "learning_rate": 0.0005, + "loss": 1.8637, + "step": 151800 + }, + { + "epoch": 11.91, + "learning_rate": 0.0005, + "loss": 1.8878, + "step": 151900 + }, + { + "epoch": 11.92, + "learning_rate": 0.0005, + "loss": 1.8625, + "step": 152000 + }, + { + "epoch": 11.93, + "learning_rate": 0.0005, + "loss": 1.8858, + "step": 152100 + }, + { + "epoch": 11.94, + "learning_rate": 0.0005, + "loss": 1.8811, + "step": 152200 + }, + { + "epoch": 11.94, + "learning_rate": 0.0005, + "loss": 1.8735, + "step": 152300 + }, + { + "epoch": 11.95, + "learning_rate": 0.0005, + "loss": 1.8728, + "step": 152400 + }, + { + "epoch": 11.96, + "learning_rate": 0.0005, + "loss": 1.8751, + "step": 152500 + }, + { + "epoch": 11.97, + "learning_rate": 0.0005, + "loss": 1.8996, + "step": 152600 + }, + { + "epoch": 11.97, + "learning_rate": 0.0005, + "loss": 1.9027, + "step": 152700 + }, + { + "epoch": 11.98, + "learning_rate": 0.0005, + "loss": 1.8875, + "step": 152800 + }, + { + "epoch": 11.99, + "learning_rate": 0.0005, + "loss": 1.8858, + "step": 152900 + }, + { + "epoch": 12.0, + "learning_rate": 0.0005, + "loss": 1.8822, + "step": 153000 + }, + { + "epoch": 12.01, + "learning_rate": 0.0005, + "loss": 1.8282, + "step": 153100 + }, + { + "epoch": 12.01, + "learning_rate": 0.0005, + "loss": 1.7881, + "step": 153200 + }, + { + "epoch": 12.02, + "learning_rate": 0.0005, + "loss": 1.7834, + "step": 153300 + }, + { + "epoch": 12.03, + "learning_rate": 0.0005, + "loss": 1.7981, + "step": 153400 + }, + { + "epoch": 12.04, + "learning_rate": 0.0005, + "loss": 1.796, + "step": 153500 + }, + { + "epoch": 12.05, + "learning_rate": 0.0005, + "loss": 1.7744, + "step": 153600 + }, + { + "epoch": 12.05, + "learning_rate": 0.0005, + "loss": 1.7785, + "step": 153700 + }, + { + "epoch": 12.06, + "learning_rate": 0.0005, + "loss": 1.7937, + "step": 153800 + }, + { + "epoch": 12.07, + "learning_rate": 0.0005, + "loss": 1.7746, + "step": 153900 + }, + { + "epoch": 12.08, + "learning_rate": 0.0005, + "loss": 1.8114, + "step": 154000 + }, + { + "epoch": 12.08, + "learning_rate": 0.0005, + "loss": 1.814, + "step": 154100 + }, + { + "epoch": 12.09, + "learning_rate": 0.0005, + "loss": 1.7709, + "step": 154200 + }, + { + "epoch": 12.1, + "learning_rate": 0.0005, + "loss": 1.7994, + "step": 154300 + }, + { + "epoch": 12.11, + "learning_rate": 0.0005, + "loss": 1.8088, + "step": 154400 + }, + { + "epoch": 12.12, + "learning_rate": 0.0005, + "loss": 1.8016, + "step": 154500 + }, + { + "epoch": 12.12, + "learning_rate": 0.0005, + "loss": 1.8205, + "step": 154600 + }, + { + "epoch": 12.13, + "learning_rate": 0.0005, + "loss": 1.826, + "step": 154700 + }, + { + "epoch": 12.14, + "learning_rate": 0.0005, + "loss": 1.8543, + "step": 154800 + }, + { + "epoch": 12.15, + "learning_rate": 0.0005, + "loss": 1.8301, + "step": 154900 + }, + { + "epoch": 12.15, + "learning_rate": 0.0005, + "loss": 1.797, + "step": 155000 + }, + { + "epoch": 12.16, + "learning_rate": 0.0005, + "loss": 1.8202, + "step": 155100 + }, + { + "epoch": 12.17, + "learning_rate": 0.0005, + "loss": 1.8305, + "step": 155200 + }, + { + "epoch": 12.18, + "learning_rate": 0.0005, + "loss": 1.8421, + "step": 155300 + }, + { + "epoch": 12.19, + "learning_rate": 0.0005, + "loss": 1.8219, + "step": 155400 + }, + { + "epoch": 12.19, + "learning_rate": 0.0005, + "loss": 1.848, + "step": 155500 + }, + { + "epoch": 12.2, + "learning_rate": 0.0005, + "loss": 1.8103, + "step": 155600 + }, + { + "epoch": 12.21, + "learning_rate": 0.0005, + "loss": 1.8199, + "step": 155700 + }, + { + "epoch": 12.22, + "learning_rate": 0.0005, + "loss": 1.8423, + "step": 155800 + }, + { + "epoch": 12.23, + "learning_rate": 0.0005, + "loss": 1.8153, + "step": 155900 + }, + { + "epoch": 12.23, + "learning_rate": 0.0005, + "loss": 1.8072, + "step": 156000 + }, + { + "epoch": 12.24, + "learning_rate": 0.0005, + "loss": 1.8351, + "step": 156100 + }, + { + "epoch": 12.25, + "learning_rate": 0.0005, + "loss": 1.8175, + "step": 156200 + }, + { + "epoch": 12.26, + "learning_rate": 0.0005, + "loss": 1.8411, + "step": 156300 + }, + { + "epoch": 12.26, + "learning_rate": 0.0005, + "loss": 1.8558, + "step": 156400 + }, + { + "epoch": 12.27, + "learning_rate": 0.0005, + "loss": 1.83, + "step": 156500 + }, + { + "epoch": 12.28, + "learning_rate": 0.0005, + "loss": 1.8168, + "step": 156600 + }, + { + "epoch": 12.29, + "learning_rate": 0.0005, + "loss": 1.8293, + "step": 156700 + }, + { + "epoch": 12.3, + "learning_rate": 0.0005, + "loss": 1.8293, + "step": 156800 + }, + { + "epoch": 12.3, + "learning_rate": 0.0005, + "loss": 1.8264, + "step": 156900 + }, + { + "epoch": 12.31, + "learning_rate": 0.0005, + "loss": 1.832, + "step": 157000 + }, + { + "epoch": 12.32, + "learning_rate": 0.0005, + "loss": 1.8167, + "step": 157100 + }, + { + "epoch": 12.33, + "learning_rate": 0.0005, + "loss": 1.8483, + "step": 157200 + }, + { + "epoch": 12.34, + "learning_rate": 0.0005, + "loss": 1.836, + "step": 157300 + }, + { + "epoch": 12.34, + "learning_rate": 0.0005, + "loss": 1.8409, + "step": 157400 + }, + { + "epoch": 12.35, + "learning_rate": 0.0005, + "loss": 1.8207, + "step": 157500 + }, + { + "epoch": 12.36, + "learning_rate": 0.0005, + "loss": 1.8434, + "step": 157600 + }, + { + "epoch": 12.37, + "learning_rate": 0.0005, + "loss": 1.8394, + "step": 157700 + }, + { + "epoch": 12.37, + "learning_rate": 0.0005, + "loss": 1.8232, + "step": 157800 + }, + { + "epoch": 12.38, + "learning_rate": 0.0005, + "loss": 1.8328, + "step": 157900 + }, + { + "epoch": 12.39, + "learning_rate": 0.0005, + "loss": 1.8358, + "step": 158000 + }, + { + "epoch": 12.4, + "learning_rate": 0.0005, + "loss": 1.8278, + "step": 158100 + }, + { + "epoch": 12.41, + "learning_rate": 0.0005, + "loss": 1.7989, + "step": 158200 + }, + { + "epoch": 12.41, + "learning_rate": 0.0005, + "loss": 1.8207, + "step": 158300 + }, + { + "epoch": 12.42, + "learning_rate": 0.0005, + "loss": 1.8147, + "step": 158400 + }, + { + "epoch": 12.43, + "learning_rate": 0.0005, + "loss": 1.8319, + "step": 158500 + }, + { + "epoch": 12.44, + "learning_rate": 0.0005, + "loss": 1.823, + "step": 158600 + }, + { + "epoch": 12.45, + "learning_rate": 0.0005, + "loss": 1.8673, + "step": 158700 + }, + { + "epoch": 12.45, + "learning_rate": 0.0005, + "loss": 1.8437, + "step": 158800 + }, + { + "epoch": 12.46, + "learning_rate": 0.0005, + "loss": 1.8654, + "step": 158900 + }, + { + "epoch": 12.47, + "learning_rate": 0.0005, + "loss": 1.8534, + "step": 159000 + }, + { + "epoch": 12.48, + "learning_rate": 0.0005, + "loss": 1.8327, + "step": 159100 + }, + { + "epoch": 12.48, + "learning_rate": 0.0005, + "loss": 1.8365, + "step": 159200 + }, + { + "epoch": 12.49, + "learning_rate": 0.0005, + "loss": 1.8231, + "step": 159300 + }, + { + "epoch": 12.5, + "learning_rate": 0.0005, + "loss": 1.8104, + "step": 159400 + }, + { + "epoch": 12.51, + "learning_rate": 0.0005, + "loss": 1.8582, + "step": 159500 + }, + { + "epoch": 12.52, + "learning_rate": 0.0005, + "loss": 1.8159, + "step": 159600 + }, + { + "epoch": 12.52, + "learning_rate": 0.0005, + "loss": 1.8491, + "step": 159700 + }, + { + "epoch": 12.53, + "learning_rate": 0.0005, + "loss": 1.8316, + "step": 159800 + }, + { + "epoch": 12.54, + "learning_rate": 0.0005, + "loss": 1.8465, + "step": 159900 + }, + { + "epoch": 12.55, + "learning_rate": 0.0005, + "loss": 1.831, + "step": 160000 + }, + { + "epoch": 12.55, + "eval_gen_len": 18.794914805332393, + "eval_loss": 2.022197961807251, + "eval_rouge1": 35.0141, + "eval_rouge2": 13.742, + "eval_rougeL": 28.6326, + "eval_rougeLsum": 28.6266, + "eval_runtime": 366.073, + "eval_samples_per_second": 30.942, + "eval_steps_per_second": 1.934, + "step": 160000 + }, + { + "epoch": 12.55, + "learning_rate": 0.0005, + "loss": 1.8515, + "step": 160100 + }, + { + "epoch": 12.56, + "learning_rate": 0.0005, + "loss": 1.8349, + "step": 160200 + }, + { + "epoch": 12.57, + "learning_rate": 0.0005, + "loss": 1.8328, + "step": 160300 + }, + { + "epoch": 12.58, + "learning_rate": 0.0005, + "loss": 1.8438, + "step": 160400 + }, + { + "epoch": 12.59, + "learning_rate": 0.0005, + "loss": 1.8385, + "step": 160500 + }, + { + "epoch": 12.59, + "learning_rate": 0.0005, + "loss": 1.8492, + "step": 160600 + }, + { + "epoch": 12.6, + "learning_rate": 0.0005, + "loss": 1.8458, + "step": 160700 + }, + { + "epoch": 12.61, + "learning_rate": 0.0005, + "loss": 1.8523, + "step": 160800 + }, + { + "epoch": 12.62, + "learning_rate": 0.0005, + "loss": 1.8413, + "step": 160900 + }, + { + "epoch": 12.63, + "learning_rate": 0.0005, + "loss": 1.8455, + "step": 161000 + }, + { + "epoch": 12.63, + "learning_rate": 0.0005, + "loss": 1.8402, + "step": 161100 + }, + { + "epoch": 12.64, + "learning_rate": 0.0005, + "loss": 1.836, + "step": 161200 + }, + { + "epoch": 12.65, + "learning_rate": 0.0005, + "loss": 1.826, + "step": 161300 + }, + { + "epoch": 12.66, + "learning_rate": 0.0005, + "loss": 1.8346, + "step": 161400 + }, + { + "epoch": 12.66, + "learning_rate": 0.0005, + "loss": 1.8614, + "step": 161500 + }, + { + "epoch": 12.67, + "learning_rate": 0.0005, + "loss": 1.8132, + "step": 161600 + }, + { + "epoch": 12.68, + "learning_rate": 0.0005, + "loss": 1.8829, + "step": 161700 + }, + { + "epoch": 12.69, + "learning_rate": 0.0005, + "loss": 1.8424, + "step": 161800 + }, + { + "epoch": 12.7, + "learning_rate": 0.0005, + "loss": 1.8492, + "step": 161900 + }, + { + "epoch": 12.7, + "learning_rate": 0.0005, + "loss": 1.8379, + "step": 162000 + }, + { + "epoch": 12.71, + "learning_rate": 0.0005, + "loss": 1.8563, + "step": 162100 + }, + { + "epoch": 12.72, + "learning_rate": 0.0005, + "loss": 1.8041, + "step": 162200 + }, + { + "epoch": 12.73, + "learning_rate": 0.0005, + "loss": 1.8593, + "step": 162300 + }, + { + "epoch": 12.74, + "learning_rate": 0.0005, + "loss": 1.8275, + "step": 162400 + }, + { + "epoch": 12.74, + "learning_rate": 0.0005, + "loss": 1.8515, + "step": 162500 + }, + { + "epoch": 12.75, + "learning_rate": 0.0005, + "loss": 1.8346, + "step": 162600 + }, + { + "epoch": 12.76, + "learning_rate": 0.0005, + "loss": 1.856, + "step": 162700 + }, + { + "epoch": 12.77, + "learning_rate": 0.0005, + "loss": 1.8382, + "step": 162800 + }, + { + "epoch": 12.77, + "learning_rate": 0.0005, + "loss": 1.8589, + "step": 162900 + }, + { + "epoch": 12.78, + "learning_rate": 0.0005, + "loss": 1.8555, + "step": 163000 + }, + { + "epoch": 12.79, + "learning_rate": 0.0005, + "loss": 1.8464, + "step": 163100 + }, + { + "epoch": 12.8, + "learning_rate": 0.0005, + "loss": 1.8498, + "step": 163200 + }, + { + "epoch": 12.81, + "learning_rate": 0.0005, + "loss": 1.8542, + "step": 163300 + }, + { + "epoch": 12.81, + "learning_rate": 0.0005, + "loss": 1.8414, + "step": 163400 + }, + { + "epoch": 12.82, + "learning_rate": 0.0005, + "loss": 1.8498, + "step": 163500 + }, + { + "epoch": 12.83, + "learning_rate": 0.0005, + "loss": 1.8572, + "step": 163600 + }, + { + "epoch": 12.84, + "learning_rate": 0.0005, + "loss": 1.8524, + "step": 163700 + }, + { + "epoch": 12.85, + "learning_rate": 0.0005, + "loss": 1.8547, + "step": 163800 + }, + { + "epoch": 12.85, + "learning_rate": 0.0005, + "loss": 1.8326, + "step": 163900 + }, + { + "epoch": 12.86, + "learning_rate": 0.0005, + "loss": 1.8452, + "step": 164000 + }, + { + "epoch": 12.87, + "learning_rate": 0.0005, + "loss": 1.8259, + "step": 164100 + }, + { + "epoch": 12.88, + "learning_rate": 0.0005, + "loss": 1.8649, + "step": 164200 + }, + { + "epoch": 12.88, + "learning_rate": 0.0005, + "loss": 1.8535, + "step": 164300 + }, + { + "epoch": 12.89, + "learning_rate": 0.0005, + "loss": 1.8384, + "step": 164400 + }, + { + "epoch": 12.9, + "learning_rate": 0.0005, + "loss": 1.8549, + "step": 164500 + }, + { + "epoch": 12.91, + "learning_rate": 0.0005, + "loss": 1.888, + "step": 164600 + }, + { + "epoch": 12.92, + "learning_rate": 0.0005, + "loss": 1.8618, + "step": 164700 + }, + { + "epoch": 12.92, + "learning_rate": 0.0005, + "loss": 1.8834, + "step": 164800 + }, + { + "epoch": 12.93, + "learning_rate": 0.0005, + "loss": 1.8608, + "step": 164900 + }, + { + "epoch": 12.94, + "learning_rate": 0.0005, + "loss": 1.8636, + "step": 165000 + }, + { + "epoch": 12.95, + "learning_rate": 0.0005, + "loss": 1.8552, + "step": 165100 + }, + { + "epoch": 12.95, + "learning_rate": 0.0005, + "loss": 1.8288, + "step": 165200 + }, + { + "epoch": 12.96, + "learning_rate": 0.0005, + "loss": 1.8426, + "step": 165300 + }, + { + "epoch": 12.97, + "learning_rate": 0.0005, + "loss": 1.8586, + "step": 165400 + }, + { + "epoch": 12.98, + "learning_rate": 0.0005, + "loss": 1.8693, + "step": 165500 + }, + { + "epoch": 12.99, + "learning_rate": 0.0005, + "loss": 1.8556, + "step": 165600 + }, + { + "epoch": 12.99, + "learning_rate": 0.0005, + "loss": 1.8689, + "step": 165700 + }, + { + "epoch": 13.0, + "learning_rate": 0.0005, + "loss": 1.8191, + "step": 165800 + }, + { + "epoch": 13.01, + "learning_rate": 0.0005, + "loss": 1.7918, + "step": 165900 + }, + { + "epoch": 13.02, + "learning_rate": 0.0005, + "loss": 1.8066, + "step": 166000 + }, + { + "epoch": 13.03, + "learning_rate": 0.0005, + "loss": 1.7596, + "step": 166100 + }, + { + "epoch": 13.03, + "learning_rate": 0.0005, + "loss": 1.7748, + "step": 166200 + }, + { + "epoch": 13.04, + "learning_rate": 0.0005, + "loss": 1.7756, + "step": 166300 + }, + { + "epoch": 13.05, + "learning_rate": 0.0005, + "loss": 1.7872, + "step": 166400 + }, + { + "epoch": 13.06, + "learning_rate": 0.0005, + "loss": 1.78, + "step": 166500 + }, + { + "epoch": 13.06, + "learning_rate": 0.0005, + "loss": 1.7967, + "step": 166600 + }, + { + "epoch": 13.07, + "learning_rate": 0.0005, + "loss": 1.7719, + "step": 166700 + }, + { + "epoch": 13.08, + "learning_rate": 0.0005, + "loss": 1.7773, + "step": 166800 + }, + { + "epoch": 13.09, + "learning_rate": 0.0005, + "loss": 1.7931, + "step": 166900 + }, + { + "epoch": 13.1, + "learning_rate": 0.0005, + "loss": 1.7814, + "step": 167000 + }, + { + "epoch": 13.1, + "learning_rate": 0.0005, + "loss": 1.7707, + "step": 167100 + }, + { + "epoch": 13.11, + "learning_rate": 0.0005, + "loss": 1.7711, + "step": 167200 + }, + { + "epoch": 13.12, + "learning_rate": 0.0005, + "loss": 1.7758, + "step": 167300 + }, + { + "epoch": 13.13, + "learning_rate": 0.0005, + "loss": 1.772, + "step": 167400 + }, + { + "epoch": 13.14, + "learning_rate": 0.0005, + "loss": 1.7794, + "step": 167500 + }, + { + "epoch": 13.14, + "learning_rate": 0.0005, + "loss": 1.8027, + "step": 167600 + }, + { + "epoch": 13.15, + "learning_rate": 0.0005, + "loss": 1.7688, + "step": 167700 + }, + { + "epoch": 13.16, + "learning_rate": 0.0005, + "loss": 1.7958, + "step": 167800 + }, + { + "epoch": 13.17, + "learning_rate": 0.0005, + "loss": 1.8164, + "step": 167900 + }, + { + "epoch": 13.17, + "learning_rate": 0.0005, + "loss": 1.7801, + "step": 168000 + }, + { + "epoch": 13.18, + "learning_rate": 0.0005, + "loss": 1.7964, + "step": 168100 + }, + { + "epoch": 13.19, + "learning_rate": 0.0005, + "loss": 1.784, + "step": 168200 + }, + { + "epoch": 13.2, + "learning_rate": 0.0005, + "loss": 1.8051, + "step": 168300 + }, + { + "epoch": 13.21, + "learning_rate": 0.0005, + "loss": 1.7861, + "step": 168400 + }, + { + "epoch": 13.21, + "learning_rate": 0.0005, + "loss": 1.7915, + "step": 168500 + }, + { + "epoch": 13.22, + "learning_rate": 0.0005, + "loss": 1.8157, + "step": 168600 + }, + { + "epoch": 13.23, + "learning_rate": 0.0005, + "loss": 1.7867, + "step": 168700 + }, + { + "epoch": 13.24, + "learning_rate": 0.0005, + "loss": 1.7561, + "step": 168800 + }, + { + "epoch": 13.24, + "learning_rate": 0.0005, + "loss": 1.7943, + "step": 168900 + }, + { + "epoch": 13.25, + "learning_rate": 0.0005, + "loss": 1.7808, + "step": 169000 + }, + { + "epoch": 13.26, + "learning_rate": 0.0005, + "loss": 1.7856, + "step": 169100 + }, + { + "epoch": 13.27, + "learning_rate": 0.0005, + "loss": 1.8041, + "step": 169200 + }, + { + "epoch": 13.28, + "learning_rate": 0.0005, + "loss": 1.8003, + "step": 169300 + }, + { + "epoch": 13.28, + "learning_rate": 0.0005, + "loss": 1.8013, + "step": 169400 + }, + { + "epoch": 13.29, + "learning_rate": 0.0005, + "loss": 1.7939, + "step": 169500 + }, + { + "epoch": 13.3, + "learning_rate": 0.0005, + "loss": 1.8059, + "step": 169600 + }, + { + "epoch": 13.31, + "learning_rate": 0.0005, + "loss": 1.8036, + "step": 169700 + }, + { + "epoch": 13.32, + "learning_rate": 0.0005, + "loss": 1.7763, + "step": 169800 + }, + { + "epoch": 13.32, + "learning_rate": 0.0005, + "loss": 1.7864, + "step": 169900 + }, + { + "epoch": 13.33, + "learning_rate": 0.0005, + "loss": 1.8027, + "step": 170000 + }, + { + "epoch": 13.33, + "eval_gen_len": 18.75721726847356, + "eval_loss": 2.0271575450897217, + "eval_rouge1": 35.2864, + "eval_rouge2": 13.9499, + "eval_rougeL": 28.9284, + "eval_rougeLsum": 28.9215, + "eval_runtime": 360.6463, + "eval_samples_per_second": 31.408, + "eval_steps_per_second": 1.963, + "step": 170000 + }, + { + "epoch": 13.34, + "learning_rate": 0.0005, + "loss": 1.8209, + "step": 170100 + }, + { + "epoch": 13.35, + "learning_rate": 0.0005, + "loss": 1.7839, + "step": 170200 + }, + { + "epoch": 13.35, + "learning_rate": 0.0005, + "loss": 1.8168, + "step": 170300 + }, + { + "epoch": 13.36, + "learning_rate": 0.0005, + "loss": 1.8084, + "step": 170400 + }, + { + "epoch": 13.37, + "learning_rate": 0.0005, + "loss": 1.7727, + "step": 170500 + }, + { + "epoch": 13.38, + "learning_rate": 0.0005, + "loss": 1.7981, + "step": 170600 + }, + { + "epoch": 13.39, + "learning_rate": 0.0005, + "loss": 1.806, + "step": 170700 + }, + { + "epoch": 13.39, + "learning_rate": 0.0005, + "loss": 1.7888, + "step": 170800 + }, + { + "epoch": 13.4, + "learning_rate": 0.0005, + "loss": 1.8181, + "step": 170900 + }, + { + "epoch": 13.41, + "learning_rate": 0.0005, + "loss": 1.8123, + "step": 171000 + }, + { + "epoch": 13.42, + "learning_rate": 0.0005, + "loss": 1.8251, + "step": 171100 + }, + { + "epoch": 13.43, + "learning_rate": 0.0005, + "loss": 1.8066, + "step": 171200 + }, + { + "epoch": 13.43, + "learning_rate": 0.0005, + "loss": 1.7705, + "step": 171300 + }, + { + "epoch": 13.44, + "learning_rate": 0.0005, + "loss": 1.7931, + "step": 171400 + }, + { + "epoch": 13.45, + "learning_rate": 0.0005, + "loss": 1.8041, + "step": 171500 + }, + { + "epoch": 13.46, + "learning_rate": 0.0005, + "loss": 1.8164, + "step": 171600 + }, + { + "epoch": 13.46, + "learning_rate": 0.0005, + "loss": 1.8147, + "step": 171700 + }, + { + "epoch": 13.47, + "learning_rate": 0.0005, + "loss": 1.8222, + "step": 171800 + }, + { + "epoch": 13.48, + "learning_rate": 0.0005, + "loss": 1.8062, + "step": 171900 + }, + { + "epoch": 13.49, + "learning_rate": 0.0005, + "loss": 1.8085, + "step": 172000 + }, + { + "epoch": 13.5, + "learning_rate": 0.0005, + "loss": 1.8116, + "step": 172100 + }, + { + "epoch": 13.5, + "learning_rate": 0.0005, + "loss": 1.8293, + "step": 172200 + }, + { + "epoch": 13.51, + "learning_rate": 0.0005, + "loss": 1.8549, + "step": 172300 + }, + { + "epoch": 13.52, + "learning_rate": 0.0005, + "loss": 1.8357, + "step": 172400 + }, + { + "epoch": 13.53, + "learning_rate": 0.0005, + "loss": 1.8385, + "step": 172500 + }, + { + "epoch": 13.54, + "learning_rate": 0.0005, + "loss": 1.8104, + "step": 172600 + }, + { + "epoch": 13.54, + "learning_rate": 0.0005, + "loss": 1.8192, + "step": 172700 + }, + { + "epoch": 13.55, + "learning_rate": 0.0005, + "loss": 1.7848, + "step": 172800 + }, + { + "epoch": 13.56, + "learning_rate": 0.0005, + "loss": 1.826, + "step": 172900 + }, + { + "epoch": 13.57, + "learning_rate": 0.0005, + "loss": 1.812, + "step": 173000 + }, + { + "epoch": 13.57, + "learning_rate": 0.0005, + "loss": 1.8255, + "step": 173100 + }, + { + "epoch": 13.58, + "learning_rate": 0.0005, + "loss": 1.8037, + "step": 173200 + }, + { + "epoch": 13.59, + "learning_rate": 0.0005, + "loss": 1.8181, + "step": 173300 + }, + { + "epoch": 13.6, + "learning_rate": 0.0005, + "loss": 1.8365, + "step": 173400 + }, + { + "epoch": 13.61, + "learning_rate": 0.0005, + "loss": 1.8409, + "step": 173500 + }, + { + "epoch": 13.61, + "learning_rate": 0.0005, + "loss": 1.8047, + "step": 173600 + }, + { + "epoch": 13.62, + "learning_rate": 0.0005, + "loss": 1.8418, + "step": 173700 + }, + { + "epoch": 13.63, + "learning_rate": 0.0005, + "loss": 1.8107, + "step": 173800 + }, + { + "epoch": 13.64, + "learning_rate": 0.0005, + "loss": 1.8037, + "step": 173900 + }, + { + "epoch": 13.64, + "learning_rate": 0.0005, + "loss": 1.8453, + "step": 174000 + }, + { + "epoch": 13.65, + "learning_rate": 0.0005, + "loss": 1.8125, + "step": 174100 + }, + { + "epoch": 13.66, + "learning_rate": 0.0005, + "loss": 1.8392, + "step": 174200 + }, + { + "epoch": 13.67, + "learning_rate": 0.0005, + "loss": 1.7939, + "step": 174300 + }, + { + "epoch": 13.68, + "learning_rate": 0.0005, + "loss": 1.8239, + "step": 174400 + }, + { + "epoch": 13.68, + "learning_rate": 0.0005, + "loss": 1.8236, + "step": 174500 + }, + { + "epoch": 13.69, + "learning_rate": 0.0005, + "loss": 1.8072, + "step": 174600 + }, + { + "epoch": 13.7, + "learning_rate": 0.0005, + "loss": 1.8417, + "step": 174700 + }, + { + "epoch": 13.71, + "learning_rate": 0.0005, + "loss": 1.8382, + "step": 174800 + }, + { + "epoch": 13.72, + "learning_rate": 0.0005, + "loss": 1.7897, + "step": 174900 + }, + { + "epoch": 13.72, + "learning_rate": 0.0005, + "loss": 1.8221, + "step": 175000 + }, + { + "epoch": 13.73, + "learning_rate": 0.0005, + "loss": 1.8289, + "step": 175100 + }, + { + "epoch": 13.74, + "learning_rate": 0.0005, + "loss": 1.8416, + "step": 175200 + }, + { + "epoch": 13.75, + "learning_rate": 0.0005, + "loss": 1.8429, + "step": 175300 + }, + { + "epoch": 13.75, + "learning_rate": 0.0005, + "loss": 1.8408, + "step": 175400 + }, + { + "epoch": 13.76, + "learning_rate": 0.0005, + "loss": 1.836, + "step": 175500 + }, + { + "epoch": 13.77, + "learning_rate": 0.0005, + "loss": 1.8222, + "step": 175600 + }, + { + "epoch": 13.78, + "learning_rate": 0.0005, + "loss": 1.8146, + "step": 175700 + }, + { + "epoch": 13.79, + "learning_rate": 0.0005, + "loss": 1.8134, + "step": 175800 + }, + { + "epoch": 13.79, + "learning_rate": 0.0005, + "loss": 1.7945, + "step": 175900 + }, + { + "epoch": 13.8, + "learning_rate": 0.0005, + "loss": 1.8222, + "step": 176000 + }, + { + "epoch": 13.81, + "learning_rate": 0.0005, + "loss": 1.8414, + "step": 176100 + }, + { + "epoch": 13.82, + "learning_rate": 0.0005, + "loss": 1.8085, + "step": 176200 + }, + { + "epoch": 13.83, + "learning_rate": 0.0005, + "loss": 1.8338, + "step": 176300 + }, + { + "epoch": 13.83, + "learning_rate": 0.0005, + "loss": 1.8468, + "step": 176400 + }, + { + "epoch": 13.84, + "learning_rate": 0.0005, + "loss": 1.8403, + "step": 176500 + }, + { + "epoch": 13.85, + "learning_rate": 0.0005, + "loss": 1.8421, + "step": 176600 + }, + { + "epoch": 13.86, + "learning_rate": 0.0005, + "loss": 1.8111, + "step": 176700 + }, + { + "epoch": 13.86, + "learning_rate": 0.0005, + "loss": 1.8125, + "step": 176800 + }, + { + "epoch": 13.87, + "learning_rate": 0.0005, + "loss": 1.8113, + "step": 176900 + }, + { + "epoch": 13.88, + "learning_rate": 0.0005, + "loss": 1.8023, + "step": 177000 + }, + { + "epoch": 13.89, + "learning_rate": 0.0005, + "loss": 1.8249, + "step": 177100 + }, + { + "epoch": 13.9, + "learning_rate": 0.0005, + "loss": 1.8316, + "step": 177200 + }, + { + "epoch": 13.9, + "learning_rate": 0.0005, + "loss": 1.8266, + "step": 177300 + }, + { + "epoch": 13.91, + "learning_rate": 0.0005, + "loss": 1.8324, + "step": 177400 + }, + { + "epoch": 13.92, + "learning_rate": 0.0005, + "loss": 1.8145, + "step": 177500 + }, + { + "epoch": 13.93, + "learning_rate": 0.0005, + "loss": 1.8484, + "step": 177600 + }, + { + "epoch": 13.94, + "learning_rate": 0.0005, + "loss": 1.8215, + "step": 177700 + }, + { + "epoch": 13.94, + "learning_rate": 0.0005, + "loss": 1.83, + "step": 177800 + }, + { + "epoch": 13.95, + "learning_rate": 0.0005, + "loss": 1.8315, + "step": 177900 + }, + { + "epoch": 13.96, + "learning_rate": 0.0005, + "loss": 1.882, + "step": 178000 + }, + { + "epoch": 13.97, + "learning_rate": 0.0005, + "loss": 1.8308, + "step": 178100 + }, + { + "epoch": 13.97, + "learning_rate": 0.0005, + "loss": 1.8354, + "step": 178200 + }, + { + "epoch": 13.98, + "learning_rate": 0.0005, + "loss": 1.8254, + "step": 178300 + }, + { + "epoch": 13.99, + "learning_rate": 0.0005, + "loss": 1.8696, + "step": 178400 + }, + { + "epoch": 14.0, + "learning_rate": 0.0005, + "loss": 1.8469, + "step": 178500 + }, + { + "epoch": 14.01, + "learning_rate": 0.0005, + "loss": 1.7778, + "step": 178600 + }, + { + "epoch": 14.01, + "learning_rate": 0.0005, + "loss": 1.7202, + "step": 178700 + }, + { + "epoch": 14.02, + "learning_rate": 0.0005, + "loss": 1.7704, + "step": 178800 + }, + { + "epoch": 14.03, + "learning_rate": 0.0005, + "loss": 1.7825, + "step": 178900 + }, + { + "epoch": 14.04, + "learning_rate": 0.0005, + "loss": 1.7526, + "step": 179000 + }, + { + "epoch": 14.04, + "learning_rate": 0.0005, + "loss": 1.777, + "step": 179100 + }, + { + "epoch": 14.05, + "learning_rate": 0.0005, + "loss": 1.7551, + "step": 179200 + }, + { + "epoch": 14.06, + "learning_rate": 0.0005, + "loss": 1.7501, + "step": 179300 + }, + { + "epoch": 14.07, + "learning_rate": 0.0005, + "loss": 1.7711, + "step": 179400 + }, + { + "epoch": 14.08, + "learning_rate": 0.0005, + "loss": 1.756, + "step": 179500 + }, + { + "epoch": 14.08, + "learning_rate": 0.0005, + "loss": 1.76, + "step": 179600 + }, + { + "epoch": 14.09, + "learning_rate": 0.0005, + "loss": 1.7467, + "step": 179700 + }, + { + "epoch": 14.1, + "learning_rate": 0.0005, + "loss": 1.7563, + "step": 179800 + }, + { + "epoch": 14.11, + "learning_rate": 0.0005, + "loss": 1.7338, + "step": 179900 + }, + { + "epoch": 14.12, + "learning_rate": 0.0005, + "loss": 1.7544, + "step": 180000 + }, + { + "epoch": 14.12, + "eval_gen_len": 18.749006797916483, + "eval_loss": 2.0176799297332764, + "eval_rouge1": 35.1221, + "eval_rouge2": 13.8252, + "eval_rougeL": 28.7503, + "eval_rougeLsum": 28.7366, + "eval_runtime": 359.2813, + "eval_samples_per_second": 31.527, + "eval_steps_per_second": 1.971, + "step": 180000 + }, + { + "epoch": 14.12, + "learning_rate": 0.0005, + "loss": 1.7795, + "step": 180100 + }, + { + "epoch": 14.13, + "learning_rate": 0.0005, + "loss": 1.7548, + "step": 180200 + }, + { + "epoch": 14.14, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 180300 + }, + { + "epoch": 14.15, + "learning_rate": 0.0005, + "loss": 1.7341, + "step": 180400 + }, + { + "epoch": 14.15, + "learning_rate": 0.0005, + "loss": 1.7936, + "step": 180500 + }, + { + "epoch": 14.16, + "learning_rate": 0.0005, + "loss": 1.7506, + "step": 180600 + }, + { + "epoch": 14.17, + "learning_rate": 0.0005, + "loss": 1.7988, + "step": 180700 + }, + { + "epoch": 14.18, + "learning_rate": 0.0005, + "loss": 1.7907, + "step": 180800 + }, + { + "epoch": 14.19, + "learning_rate": 0.0005, + "loss": 1.7804, + "step": 180900 + }, + { + "epoch": 14.19, + "learning_rate": 0.0005, + "loss": 1.7554, + "step": 181000 + }, + { + "epoch": 14.2, + "learning_rate": 0.0005, + "loss": 1.7807, + "step": 181100 + }, + { + "epoch": 14.21, + "learning_rate": 0.0005, + "loss": 1.7703, + "step": 181200 + }, + { + "epoch": 14.22, + "learning_rate": 0.0005, + "loss": 1.7901, + "step": 181300 + }, + { + "epoch": 14.23, + "learning_rate": 0.0005, + "loss": 1.796, + "step": 181400 + }, + { + "epoch": 14.23, + "learning_rate": 0.0005, + "loss": 1.789, + "step": 181500 + }, + { + "epoch": 14.24, + "learning_rate": 0.0005, + "loss": 1.7764, + "step": 181600 + }, + { + "epoch": 14.25, + "learning_rate": 0.0005, + "loss": 1.7761, + "step": 181700 + }, + { + "epoch": 14.26, + "learning_rate": 0.0005, + "loss": 1.7824, + "step": 181800 + }, + { + "epoch": 14.26, + "learning_rate": 0.0005, + "loss": 1.7522, + "step": 181900 + }, + { + "epoch": 14.27, + "learning_rate": 0.0005, + "loss": 1.7723, + "step": 182000 + }, + { + "epoch": 14.28, + "learning_rate": 0.0005, + "loss": 1.7678, + "step": 182100 + }, + { + "epoch": 14.29, + "learning_rate": 0.0005, + "loss": 1.7886, + "step": 182200 + }, + { + "epoch": 14.3, + "learning_rate": 0.0005, + "loss": 1.7694, + "step": 182300 + }, + { + "epoch": 14.3, + "learning_rate": 0.0005, + "loss": 1.7767, + "step": 182400 + }, + { + "epoch": 14.31, + "learning_rate": 0.0005, + "loss": 1.7684, + "step": 182500 + }, + { + "epoch": 14.32, + "learning_rate": 0.0005, + "loss": 1.7488, + "step": 182600 + }, + { + "epoch": 14.33, + "learning_rate": 0.0005, + "loss": 1.7981, + "step": 182700 + }, + { + "epoch": 14.34, + "learning_rate": 0.0005, + "loss": 1.7871, + "step": 182800 + }, + { + "epoch": 14.34, + "learning_rate": 0.0005, + "loss": 1.7587, + "step": 182900 + }, + { + "epoch": 14.35, + "learning_rate": 0.0005, + "loss": 1.8053, + "step": 183000 + }, + { + "epoch": 14.36, + "learning_rate": 0.0005, + "loss": 1.7758, + "step": 183100 + }, + { + "epoch": 14.37, + "learning_rate": 0.0005, + "loss": 1.7966, + "step": 183200 + }, + { + "epoch": 14.37, + "learning_rate": 0.0005, + "loss": 1.7478, + "step": 183300 + }, + { + "epoch": 14.38, + "learning_rate": 0.0005, + "loss": 1.7651, + "step": 183400 + }, + { + "epoch": 14.39, + "learning_rate": 0.0005, + "loss": 1.7924, + "step": 183500 + }, + { + "epoch": 14.4, + "learning_rate": 0.0005, + "loss": 1.7729, + "step": 183600 + }, + { + "epoch": 14.41, + "learning_rate": 0.0005, + "loss": 1.7818, + "step": 183700 + }, + { + "epoch": 14.41, + "learning_rate": 0.0005, + "loss": 1.7805, + "step": 183800 + }, + { + "epoch": 14.42, + "learning_rate": 0.0005, + "loss": 1.8025, + "step": 183900 + }, + { + "epoch": 14.43, + "learning_rate": 0.0005, + "loss": 1.7769, + "step": 184000 + }, + { + "epoch": 14.44, + "learning_rate": 0.0005, + "loss": 1.798, + "step": 184100 + }, + { + "epoch": 14.44, + "learning_rate": 0.0005, + "loss": 1.7751, + "step": 184200 + }, + { + "epoch": 14.45, + "learning_rate": 0.0005, + "loss": 1.7869, + "step": 184300 + }, + { + "epoch": 14.46, + "learning_rate": 0.0005, + "loss": 1.7882, + "step": 184400 + }, + { + "epoch": 14.47, + "learning_rate": 0.0005, + "loss": 1.7908, + "step": 184500 + }, + { + "epoch": 14.48, + "learning_rate": 0.0005, + "loss": 1.819, + "step": 184600 + }, + { + "epoch": 14.48, + "learning_rate": 0.0005, + "loss": 1.794, + "step": 184700 + }, + { + "epoch": 14.49, + "learning_rate": 0.0005, + "loss": 1.7894, + "step": 184800 + }, + { + "epoch": 14.5, + "learning_rate": 0.0005, + "loss": 1.8124, + "step": 184900 + }, + { + "epoch": 14.51, + "learning_rate": 0.0005, + "loss": 1.7684, + "step": 185000 + }, + { + "epoch": 14.52, + "learning_rate": 0.0005, + "loss": 1.7929, + "step": 185100 + }, + { + "epoch": 14.52, + "learning_rate": 0.0005, + "loss": 1.7906, + "step": 185200 + }, + { + "epoch": 14.53, + "learning_rate": 0.0005, + "loss": 1.7852, + "step": 185300 + }, + { + "epoch": 14.54, + "learning_rate": 0.0005, + "loss": 1.7982, + "step": 185400 + }, + { + "epoch": 14.55, + "learning_rate": 0.0005, + "loss": 1.8103, + "step": 185500 + }, + { + "epoch": 14.55, + "learning_rate": 0.0005, + "loss": 1.8011, + "step": 185600 + }, + { + "epoch": 14.56, + "learning_rate": 0.0005, + "loss": 1.77, + "step": 185700 + }, + { + "epoch": 14.57, + "learning_rate": 0.0005, + "loss": 1.7581, + "step": 185800 + }, + { + "epoch": 14.58, + "learning_rate": 0.0005, + "loss": 1.7856, + "step": 185900 + }, + { + "epoch": 14.59, + "learning_rate": 0.0005, + "loss": 1.7908, + "step": 186000 + }, + { + "epoch": 14.59, + "learning_rate": 0.0005, + "loss": 1.7839, + "step": 186100 + }, + { + "epoch": 14.6, + "learning_rate": 0.0005, + "loss": 1.8084, + "step": 186200 + }, + { + "epoch": 14.61, + "learning_rate": 0.0005, + "loss": 1.7967, + "step": 186300 + }, + { + "epoch": 14.62, + "learning_rate": 0.0005, + "loss": 1.7827, + "step": 186400 + }, + { + "epoch": 14.63, + "learning_rate": 0.0005, + "loss": 1.8011, + "step": 186500 + }, + { + "epoch": 14.63, + "learning_rate": 0.0005, + "loss": 1.8161, + "step": 186600 + }, + { + "epoch": 14.64, + "learning_rate": 0.0005, + "loss": 1.8012, + "step": 186700 + }, + { + "epoch": 14.65, + "learning_rate": 0.0005, + "loss": 1.8069, + "step": 186800 + }, + { + "epoch": 14.66, + "learning_rate": 0.0005, + "loss": 1.8105, + "step": 186900 + }, + { + "epoch": 14.66, + "learning_rate": 0.0005, + "loss": 1.7905, + "step": 187000 + }, + { + "epoch": 14.67, + "learning_rate": 0.0005, + "loss": 1.803, + "step": 187100 + }, + { + "epoch": 14.68, + "learning_rate": 0.0005, + "loss": 1.7951, + "step": 187200 + }, + { + "epoch": 14.69, + "learning_rate": 0.0005, + "loss": 1.8003, + "step": 187300 + }, + { + "epoch": 14.7, + "learning_rate": 0.0005, + "loss": 1.7834, + "step": 187400 + }, + { + "epoch": 14.7, + "learning_rate": 0.0005, + "loss": 1.827, + "step": 187500 + }, + { + "epoch": 14.71, + "learning_rate": 0.0005, + "loss": 1.7933, + "step": 187600 + }, + { + "epoch": 14.72, + "learning_rate": 0.0005, + "loss": 1.8315, + "step": 187700 + }, + { + "epoch": 14.73, + "learning_rate": 0.0005, + "loss": 1.7726, + "step": 187800 + }, + { + "epoch": 14.73, + "learning_rate": 0.0005, + "loss": 1.7941, + "step": 187900 + }, + { + "epoch": 14.74, + "learning_rate": 0.0005, + "loss": 1.7815, + "step": 188000 + }, + { + "epoch": 14.75, + "learning_rate": 0.0005, + "loss": 1.7911, + "step": 188100 + }, + { + "epoch": 14.76, + "learning_rate": 0.0005, + "loss": 1.8138, + "step": 188200 + }, + { + "epoch": 14.77, + "learning_rate": 0.0005, + "loss": 1.8124, + "step": 188300 + }, + { + "epoch": 14.77, + "learning_rate": 0.0005, + "loss": 1.7925, + "step": 188400 + }, + { + "epoch": 14.78, + "learning_rate": 0.0005, + "loss": 1.7887, + "step": 188500 + }, + { + "epoch": 14.79, + "learning_rate": 0.0005, + "loss": 1.8255, + "step": 188600 + }, + { + "epoch": 14.8, + "learning_rate": 0.0005, + "loss": 1.7837, + "step": 188700 + }, + { + "epoch": 14.81, + "learning_rate": 0.0005, + "loss": 1.8091, + "step": 188800 + }, + { + "epoch": 14.81, + "learning_rate": 0.0005, + "loss": 1.8004, + "step": 188900 + }, + { + "epoch": 14.82, + "learning_rate": 0.0005, + "loss": 1.8177, + "step": 189000 + }, + { + "epoch": 14.83, + "learning_rate": 0.0005, + "loss": 1.8072, + "step": 189100 + }, + { + "epoch": 14.84, + "learning_rate": 0.0005, + "loss": 1.8119, + "step": 189200 + }, + { + "epoch": 14.84, + "learning_rate": 0.0005, + "loss": 1.7918, + "step": 189300 + }, + { + "epoch": 14.85, + "learning_rate": 0.0005, + "loss": 1.797, + "step": 189400 + }, + { + "epoch": 14.86, + "learning_rate": 0.0005, + "loss": 1.8045, + "step": 189500 + }, + { + "epoch": 14.87, + "learning_rate": 0.0005, + "loss": 1.815, + "step": 189600 + }, + { + "epoch": 14.88, + "learning_rate": 0.0005, + "loss": 1.8034, + "step": 189700 + }, + { + "epoch": 14.88, + "learning_rate": 0.0005, + "loss": 1.7908, + "step": 189800 + }, + { + "epoch": 14.89, + "learning_rate": 0.0005, + "loss": 1.8085, + "step": 189900 + }, + { + "epoch": 14.9, + "learning_rate": 0.0005, + "loss": 1.8096, + "step": 190000 + }, + { + "epoch": 14.9, + "eval_gen_len": 18.80727465348283, + "eval_loss": 2.011542558670044, + "eval_rouge1": 35.1261, + "eval_rouge2": 13.9669, + "eval_rougeL": 28.8032, + "eval_rougeLsum": 28.7986, + "eval_runtime": 359.3121, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 1.97, + "step": 190000 + }, + { + "epoch": 14.91, + "learning_rate": 0.0005, + "loss": 1.7785, + "step": 190100 + }, + { + "epoch": 14.92, + "learning_rate": 0.0005, + "loss": 1.8173, + "step": 190200 + }, + { + "epoch": 14.92, + "learning_rate": 0.0005, + "loss": 1.8061, + "step": 190300 + }, + { + "epoch": 14.93, + "learning_rate": 0.0005, + "loss": 1.8585, + "step": 190400 + }, + { + "epoch": 14.94, + "learning_rate": 0.0005, + "loss": 1.8302, + "step": 190500 + }, + { + "epoch": 14.95, + "learning_rate": 0.0005, + "loss": 1.8016, + "step": 190600 + }, + { + "epoch": 14.95, + "learning_rate": 0.0005, + "loss": 1.8222, + "step": 190700 + }, + { + "epoch": 14.96, + "learning_rate": 0.0005, + "loss": 1.8069, + "step": 190800 + }, + { + "epoch": 14.97, + "learning_rate": 0.0005, + "loss": 1.8366, + "step": 190900 + }, + { + "epoch": 14.98, + "learning_rate": 0.0005, + "loss": 1.8089, + "step": 191000 + }, + { + "epoch": 14.99, + "learning_rate": 0.0005, + "loss": 1.8057, + "step": 191100 + }, + { + "epoch": 14.99, + "learning_rate": 0.0005, + "loss": 1.8068, + "step": 191200 + }, + { + "epoch": 15.0, + "learning_rate": 0.0005, + "loss": 1.7916, + "step": 191300 + }, + { + "epoch": 15.01, + "learning_rate": 0.0005, + "loss": 1.7352, + "step": 191400 + }, + { + "epoch": 15.02, + "learning_rate": 0.0005, + "loss": 1.7374, + "step": 191500 + }, + { + "epoch": 15.03, + "learning_rate": 0.0005, + "loss": 1.7113, + "step": 191600 + }, + { + "epoch": 15.03, + "learning_rate": 0.0005, + "loss": 1.7363, + "step": 191700 + }, + { + "epoch": 15.04, + "learning_rate": 0.0005, + "loss": 1.759, + "step": 191800 + }, + { + "epoch": 15.05, + "learning_rate": 0.0005, + "loss": 1.7333, + "step": 191900 + }, + { + "epoch": 15.06, + "learning_rate": 0.0005, + "loss": 1.7388, + "step": 192000 + }, + { + "epoch": 15.06, + "learning_rate": 0.0005, + "loss": 1.7501, + "step": 192100 + }, + { + "epoch": 15.07, + "learning_rate": 0.0005, + "loss": 1.7265, + "step": 192200 + }, + { + "epoch": 15.08, + "learning_rate": 0.0005, + "loss": 1.7163, + "step": 192300 + }, + { + "epoch": 15.09, + "learning_rate": 0.0005, + "loss": 1.7372, + "step": 192400 + }, + { + "epoch": 15.1, + "learning_rate": 0.0005, + "loss": 1.7249, + "step": 192500 + }, + { + "epoch": 15.1, + "learning_rate": 0.0005, + "loss": 1.7426, + "step": 192600 + }, + { + "epoch": 15.11, + "learning_rate": 0.0005, + "loss": 1.7574, + "step": 192700 + }, + { + "epoch": 15.12, + "learning_rate": 0.0005, + "loss": 1.7542, + "step": 192800 + }, + { + "epoch": 15.13, + "learning_rate": 0.0005, + "loss": 1.7373, + "step": 192900 + }, + { + "epoch": 15.13, + "learning_rate": 0.0005, + "loss": 1.7739, + "step": 193000 + }, + { + "epoch": 15.14, + "learning_rate": 0.0005, + "loss": 1.7292, + "step": 193100 + }, + { + "epoch": 15.15, + "learning_rate": 0.0005, + "loss": 1.7559, + "step": 193200 + }, + { + "epoch": 15.16, + "learning_rate": 0.0005, + "loss": 1.7664, + "step": 193300 + }, + { + "epoch": 15.17, + "learning_rate": 0.0005, + "loss": 1.775, + "step": 193400 + }, + { + "epoch": 15.17, + "learning_rate": 0.0005, + "loss": 1.7353, + "step": 193500 + }, + { + "epoch": 15.18, + "learning_rate": 0.0005, + "loss": 1.7059, + "step": 193600 + }, + { + "epoch": 15.19, + "learning_rate": 0.0005, + "loss": 1.7496, + "step": 193700 + }, + { + "epoch": 15.2, + "learning_rate": 0.0005, + "loss": 1.7722, + "step": 193800 + }, + { + "epoch": 15.21, + "learning_rate": 0.0005, + "loss": 1.7518, + "step": 193900 + }, + { + "epoch": 15.21, + "learning_rate": 0.0005, + "loss": 1.7517, + "step": 194000 + }, + { + "epoch": 15.22, + "learning_rate": 0.0005, + "loss": 1.7901, + "step": 194100 + }, + { + "epoch": 15.23, + "learning_rate": 0.0005, + "loss": 1.7402, + "step": 194200 + }, + { + "epoch": 15.24, + "learning_rate": 0.0005, + "loss": 1.7294, + "step": 194300 + }, + { + "epoch": 15.24, + "learning_rate": 0.0005, + "loss": 1.7142, + "step": 194400 + }, + { + "epoch": 15.25, + "learning_rate": 0.0005, + "loss": 1.7213, + "step": 194500 + }, + { + "epoch": 15.26, + "learning_rate": 0.0005, + "loss": 1.7418, + "step": 194600 + }, + { + "epoch": 15.27, + "learning_rate": 0.0005, + "loss": 1.7736, + "step": 194700 + }, + { + "epoch": 15.28, + "learning_rate": 0.0005, + "loss": 1.7709, + "step": 194800 + }, + { + "epoch": 15.28, + "learning_rate": 0.0005, + "loss": 1.7365, + "step": 194900 + }, + { + "epoch": 15.29, + "learning_rate": 0.0005, + "loss": 1.7571, + "step": 195000 + }, + { + "epoch": 15.3, + "learning_rate": 0.0005, + "loss": 1.7207, + "step": 195100 + }, + { + "epoch": 15.31, + "learning_rate": 0.0005, + "loss": 1.7616, + "step": 195200 + }, + { + "epoch": 15.32, + "learning_rate": 0.0005, + "loss": 1.752, + "step": 195300 + }, + { + "epoch": 15.32, + "learning_rate": 0.0005, + "loss": 1.737, + "step": 195400 + }, + { + "epoch": 15.33, + "learning_rate": 0.0005, + "loss": 1.7397, + "step": 195500 + }, + { + "epoch": 15.34, + "learning_rate": 0.0005, + "loss": 1.7586, + "step": 195600 + }, + { + "epoch": 15.35, + "learning_rate": 0.0005, + "loss": 1.7357, + "step": 195700 + }, + { + "epoch": 15.35, + "learning_rate": 0.0005, + "loss": 1.742, + "step": 195800 + }, + { + "epoch": 15.36, + "learning_rate": 0.0005, + "loss": 1.7802, + "step": 195900 + }, + { + "epoch": 15.37, + "learning_rate": 0.0005, + "loss": 1.7528, + "step": 196000 + }, + { + "epoch": 15.38, + "learning_rate": 0.0005, + "loss": 1.7203, + "step": 196100 + }, + { + "epoch": 15.39, + "learning_rate": 0.0005, + "loss": 1.7815, + "step": 196200 + }, + { + "epoch": 15.39, + "learning_rate": 0.0005, + "loss": 1.7547, + "step": 196300 + }, + { + "epoch": 15.4, + "learning_rate": 0.0005, + "loss": 1.761, + "step": 196400 + }, + { + "epoch": 15.41, + "learning_rate": 0.0005, + "loss": 1.7246, + "step": 196500 + }, + { + "epoch": 15.42, + "learning_rate": 0.0005, + "loss": 1.7984, + "step": 196600 + }, + { + "epoch": 15.43, + "learning_rate": 0.0005, + "loss": 1.7696, + "step": 196700 + }, + { + "epoch": 15.43, + "learning_rate": 0.0005, + "loss": 1.7424, + "step": 196800 + }, + { + "epoch": 15.44, + "learning_rate": 0.0005, + "loss": 1.7836, + "step": 196900 + }, + { + "epoch": 15.45, + "learning_rate": 0.0005, + "loss": 1.763, + "step": 197000 + }, + { + "epoch": 15.46, + "learning_rate": 0.0005, + "loss": 1.7935, + "step": 197100 + }, + { + "epoch": 15.46, + "learning_rate": 0.0005, + "loss": 1.757, + "step": 197200 + }, + { + "epoch": 15.47, + "learning_rate": 0.0005, + "loss": 1.7406, + "step": 197300 + }, + { + "epoch": 15.48, + "learning_rate": 0.0005, + "loss": 1.7726, + "step": 197400 + }, + { + "epoch": 15.49, + "learning_rate": 0.0005, + "loss": 1.7588, + "step": 197500 + }, + { + "epoch": 15.5, + "learning_rate": 0.0005, + "loss": 1.775, + "step": 197600 + }, + { + "epoch": 15.5, + "learning_rate": 0.0005, + "loss": 1.7454, + "step": 197700 + }, + { + "epoch": 15.51, + "learning_rate": 0.0005, + "loss": 1.7705, + "step": 197800 + }, + { + "epoch": 15.52, + "learning_rate": 0.0005, + "loss": 1.8015, + "step": 197900 + }, + { + "epoch": 15.53, + "learning_rate": 0.0005, + "loss": 1.7881, + "step": 198000 + }, + { + "epoch": 15.53, + "learning_rate": 0.0005, + "loss": 1.7773, + "step": 198100 + }, + { + "epoch": 15.54, + "learning_rate": 0.0005, + "loss": 1.7813, + "step": 198200 + }, + { + "epoch": 15.55, + "learning_rate": 0.0005, + "loss": 1.7729, + "step": 198300 + }, + { + "epoch": 15.56, + "learning_rate": 0.0005, + "loss": 1.77, + "step": 198400 + }, + { + "epoch": 15.57, + "learning_rate": 0.0005, + "loss": 1.7687, + "step": 198500 + }, + { + "epoch": 15.57, + "learning_rate": 0.0005, + "loss": 1.786, + "step": 198600 + }, + { + "epoch": 15.58, + "learning_rate": 0.0005, + "loss": 1.765, + "step": 198700 + }, + { + "epoch": 15.59, + "learning_rate": 0.0005, + "loss": 1.7615, + "step": 198800 + }, + { + "epoch": 15.6, + "learning_rate": 0.0005, + "loss": 1.7887, + "step": 198900 + }, + { + "epoch": 15.61, + "learning_rate": 0.0005, + "loss": 1.7714, + "step": 199000 + }, + { + "epoch": 15.61, + "learning_rate": 0.0005, + "loss": 1.8128, + "step": 199100 + }, + { + "epoch": 15.62, + "learning_rate": 0.0005, + "loss": 1.7684, + "step": 199200 + }, + { + "epoch": 15.63, + "learning_rate": 0.0005, + "loss": 1.769, + "step": 199300 + }, + { + "epoch": 15.64, + "learning_rate": 0.0005, + "loss": 1.7688, + "step": 199400 + }, + { + "epoch": 15.64, + "learning_rate": 0.0005, + "loss": 1.7897, + "step": 199500 + }, + { + "epoch": 15.65, + "learning_rate": 0.0005, + "loss": 1.7827, + "step": 199600 + }, + { + "epoch": 15.66, + "learning_rate": 0.0005, + "loss": 1.775, + "step": 199700 + }, + { + "epoch": 15.67, + "learning_rate": 0.0005, + "loss": 1.7955, + "step": 199800 + }, + { + "epoch": 15.68, + "learning_rate": 0.0005, + "loss": 1.7774, + "step": 199900 + }, + { + "epoch": 15.68, + "learning_rate": 0.0005, + "loss": 1.8084, + "step": 200000 + }, + { + "epoch": 15.68, + "eval_gen_len": 18.788293458108942, + "eval_loss": 2.014420747756958, + "eval_rouge1": 35.1009, + "eval_rouge2": 13.9441, + "eval_rougeL": 28.7814, + "eval_rougeLsum": 28.7778, + "eval_runtime": 359.2501, + "eval_samples_per_second": 31.53, + "eval_steps_per_second": 1.971, + "step": 200000 + }, + { + "epoch": 15.69, + "learning_rate": 0.0005, + "loss": 1.7672, + "step": 200100 + }, + { + "epoch": 15.7, + "learning_rate": 0.0005, + "loss": 1.7826, + "step": 200200 + }, + { + "epoch": 15.71, + "learning_rate": 0.0005, + "loss": 1.7923, + "step": 200300 + }, + { + "epoch": 15.72, + "learning_rate": 0.0005, + "loss": 1.7803, + "step": 200400 + }, + { + "epoch": 15.72, + "learning_rate": 0.0005, + "loss": 1.7878, + "step": 200500 + }, + { + "epoch": 15.73, + "learning_rate": 0.0005, + "loss": 1.7987, + "step": 200600 + }, + { + "epoch": 15.74, + "learning_rate": 0.0005, + "loss": 1.775, + "step": 200700 + }, + { + "epoch": 15.75, + "learning_rate": 0.0005, + "loss": 1.7686, + "step": 200800 + }, + { + "epoch": 15.75, + "learning_rate": 0.0005, + "loss": 1.7859, + "step": 200900 + }, + { + "epoch": 15.76, + "learning_rate": 0.0005, + "loss": 1.7605, + "step": 201000 + }, + { + "epoch": 15.77, + "learning_rate": 0.0005, + "loss": 1.8035, + "step": 201100 + }, + { + "epoch": 15.78, + "learning_rate": 0.0005, + "loss": 1.7632, + "step": 201200 + }, + { + "epoch": 15.79, + "learning_rate": 0.0005, + "loss": 1.7976, + "step": 201300 + }, + { + "epoch": 15.79, + "learning_rate": 0.0005, + "loss": 1.7718, + "step": 201400 + }, + { + "epoch": 15.8, + "learning_rate": 0.0005, + "loss": 1.7786, + "step": 201500 + }, + { + "epoch": 15.81, + "learning_rate": 0.0005, + "loss": 1.7525, + "step": 201600 + }, + { + "epoch": 15.82, + "learning_rate": 0.0005, + "loss": 1.7773, + "step": 201700 + }, + { + "epoch": 15.82, + "learning_rate": 0.0005, + "loss": 1.7534, + "step": 201800 + }, + { + "epoch": 15.83, + "learning_rate": 0.0005, + "loss": 1.7707, + "step": 201900 + }, + { + "epoch": 15.84, + "learning_rate": 0.0005, + "loss": 1.7756, + "step": 202000 + }, + { + "epoch": 15.85, + "learning_rate": 0.0005, + "loss": 1.797, + "step": 202100 + }, + { + "epoch": 15.86, + "learning_rate": 0.0005, + "loss": 1.7775, + "step": 202200 + }, + { + "epoch": 15.86, + "learning_rate": 0.0005, + "loss": 1.8021, + "step": 202300 + }, + { + "epoch": 15.87, + "learning_rate": 0.0005, + "loss": 1.8092, + "step": 202400 + }, + { + "epoch": 15.88, + "learning_rate": 0.0005, + "loss": 1.802, + "step": 202500 + }, + { + "epoch": 15.89, + "learning_rate": 0.0005, + "loss": 1.7979, + "step": 202600 + }, + { + "epoch": 15.9, + "learning_rate": 0.0005, + "loss": 1.8046, + "step": 202700 + }, + { + "epoch": 15.9, + "learning_rate": 0.0005, + "loss": 1.7676, + "step": 202800 + }, + { + "epoch": 15.91, + "learning_rate": 0.0005, + "loss": 1.7956, + "step": 202900 + }, + { + "epoch": 15.92, + "learning_rate": 0.0005, + "loss": 1.7877, + "step": 203000 + }, + { + "epoch": 15.93, + "learning_rate": 0.0005, + "loss": 1.7923, + "step": 203100 + }, + { + "epoch": 15.93, + "learning_rate": 0.0005, + "loss": 1.7798, + "step": 203200 + }, + { + "epoch": 15.94, + "learning_rate": 0.0005, + "loss": 1.805, + "step": 203300 + }, + { + "epoch": 15.95, + "learning_rate": 0.0005, + "loss": 1.7862, + "step": 203400 + }, + { + "epoch": 15.96, + "learning_rate": 0.0005, + "loss": 1.8017, + "step": 203500 + }, + { + "epoch": 15.97, + "learning_rate": 0.0005, + "loss": 1.7967, + "step": 203600 + }, + { + "epoch": 15.97, + "learning_rate": 0.0005, + "loss": 1.7794, + "step": 203700 + }, + { + "epoch": 15.98, + "learning_rate": 0.0005, + "loss": 1.8175, + "step": 203800 + }, + { + "epoch": 15.99, + "learning_rate": 0.0005, + "loss": 1.7917, + "step": 203900 + }, + { + "epoch": 16.0, + "learning_rate": 0.0005, + "loss": 1.7841, + "step": 204000 + }, + { + "epoch": 16.01, + "learning_rate": 0.0005, + "loss": 1.7394, + "step": 204100 + }, + { + "epoch": 16.01, + "learning_rate": 0.0005, + "loss": 1.6738, + "step": 204200 + }, + { + "epoch": 16.02, + "learning_rate": 0.0005, + "loss": 1.6943, + "step": 204300 + }, + { + "epoch": 16.03, + "learning_rate": 0.0005, + "loss": 1.7132, + "step": 204400 + }, + { + "epoch": 16.04, + "learning_rate": 0.0005, + "loss": 1.7137, + "step": 204500 + }, + { + "epoch": 16.04, + "learning_rate": 0.0005, + "loss": 1.7141, + "step": 204600 + }, + { + "epoch": 16.05, + "learning_rate": 0.0005, + "loss": 1.6959, + "step": 204700 + }, + { + "epoch": 16.06, + "learning_rate": 0.0005, + "loss": 1.7451, + "step": 204800 + }, + { + "epoch": 16.07, + "learning_rate": 0.0005, + "loss": 1.713, + "step": 204900 + }, + { + "epoch": 16.08, + "learning_rate": 0.0005, + "loss": 1.7364, + "step": 205000 + }, + { + "epoch": 16.08, + "learning_rate": 0.0005, + "loss": 1.7192, + "step": 205100 + }, + { + "epoch": 16.09, + "learning_rate": 0.0005, + "loss": 1.7199, + "step": 205200 + }, + { + "epoch": 16.1, + "learning_rate": 0.0005, + "loss": 1.734, + "step": 205300 + }, + { + "epoch": 16.11, + "learning_rate": 0.0005, + "loss": 1.6913, + "step": 205400 + }, + { + "epoch": 16.12, + "learning_rate": 0.0005, + "loss": 1.7102, + "step": 205500 + }, + { + "epoch": 16.12, + "learning_rate": 0.0005, + "loss": 1.7346, + "step": 205600 + }, + { + "epoch": 16.13, + "learning_rate": 0.0005, + "loss": 1.7258, + "step": 205700 + }, + { + "epoch": 16.14, + "learning_rate": 0.0005, + "loss": 1.6992, + "step": 205800 + }, + { + "epoch": 16.15, + "learning_rate": 0.0005, + "loss": 1.7197, + "step": 205900 + }, + { + "epoch": 16.15, + "learning_rate": 0.0005, + "loss": 1.7702, + "step": 206000 + }, + { + "epoch": 16.16, + "learning_rate": 0.0005, + "loss": 1.7227, + "step": 206100 + }, + { + "epoch": 16.17, + "learning_rate": 0.0005, + "loss": 1.7175, + "step": 206200 + }, + { + "epoch": 16.18, + "learning_rate": 0.0005, + "loss": 1.7367, + "step": 206300 + }, + { + "epoch": 16.19, + "learning_rate": 0.0005, + "loss": 1.7115, + "step": 206400 + }, + { + "epoch": 16.19, + "learning_rate": 0.0005, + "loss": 1.751, + "step": 206500 + }, + { + "epoch": 16.2, + "learning_rate": 0.0005, + "loss": 1.7377, + "step": 206600 + }, + { + "epoch": 16.21, + "learning_rate": 0.0005, + "loss": 1.7243, + "step": 206700 + }, + { + "epoch": 16.22, + "learning_rate": 0.0005, + "loss": 1.7267, + "step": 206800 + }, + { + "epoch": 16.22, + "learning_rate": 0.0005, + "loss": 1.7448, + "step": 206900 + }, + { + "epoch": 16.23, + "learning_rate": 0.0005, + "loss": 1.7357, + "step": 207000 + }, + { + "epoch": 16.24, + "learning_rate": 0.0005, + "loss": 1.7057, + "step": 207100 + }, + { + "epoch": 16.25, + "learning_rate": 0.0005, + "loss": 1.7444, + "step": 207200 + }, + { + "epoch": 16.26, + "learning_rate": 0.0005, + "loss": 1.7404, + "step": 207300 + }, + { + "epoch": 16.26, + "learning_rate": 0.0005, + "loss": 1.7089, + "step": 207400 + }, + { + "epoch": 16.27, + "learning_rate": 0.0005, + "loss": 1.7147, + "step": 207500 + }, + { + "epoch": 16.28, + "learning_rate": 0.0005, + "loss": 1.745, + "step": 207600 + }, + { + "epoch": 16.29, + "learning_rate": 0.0005, + "loss": 1.7442, + "step": 207700 + }, + { + "epoch": 16.3, + "learning_rate": 0.0005, + "loss": 1.7184, + "step": 207800 + }, + { + "epoch": 16.3, + "learning_rate": 0.0005, + "loss": 1.724, + "step": 207900 + }, + { + "epoch": 16.31, + "learning_rate": 0.0005, + "loss": 1.7249, + "step": 208000 + }, + { + "epoch": 16.32, + "learning_rate": 0.0005, + "loss": 1.7072, + "step": 208100 + }, + { + "epoch": 16.33, + "learning_rate": 0.0005, + "loss": 1.7377, + "step": 208200 + }, + { + "epoch": 16.33, + "learning_rate": 0.0005, + "loss": 1.7466, + "step": 208300 + }, + { + "epoch": 16.34, + "learning_rate": 0.0005, + "loss": 1.7426, + "step": 208400 + }, + { + "epoch": 16.35, + "learning_rate": 0.0005, + "loss": 1.7699, + "step": 208500 + }, + { + "epoch": 16.36, + "learning_rate": 0.0005, + "loss": 1.7459, + "step": 208600 + }, + { + "epoch": 16.37, + "learning_rate": 0.0005, + "loss": 1.7269, + "step": 208700 + }, + { + "epoch": 16.37, + "learning_rate": 0.0005, + "loss": 1.7108, + "step": 208800 + }, + { + "epoch": 16.38, + "learning_rate": 0.0005, + "loss": 1.7786, + "step": 208900 + }, + { + "epoch": 16.39, + "learning_rate": 0.0005, + "loss": 1.7564, + "step": 209000 + }, + { + "epoch": 16.4, + "learning_rate": 0.0005, + "loss": 1.7284, + "step": 209100 + }, + { + "epoch": 16.41, + "learning_rate": 0.0005, + "loss": 1.728, + "step": 209200 + }, + { + "epoch": 16.41, + "learning_rate": 0.0005, + "loss": 1.7521, + "step": 209300 + }, + { + "epoch": 16.42, + "learning_rate": 0.0005, + "loss": 1.7642, + "step": 209400 + }, + { + "epoch": 16.43, + "learning_rate": 0.0005, + "loss": 1.7576, + "step": 209500 + }, + { + "epoch": 16.44, + "learning_rate": 0.0005, + "loss": 1.7292, + "step": 209600 + }, + { + "epoch": 16.44, + "learning_rate": 0.0005, + "loss": 1.7253, + "step": 209700 + }, + { + "epoch": 16.45, + "learning_rate": 0.0005, + "loss": 1.7542, + "step": 209800 + }, + { + "epoch": 16.46, + "learning_rate": 0.0005, + "loss": 1.7427, + "step": 209900 + }, + { + "epoch": 16.47, + "learning_rate": 0.0005, + "loss": 1.7272, + "step": 210000 + }, + { + "epoch": 16.47, + "eval_gen_len": 18.780789264589036, + "eval_loss": 2.018299102783203, + "eval_rouge1": 35.0675, + "eval_rouge2": 13.9534, + "eval_rougeL": 28.8007, + "eval_rougeLsum": 28.7924, + "eval_runtime": 359.03, + "eval_samples_per_second": 31.549, + "eval_steps_per_second": 1.972, + "step": 210000 + }, + { + "epoch": 16.48, + "learning_rate": 0.0005, + "loss": 1.7419, + "step": 210100 + }, + { + "epoch": 16.48, + "learning_rate": 0.0005, + "loss": 1.7509, + "step": 210200 + }, + { + "epoch": 16.49, + "learning_rate": 0.0005, + "loss": 1.7605, + "step": 210300 + }, + { + "epoch": 16.5, + "learning_rate": 0.0005, + "loss": 1.7442, + "step": 210400 + }, + { + "epoch": 16.51, + "learning_rate": 0.0005, + "loss": 1.7492, + "step": 210500 + }, + { + "epoch": 16.52, + "learning_rate": 0.0005, + "loss": 1.7504, + "step": 210600 + }, + { + "epoch": 16.52, + "learning_rate": 0.0005, + "loss": 1.7649, + "step": 210700 + }, + { + "epoch": 16.53, + "learning_rate": 0.0005, + "loss": 1.7518, + "step": 210800 + }, + { + "epoch": 16.54, + "learning_rate": 0.0005, + "loss": 1.7234, + "step": 210900 + }, + { + "epoch": 16.55, + "learning_rate": 0.0005, + "loss": 1.7515, + "step": 211000 + }, + { + "epoch": 16.55, + "learning_rate": 0.0005, + "loss": 1.7684, + "step": 211100 + }, + { + "epoch": 16.56, + "learning_rate": 0.0005, + "loss": 1.7516, + "step": 211200 + }, + { + "epoch": 16.57, + "learning_rate": 0.0005, + "loss": 1.7365, + "step": 211300 + }, + { + "epoch": 16.58, + "learning_rate": 0.0005, + "loss": 1.7448, + "step": 211400 + }, + { + "epoch": 16.59, + "learning_rate": 0.0005, + "loss": 1.7608, + "step": 211500 + }, + { + "epoch": 16.59, + "learning_rate": 0.0005, + "loss": 1.7207, + "step": 211600 + }, + { + "epoch": 16.6, + "learning_rate": 0.0005, + "loss": 1.7543, + "step": 211700 + }, + { + "epoch": 16.61, + "learning_rate": 0.0005, + "loss": 1.7662, + "step": 211800 + }, + { + "epoch": 16.62, + "learning_rate": 0.0005, + "loss": 1.7219, + "step": 211900 + }, + { + "epoch": 16.62, + "learning_rate": 0.0005, + "loss": 1.7558, + "step": 212000 + }, + { + "epoch": 16.63, + "learning_rate": 0.0005, + "loss": 1.7425, + "step": 212100 + }, + { + "epoch": 16.64, + "learning_rate": 0.0005, + "loss": 1.7497, + "step": 212200 + }, + { + "epoch": 16.65, + "learning_rate": 0.0005, + "loss": 1.7257, + "step": 212300 + }, + { + "epoch": 16.66, + "learning_rate": 0.0005, + "loss": 1.7546, + "step": 212400 + }, + { + "epoch": 16.66, + "learning_rate": 0.0005, + "loss": 1.7781, + "step": 212500 + }, + { + "epoch": 16.67, + "learning_rate": 0.0005, + "loss": 1.7755, + "step": 212600 + }, + { + "epoch": 16.68, + "learning_rate": 0.0005, + "loss": 1.7529, + "step": 212700 + }, + { + "epoch": 16.69, + "learning_rate": 0.0005, + "loss": 1.7525, + "step": 212800 + }, + { + "epoch": 16.7, + "learning_rate": 0.0005, + "loss": 1.7715, + "step": 212900 + }, + { + "epoch": 16.7, + "learning_rate": 0.0005, + "loss": 1.7378, + "step": 213000 + }, + { + "epoch": 16.71, + "learning_rate": 0.0005, + "loss": 1.7646, + "step": 213100 + }, + { + "epoch": 16.72, + "learning_rate": 0.0005, + "loss": 1.7489, + "step": 213200 + }, + { + "epoch": 16.73, + "learning_rate": 0.0005, + "loss": 1.7538, + "step": 213300 + }, + { + "epoch": 16.73, + "learning_rate": 0.0005, + "loss": 1.7877, + "step": 213400 + }, + { + "epoch": 16.74, + "learning_rate": 0.0005, + "loss": 1.7733, + "step": 213500 + }, + { + "epoch": 16.75, + "learning_rate": 0.0005, + "loss": 1.7807, + "step": 213600 + }, + { + "epoch": 16.76, + "learning_rate": 0.0005, + "loss": 1.7642, + "step": 213700 + }, + { + "epoch": 16.77, + "learning_rate": 0.0005, + "loss": 1.7609, + "step": 213800 + }, + { + "epoch": 16.77, + "learning_rate": 0.0005, + "loss": 1.7679, + "step": 213900 + }, + { + "epoch": 16.78, + "learning_rate": 0.0005, + "loss": 1.7719, + "step": 214000 + }, + { + "epoch": 16.79, + "learning_rate": 0.0005, + "loss": 1.7707, + "step": 214100 + }, + { + "epoch": 16.8, + "learning_rate": 0.0005, + "loss": 1.7702, + "step": 214200 + }, + { + "epoch": 16.81, + "learning_rate": 0.0005, + "loss": 1.7301, + "step": 214300 + }, + { + "epoch": 16.81, + "learning_rate": 0.0005, + "loss": 1.7522, + "step": 214400 + }, + { + "epoch": 16.82, + "learning_rate": 0.0005, + "loss": 1.7738, + "step": 214500 + }, + { + "epoch": 16.83, + "learning_rate": 0.0005, + "loss": 1.7551, + "step": 214600 + }, + { + "epoch": 16.84, + "learning_rate": 0.0005, + "loss": 1.742, + "step": 214700 + }, + { + "epoch": 16.84, + "learning_rate": 0.0005, + "loss": 1.7641, + "step": 214800 + }, + { + "epoch": 16.85, + "learning_rate": 0.0005, + "loss": 1.7828, + "step": 214900 + }, + { + "epoch": 16.86, + "learning_rate": 0.0005, + "loss": 1.7799, + "step": 215000 + }, + { + "epoch": 16.87, + "learning_rate": 0.0005, + "loss": 1.7476, + "step": 215100 + }, + { + "epoch": 16.88, + "learning_rate": 0.0005, + "loss": 1.7662, + "step": 215200 + }, + { + "epoch": 16.88, + "learning_rate": 0.0005, + "loss": 1.7764, + "step": 215300 + }, + { + "epoch": 16.89, + "learning_rate": 0.0005, + "loss": 1.79, + "step": 215400 + }, + { + "epoch": 16.9, + "learning_rate": 0.0005, + "loss": 1.7862, + "step": 215500 + }, + { + "epoch": 16.91, + "learning_rate": 0.0005, + "loss": 1.7792, + "step": 215600 + }, + { + "epoch": 16.91, + "learning_rate": 0.0005, + "loss": 1.7772, + "step": 215700 + }, + { + "epoch": 16.92, + "learning_rate": 0.0005, + "loss": 1.8195, + "step": 215800 + }, + { + "epoch": 16.93, + "learning_rate": 0.0005, + "loss": 1.7426, + "step": 215900 + }, + { + "epoch": 16.94, + "learning_rate": 0.0005, + "loss": 1.781, + "step": 216000 + }, + { + "epoch": 16.95, + "learning_rate": 0.0005, + "loss": 1.7399, + "step": 216100 + }, + { + "epoch": 16.95, + "learning_rate": 0.0005, + "loss": 1.774, + "step": 216200 + }, + { + "epoch": 16.96, + "learning_rate": 0.0005, + "loss": 1.7855, + "step": 216300 + }, + { + "epoch": 16.97, + "learning_rate": 0.0005, + "loss": 1.785, + "step": 216400 + }, + { + "epoch": 16.98, + "learning_rate": 0.0005, + "loss": 1.7603, + "step": 216500 + }, + { + "epoch": 16.99, + "learning_rate": 0.0005, + "loss": 1.7576, + "step": 216600 + }, + { + "epoch": 16.99, + "learning_rate": 0.0005, + "loss": 1.7952, + "step": 216700 + }, + { + "epoch": 17.0, + "learning_rate": 0.0005, + "loss": 1.7494, + "step": 216800 + }, + { + "epoch": 17.01, + "learning_rate": 0.0005, + "loss": 1.7225, + "step": 216900 + }, + { + "epoch": 17.02, + "learning_rate": 0.0005, + "loss": 1.6896, + "step": 217000 + }, + { + "epoch": 17.02, + "learning_rate": 0.0005, + "loss": 1.7069, + "step": 217100 + }, + { + "epoch": 17.03, + "learning_rate": 0.0005, + "loss": 1.6877, + "step": 217200 + }, + { + "epoch": 17.04, + "learning_rate": 0.0005, + "loss": 1.6936, + "step": 217300 + }, + { + "epoch": 17.05, + "learning_rate": 0.0005, + "loss": 1.6868, + "step": 217400 + }, + { + "epoch": 17.06, + "learning_rate": 0.0005, + "loss": 1.6801, + "step": 217500 + }, + { + "epoch": 17.06, + "learning_rate": 0.0005, + "loss": 1.7087, + "step": 217600 + }, + { + "epoch": 17.07, + "learning_rate": 0.0005, + "loss": 1.677, + "step": 217700 + }, + { + "epoch": 17.08, + "learning_rate": 0.0005, + "loss": 1.7113, + "step": 217800 + }, + { + "epoch": 17.09, + "learning_rate": 0.0005, + "loss": 1.7145, + "step": 217900 + }, + { + "epoch": 17.1, + "learning_rate": 0.0005, + "loss": 1.6894, + "step": 218000 + }, + { + "epoch": 17.1, + "learning_rate": 0.0005, + "loss": 1.7059, + "step": 218100 + }, + { + "epoch": 17.11, + "learning_rate": 0.0005, + "loss": 1.6714, + "step": 218200 + }, + { + "epoch": 17.12, + "learning_rate": 0.0005, + "loss": 1.7143, + "step": 218300 + }, + { + "epoch": 17.13, + "learning_rate": 0.0005, + "loss": 1.7091, + "step": 218400 + }, + { + "epoch": 17.13, + "learning_rate": 0.0005, + "loss": 1.6903, + "step": 218500 + }, + { + "epoch": 17.14, + "learning_rate": 0.0005, + "loss": 1.7064, + "step": 218600 + }, + { + "epoch": 17.15, + "learning_rate": 0.0005, + "loss": 1.6881, + "step": 218700 + }, + { + "epoch": 17.16, + "learning_rate": 0.0005, + "loss": 1.6891, + "step": 218800 + }, + { + "epoch": 17.17, + "learning_rate": 0.0005, + "loss": 1.7152, + "step": 218900 + }, + { + "epoch": 17.17, + "learning_rate": 0.0005, + "loss": 1.7019, + "step": 219000 + }, + { + "epoch": 17.18, + "learning_rate": 0.0005, + "loss": 1.7281, + "step": 219100 + }, + { + "epoch": 17.19, + "learning_rate": 0.0005, + "loss": 1.7069, + "step": 219200 + }, + { + "epoch": 17.2, + "learning_rate": 0.0005, + "loss": 1.7337, + "step": 219300 + }, + { + "epoch": 17.21, + "learning_rate": 0.0005, + "loss": 1.7059, + "step": 219400 + }, + { + "epoch": 17.21, + "learning_rate": 0.0005, + "loss": 1.7184, + "step": 219500 + }, + { + "epoch": 17.22, + "learning_rate": 0.0005, + "loss": 1.7176, + "step": 219600 + }, + { + "epoch": 17.23, + "learning_rate": 0.0005, + "loss": 1.6903, + "step": 219700 + }, + { + "epoch": 17.24, + "learning_rate": 0.0005, + "loss": 1.7106, + "step": 219800 + }, + { + "epoch": 17.24, + "learning_rate": 0.0005, + "loss": 1.7179, + "step": 219900 + }, + { + "epoch": 17.25, + "learning_rate": 0.0005, + "loss": 1.7223, + "step": 220000 + }, + { + "epoch": 17.25, + "eval_gen_len": 18.785556634589916, + "eval_loss": 2.0205140113830566, + "eval_rouge1": 35.2932, + "eval_rouge2": 14.0579, + "eval_rougeL": 28.9651, + "eval_rougeLsum": 28.9628, + "eval_runtime": 358.8009, + "eval_samples_per_second": 31.569, + "eval_steps_per_second": 1.973, + "step": 220000 + }, + { + "epoch": 17.26, + "learning_rate": 0.0005, + "loss": 1.7245, + "step": 220100 + }, + { + "epoch": 17.27, + "learning_rate": 0.0005, + "loss": 1.6959, + "step": 220200 + }, + { + "epoch": 17.28, + "learning_rate": 0.0005, + "loss": 1.7083, + "step": 220300 + }, + { + "epoch": 17.28, + "learning_rate": 0.0005, + "loss": 1.7132, + "step": 220400 + }, + { + "epoch": 17.29, + "learning_rate": 0.0005, + "loss": 1.7206, + "step": 220500 + }, + { + "epoch": 17.3, + "learning_rate": 0.0005, + "loss": 1.7128, + "step": 220600 + }, + { + "epoch": 17.31, + "learning_rate": 0.0005, + "loss": 1.711, + "step": 220700 + }, + { + "epoch": 17.31, + "learning_rate": 0.0005, + "loss": 1.7169, + "step": 220800 + }, + { + "epoch": 17.32, + "learning_rate": 0.0005, + "loss": 1.6995, + "step": 220900 + }, + { + "epoch": 17.33, + "learning_rate": 0.0005, + "loss": 1.7092, + "step": 221000 + }, + { + "epoch": 17.34, + "learning_rate": 0.0005, + "loss": 1.7284, + "step": 221100 + }, + { + "epoch": 17.35, + "learning_rate": 0.0005, + "loss": 1.7167, + "step": 221200 + }, + { + "epoch": 17.35, + "learning_rate": 0.0005, + "loss": 1.7499, + "step": 221300 + }, + { + "epoch": 17.36, + "learning_rate": 0.0005, + "loss": 1.693, + "step": 221400 + }, + { + "epoch": 17.37, + "learning_rate": 0.0005, + "loss": 1.7267, + "step": 221500 + }, + { + "epoch": 17.38, + "learning_rate": 0.0005, + "loss": 1.6923, + "step": 221600 + }, + { + "epoch": 17.39, + "learning_rate": 0.0005, + "loss": 1.7253, + "step": 221700 + }, + { + "epoch": 17.39, + "learning_rate": 0.0005, + "loss": 1.7074, + "step": 221800 + }, + { + "epoch": 17.4, + "learning_rate": 0.0005, + "loss": 1.6936, + "step": 221900 + }, + { + "epoch": 17.41, + "learning_rate": 0.0005, + "loss": 1.7314, + "step": 222000 + }, + { + "epoch": 17.42, + "learning_rate": 0.0005, + "loss": 1.7109, + "step": 222100 + }, + { + "epoch": 17.42, + "learning_rate": 0.0005, + "loss": 1.7574, + "step": 222200 + }, + { + "epoch": 17.43, + "learning_rate": 0.0005, + "loss": 1.6917, + "step": 222300 + }, + { + "epoch": 17.44, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 222400 + }, + { + "epoch": 17.45, + "learning_rate": 0.0005, + "loss": 1.7132, + "step": 222500 + }, + { + "epoch": 17.46, + "learning_rate": 0.0005, + "loss": 1.7103, + "step": 222600 + }, + { + "epoch": 17.46, + "learning_rate": 0.0005, + "loss": 1.717, + "step": 222700 + }, + { + "epoch": 17.47, + "learning_rate": 0.0005, + "loss": 1.7215, + "step": 222800 + }, + { + "epoch": 17.48, + "learning_rate": 0.0005, + "loss": 1.7374, + "step": 222900 + }, + { + "epoch": 17.49, + "learning_rate": 0.0005, + "loss": 1.6961, + "step": 223000 + }, + { + "epoch": 17.5, + "learning_rate": 0.0005, + "loss": 1.7261, + "step": 223100 + }, + { + "epoch": 17.5, + "learning_rate": 0.0005, + "loss": 1.7277, + "step": 223200 + }, + { + "epoch": 17.51, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 223300 + }, + { + "epoch": 17.52, + "learning_rate": 0.0005, + "loss": 1.7192, + "step": 223400 + }, + { + "epoch": 17.53, + "learning_rate": 0.0005, + "loss": 1.7496, + "step": 223500 + }, + { + "epoch": 17.53, + "learning_rate": 0.0005, + "loss": 1.7392, + "step": 223600 + }, + { + "epoch": 17.54, + "learning_rate": 0.0005, + "loss": 1.7387, + "step": 223700 + }, + { + "epoch": 17.55, + "learning_rate": 0.0005, + "loss": 1.6937, + "step": 223800 + }, + { + "epoch": 17.56, + "learning_rate": 0.0005, + "loss": 1.7106, + "step": 223900 + }, + { + "epoch": 17.57, + "learning_rate": 0.0005, + "loss": 1.7596, + "step": 224000 + }, + { + "epoch": 17.57, + "learning_rate": 0.0005, + "loss": 1.743, + "step": 224100 + }, + { + "epoch": 17.58, + "learning_rate": 0.0005, + "loss": 1.7321, + "step": 224200 + }, + { + "epoch": 17.59, + "learning_rate": 0.0005, + "loss": 1.7312, + "step": 224300 + }, + { + "epoch": 17.6, + "learning_rate": 0.0005, + "loss": 1.737, + "step": 224400 + }, + { + "epoch": 17.61, + "learning_rate": 0.0005, + "loss": 1.7505, + "step": 224500 + }, + { + "epoch": 17.61, + "learning_rate": 0.0005, + "loss": 1.7366, + "step": 224600 + }, + { + "epoch": 17.62, + "learning_rate": 0.0005, + "loss": 1.7694, + "step": 224700 + }, + { + "epoch": 17.63, + "learning_rate": 0.0005, + "loss": 1.7462, + "step": 224800 + }, + { + "epoch": 17.64, + "learning_rate": 0.0005, + "loss": 1.7142, + "step": 224900 + }, + { + "epoch": 17.64, + "learning_rate": 0.0005, + "loss": 1.7402, + "step": 225000 + }, + { + "epoch": 17.65, + "learning_rate": 0.0005, + "loss": 1.7443, + "step": 225100 + }, + { + "epoch": 17.66, + "learning_rate": 0.0005, + "loss": 1.7299, + "step": 225200 + }, + { + "epoch": 17.67, + "learning_rate": 0.0005, + "loss": 1.7341, + "step": 225300 + }, + { + "epoch": 17.68, + "learning_rate": 0.0005, + "loss": 1.7406, + "step": 225400 + }, + { + "epoch": 17.68, + "learning_rate": 0.0005, + "loss": 1.7466, + "step": 225500 + }, + { + "epoch": 17.69, + "learning_rate": 0.0005, + "loss": 1.7195, + "step": 225600 + }, + { + "epoch": 17.7, + "learning_rate": 0.0005, + "loss": 1.7175, + "step": 225700 + }, + { + "epoch": 17.71, + "learning_rate": 0.0005, + "loss": 1.7269, + "step": 225800 + }, + { + "epoch": 17.71, + "learning_rate": 0.0005, + "loss": 1.7371, + "step": 225900 + }, + { + "epoch": 17.72, + "learning_rate": 0.0005, + "loss": 1.7362, + "step": 226000 + }, + { + "epoch": 17.73, + "learning_rate": 0.0005, + "loss": 1.7418, + "step": 226100 + }, + { + "epoch": 17.74, + "learning_rate": 0.0005, + "loss": 1.7583, + "step": 226200 + }, + { + "epoch": 17.75, + "learning_rate": 0.0005, + "loss": 1.7261, + "step": 226300 + }, + { + "epoch": 17.75, + "learning_rate": 0.0005, + "loss": 1.7558, + "step": 226400 + }, + { + "epoch": 17.76, + "learning_rate": 0.0005, + "loss": 1.7613, + "step": 226500 + }, + { + "epoch": 17.77, + "learning_rate": 0.0005, + "loss": 1.7171, + "step": 226600 + }, + { + "epoch": 17.78, + "learning_rate": 0.0005, + "loss": 1.7267, + "step": 226700 + }, + { + "epoch": 17.79, + "learning_rate": 0.0005, + "loss": 1.7391, + "step": 226800 + }, + { + "epoch": 17.79, + "learning_rate": 0.0005, + "loss": 1.7509, + "step": 226900 + }, + { + "epoch": 17.8, + "learning_rate": 0.0005, + "loss": 1.7602, + "step": 227000 + }, + { + "epoch": 17.81, + "learning_rate": 0.0005, + "loss": 1.7495, + "step": 227100 + }, + { + "epoch": 17.82, + "learning_rate": 0.0005, + "loss": 1.7496, + "step": 227200 + }, + { + "epoch": 17.82, + "learning_rate": 0.0005, + "loss": 1.7514, + "step": 227300 + }, + { + "epoch": 17.83, + "learning_rate": 0.0005, + "loss": 1.7236, + "step": 227400 + }, + { + "epoch": 17.84, + "learning_rate": 0.0005, + "loss": 1.7744, + "step": 227500 + }, + { + "epoch": 17.85, + "learning_rate": 0.0005, + "loss": 1.7376, + "step": 227600 + }, + { + "epoch": 17.86, + "learning_rate": 0.0005, + "loss": 1.7838, + "step": 227700 + }, + { + "epoch": 17.86, + "learning_rate": 0.0005, + "loss": 1.7483, + "step": 227800 + }, + { + "epoch": 17.87, + "learning_rate": 0.0005, + "loss": 1.7373, + "step": 227900 + }, + { + "epoch": 17.88, + "learning_rate": 0.0005, + "loss": 1.763, + "step": 228000 + }, + { + "epoch": 17.89, + "learning_rate": 0.0005, + "loss": 1.7521, + "step": 228100 + }, + { + "epoch": 17.9, + "learning_rate": 0.0005, + "loss": 1.7713, + "step": 228200 + }, + { + "epoch": 17.9, + "learning_rate": 0.0005, + "loss": 1.7472, + "step": 228300 + }, + { + "epoch": 17.91, + "learning_rate": 0.0005, + "loss": 1.7377, + "step": 228400 + }, + { + "epoch": 17.92, + "learning_rate": 0.0005, + "loss": 1.7492, + "step": 228500 + }, + { + "epoch": 17.93, + "learning_rate": 0.0005, + "loss": 1.7903, + "step": 228600 + }, + { + "epoch": 17.93, + "learning_rate": 0.0005, + "loss": 1.7734, + "step": 228700 + }, + { + "epoch": 17.94, + "learning_rate": 0.0005, + "loss": 1.743, + "step": 228800 + }, + { + "epoch": 17.95, + "learning_rate": 0.0005, + "loss": 1.7376, + "step": 228900 + }, + { + "epoch": 17.96, + "learning_rate": 0.0005, + "loss": 1.7402, + "step": 229000 + }, + { + "epoch": 17.97, + "learning_rate": 0.0005, + "loss": 1.7615, + "step": 229100 + }, + { + "epoch": 17.97, + "learning_rate": 0.0005, + "loss": 1.7213, + "step": 229200 + }, + { + "epoch": 17.98, + "learning_rate": 0.0005, + "loss": 1.7485, + "step": 229300 + }, + { + "epoch": 17.99, + "learning_rate": 0.0005, + "loss": 1.7361, + "step": 229400 + }, + { + "epoch": 18.0, + "learning_rate": 0.0005, + "loss": 1.7568, + "step": 229500 + }, + { + "epoch": 18.01, + "learning_rate": 0.0005, + "loss": 1.6941, + "step": 229600 + }, + { + "epoch": 18.01, + "learning_rate": 0.0005, + "loss": 1.6589, + "step": 229700 + }, + { + "epoch": 18.02, + "learning_rate": 0.0005, + "loss": 1.6802, + "step": 229800 + }, + { + "epoch": 18.03, + "learning_rate": 0.0005, + "loss": 1.6694, + "step": 229900 + }, + { + "epoch": 18.04, + "learning_rate": 0.0005, + "loss": 1.6742, + "step": 230000 + }, + { + "epoch": 18.04, + "eval_gen_len": 18.784497219034165, + "eval_loss": 2.0137345790863037, + "eval_rouge1": 35.3791, + "eval_rouge2": 14.1491, + "eval_rougeL": 29.0175, + "eval_rougeLsum": 29.0086, + "eval_runtime": 361.5305, + "eval_samples_per_second": 31.331, + "eval_steps_per_second": 1.958, + "step": 230000 + }, + { + "epoch": 18.04, + "learning_rate": 0.0005, + "loss": 1.6665, + "step": 230100 + }, + { + "epoch": 18.05, + "learning_rate": 0.0005, + "loss": 1.6606, + "step": 230200 + }, + { + "epoch": 18.06, + "learning_rate": 0.0005, + "loss": 1.677, + "step": 230300 + }, + { + "epoch": 18.07, + "learning_rate": 0.0005, + "loss": 1.6896, + "step": 230400 + }, + { + "epoch": 18.08, + "learning_rate": 0.0005, + "loss": 1.6786, + "step": 230500 + }, + { + "epoch": 18.08, + "learning_rate": 0.0005, + "loss": 1.653, + "step": 230600 + }, + { + "epoch": 18.09, + "learning_rate": 0.0005, + "loss": 1.7007, + "step": 230700 + }, + { + "epoch": 18.1, + "learning_rate": 0.0005, + "loss": 1.6996, + "step": 230800 + }, + { + "epoch": 18.11, + "learning_rate": 0.0005, + "loss": 1.6833, + "step": 230900 + }, + { + "epoch": 18.11, + "learning_rate": 0.0005, + "loss": 1.6877, + "step": 231000 + }, + { + "epoch": 18.12, + "learning_rate": 0.0005, + "loss": 1.7055, + "step": 231100 + }, + { + "epoch": 18.13, + "learning_rate": 0.0005, + "loss": 1.6772, + "step": 231200 + }, + { + "epoch": 18.14, + "learning_rate": 0.0005, + "loss": 1.6763, + "step": 231300 + }, + { + "epoch": 18.15, + "learning_rate": 0.0005, + "loss": 1.69, + "step": 231400 + }, + { + "epoch": 18.15, + "learning_rate": 0.0005, + "loss": 1.6732, + "step": 231500 + }, + { + "epoch": 18.16, + "learning_rate": 0.0005, + "loss": 1.6955, + "step": 231600 + }, + { + "epoch": 18.17, + "learning_rate": 0.0005, + "loss": 1.7267, + "step": 231700 + }, + { + "epoch": 18.18, + "learning_rate": 0.0005, + "loss": 1.6848, + "step": 231800 + }, + { + "epoch": 18.19, + "learning_rate": 0.0005, + "loss": 1.6835, + "step": 231900 + }, + { + "epoch": 18.19, + "learning_rate": 0.0005, + "loss": 1.689, + "step": 232000 + }, + { + "epoch": 18.2, + "learning_rate": 0.0005, + "loss": 1.6716, + "step": 232100 + }, + { + "epoch": 18.21, + "learning_rate": 0.0005, + "loss": 1.6968, + "step": 232200 + }, + { + "epoch": 18.22, + "learning_rate": 0.0005, + "loss": 1.6927, + "step": 232300 + }, + { + "epoch": 18.22, + "learning_rate": 0.0005, + "loss": 1.678, + "step": 232400 + }, + { + "epoch": 18.23, + "learning_rate": 0.0005, + "loss": 1.6848, + "step": 232500 + }, + { + "epoch": 18.24, + "learning_rate": 0.0005, + "loss": 1.6858, + "step": 232600 + }, + { + "epoch": 18.25, + "learning_rate": 0.0005, + "loss": 1.6891, + "step": 232700 + }, + { + "epoch": 18.26, + "learning_rate": 0.0005, + "loss": 1.7047, + "step": 232800 + }, + { + "epoch": 18.26, + "learning_rate": 0.0005, + "loss": 1.6695, + "step": 232900 + }, + { + "epoch": 18.27, + "learning_rate": 0.0005, + "loss": 1.7147, + "step": 233000 + }, + { + "epoch": 18.28, + "learning_rate": 0.0005, + "loss": 1.681, + "step": 233100 + }, + { + "epoch": 18.29, + "learning_rate": 0.0005, + "loss": 1.703, + "step": 233200 + }, + { + "epoch": 18.3, + "learning_rate": 0.0005, + "loss": 1.7181, + "step": 233300 + }, + { + "epoch": 18.3, + "learning_rate": 0.0005, + "loss": 1.6858, + "step": 233400 + }, + { + "epoch": 18.31, + "learning_rate": 0.0005, + "loss": 1.695, + "step": 233500 + }, + { + "epoch": 18.32, + "learning_rate": 0.0005, + "loss": 1.6958, + "step": 233600 + }, + { + "epoch": 18.33, + "learning_rate": 0.0005, + "loss": 1.6873, + "step": 233700 + }, + { + "epoch": 18.33, + "learning_rate": 0.0005, + "loss": 1.7194, + "step": 233800 + }, + { + "epoch": 18.34, + "learning_rate": 0.0005, + "loss": 1.7067, + "step": 233900 + }, + { + "epoch": 18.35, + "learning_rate": 0.0005, + "loss": 1.7099, + "step": 234000 + }, + { + "epoch": 18.36, + "learning_rate": 0.0005, + "loss": 1.6824, + "step": 234100 + }, + { + "epoch": 18.37, + "learning_rate": 0.0005, + "loss": 1.6951, + "step": 234200 + }, + { + "epoch": 18.37, + "learning_rate": 0.0005, + "loss": 1.6935, + "step": 234300 + }, + { + "epoch": 18.38, + "learning_rate": 0.0005, + "loss": 1.7125, + "step": 234400 + }, + { + "epoch": 18.39, + "learning_rate": 0.0005, + "loss": 1.729, + "step": 234500 + }, + { + "epoch": 18.4, + "learning_rate": 0.0005, + "loss": 1.7548, + "step": 234600 + }, + { + "epoch": 18.4, + "learning_rate": 0.0005, + "loss": 1.6915, + "step": 234700 + }, + { + "epoch": 18.41, + "learning_rate": 0.0005, + "loss": 1.677, + "step": 234800 + }, + { + "epoch": 18.42, + "learning_rate": 0.0005, + "loss": 1.6872, + "step": 234900 + }, + { + "epoch": 18.43, + "learning_rate": 0.0005, + "loss": 1.6984, + "step": 235000 + }, + { + "epoch": 18.44, + "learning_rate": 0.0005, + "loss": 1.6906, + "step": 235100 + }, + { + "epoch": 18.44, + "learning_rate": 0.0005, + "loss": 1.7143, + "step": 235200 + }, + { + "epoch": 18.45, + "learning_rate": 0.0005, + "loss": 1.7065, + "step": 235300 + }, + { + "epoch": 18.46, + "learning_rate": 0.0005, + "loss": 1.7155, + "step": 235400 + }, + { + "epoch": 18.47, + "learning_rate": 0.0005, + "loss": 1.7136, + "step": 235500 + }, + { + "epoch": 18.48, + "learning_rate": 0.0005, + "loss": 1.703, + "step": 235600 + }, + { + "epoch": 18.48, + "learning_rate": 0.0005, + "loss": 1.6929, + "step": 235700 + }, + { + "epoch": 18.49, + "learning_rate": 0.0005, + "loss": 1.7074, + "step": 235800 + }, + { + "epoch": 18.5, + "learning_rate": 0.0005, + "loss": 1.6988, + "step": 235900 + }, + { + "epoch": 18.51, + "learning_rate": 0.0005, + "loss": 1.7208, + "step": 236000 + }, + { + "epoch": 18.51, + "learning_rate": 0.0005, + "loss": 1.7038, + "step": 236100 + }, + { + "epoch": 18.52, + "learning_rate": 0.0005, + "loss": 1.7127, + "step": 236200 + }, + { + "epoch": 18.53, + "learning_rate": 0.0005, + "loss": 1.7226, + "step": 236300 + }, + { + "epoch": 18.54, + "learning_rate": 0.0005, + "loss": 1.7001, + "step": 236400 + }, + { + "epoch": 18.55, + "learning_rate": 0.0005, + "loss": 1.6968, + "step": 236500 + }, + { + "epoch": 18.55, + "learning_rate": 0.0005, + "loss": 1.6991, + "step": 236600 + }, + { + "epoch": 18.56, + "learning_rate": 0.0005, + "loss": 1.7272, + "step": 236700 + }, + { + "epoch": 18.57, + "learning_rate": 0.0005, + "loss": 1.7066, + "step": 236800 + }, + { + "epoch": 18.58, + "learning_rate": 0.0005, + "loss": 1.7438, + "step": 236900 + }, + { + "epoch": 18.59, + "learning_rate": 0.0005, + "loss": 1.7206, + "step": 237000 + }, + { + "epoch": 18.59, + "learning_rate": 0.0005, + "loss": 1.751, + "step": 237100 + }, + { + "epoch": 18.6, + "learning_rate": 0.0005, + "loss": 1.7057, + "step": 237200 + }, + { + "epoch": 18.61, + "learning_rate": 0.0005, + "loss": 1.7001, + "step": 237300 + }, + { + "epoch": 18.62, + "learning_rate": 0.0005, + "loss": 1.7321, + "step": 237400 + }, + { + "epoch": 18.62, + "learning_rate": 0.0005, + "loss": 1.733, + "step": 237500 + }, + { + "epoch": 18.63, + "learning_rate": 0.0005, + "loss": 1.704, + "step": 237600 + }, + { + "epoch": 18.64, + "learning_rate": 0.0005, + "loss": 1.7253, + "step": 237700 + }, + { + "epoch": 18.65, + "learning_rate": 0.0005, + "loss": 1.7128, + "step": 237800 + }, + { + "epoch": 18.66, + "learning_rate": 0.0005, + "loss": 1.7277, + "step": 237900 + }, + { + "epoch": 18.66, + "learning_rate": 0.0005, + "loss": 1.7428, + "step": 238000 + }, + { + "epoch": 18.67, + "learning_rate": 0.0005, + "loss": 1.7269, + "step": 238100 + }, + { + "epoch": 18.68, + "learning_rate": 0.0005, + "loss": 1.7137, + "step": 238200 + }, + { + "epoch": 18.69, + "learning_rate": 0.0005, + "loss": 1.7148, + "step": 238300 + }, + { + "epoch": 18.7, + "learning_rate": 0.0005, + "loss": 1.7229, + "step": 238400 + }, + { + "epoch": 18.7, + "learning_rate": 0.0005, + "loss": 1.7405, + "step": 238500 + }, + { + "epoch": 18.71, + "learning_rate": 0.0005, + "loss": 1.7477, + "step": 238600 + }, + { + "epoch": 18.72, + "learning_rate": 0.0005, + "loss": 1.7204, + "step": 238700 + }, + { + "epoch": 18.73, + "learning_rate": 0.0005, + "loss": 1.7565, + "step": 238800 + }, + { + "epoch": 18.73, + "learning_rate": 0.0005, + "loss": 1.7104, + "step": 238900 + }, + { + "epoch": 18.74, + "learning_rate": 0.0005, + "loss": 1.748, + "step": 239000 + }, + { + "epoch": 18.75, + "learning_rate": 0.0005, + "loss": 1.7363, + "step": 239100 + }, + { + "epoch": 18.76, + "learning_rate": 0.0005, + "loss": 1.7393, + "step": 239200 + }, + { + "epoch": 18.77, + "learning_rate": 0.0005, + "loss": 1.7151, + "step": 239300 + }, + { + "epoch": 18.77, + "learning_rate": 0.0005, + "loss": 1.7496, + "step": 239400 + }, + { + "epoch": 18.78, + "learning_rate": 0.0005, + "loss": 1.7605, + "step": 239500 + }, + { + "epoch": 18.79, + "learning_rate": 0.0005, + "loss": 1.7371, + "step": 239600 + }, + { + "epoch": 18.8, + "learning_rate": 0.0005, + "loss": 1.7388, + "step": 239700 + }, + { + "epoch": 18.8, + "learning_rate": 0.0005, + "loss": 1.7166, + "step": 239800 + }, + { + "epoch": 18.81, + "learning_rate": 0.0005, + "loss": 1.7391, + "step": 239900 + }, + { + "epoch": 18.82, + "learning_rate": 0.0005, + "loss": 1.7056, + "step": 240000 + }, + { + "epoch": 18.82, + "eval_gen_len": 18.775668756069567, + "eval_loss": 2.0135035514831543, + "eval_rouge1": 35.5094, + "eval_rouge2": 14.2676, + "eval_rougeL": 29.0733, + "eval_rougeLsum": 29.0642, + "eval_runtime": 364.5866, + "eval_samples_per_second": 31.068, + "eval_steps_per_second": 1.942, + "step": 240000 + }, + { + "epoch": 18.83, + "learning_rate": 0.0005, + "loss": 1.7024, + "step": 240100 + }, + { + "epoch": 18.84, + "learning_rate": 0.0005, + "loss": 1.6971, + "step": 240200 + }, + { + "epoch": 18.84, + "learning_rate": 0.0005, + "loss": 1.6981, + "step": 240300 + }, + { + "epoch": 18.85, + "learning_rate": 0.0005, + "loss": 1.7311, + "step": 240400 + }, + { + "epoch": 18.86, + "learning_rate": 0.0005, + "loss": 1.7257, + "step": 240500 + }, + { + "epoch": 18.87, + "learning_rate": 0.0005, + "loss": 1.6868, + "step": 240600 + }, + { + "epoch": 18.88, + "learning_rate": 0.0005, + "loss": 1.7249, + "step": 240700 + }, + { + "epoch": 18.88, + "learning_rate": 0.0005, + "loss": 1.7212, + "step": 240800 + }, + { + "epoch": 18.89, + "learning_rate": 0.0005, + "loss": 1.7294, + "step": 240900 + }, + { + "epoch": 18.9, + "learning_rate": 0.0005, + "loss": 1.7355, + "step": 241000 + }, + { + "epoch": 18.91, + "learning_rate": 0.0005, + "loss": 1.7353, + "step": 241100 + }, + { + "epoch": 18.91, + "learning_rate": 0.0005, + "loss": 1.7472, + "step": 241200 + }, + { + "epoch": 18.92, + "learning_rate": 0.0005, + "loss": 1.7534, + "step": 241300 + }, + { + "epoch": 18.93, + "learning_rate": 0.0005, + "loss": 1.7067, + "step": 241400 + }, + { + "epoch": 18.94, + "learning_rate": 0.0005, + "loss": 1.7399, + "step": 241500 + }, + { + "epoch": 18.95, + "learning_rate": 0.0005, + "loss": 1.7203, + "step": 241600 + }, + { + "epoch": 18.95, + "learning_rate": 0.0005, + "loss": 1.7496, + "step": 241700 + }, + { + "epoch": 18.96, + "learning_rate": 0.0005, + "loss": 1.7307, + "step": 241800 + }, + { + "epoch": 18.97, + "learning_rate": 0.0005, + "loss": 1.7298, + "step": 241900 + }, + { + "epoch": 18.98, + "learning_rate": 0.0005, + "loss": 1.7253, + "step": 242000 + }, + { + "epoch": 18.99, + "learning_rate": 0.0005, + "loss": 1.7364, + "step": 242100 + }, + { + "epoch": 18.99, + "learning_rate": 0.0005, + "loss": 1.7221, + "step": 242200 + }, + { + "epoch": 19.0, + "learning_rate": 0.0005, + "loss": 1.7532, + "step": 242300 + }, + { + "epoch": 19.01, + "learning_rate": 0.0005, + "loss": 1.6476, + "step": 242400 + }, + { + "epoch": 19.02, + "learning_rate": 0.0005, + "loss": 1.6458, + "step": 242500 + }, + { + "epoch": 19.02, + "learning_rate": 0.0005, + "loss": 1.627, + "step": 242600 + }, + { + "epoch": 19.03, + "learning_rate": 0.0005, + "loss": 1.6363, + "step": 242700 + }, + { + "epoch": 19.04, + "learning_rate": 0.0005, + "loss": 1.6743, + "step": 242800 + }, + { + "epoch": 19.05, + "learning_rate": 0.0005, + "loss": 1.666, + "step": 242900 + }, + { + "epoch": 19.06, + "learning_rate": 0.0005, + "loss": 1.6507, + "step": 243000 + }, + { + "epoch": 19.06, + "learning_rate": 0.0005, + "loss": 1.6605, + "step": 243100 + }, + { + "epoch": 19.07, + "learning_rate": 0.0005, + "loss": 1.6324, + "step": 243200 + }, + { + "epoch": 19.08, + "learning_rate": 0.0005, + "loss": 1.6725, + "step": 243300 + }, + { + "epoch": 19.09, + "learning_rate": 0.0005, + "loss": 1.6617, + "step": 243400 + }, + { + "epoch": 19.1, + "learning_rate": 0.0005, + "loss": 1.6396, + "step": 243500 + }, + { + "epoch": 19.1, + "learning_rate": 0.0005, + "loss": 1.6576, + "step": 243600 + }, + { + "epoch": 19.11, + "learning_rate": 0.0005, + "loss": 1.6778, + "step": 243700 + }, + { + "epoch": 19.12, + "learning_rate": 0.0005, + "loss": 1.6664, + "step": 243800 + }, + { + "epoch": 19.13, + "learning_rate": 0.0005, + "loss": 1.7057, + "step": 243900 + }, + { + "epoch": 19.13, + "learning_rate": 0.0005, + "loss": 1.6805, + "step": 244000 + }, + { + "epoch": 19.14, + "learning_rate": 0.0005, + "loss": 1.6807, + "step": 244100 + }, + { + "epoch": 19.15, + "learning_rate": 0.0005, + "loss": 1.6879, + "step": 244200 + }, + { + "epoch": 19.16, + "learning_rate": 0.0005, + "loss": 1.6696, + "step": 244300 + }, + { + "epoch": 19.17, + "learning_rate": 0.0005, + "loss": 1.6631, + "step": 244400 + }, + { + "epoch": 19.17, + "learning_rate": 0.0005, + "loss": 1.672, + "step": 244500 + }, + { + "epoch": 19.18, + "learning_rate": 0.0005, + "loss": 1.6705, + "step": 244600 + }, + { + "epoch": 19.19, + "learning_rate": 0.0005, + "loss": 1.6861, + "step": 244700 + }, + { + "epoch": 19.2, + "learning_rate": 0.0005, + "loss": 1.6579, + "step": 244800 + }, + { + "epoch": 19.2, + "learning_rate": 0.0005, + "loss": 1.6765, + "step": 244900 + }, + { + "epoch": 19.21, + "learning_rate": 0.0005, + "loss": 1.7092, + "step": 245000 + }, + { + "epoch": 19.22, + "learning_rate": 0.0005, + "loss": 1.6675, + "step": 245100 + }, + { + "epoch": 19.23, + "learning_rate": 0.0005, + "loss": 1.6635, + "step": 245200 + }, + { + "epoch": 19.24, + "learning_rate": 0.0005, + "loss": 1.678, + "step": 245300 + }, + { + "epoch": 19.24, + "learning_rate": 0.0005, + "loss": 1.664, + "step": 245400 + }, + { + "epoch": 19.25, + "learning_rate": 0.0005, + "loss": 1.6954, + "step": 245500 + }, + { + "epoch": 19.26, + "learning_rate": 0.0005, + "loss": 1.6661, + "step": 245600 + }, + { + "epoch": 19.27, + "learning_rate": 0.0005, + "loss": 1.6942, + "step": 245700 + }, + { + "epoch": 19.28, + "learning_rate": 0.0005, + "loss": 1.6785, + "step": 245800 + }, + { + "epoch": 19.28, + "learning_rate": 0.0005, + "loss": 1.6845, + "step": 245900 + }, + { + "epoch": 19.29, + "learning_rate": 0.0005, + "loss": 1.6905, + "step": 246000 + }, + { + "epoch": 19.3, + "learning_rate": 0.0005, + "loss": 1.7119, + "step": 246100 + }, + { + "epoch": 19.31, + "learning_rate": 0.0005, + "loss": 1.6895, + "step": 246200 + }, + { + "epoch": 19.31, + "learning_rate": 0.0005, + "loss": 1.6958, + "step": 246300 + }, + { + "epoch": 19.32, + "learning_rate": 0.0005, + "loss": 1.6489, + "step": 246400 + }, + { + "epoch": 19.33, + "learning_rate": 0.0005, + "loss": 1.672, + "step": 246500 + }, + { + "epoch": 19.34, + "learning_rate": 0.0005, + "loss": 1.7014, + "step": 246600 + }, + { + "epoch": 19.35, + "learning_rate": 0.0005, + "loss": 1.6828, + "step": 246700 + }, + { + "epoch": 19.35, + "learning_rate": 0.0005, + "loss": 1.6354, + "step": 246800 + }, + { + "epoch": 19.36, + "learning_rate": 0.0005, + "loss": 1.6998, + "step": 246900 + }, + { + "epoch": 19.37, + "learning_rate": 0.0005, + "loss": 1.679, + "step": 247000 + }, + { + "epoch": 19.38, + "learning_rate": 0.0005, + "loss": 1.6901, + "step": 247100 + }, + { + "epoch": 19.39, + "learning_rate": 0.0005, + "loss": 1.6976, + "step": 247200 + }, + { + "epoch": 19.39, + "learning_rate": 0.0005, + "loss": 1.6883, + "step": 247300 + }, + { + "epoch": 19.4, + "learning_rate": 0.0005, + "loss": 1.6643, + "step": 247400 + }, + { + "epoch": 19.41, + "learning_rate": 0.0005, + "loss": 1.6772, + "step": 247500 + }, + { + "epoch": 19.42, + "learning_rate": 0.0005, + "loss": 1.6931, + "step": 247600 + }, + { + "epoch": 19.42, + "learning_rate": 0.0005, + "loss": 1.6521, + "step": 247700 + }, + { + "epoch": 19.43, + "learning_rate": 0.0005, + "loss": 1.7068, + "step": 247800 + }, + { + "epoch": 19.44, + "learning_rate": 0.0005, + "loss": 1.6937, + "step": 247900 + }, + { + "epoch": 19.45, + "learning_rate": 0.0005, + "loss": 1.72, + "step": 248000 + }, + { + "epoch": 19.46, + "learning_rate": 0.0005, + "loss": 1.7136, + "step": 248100 + }, + { + "epoch": 19.46, + "learning_rate": 0.0005, + "loss": 1.6937, + "step": 248200 + }, + { + "epoch": 19.47, + "learning_rate": 0.0005, + "loss": 1.6706, + "step": 248300 + }, + { + "epoch": 19.48, + "learning_rate": 0.0005, + "loss": 1.6952, + "step": 248400 + }, + { + "epoch": 19.49, + "learning_rate": 0.0005, + "loss": 1.698, + "step": 248500 + }, + { + "epoch": 19.49, + "learning_rate": 0.0005, + "loss": 1.699, + "step": 248600 + }, + { + "epoch": 19.5, + "learning_rate": 0.0005, + "loss": 1.6925, + "step": 248700 + }, + { + "epoch": 19.51, + "learning_rate": 0.0005, + "loss": 1.6807, + "step": 248800 + }, + { + "epoch": 19.52, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 248900 + }, + { + "epoch": 19.53, + "learning_rate": 0.0005, + "loss": 1.6907, + "step": 249000 + }, + { + "epoch": 19.53, + "learning_rate": 0.0005, + "loss": 1.7015, + "step": 249100 + }, + { + "epoch": 19.54, + "learning_rate": 0.0005, + "loss": 1.7157, + "step": 249200 + }, + { + "epoch": 19.55, + "learning_rate": 0.0005, + "loss": 1.6905, + "step": 249300 + }, + { + "epoch": 19.56, + "learning_rate": 0.0005, + "loss": 1.6841, + "step": 249400 + }, + { + "epoch": 19.57, + "learning_rate": 0.0005, + "loss": 1.6621, + "step": 249500 + }, + { + "epoch": 19.57, + "learning_rate": 0.0005, + "loss": 1.7077, + "step": 249600 + }, + { + "epoch": 19.58, + "learning_rate": 0.0005, + "loss": 1.6981, + "step": 249700 + }, + { + "epoch": 19.59, + "learning_rate": 0.0005, + "loss": 1.6905, + "step": 249800 + }, + { + "epoch": 19.6, + "learning_rate": 0.0005, + "loss": 1.7028, + "step": 249900 + }, + { + "epoch": 19.6, + "learning_rate": 0.0005, + "loss": 1.6849, + "step": 250000 + }, + { + "epoch": 19.6, + "eval_gen_len": 18.80047673700009, + "eval_loss": 2.020770311355591, + "eval_rouge1": 35.1918, + "eval_rouge2": 14.0743, + "eval_rougeL": 28.8604, + "eval_rougeLsum": 28.846, + "eval_runtime": 358.4886, + "eval_samples_per_second": 31.597, + "eval_steps_per_second": 1.975, + "step": 250000 + }, + { + "epoch": 19.61, + "learning_rate": 0.0005, + "loss": 1.6831, + "step": 250100 + }, + { + "epoch": 19.62, + "learning_rate": 0.0005, + "loss": 1.7192, + "step": 250200 + }, + { + "epoch": 19.63, + "learning_rate": 0.0005, + "loss": 1.7143, + "step": 250300 + }, + { + "epoch": 19.64, + "learning_rate": 0.0005, + "loss": 1.708, + "step": 250400 + }, + { + "epoch": 19.64, + "learning_rate": 0.0005, + "loss": 1.714, + "step": 250500 + }, + { + "epoch": 19.65, + "learning_rate": 0.0005, + "loss": 1.7072, + "step": 250600 + }, + { + "epoch": 19.66, + "learning_rate": 0.0005, + "loss": 1.7227, + "step": 250700 + }, + { + "epoch": 19.67, + "learning_rate": 0.0005, + "loss": 1.7089, + "step": 250800 + }, + { + "epoch": 19.68, + "learning_rate": 0.0005, + "loss": 1.7213, + "step": 250900 + }, + { + "epoch": 19.68, + "learning_rate": 0.0005, + "loss": 1.7167, + "step": 251000 + }, + { + "epoch": 19.69, + "learning_rate": 0.0005, + "loss": 1.7012, + "step": 251100 + }, + { + "epoch": 19.7, + "learning_rate": 0.0005, + "loss": 1.6903, + "step": 251200 + }, + { + "epoch": 19.71, + "learning_rate": 0.0005, + "loss": 1.6836, + "step": 251300 + }, + { + "epoch": 19.71, + "learning_rate": 0.0005, + "loss": 1.6714, + "step": 251400 + }, + { + "epoch": 19.72, + "learning_rate": 0.0005, + "loss": 1.7137, + "step": 251500 + }, + { + "epoch": 19.73, + "learning_rate": 0.0005, + "loss": 1.6813, + "step": 251600 + }, + { + "epoch": 19.74, + "learning_rate": 0.0005, + "loss": 1.7059, + "step": 251700 + }, + { + "epoch": 19.75, + "learning_rate": 0.0005, + "loss": 1.6909, + "step": 251800 + }, + { + "epoch": 19.75, + "learning_rate": 0.0005, + "loss": 1.7011, + "step": 251900 + }, + { + "epoch": 19.76, + "learning_rate": 0.0005, + "loss": 1.7112, + "step": 252000 + }, + { + "epoch": 19.77, + "learning_rate": 0.0005, + "loss": 1.7428, + "step": 252100 + }, + { + "epoch": 19.78, + "learning_rate": 0.0005, + "loss": 1.7332, + "step": 252200 + }, + { + "epoch": 19.79, + "learning_rate": 0.0005, + "loss": 1.7068, + "step": 252300 + }, + { + "epoch": 19.79, + "learning_rate": 0.0005, + "loss": 1.7146, + "step": 252400 + }, + { + "epoch": 19.8, + "learning_rate": 0.0005, + "loss": 1.7073, + "step": 252500 + }, + { + "epoch": 19.81, + "learning_rate": 0.0005, + "loss": 1.7071, + "step": 252600 + }, + { + "epoch": 19.82, + "learning_rate": 0.0005, + "loss": 1.7196, + "step": 252700 + }, + { + "epoch": 19.82, + "learning_rate": 0.0005, + "loss": 1.7188, + "step": 252800 + }, + { + "epoch": 19.83, + "learning_rate": 0.0005, + "loss": 1.691, + "step": 252900 + }, + { + "epoch": 19.84, + "learning_rate": 0.0005, + "loss": 1.7275, + "step": 253000 + }, + { + "epoch": 19.85, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 253100 + }, + { + "epoch": 19.86, + "learning_rate": 0.0005, + "loss": 1.7168, + "step": 253200 + }, + { + "epoch": 19.86, + "learning_rate": 0.0005, + "loss": 1.6927, + "step": 253300 + }, + { + "epoch": 19.87, + "learning_rate": 0.0005, + "loss": 1.7066, + "step": 253400 + }, + { + "epoch": 19.88, + "learning_rate": 0.0005, + "loss": 1.7022, + "step": 253500 + }, + { + "epoch": 19.89, + "learning_rate": 0.0005, + "loss": 1.6908, + "step": 253600 + }, + { + "epoch": 19.89, + "learning_rate": 0.0005, + "loss": 1.7124, + "step": 253700 + }, + { + "epoch": 19.9, + "learning_rate": 0.0005, + "loss": 1.7099, + "step": 253800 + }, + { + "epoch": 19.91, + "learning_rate": 0.0005, + "loss": 1.7335, + "step": 253900 + }, + { + "epoch": 19.92, + "learning_rate": 0.0005, + "loss": 1.7446, + "step": 254000 + }, + { + "epoch": 19.93, + "learning_rate": 0.0005, + "loss": 1.7187, + "step": 254100 + }, + { + "epoch": 19.93, + "learning_rate": 0.0005, + "loss": 1.6978, + "step": 254200 + }, + { + "epoch": 19.94, + "learning_rate": 0.0005, + "loss": 1.7339, + "step": 254300 + }, + { + "epoch": 19.95, + "learning_rate": 0.0005, + "loss": 1.698, + "step": 254400 + }, + { + "epoch": 19.96, + "learning_rate": 0.0005, + "loss": 1.7041, + "step": 254500 + }, + { + "epoch": 19.97, + "learning_rate": 0.0005, + "loss": 1.7173, + "step": 254600 + }, + { + "epoch": 19.97, + "learning_rate": 0.0005, + "loss": 1.7025, + "step": 254700 + }, + { + "epoch": 19.98, + "learning_rate": 0.0005, + "loss": 1.6967, + "step": 254800 + }, + { + "epoch": 19.99, + "learning_rate": 0.0005, + "loss": 1.6963, + "step": 254900 + }, + { + "epoch": 20.0, + "learning_rate": 0.0005, + "loss": 1.7125, + "step": 255000 + }, + { + "epoch": 20.0, + "learning_rate": 0.0005, + "loss": 1.6839, + "step": 255100 + }, + { + "epoch": 20.01, + "learning_rate": 0.0005, + "loss": 1.6403, + "step": 255200 + }, + { + "epoch": 20.02, + "learning_rate": 0.0005, + "loss": 1.6344, + "step": 255300 + }, + { + "epoch": 20.03, + "learning_rate": 0.0005, + "loss": 1.643, + "step": 255400 + }, + { + "epoch": 20.04, + "learning_rate": 0.0005, + "loss": 1.6433, + "step": 255500 + }, + { + "epoch": 20.04, + "learning_rate": 0.0005, + "loss": 1.631, + "step": 255600 + }, + { + "epoch": 20.05, + "learning_rate": 0.0005, + "loss": 1.662, + "step": 255700 + }, + { + "epoch": 20.06, + "learning_rate": 0.0005, + "loss": 1.6108, + "step": 255800 + }, + { + "epoch": 20.07, + "learning_rate": 0.0005, + "loss": 1.6505, + "step": 255900 + }, + { + "epoch": 20.08, + "learning_rate": 0.0005, + "loss": 1.6687, + "step": 256000 + }, + { + "epoch": 20.08, + "learning_rate": 0.0005, + "loss": 1.6718, + "step": 256100 + }, + { + "epoch": 20.09, + "learning_rate": 0.0005, + "loss": 1.6497, + "step": 256200 + }, + { + "epoch": 20.1, + "learning_rate": 0.0005, + "loss": 1.6455, + "step": 256300 + }, + { + "epoch": 20.11, + "learning_rate": 0.0005, + "loss": 1.6545, + "step": 256400 + }, + { + "epoch": 20.11, + "learning_rate": 0.0005, + "loss": 1.6349, + "step": 256500 + }, + { + "epoch": 20.12, + "learning_rate": 0.0005, + "loss": 1.6368, + "step": 256600 + }, + { + "epoch": 20.13, + "learning_rate": 0.0005, + "loss": 1.624, + "step": 256700 + }, + { + "epoch": 20.14, + "learning_rate": 0.0005, + "loss": 1.6588, + "step": 256800 + }, + { + "epoch": 20.15, + "learning_rate": 0.0005, + "loss": 1.6254, + "step": 256900 + }, + { + "epoch": 20.15, + "learning_rate": 0.0005, + "loss": 1.6433, + "step": 257000 + }, + { + "epoch": 20.16, + "learning_rate": 0.0005, + "loss": 1.6505, + "step": 257100 + }, + { + "epoch": 20.17, + "learning_rate": 0.0005, + "loss": 1.6722, + "step": 257200 + }, + { + "epoch": 20.18, + "learning_rate": 0.0005, + "loss": 1.6373, + "step": 257300 + }, + { + "epoch": 20.19, + "learning_rate": 0.0005, + "loss": 1.6586, + "step": 257400 + }, + { + "epoch": 20.19, + "learning_rate": 0.0005, + "loss": 1.6745, + "step": 257500 + }, + { + "epoch": 20.2, + "learning_rate": 0.0005, + "loss": 1.665, + "step": 257600 + }, + { + "epoch": 20.21, + "learning_rate": 0.0005, + "loss": 1.656, + "step": 257700 + }, + { + "epoch": 20.22, + "learning_rate": 0.0005, + "loss": 1.6748, + "step": 257800 + }, + { + "epoch": 20.22, + "learning_rate": 0.0005, + "loss": 1.6635, + "step": 257900 + }, + { + "epoch": 20.23, + "learning_rate": 0.0005, + "loss": 1.6579, + "step": 258000 + }, + { + "epoch": 20.24, + "learning_rate": 0.0005, + "loss": 1.6716, + "step": 258100 + }, + { + "epoch": 20.25, + "learning_rate": 0.0005, + "loss": 1.6748, + "step": 258200 + }, + { + "epoch": 20.26, + "learning_rate": 0.0005, + "loss": 1.688, + "step": 258300 + }, + { + "epoch": 20.26, + "learning_rate": 0.0005, + "loss": 1.6389, + "step": 258400 + }, + { + "epoch": 20.27, + "learning_rate": 0.0005, + "loss": 1.6642, + "step": 258500 + }, + { + "epoch": 20.28, + "learning_rate": 0.0005, + "loss": 1.6493, + "step": 258600 + }, + { + "epoch": 20.29, + "learning_rate": 0.0005, + "loss": 1.6601, + "step": 258700 + }, + { + "epoch": 20.29, + "learning_rate": 0.0005, + "loss": 1.6424, + "step": 258800 + }, + { + "epoch": 20.3, + "learning_rate": 0.0005, + "loss": 1.6607, + "step": 258900 + }, + { + "epoch": 20.31, + "learning_rate": 0.0005, + "loss": 1.6403, + "step": 259000 + }, + { + "epoch": 20.32, + "learning_rate": 0.0005, + "loss": 1.693, + "step": 259100 + }, + { + "epoch": 20.33, + "learning_rate": 0.0005, + "loss": 1.6657, + "step": 259200 + }, + { + "epoch": 20.33, + "learning_rate": 0.0005, + "loss": 1.6836, + "step": 259300 + }, + { + "epoch": 20.34, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 259400 + }, + { + "epoch": 20.35, + "learning_rate": 0.0005, + "loss": 1.6512, + "step": 259500 + }, + { + "epoch": 20.36, + "learning_rate": 0.0005, + "loss": 1.671, + "step": 259600 + }, + { + "epoch": 20.37, + "learning_rate": 0.0005, + "loss": 1.6558, + "step": 259700 + }, + { + "epoch": 20.37, + "learning_rate": 0.0005, + "loss": 1.6636, + "step": 259800 + }, + { + "epoch": 20.38, + "learning_rate": 0.0005, + "loss": 1.6533, + "step": 259900 + }, + { + "epoch": 20.39, + "learning_rate": 0.0005, + "loss": 1.6784, + "step": 260000 + }, + { + "epoch": 20.39, + "eval_gen_len": 18.790235719961156, + "eval_loss": 2.0271031856536865, + "eval_rouge1": 35.165, + "eval_rouge2": 14.0225, + "eval_rougeL": 28.8773, + "eval_rougeLsum": 28.8644, + "eval_runtime": 359.9035, + "eval_samples_per_second": 31.472, + "eval_steps_per_second": 1.967, + "step": 260000 + }, + { + "epoch": 20.4, + "learning_rate": 0.0005, + "loss": 1.6699, + "step": 260100 + }, + { + "epoch": 20.4, + "learning_rate": 0.0005, + "loss": 1.6597, + "step": 260200 + }, + { + "epoch": 20.41, + "learning_rate": 0.0005, + "loss": 1.6813, + "step": 260300 + }, + { + "epoch": 20.42, + "learning_rate": 0.0005, + "loss": 1.6641, + "step": 260400 + }, + { + "epoch": 20.43, + "learning_rate": 0.0005, + "loss": 1.6587, + "step": 260500 + }, + { + "epoch": 20.44, + "learning_rate": 0.0005, + "loss": 1.6786, + "step": 260600 + }, + { + "epoch": 20.44, + "learning_rate": 0.0005, + "loss": 1.6915, + "step": 260700 + }, + { + "epoch": 20.45, + "learning_rate": 0.0005, + "loss": 1.659, + "step": 260800 + }, + { + "epoch": 20.46, + "learning_rate": 0.0005, + "loss": 1.6866, + "step": 260900 + }, + { + "epoch": 20.47, + "learning_rate": 0.0005, + "loss": 1.6817, + "step": 261000 + }, + { + "epoch": 20.48, + "learning_rate": 0.0005, + "loss": 1.6575, + "step": 261100 + }, + { + "epoch": 20.48, + "learning_rate": 0.0005, + "loss": 1.6908, + "step": 261200 + }, + { + "epoch": 20.49, + "learning_rate": 0.0005, + "loss": 1.6865, + "step": 261300 + }, + { + "epoch": 20.5, + "learning_rate": 0.0005, + "loss": 1.6798, + "step": 261400 + }, + { + "epoch": 20.51, + "learning_rate": 0.0005, + "loss": 1.6394, + "step": 261500 + }, + { + "epoch": 20.51, + "learning_rate": 0.0005, + "loss": 1.6804, + "step": 261600 + }, + { + "epoch": 20.52, + "learning_rate": 0.0005, + "loss": 1.679, + "step": 261700 + }, + { + "epoch": 20.53, + "learning_rate": 0.0005, + "loss": 1.7161, + "step": 261800 + }, + { + "epoch": 20.54, + "learning_rate": 0.0005, + "loss": 1.6768, + "step": 261900 + }, + { + "epoch": 20.55, + "learning_rate": 0.0005, + "loss": 1.7062, + "step": 262000 + }, + { + "epoch": 20.55, + "learning_rate": 0.0005, + "loss": 1.6279, + "step": 262100 + }, + { + "epoch": 20.56, + "learning_rate": 0.0005, + "loss": 1.6862, + "step": 262200 + }, + { + "epoch": 20.57, + "learning_rate": 0.0005, + "loss": 1.7075, + "step": 262300 + }, + { + "epoch": 20.58, + "learning_rate": 0.0005, + "loss": 1.6938, + "step": 262400 + }, + { + "epoch": 20.59, + "learning_rate": 0.0005, + "loss": 1.7088, + "step": 262500 + }, + { + "epoch": 20.59, + "learning_rate": 0.0005, + "loss": 1.6599, + "step": 262600 + }, + { + "epoch": 20.6, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 262700 + }, + { + "epoch": 20.61, + "learning_rate": 0.0005, + "loss": 1.6543, + "step": 262800 + }, + { + "epoch": 20.62, + "learning_rate": 0.0005, + "loss": 1.6803, + "step": 262900 + }, + { + "epoch": 20.62, + "learning_rate": 0.0005, + "loss": 1.6728, + "step": 263000 + }, + { + "epoch": 20.63, + "learning_rate": 0.0005, + "loss": 1.6599, + "step": 263100 + }, + { + "epoch": 20.64, + "learning_rate": 0.0005, + "loss": 1.6822, + "step": 263200 + }, + { + "epoch": 20.65, + "learning_rate": 0.0005, + "loss": 1.6958, + "step": 263300 + }, + { + "epoch": 20.66, + "learning_rate": 0.0005, + "loss": 1.7065, + "step": 263400 + }, + { + "epoch": 20.66, + "learning_rate": 0.0005, + "loss": 1.7043, + "step": 263500 + }, + { + "epoch": 20.67, + "learning_rate": 0.0005, + "loss": 1.6914, + "step": 263600 + }, + { + "epoch": 20.68, + "learning_rate": 0.0005, + "loss": 1.701, + "step": 263700 + }, + { + "epoch": 20.69, + "learning_rate": 0.0005, + "loss": 1.6959, + "step": 263800 + }, + { + "epoch": 20.69, + "learning_rate": 0.0005, + "loss": 1.6852, + "step": 263900 + }, + { + "epoch": 20.7, + "learning_rate": 0.0005, + "loss": 1.6723, + "step": 264000 + }, + { + "epoch": 20.71, + "learning_rate": 0.0005, + "loss": 1.6832, + "step": 264100 + }, + { + "epoch": 20.72, + "learning_rate": 0.0005, + "loss": 1.65, + "step": 264200 + }, + { + "epoch": 20.73, + "learning_rate": 0.0005, + "loss": 1.6954, + "step": 264300 + }, + { + "epoch": 20.73, + "learning_rate": 0.0005, + "loss": 1.6687, + "step": 264400 + }, + { + "epoch": 20.74, + "learning_rate": 0.0005, + "loss": 1.693, + "step": 264500 + }, + { + "epoch": 20.75, + "learning_rate": 0.0005, + "loss": 1.6815, + "step": 264600 + }, + { + "epoch": 20.76, + "learning_rate": 0.0005, + "loss": 1.6688, + "step": 264700 + }, + { + "epoch": 20.77, + "learning_rate": 0.0005, + "loss": 1.6602, + "step": 264800 + }, + { + "epoch": 20.77, + "learning_rate": 0.0005, + "loss": 1.6585, + "step": 264900 + }, + { + "epoch": 20.78, + "learning_rate": 0.0005, + "loss": 1.6895, + "step": 265000 + }, + { + "epoch": 20.79, + "learning_rate": 0.0005, + "loss": 1.6925, + "step": 265100 + }, + { + "epoch": 20.8, + "learning_rate": 0.0005, + "loss": 1.7074, + "step": 265200 + }, + { + "epoch": 20.8, + "learning_rate": 0.0005, + "loss": 1.7013, + "step": 265300 + }, + { + "epoch": 20.81, + "learning_rate": 0.0005, + "loss": 1.6727, + "step": 265400 + }, + { + "epoch": 20.82, + "learning_rate": 0.0005, + "loss": 1.6998, + "step": 265500 + }, + { + "epoch": 20.83, + "learning_rate": 0.0005, + "loss": 1.687, + "step": 265600 + }, + { + "epoch": 20.84, + "learning_rate": 0.0005, + "loss": 1.6983, + "step": 265700 + }, + { + "epoch": 20.84, + "learning_rate": 0.0005, + "loss": 1.7113, + "step": 265800 + }, + { + "epoch": 20.85, + "learning_rate": 0.0005, + "loss": 1.6944, + "step": 265900 + }, + { + "epoch": 20.86, + "learning_rate": 0.0005, + "loss": 1.7157, + "step": 266000 + }, + { + "epoch": 20.87, + "learning_rate": 0.0005, + "loss": 1.7116, + "step": 266100 + }, + { + "epoch": 20.88, + "learning_rate": 0.0005, + "loss": 1.7307, + "step": 266200 + }, + { + "epoch": 20.88, + "learning_rate": 0.0005, + "loss": 1.6907, + "step": 266300 + }, + { + "epoch": 20.89, + "learning_rate": 0.0005, + "loss": 1.6861, + "step": 266400 + }, + { + "epoch": 20.9, + "learning_rate": 0.0005, + "loss": 1.6892, + "step": 266500 + }, + { + "epoch": 20.91, + "learning_rate": 0.0005, + "loss": 1.6904, + "step": 266600 + }, + { + "epoch": 20.91, + "learning_rate": 0.0005, + "loss": 1.7172, + "step": 266700 + }, + { + "epoch": 20.92, + "learning_rate": 0.0005, + "loss": 1.7, + "step": 266800 + }, + { + "epoch": 20.93, + "learning_rate": 0.0005, + "loss": 1.6893, + "step": 266900 + }, + { + "epoch": 20.94, + "learning_rate": 0.0005, + "loss": 1.6898, + "step": 267000 + }, + { + "epoch": 20.95, + "learning_rate": 0.0005, + "loss": 1.7073, + "step": 267100 + }, + { + "epoch": 20.95, + "learning_rate": 0.0005, + "loss": 1.703, + "step": 267200 + }, + { + "epoch": 20.96, + "learning_rate": 0.0005, + "loss": 1.6952, + "step": 267300 + }, + { + "epoch": 20.97, + "learning_rate": 0.0005, + "loss": 1.6893, + "step": 267400 + }, + { + "epoch": 20.98, + "learning_rate": 0.0005, + "loss": 1.7366, + "step": 267500 + }, + { + "epoch": 20.98, + "learning_rate": 0.0005, + "loss": 1.7323, + "step": 267600 + }, + { + "epoch": 20.99, + "learning_rate": 0.0005, + "loss": 1.7095, + "step": 267700 + }, + { + "epoch": 21.0, + "learning_rate": 0.0005, + "loss": 1.6746, + "step": 267800 + }, + { + "epoch": 21.01, + "learning_rate": 0.0005, + "loss": 1.5848, + "step": 267900 + }, + { + "epoch": 21.02, + "learning_rate": 0.0005, + "loss": 1.64, + "step": 268000 + }, + { + "epoch": 21.02, + "learning_rate": 0.0005, + "loss": 1.6432, + "step": 268100 + }, + { + "epoch": 21.03, + "learning_rate": 0.0005, + "loss": 1.6222, + "step": 268200 + }, + { + "epoch": 21.04, + "learning_rate": 0.0005, + "loss": 1.6085, + "step": 268300 + }, + { + "epoch": 21.05, + "learning_rate": 0.0005, + "loss": 1.6059, + "step": 268400 + }, + { + "epoch": 21.06, + "learning_rate": 0.0005, + "loss": 1.6361, + "step": 268500 + }, + { + "epoch": 21.06, + "learning_rate": 0.0005, + "loss": 1.6125, + "step": 268600 + }, + { + "epoch": 21.07, + "learning_rate": 0.0005, + "loss": 1.6033, + "step": 268700 + }, + { + "epoch": 21.08, + "learning_rate": 0.0005, + "loss": 1.6163, + "step": 268800 + }, + { + "epoch": 21.09, + "learning_rate": 0.0005, + "loss": 1.6157, + "step": 268900 + }, + { + "epoch": 21.09, + "learning_rate": 0.0005, + "loss": 1.6505, + "step": 269000 + }, + { + "epoch": 21.1, + "learning_rate": 0.0005, + "loss": 1.6267, + "step": 269100 + }, + { + "epoch": 21.11, + "learning_rate": 0.0005, + "loss": 1.6007, + "step": 269200 + }, + { + "epoch": 21.12, + "learning_rate": 0.0005, + "loss": 1.6224, + "step": 269300 + }, + { + "epoch": 21.13, + "learning_rate": 0.0005, + "loss": 1.6151, + "step": 269400 + }, + { + "epoch": 21.13, + "learning_rate": 0.0005, + "loss": 1.6311, + "step": 269500 + }, + { + "epoch": 21.14, + "learning_rate": 0.0005, + "loss": 1.6143, + "step": 269600 + }, + { + "epoch": 21.15, + "learning_rate": 0.0005, + "loss": 1.6499, + "step": 269700 + }, + { + "epoch": 21.16, + "learning_rate": 0.0005, + "loss": 1.6286, + "step": 269800 + }, + { + "epoch": 21.17, + "learning_rate": 0.0005, + "loss": 1.6327, + "step": 269900 + }, + { + "epoch": 21.17, + "learning_rate": 0.0005, + "loss": 1.6273, + "step": 270000 + }, + { + "epoch": 21.17, + "eval_gen_len": 18.794120243665578, + "eval_loss": 2.0332844257354736, + "eval_rouge1": 35.267, + "eval_rouge2": 14.1753, + "eval_rougeL": 28.973, + "eval_rougeLsum": 28.9735, + "eval_runtime": 364.1194, + "eval_samples_per_second": 31.108, + "eval_steps_per_second": 1.944, + "step": 270000 + }, + { + "epoch": 21.18, + "learning_rate": 0.0005, + "loss": 1.6709, + "step": 270100 + }, + { + "epoch": 21.19, + "learning_rate": 0.0005, + "loss": 1.6506, + "step": 270200 + }, + { + "epoch": 21.2, + "learning_rate": 0.0005, + "loss": 1.6591, + "step": 270300 + }, + { + "epoch": 21.2, + "learning_rate": 0.0005, + "loss": 1.6446, + "step": 270400 + }, + { + "epoch": 21.21, + "learning_rate": 0.0005, + "loss": 1.6299, + "step": 270500 + }, + { + "epoch": 21.22, + "learning_rate": 0.0005, + "loss": 1.6684, + "step": 270600 + }, + { + "epoch": 21.23, + "learning_rate": 0.0005, + "loss": 1.6636, + "step": 270700 + }, + { + "epoch": 21.24, + "learning_rate": 0.0005, + "loss": 1.6326, + "step": 270800 + }, + { + "epoch": 21.24, + "learning_rate": 0.0005, + "loss": 1.6458, + "step": 270900 + }, + { + "epoch": 21.25, + "learning_rate": 0.0005, + "loss": 1.6513, + "step": 271000 + }, + { + "epoch": 21.26, + "learning_rate": 0.0005, + "loss": 1.6618, + "step": 271100 + }, + { + "epoch": 21.27, + "learning_rate": 0.0005, + "loss": 1.6441, + "step": 271200 + }, + { + "epoch": 21.28, + "learning_rate": 0.0005, + "loss": 1.64, + "step": 271300 + }, + { + "epoch": 21.28, + "learning_rate": 0.0005, + "loss": 1.6454, + "step": 271400 + }, + { + "epoch": 21.29, + "learning_rate": 0.0005, + "loss": 1.6246, + "step": 271500 + }, + { + "epoch": 21.3, + "learning_rate": 0.0005, + "loss": 1.6632, + "step": 271600 + }, + { + "epoch": 21.31, + "learning_rate": 0.0005, + "loss": 1.6478, + "step": 271700 + }, + { + "epoch": 21.31, + "learning_rate": 0.0005, + "loss": 1.6651, + "step": 271800 + }, + { + "epoch": 21.32, + "learning_rate": 0.0005, + "loss": 1.626, + "step": 271900 + }, + { + "epoch": 21.33, + "learning_rate": 0.0005, + "loss": 1.6519, + "step": 272000 + }, + { + "epoch": 21.34, + "learning_rate": 0.0005, + "loss": 1.66, + "step": 272100 + }, + { + "epoch": 21.35, + "learning_rate": 0.0005, + "loss": 1.6479, + "step": 272200 + }, + { + "epoch": 21.35, + "learning_rate": 0.0005, + "loss": 1.6556, + "step": 272300 + }, + { + "epoch": 21.36, + "learning_rate": 0.0005, + "loss": 1.6563, + "step": 272400 + }, + { + "epoch": 21.37, + "learning_rate": 0.0005, + "loss": 1.6392, + "step": 272500 + }, + { + "epoch": 21.38, + "learning_rate": 0.0005, + "loss": 1.6474, + "step": 272600 + }, + { + "epoch": 21.38, + "learning_rate": 0.0005, + "loss": 1.6544, + "step": 272700 + }, + { + "epoch": 21.39, + "learning_rate": 0.0005, + "loss": 1.6677, + "step": 272800 + }, + { + "epoch": 21.4, + "learning_rate": 0.0005, + "loss": 1.6584, + "step": 272900 + }, + { + "epoch": 21.41, + "learning_rate": 0.0005, + "loss": 1.656, + "step": 273000 + }, + { + "epoch": 21.42, + "learning_rate": 0.0005, + "loss": 1.6487, + "step": 273100 + }, + { + "epoch": 21.42, + "learning_rate": 0.0005, + "loss": 1.6614, + "step": 273200 + }, + { + "epoch": 21.43, + "learning_rate": 0.0005, + "loss": 1.6839, + "step": 273300 + }, + { + "epoch": 21.44, + "learning_rate": 0.0005, + "loss": 1.6518, + "step": 273400 + }, + { + "epoch": 21.45, + "learning_rate": 0.0005, + "loss": 1.6572, + "step": 273500 + }, + { + "epoch": 21.46, + "learning_rate": 0.0005, + "loss": 1.6604, + "step": 273600 + }, + { + "epoch": 21.46, + "learning_rate": 0.0005, + "loss": 1.683, + "step": 273700 + }, + { + "epoch": 21.47, + "learning_rate": 0.0005, + "loss": 1.6378, + "step": 273800 + }, + { + "epoch": 21.48, + "learning_rate": 0.0005, + "loss": 1.6439, + "step": 273900 + }, + { + "epoch": 21.49, + "learning_rate": 0.0005, + "loss": 1.6667, + "step": 274000 + }, + { + "epoch": 21.49, + "learning_rate": 0.0005, + "loss": 1.6472, + "step": 274100 + }, + { + "epoch": 21.5, + "learning_rate": 0.0005, + "loss": 1.639, + "step": 274200 + }, + { + "epoch": 21.51, + "learning_rate": 0.0005, + "loss": 1.6757, + "step": 274300 + }, + { + "epoch": 21.52, + "learning_rate": 0.0005, + "loss": 1.6633, + "step": 274400 + }, + { + "epoch": 21.53, + "learning_rate": 0.0005, + "loss": 1.65, + "step": 274500 + }, + { + "epoch": 21.53, + "learning_rate": 0.0005, + "loss": 1.67, + "step": 274600 + }, + { + "epoch": 21.54, + "learning_rate": 0.0005, + "loss": 1.6726, + "step": 274700 + }, + { + "epoch": 21.55, + "learning_rate": 0.0005, + "loss": 1.6634, + "step": 274800 + }, + { + "epoch": 21.56, + "learning_rate": 0.0005, + "loss": 1.6844, + "step": 274900 + }, + { + "epoch": 21.57, + "learning_rate": 0.0005, + "loss": 1.6708, + "step": 275000 + }, + { + "epoch": 21.57, + "learning_rate": 0.0005, + "loss": 1.6485, + "step": 275100 + }, + { + "epoch": 21.58, + "learning_rate": 0.0005, + "loss": 1.6648, + "step": 275200 + }, + { + "epoch": 21.59, + "learning_rate": 0.0005, + "loss": 1.6691, + "step": 275300 + }, + { + "epoch": 21.6, + "learning_rate": 0.0005, + "loss": 1.6751, + "step": 275400 + }, + { + "epoch": 21.6, + "learning_rate": 0.0005, + "loss": 1.6635, + "step": 275500 + }, + { + "epoch": 21.61, + "learning_rate": 0.0005, + "loss": 1.6693, + "step": 275600 + }, + { + "epoch": 21.62, + "learning_rate": 0.0005, + "loss": 1.6764, + "step": 275700 + }, + { + "epoch": 21.63, + "learning_rate": 0.0005, + "loss": 1.6629, + "step": 275800 + }, + { + "epoch": 21.64, + "learning_rate": 0.0005, + "loss": 1.6837, + "step": 275900 + }, + { + "epoch": 21.64, + "learning_rate": 0.0005, + "loss": 1.6856, + "step": 276000 + }, + { + "epoch": 21.65, + "learning_rate": 0.0005, + "loss": 1.6608, + "step": 276100 + }, + { + "epoch": 21.66, + "learning_rate": 0.0005, + "loss": 1.665, + "step": 276200 + }, + { + "epoch": 21.67, + "learning_rate": 0.0005, + "loss": 1.6734, + "step": 276300 + }, + { + "epoch": 21.68, + "learning_rate": 0.0005, + "loss": 1.6561, + "step": 276400 + }, + { + "epoch": 21.68, + "learning_rate": 0.0005, + "loss": 1.6762, + "step": 276500 + }, + { + "epoch": 21.69, + "learning_rate": 0.0005, + "loss": 1.6431, + "step": 276600 + }, + { + "epoch": 21.7, + "learning_rate": 0.0005, + "loss": 1.6631, + "step": 276700 + }, + { + "epoch": 21.71, + "learning_rate": 0.0005, + "loss": 1.7076, + "step": 276800 + }, + { + "epoch": 21.71, + "learning_rate": 0.0005, + "loss": 1.6691, + "step": 276900 + }, + { + "epoch": 21.72, + "learning_rate": 0.0005, + "loss": 1.6689, + "step": 277000 + }, + { + "epoch": 21.73, + "learning_rate": 0.0005, + "loss": 1.6895, + "step": 277100 + }, + { + "epoch": 21.74, + "learning_rate": 0.0005, + "loss": 1.701, + "step": 277200 + }, + { + "epoch": 21.75, + "learning_rate": 0.0005, + "loss": 1.6691, + "step": 277300 + }, + { + "epoch": 21.75, + "learning_rate": 0.0005, + "loss": 1.6708, + "step": 277400 + }, + { + "epoch": 21.76, + "learning_rate": 0.0005, + "loss": 1.6708, + "step": 277500 + }, + { + "epoch": 21.77, + "learning_rate": 0.0005, + "loss": 1.7038, + "step": 277600 + }, + { + "epoch": 21.78, + "learning_rate": 0.0005, + "loss": 1.6813, + "step": 277700 + }, + { + "epoch": 21.78, + "learning_rate": 0.0005, + "loss": 1.6738, + "step": 277800 + }, + { + "epoch": 21.79, + "learning_rate": 0.0005, + "loss": 1.6991, + "step": 277900 + }, + { + "epoch": 21.8, + "learning_rate": 0.0005, + "loss": 1.7033, + "step": 278000 + }, + { + "epoch": 21.81, + "learning_rate": 0.0005, + "loss": 1.6716, + "step": 278100 + }, + { + "epoch": 21.82, + "learning_rate": 0.0005, + "loss": 1.678, + "step": 278200 + }, + { + "epoch": 21.82, + "learning_rate": 0.0005, + "loss": 1.6645, + "step": 278300 + }, + { + "epoch": 21.83, + "learning_rate": 0.0005, + "loss": 1.6913, + "step": 278400 + }, + { + "epoch": 21.84, + "learning_rate": 0.0005, + "loss": 1.6807, + "step": 278500 + }, + { + "epoch": 21.85, + "learning_rate": 0.0005, + "loss": 1.6646, + "step": 278600 + }, + { + "epoch": 21.86, + "learning_rate": 0.0005, + "loss": 1.6968, + "step": 278700 + }, + { + "epoch": 21.86, + "learning_rate": 0.0005, + "loss": 1.6384, + "step": 278800 + }, + { + "epoch": 21.87, + "learning_rate": 0.0005, + "loss": 1.6578, + "step": 278900 + }, + { + "epoch": 21.88, + "learning_rate": 0.0005, + "loss": 1.6761, + "step": 279000 + }, + { + "epoch": 21.89, + "learning_rate": 0.0005, + "loss": 1.6749, + "step": 279100 + }, + { + "epoch": 21.89, + "learning_rate": 0.0005, + "loss": 1.6953, + "step": 279200 + }, + { + "epoch": 21.9, + "learning_rate": 0.0005, + "loss": 1.6829, + "step": 279300 + }, + { + "epoch": 21.91, + "learning_rate": 0.0005, + "loss": 1.6733, + "step": 279400 + }, + { + "epoch": 21.92, + "learning_rate": 0.0005, + "loss": 1.6732, + "step": 279500 + }, + { + "epoch": 21.93, + "learning_rate": 0.0005, + "loss": 1.6764, + "step": 279600 + }, + { + "epoch": 21.93, + "learning_rate": 0.0005, + "loss": 1.6588, + "step": 279700 + }, + { + "epoch": 21.94, + "learning_rate": 0.0005, + "loss": 1.6779, + "step": 279800 + }, + { + "epoch": 21.95, + "learning_rate": 0.0005, + "loss": 1.6869, + "step": 279900 + }, + { + "epoch": 21.96, + "learning_rate": 0.0005, + "loss": 1.7009, + "step": 280000 + }, + { + "epoch": 21.96, + "eval_gen_len": 18.80851063829787, + "eval_loss": 2.0156733989715576, + "eval_rouge1": 35.3219, + "eval_rouge2": 14.2115, + "eval_rougeL": 28.9797, + "eval_rougeLsum": 28.9688, + "eval_runtime": 361.1372, + "eval_samples_per_second": 31.365, + "eval_steps_per_second": 1.96, + "step": 280000 + }, + { + "epoch": 21.97, + "learning_rate": 0.0005, + "loss": 1.6889, + "step": 280100 + }, + { + "epoch": 21.97, + "learning_rate": 0.0005, + "loss": 1.6603, + "step": 280200 + }, + { + "epoch": 21.98, + "learning_rate": 0.0005, + "loss": 1.6909, + "step": 280300 + }, + { + "epoch": 21.99, + "learning_rate": 0.0005, + "loss": 1.7038, + "step": 280400 + }, + { + "epoch": 22.0, + "learning_rate": 0.0005, + "loss": 1.6765, + "step": 280500 + }, + { + "epoch": 22.0, + "learning_rate": 0.0005, + "loss": 1.6226, + "step": 280600 + }, + { + "epoch": 22.01, + "learning_rate": 0.0005, + "loss": 1.6016, + "step": 280700 + }, + { + "epoch": 22.02, + "learning_rate": 0.0005, + "loss": 1.6137, + "step": 280800 + }, + { + "epoch": 22.03, + "learning_rate": 0.0005, + "loss": 1.6311, + "step": 280900 + }, + { + "epoch": 22.04, + "learning_rate": 0.0005, + "loss": 1.6032, + "step": 281000 + }, + { + "epoch": 22.04, + "learning_rate": 0.0005, + "loss": 1.586, + "step": 281100 + }, + { + "epoch": 22.05, + "learning_rate": 0.0005, + "loss": 1.6228, + "step": 281200 + }, + { + "epoch": 22.06, + "learning_rate": 0.0005, + "loss": 1.5936, + "step": 281300 + }, + { + "epoch": 22.07, + "learning_rate": 0.0005, + "loss": 1.6262, + "step": 281400 + }, + { + "epoch": 22.07, + "learning_rate": 0.0005, + "loss": 1.6091, + "step": 281500 + }, + { + "epoch": 22.08, + "learning_rate": 0.0005, + "loss": 1.6138, + "step": 281600 + }, + { + "epoch": 22.09, + "learning_rate": 0.0005, + "loss": 1.6125, + "step": 281700 + }, + { + "epoch": 22.1, + "learning_rate": 0.0005, + "loss": 1.636, + "step": 281800 + }, + { + "epoch": 22.11, + "learning_rate": 0.0005, + "loss": 1.5914, + "step": 281900 + }, + { + "epoch": 22.11, + "learning_rate": 0.0005, + "loss": 1.6495, + "step": 282000 + }, + { + "epoch": 22.12, + "learning_rate": 0.0005, + "loss": 1.6053, + "step": 282100 + }, + { + "epoch": 22.13, + "learning_rate": 0.0005, + "loss": 1.6091, + "step": 282200 + }, + { + "epoch": 22.14, + "learning_rate": 0.0005, + "loss": 1.6329, + "step": 282300 + }, + { + "epoch": 22.15, + "learning_rate": 0.0005, + "loss": 1.6053, + "step": 282400 + }, + { + "epoch": 22.15, + "learning_rate": 0.0005, + "loss": 1.5926, + "step": 282500 + }, + { + "epoch": 22.16, + "learning_rate": 0.0005, + "loss": 1.5977, + "step": 282600 + }, + { + "epoch": 22.17, + "learning_rate": 0.0005, + "loss": 1.59, + "step": 282700 + }, + { + "epoch": 22.18, + "learning_rate": 0.0005, + "loss": 1.6346, + "step": 282800 + }, + { + "epoch": 22.18, + "learning_rate": 0.0005, + "loss": 1.5948, + "step": 282900 + }, + { + "epoch": 22.19, + "learning_rate": 0.0005, + "loss": 1.6221, + "step": 283000 + }, + { + "epoch": 22.2, + "learning_rate": 0.0005, + "loss": 1.6059, + "step": 283100 + }, + { + "epoch": 22.21, + "learning_rate": 0.0005, + "loss": 1.6255, + "step": 283200 + }, + { + "epoch": 22.22, + "learning_rate": 0.0005, + "loss": 1.6016, + "step": 283300 + }, + { + "epoch": 22.22, + "learning_rate": 0.0005, + "loss": 1.6287, + "step": 283400 + }, + { + "epoch": 22.23, + "learning_rate": 0.0005, + "loss": 1.6254, + "step": 283500 + }, + { + "epoch": 22.24, + "learning_rate": 0.0005, + "loss": 1.623, + "step": 283600 + }, + { + "epoch": 22.25, + "learning_rate": 0.0005, + "loss": 1.6471, + "step": 283700 + }, + { + "epoch": 22.26, + "learning_rate": 0.0005, + "loss": 1.6425, + "step": 283800 + }, + { + "epoch": 22.26, + "learning_rate": 0.0005, + "loss": 1.6553, + "step": 283900 + }, + { + "epoch": 22.27, + "learning_rate": 0.0005, + "loss": 1.6522, + "step": 284000 + }, + { + "epoch": 22.28, + "learning_rate": 0.0005, + "loss": 1.6446, + "step": 284100 + }, + { + "epoch": 22.29, + "learning_rate": 0.0005, + "loss": 1.6494, + "step": 284200 + }, + { + "epoch": 22.29, + "learning_rate": 0.0005, + "loss": 1.6355, + "step": 284300 + }, + { + "epoch": 22.3, + "learning_rate": 0.0005, + "loss": 1.612, + "step": 284400 + }, + { + "epoch": 22.31, + "learning_rate": 0.0005, + "loss": 1.6425, + "step": 284500 + }, + { + "epoch": 22.32, + "learning_rate": 0.0005, + "loss": 1.6426, + "step": 284600 + }, + { + "epoch": 22.33, + "learning_rate": 0.0005, + "loss": 1.6647, + "step": 284700 + }, + { + "epoch": 22.33, + "learning_rate": 0.0005, + "loss": 1.6259, + "step": 284800 + }, + { + "epoch": 22.34, + "learning_rate": 0.0005, + "loss": 1.6348, + "step": 284900 + }, + { + "epoch": 22.35, + "learning_rate": 0.0005, + "loss": 1.6271, + "step": 285000 + }, + { + "epoch": 22.36, + "learning_rate": 0.0005, + "loss": 1.6255, + "step": 285100 + }, + { + "epoch": 22.37, + "learning_rate": 0.0005, + "loss": 1.6273, + "step": 285200 + }, + { + "epoch": 22.37, + "learning_rate": 0.0005, + "loss": 1.6208, + "step": 285300 + }, + { + "epoch": 22.38, + "learning_rate": 0.0005, + "loss": 1.639, + "step": 285400 + }, + { + "epoch": 22.39, + "learning_rate": 0.0005, + "loss": 1.6401, + "step": 285500 + }, + { + "epoch": 22.4, + "learning_rate": 0.0005, + "loss": 1.631, + "step": 285600 + }, + { + "epoch": 22.4, + "learning_rate": 0.0005, + "loss": 1.6275, + "step": 285700 + }, + { + "epoch": 22.41, + "learning_rate": 0.0005, + "loss": 1.6289, + "step": 285800 + }, + { + "epoch": 22.42, + "learning_rate": 0.0005, + "loss": 1.6153, + "step": 285900 + }, + { + "epoch": 22.43, + "learning_rate": 0.0005, + "loss": 1.6571, + "step": 286000 + }, + { + "epoch": 22.44, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 286100 + }, + { + "epoch": 22.44, + "learning_rate": 0.0005, + "loss": 1.6535, + "step": 286200 + }, + { + "epoch": 22.45, + "learning_rate": 0.0005, + "loss": 1.6414, + "step": 286300 + }, + { + "epoch": 22.46, + "learning_rate": 0.0005, + "loss": 1.642, + "step": 286400 + }, + { + "epoch": 22.47, + "learning_rate": 0.0005, + "loss": 1.6574, + "step": 286500 + }, + { + "epoch": 22.47, + "learning_rate": 0.0005, + "loss": 1.612, + "step": 286600 + }, + { + "epoch": 22.48, + "learning_rate": 0.0005, + "loss": 1.6388, + "step": 286700 + }, + { + "epoch": 22.49, + "learning_rate": 0.0005, + "loss": 1.6577, + "step": 286800 + }, + { + "epoch": 22.5, + "learning_rate": 0.0005, + "loss": 1.6306, + "step": 286900 + }, + { + "epoch": 22.51, + "learning_rate": 0.0005, + "loss": 1.6493, + "step": 287000 + }, + { + "epoch": 22.51, + "learning_rate": 0.0005, + "loss": 1.6496, + "step": 287100 + }, + { + "epoch": 22.52, + "learning_rate": 0.0005, + "loss": 1.6655, + "step": 287200 + }, + { + "epoch": 22.53, + "learning_rate": 0.0005, + "loss": 1.6347, + "step": 287300 + }, + { + "epoch": 22.54, + "learning_rate": 0.0005, + "loss": 1.6628, + "step": 287400 + }, + { + "epoch": 22.55, + "learning_rate": 0.0005, + "loss": 1.6369, + "step": 287500 + }, + { + "epoch": 22.55, + "learning_rate": 0.0005, + "loss": 1.6818, + "step": 287600 + }, + { + "epoch": 22.56, + "learning_rate": 0.0005, + "loss": 1.6497, + "step": 287700 + }, + { + "epoch": 22.57, + "learning_rate": 0.0005, + "loss": 1.6406, + "step": 287800 + }, + { + "epoch": 22.58, + "learning_rate": 0.0005, + "loss": 1.6353, + "step": 287900 + }, + { + "epoch": 22.58, + "learning_rate": 0.0005, + "loss": 1.6637, + "step": 288000 + }, + { + "epoch": 22.59, + "learning_rate": 0.0005, + "loss": 1.6416, + "step": 288100 + }, + { + "epoch": 22.6, + "learning_rate": 0.0005, + "loss": 1.6283, + "step": 288200 + }, + { + "epoch": 22.61, + "learning_rate": 0.0005, + "loss": 1.667, + "step": 288300 + }, + { + "epoch": 22.62, + "learning_rate": 0.0005, + "loss": 1.6687, + "step": 288400 + }, + { + "epoch": 22.62, + "learning_rate": 0.0005, + "loss": 1.6424, + "step": 288500 + }, + { + "epoch": 22.63, + "learning_rate": 0.0005, + "loss": 1.661, + "step": 288600 + }, + { + "epoch": 22.64, + "learning_rate": 0.0005, + "loss": 1.6544, + "step": 288700 + }, + { + "epoch": 22.65, + "learning_rate": 0.0005, + "loss": 1.66, + "step": 288800 + }, + { + "epoch": 22.66, + "learning_rate": 0.0005, + "loss": 1.661, + "step": 288900 + }, + { + "epoch": 22.66, + "learning_rate": 0.0005, + "loss": 1.6509, + "step": 289000 + }, + { + "epoch": 22.67, + "learning_rate": 0.0005, + "loss": 1.6761, + "step": 289100 + }, + { + "epoch": 22.68, + "learning_rate": 0.0005, + "loss": 1.6578, + "step": 289200 + }, + { + "epoch": 22.69, + "learning_rate": 0.0005, + "loss": 1.6454, + "step": 289300 + }, + { + "epoch": 22.69, + "learning_rate": 0.0005, + "loss": 1.6606, + "step": 289400 + }, + { + "epoch": 22.7, + "learning_rate": 0.0005, + "loss": 1.6326, + "step": 289500 + }, + { + "epoch": 22.71, + "learning_rate": 0.0005, + "loss": 1.6758, + "step": 289600 + }, + { + "epoch": 22.72, + "learning_rate": 0.0005, + "loss": 1.6564, + "step": 289700 + }, + { + "epoch": 22.73, + "learning_rate": 0.0005, + "loss": 1.656, + "step": 289800 + }, + { + "epoch": 22.73, + "learning_rate": 0.0005, + "loss": 1.6915, + "step": 289900 + }, + { + "epoch": 22.74, + "learning_rate": 0.0005, + "loss": 1.6582, + "step": 290000 + }, + { + "epoch": 22.74, + "eval_gen_len": 18.805862099408493, + "eval_loss": 2.0187759399414062, + "eval_rouge1": 35.4028, + "eval_rouge2": 14.3182, + "eval_rougeL": 29.1245, + "eval_rougeLsum": 29.1239, + "eval_runtime": 358.1851, + "eval_samples_per_second": 31.623, + "eval_steps_per_second": 1.977, + "step": 290000 + }, + { + "epoch": 22.75, + "learning_rate": 0.0005, + "loss": 1.6384, + "step": 290100 + }, + { + "epoch": 22.76, + "learning_rate": 0.0005, + "loss": 1.6645, + "step": 290200 + }, + { + "epoch": 22.77, + "learning_rate": 0.0005, + "loss": 1.6506, + "step": 290300 + }, + { + "epoch": 22.77, + "learning_rate": 0.0005, + "loss": 1.6602, + "step": 290400 + }, + { + "epoch": 22.78, + "learning_rate": 0.0005, + "loss": 1.6624, + "step": 290500 + }, + { + "epoch": 22.79, + "learning_rate": 0.0005, + "loss": 1.661, + "step": 290600 + }, + { + "epoch": 22.8, + "learning_rate": 0.0005, + "loss": 1.659, + "step": 290700 + }, + { + "epoch": 22.8, + "learning_rate": 0.0005, + "loss": 1.7059, + "step": 290800 + }, + { + "epoch": 22.81, + "learning_rate": 0.0005, + "loss": 1.6346, + "step": 290900 + }, + { + "epoch": 22.82, + "learning_rate": 0.0005, + "loss": 1.6551, + "step": 291000 + }, + { + "epoch": 22.83, + "learning_rate": 0.0005, + "loss": 1.6512, + "step": 291100 + }, + { + "epoch": 22.84, + "learning_rate": 0.0005, + "loss": 1.6733, + "step": 291200 + }, + { + "epoch": 22.84, + "learning_rate": 0.0005, + "loss": 1.6672, + "step": 291300 + }, + { + "epoch": 22.85, + "learning_rate": 0.0005, + "loss": 1.6735, + "step": 291400 + }, + { + "epoch": 22.86, + "learning_rate": 0.0005, + "loss": 1.6639, + "step": 291500 + }, + { + "epoch": 22.87, + "learning_rate": 0.0005, + "loss": 1.6709, + "step": 291600 + }, + { + "epoch": 22.87, + "learning_rate": 0.0005, + "loss": 1.6485, + "step": 291700 + }, + { + "epoch": 22.88, + "learning_rate": 0.0005, + "loss": 1.6701, + "step": 291800 + }, + { + "epoch": 22.89, + "learning_rate": 0.0005, + "loss": 1.6622, + "step": 291900 + }, + { + "epoch": 22.9, + "learning_rate": 0.0005, + "loss": 1.6651, + "step": 292000 + }, + { + "epoch": 22.91, + "learning_rate": 0.0005, + "loss": 1.6575, + "step": 292100 + }, + { + "epoch": 22.91, + "learning_rate": 0.0005, + "loss": 1.661, + "step": 292200 + }, + { + "epoch": 22.92, + "learning_rate": 0.0005, + "loss": 1.6728, + "step": 292300 + }, + { + "epoch": 22.93, + "learning_rate": 0.0005, + "loss": 1.6599, + "step": 292400 + }, + { + "epoch": 22.94, + "learning_rate": 0.0005, + "loss": 1.6628, + "step": 292500 + }, + { + "epoch": 22.95, + "learning_rate": 0.0005, + "loss": 1.6632, + "step": 292600 + }, + { + "epoch": 22.95, + "learning_rate": 0.0005, + "loss": 1.642, + "step": 292700 + }, + { + "epoch": 22.96, + "learning_rate": 0.0005, + "loss": 1.6767, + "step": 292800 + }, + { + "epoch": 22.97, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 292900 + }, + { + "epoch": 22.98, + "learning_rate": 0.0005, + "loss": 1.6488, + "step": 293000 + }, + { + "epoch": 22.98, + "learning_rate": 0.0005, + "loss": 1.6915, + "step": 293100 + }, + { + "epoch": 22.99, + "learning_rate": 0.0005, + "loss": 1.6965, + "step": 293200 + }, + { + "epoch": 23.0, + "learning_rate": 0.0005, + "loss": 1.6508, + "step": 293300 + }, + { + "epoch": 23.01, + "learning_rate": 0.0005, + "loss": 1.5917, + "step": 293400 + }, + { + "epoch": 23.02, + "learning_rate": 0.0005, + "loss": 1.5818, + "step": 293500 + }, + { + "epoch": 23.02, + "learning_rate": 0.0005, + "loss": 1.6064, + "step": 293600 + }, + { + "epoch": 23.03, + "learning_rate": 0.0005, + "loss": 1.6003, + "step": 293700 + }, + { + "epoch": 23.04, + "learning_rate": 0.0005, + "loss": 1.5817, + "step": 293800 + }, + { + "epoch": 23.05, + "learning_rate": 0.0005, + "loss": 1.5865, + "step": 293900 + }, + { + "epoch": 23.06, + "learning_rate": 0.0005, + "loss": 1.5629, + "step": 294000 + }, + { + "epoch": 23.06, + "learning_rate": 0.0005, + "loss": 1.5881, + "step": 294100 + }, + { + "epoch": 23.07, + "learning_rate": 0.0005, + "loss": 1.5776, + "step": 294200 + }, + { + "epoch": 23.08, + "learning_rate": 0.0005, + "loss": 1.59, + "step": 294300 + }, + { + "epoch": 23.09, + "learning_rate": 0.0005, + "loss": 1.5971, + "step": 294400 + }, + { + "epoch": 23.09, + "learning_rate": 0.0005, + "loss": 1.6124, + "step": 294500 + }, + { + "epoch": 23.1, + "learning_rate": 0.0005, + "loss": 1.6203, + "step": 294600 + }, + { + "epoch": 23.11, + "learning_rate": 0.0005, + "loss": 1.574, + "step": 294700 + }, + { + "epoch": 23.12, + "learning_rate": 0.0005, + "loss": 1.6149, + "step": 294800 + }, + { + "epoch": 23.13, + "learning_rate": 0.0005, + "loss": 1.6092, + "step": 294900 + }, + { + "epoch": 23.13, + "learning_rate": 0.0005, + "loss": 1.5889, + "step": 295000 + }, + { + "epoch": 23.14, + "learning_rate": 0.0005, + "loss": 1.5929, + "step": 295100 + }, + { + "epoch": 23.15, + "learning_rate": 0.0005, + "loss": 1.6198, + "step": 295200 + }, + { + "epoch": 23.16, + "learning_rate": 0.0005, + "loss": 1.5972, + "step": 295300 + }, + { + "epoch": 23.16, + "learning_rate": 0.0005, + "loss": 1.5884, + "step": 295400 + }, + { + "epoch": 23.17, + "learning_rate": 0.0005, + "loss": 1.5839, + "step": 295500 + }, + { + "epoch": 23.18, + "learning_rate": 0.0005, + "loss": 1.6108, + "step": 295600 + }, + { + "epoch": 23.19, + "learning_rate": 0.0005, + "loss": 1.5895, + "step": 295700 + }, + { + "epoch": 23.2, + "learning_rate": 0.0005, + "loss": 1.5911, + "step": 295800 + }, + { + "epoch": 23.2, + "learning_rate": 0.0005, + "loss": 1.5722, + "step": 295900 + }, + { + "epoch": 23.21, + "learning_rate": 0.0005, + "loss": 1.6111, + "step": 296000 + }, + { + "epoch": 23.22, + "learning_rate": 0.0005, + "loss": 1.6076, + "step": 296100 + }, + { + "epoch": 23.23, + "learning_rate": 0.0005, + "loss": 1.6228, + "step": 296200 + }, + { + "epoch": 23.24, + "learning_rate": 0.0005, + "loss": 1.5836, + "step": 296300 + }, + { + "epoch": 23.24, + "learning_rate": 0.0005, + "loss": 1.616, + "step": 296400 + }, + { + "epoch": 23.25, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 296500 + }, + { + "epoch": 23.26, + "learning_rate": 0.0005, + "loss": 1.5998, + "step": 296600 + }, + { + "epoch": 23.27, + "learning_rate": 0.0005, + "loss": 1.6057, + "step": 296700 + }, + { + "epoch": 23.27, + "learning_rate": 0.0005, + "loss": 1.6069, + "step": 296800 + }, + { + "epoch": 23.28, + "learning_rate": 0.0005, + "loss": 1.6165, + "step": 296900 + }, + { + "epoch": 23.29, + "learning_rate": 0.0005, + "loss": 1.6075, + "step": 297000 + }, + { + "epoch": 23.3, + "learning_rate": 0.0005, + "loss": 1.6274, + "step": 297100 + }, + { + "epoch": 23.31, + "learning_rate": 0.0005, + "loss": 1.6295, + "step": 297200 + }, + { + "epoch": 23.31, + "learning_rate": 0.0005, + "loss": 1.6176, + "step": 297300 + }, + { + "epoch": 23.32, + "learning_rate": 0.0005, + "loss": 1.6193, + "step": 297400 + }, + { + "epoch": 23.33, + "learning_rate": 0.0005, + "loss": 1.6231, + "step": 297500 + }, + { + "epoch": 23.34, + "learning_rate": 0.0005, + "loss": 1.6155, + "step": 297600 + }, + { + "epoch": 23.35, + "learning_rate": 0.0005, + "loss": 1.6239, + "step": 297700 + }, + { + "epoch": 23.35, + "learning_rate": 0.0005, + "loss": 1.6217, + "step": 297800 + }, + { + "epoch": 23.36, + "learning_rate": 0.0005, + "loss": 1.5893, + "step": 297900 + }, + { + "epoch": 23.37, + "learning_rate": 0.0005, + "loss": 1.632, + "step": 298000 + }, + { + "epoch": 23.38, + "learning_rate": 0.0005, + "loss": 1.614, + "step": 298100 + }, + { + "epoch": 23.38, + "learning_rate": 0.0005, + "loss": 1.604, + "step": 298200 + }, + { + "epoch": 23.39, + "learning_rate": 0.0005, + "loss": 1.6348, + "step": 298300 + }, + { + "epoch": 23.4, + "learning_rate": 0.0005, + "loss": 1.6524, + "step": 298400 + }, + { + "epoch": 23.41, + "learning_rate": 0.0005, + "loss": 1.6346, + "step": 298500 + }, + { + "epoch": 23.42, + "learning_rate": 0.0005, + "loss": 1.6335, + "step": 298600 + }, + { + "epoch": 23.42, + "learning_rate": 0.0005, + "loss": 1.6356, + "step": 298700 + }, + { + "epoch": 23.43, + "learning_rate": 0.0005, + "loss": 1.6304, + "step": 298800 + }, + { + "epoch": 23.44, + "learning_rate": 0.0005, + "loss": 1.6099, + "step": 298900 + }, + { + "epoch": 23.45, + "learning_rate": 0.0005, + "loss": 1.6269, + "step": 299000 + }, + { + "epoch": 23.46, + "learning_rate": 0.0005, + "loss": 1.6192, + "step": 299100 + }, + { + "epoch": 23.46, + "learning_rate": 0.0005, + "loss": 1.6656, + "step": 299200 + }, + { + "epoch": 23.47, + "learning_rate": 0.0005, + "loss": 1.6019, + "step": 299300 + }, + { + "epoch": 23.48, + "learning_rate": 0.0005, + "loss": 1.6528, + "step": 299400 + }, + { + "epoch": 23.49, + "learning_rate": 0.0005, + "loss": 1.6116, + "step": 299500 + }, + { + "epoch": 23.49, + "learning_rate": 0.0005, + "loss": 1.6486, + "step": 299600 + }, + { + "epoch": 23.5, + "learning_rate": 0.0005, + "loss": 1.6194, + "step": 299700 + }, + { + "epoch": 23.51, + "learning_rate": 0.0005, + "loss": 1.6463, + "step": 299800 + }, + { + "epoch": 23.52, + "learning_rate": 0.0005, + "loss": 1.628, + "step": 299900 + }, + { + "epoch": 23.53, + "learning_rate": 0.0005, + "loss": 1.6391, + "step": 300000 + }, + { + "epoch": 23.53, + "eval_gen_len": 18.781583826255847, + "eval_loss": 2.024906635284424, + "eval_rouge1": 35.371, + "eval_rouge2": 14.234, + "eval_rougeL": 29.0754, + "eval_rougeLsum": 29.072, + "eval_runtime": 361.8269, + "eval_samples_per_second": 31.305, + "eval_steps_per_second": 1.957, + "step": 300000 + }, + { + "epoch": 23.53, + "learning_rate": 0.0005, + "loss": 1.666, + "step": 300100 + }, + { + "epoch": 23.54, + "learning_rate": 0.0005, + "loss": 1.6385, + "step": 300200 + }, + { + "epoch": 23.55, + "learning_rate": 0.0005, + "loss": 1.6464, + "step": 300300 + }, + { + "epoch": 23.56, + "learning_rate": 0.0005, + "loss": 1.6211, + "step": 300400 + }, + { + "epoch": 23.56, + "learning_rate": 0.0005, + "loss": 1.6403, + "step": 300500 + }, + { + "epoch": 23.57, + "learning_rate": 0.0005, + "loss": 1.6051, + "step": 300600 + }, + { + "epoch": 23.58, + "learning_rate": 0.0005, + "loss": 1.6281, + "step": 300700 + }, + { + "epoch": 23.59, + "learning_rate": 0.0005, + "loss": 1.628, + "step": 300800 + }, + { + "epoch": 23.6, + "learning_rate": 0.0005, + "loss": 1.6414, + "step": 300900 + }, + { + "epoch": 23.6, + "learning_rate": 0.0005, + "loss": 1.6388, + "step": 301000 + }, + { + "epoch": 23.61, + "learning_rate": 0.0005, + "loss": 1.6349, + "step": 301100 + }, + { + "epoch": 23.62, + "learning_rate": 0.0005, + "loss": 1.6436, + "step": 301200 + }, + { + "epoch": 23.63, + "learning_rate": 0.0005, + "loss": 1.6358, + "step": 301300 + }, + { + "epoch": 23.64, + "learning_rate": 0.0005, + "loss": 1.6356, + "step": 301400 + }, + { + "epoch": 23.64, + "learning_rate": 0.0005, + "loss": 1.6494, + "step": 301500 + }, + { + "epoch": 23.65, + "learning_rate": 0.0005, + "loss": 1.6575, + "step": 301600 + }, + { + "epoch": 23.66, + "learning_rate": 0.0005, + "loss": 1.6476, + "step": 301700 + }, + { + "epoch": 23.67, + "learning_rate": 0.0005, + "loss": 1.6487, + "step": 301800 + }, + { + "epoch": 23.67, + "learning_rate": 0.0005, + "loss": 1.6258, + "step": 301900 + }, + { + "epoch": 23.68, + "learning_rate": 0.0005, + "loss": 1.6489, + "step": 302000 + }, + { + "epoch": 23.69, + "learning_rate": 0.0005, + "loss": 1.6275, + "step": 302100 + }, + { + "epoch": 23.7, + "learning_rate": 0.0005, + "loss": 1.6157, + "step": 302200 + }, + { + "epoch": 23.71, + "learning_rate": 0.0005, + "loss": 1.6603, + "step": 302300 + }, + { + "epoch": 23.71, + "learning_rate": 0.0005, + "loss": 1.6366, + "step": 302400 + }, + { + "epoch": 23.72, + "learning_rate": 0.0005, + "loss": 1.648, + "step": 302500 + }, + { + "epoch": 23.73, + "learning_rate": 0.0005, + "loss": 1.6612, + "step": 302600 + }, + { + "epoch": 23.74, + "learning_rate": 0.0005, + "loss": 1.6281, + "step": 302700 + }, + { + "epoch": 23.75, + "learning_rate": 0.0005, + "loss": 1.6699, + "step": 302800 + }, + { + "epoch": 23.75, + "learning_rate": 0.0005, + "loss": 1.6539, + "step": 302900 + }, + { + "epoch": 23.76, + "learning_rate": 0.0005, + "loss": 1.6383, + "step": 303000 + }, + { + "epoch": 23.77, + "learning_rate": 0.0005, + "loss": 1.6336, + "step": 303100 + }, + { + "epoch": 23.78, + "learning_rate": 0.0005, + "loss": 1.6608, + "step": 303200 + }, + { + "epoch": 23.78, + "learning_rate": 0.0005, + "loss": 1.6244, + "step": 303300 + }, + { + "epoch": 23.79, + "learning_rate": 0.0005, + "loss": 1.6521, + "step": 303400 + }, + { + "epoch": 23.8, + "learning_rate": 0.0005, + "loss": 1.6382, + "step": 303500 + }, + { + "epoch": 23.81, + "learning_rate": 0.0005, + "loss": 1.6328, + "step": 303600 + }, + { + "epoch": 23.82, + "learning_rate": 0.0005, + "loss": 1.6603, + "step": 303700 + }, + { + "epoch": 23.82, + "learning_rate": 0.0005, + "loss": 1.6212, + "step": 303800 + }, + { + "epoch": 23.83, + "learning_rate": 0.0005, + "loss": 1.6531, + "step": 303900 + }, + { + "epoch": 23.84, + "learning_rate": 0.0005, + "loss": 1.6388, + "step": 304000 + }, + { + "epoch": 23.85, + "learning_rate": 0.0005, + "loss": 1.65, + "step": 304100 + }, + { + "epoch": 23.86, + "learning_rate": 0.0005, + "loss": 1.6484, + "step": 304200 + }, + { + "epoch": 23.86, + "learning_rate": 0.0005, + "loss": 1.6867, + "step": 304300 + }, + { + "epoch": 23.87, + "learning_rate": 0.0005, + "loss": 1.6901, + "step": 304400 + }, + { + "epoch": 23.88, + "learning_rate": 0.0005, + "loss": 1.6416, + "step": 304500 + }, + { + "epoch": 23.89, + "learning_rate": 0.0005, + "loss": 1.6623, + "step": 304600 + }, + { + "epoch": 23.89, + "learning_rate": 0.0005, + "loss": 1.651, + "step": 304700 + }, + { + "epoch": 23.9, + "learning_rate": 0.0005, + "loss": 1.6387, + "step": 304800 + }, + { + "epoch": 23.91, + "learning_rate": 0.0005, + "loss": 1.6614, + "step": 304900 + }, + { + "epoch": 23.92, + "learning_rate": 0.0005, + "loss": 1.6405, + "step": 305000 + }, + { + "epoch": 23.93, + "learning_rate": 0.0005, + "loss": 1.6339, + "step": 305100 + }, + { + "epoch": 23.93, + "learning_rate": 0.0005, + "loss": 1.6873, + "step": 305200 + }, + { + "epoch": 23.94, + "learning_rate": 0.0005, + "loss": 1.6774, + "step": 305300 + }, + { + "epoch": 23.95, + "learning_rate": 0.0005, + "loss": 1.6678, + "step": 305400 + }, + { + "epoch": 23.96, + "learning_rate": 0.0005, + "loss": 1.6646, + "step": 305500 + }, + { + "epoch": 23.96, + "learning_rate": 0.0005, + "loss": 1.6671, + "step": 305600 + }, + { + "epoch": 23.97, + "learning_rate": 0.0005, + "loss": 1.6392, + "step": 305700 + }, + { + "epoch": 23.98, + "learning_rate": 0.0005, + "loss": 1.6524, + "step": 305800 + }, + { + "epoch": 23.99, + "learning_rate": 0.0005, + "loss": 1.6699, + "step": 305900 + }, + { + "epoch": 24.0, + "learning_rate": 0.0005, + "loss": 1.6702, + "step": 306000 + }, + { + "epoch": 24.0, + "learning_rate": 0.0005, + "loss": 1.6122, + "step": 306100 + }, + { + "epoch": 24.01, + "learning_rate": 0.0005, + "loss": 1.5641, + "step": 306200 + }, + { + "epoch": 24.02, + "learning_rate": 0.0005, + "loss": 1.5877, + "step": 306300 + }, + { + "epoch": 24.03, + "learning_rate": 0.0005, + "loss": 1.5558, + "step": 306400 + }, + { + "epoch": 24.04, + "learning_rate": 0.0005, + "loss": 1.5677, + "step": 306500 + }, + { + "epoch": 24.04, + "learning_rate": 0.0005, + "loss": 1.5845, + "step": 306600 + }, + { + "epoch": 24.05, + "learning_rate": 0.0005, + "loss": 1.6126, + "step": 306700 + }, + { + "epoch": 24.06, + "learning_rate": 0.0005, + "loss": 1.5886, + "step": 306800 + }, + { + "epoch": 24.07, + "learning_rate": 0.0005, + "loss": 1.5871, + "step": 306900 + }, + { + "epoch": 24.07, + "learning_rate": 0.0005, + "loss": 1.5685, + "step": 307000 + }, + { + "epoch": 24.08, + "learning_rate": 0.0005, + "loss": 1.5822, + "step": 307100 + }, + { + "epoch": 24.09, + "learning_rate": 0.0005, + "loss": 1.5561, + "step": 307200 + }, + { + "epoch": 24.1, + "learning_rate": 0.0005, + "loss": 1.5736, + "step": 307300 + }, + { + "epoch": 24.11, + "learning_rate": 0.0005, + "loss": 1.6063, + "step": 307400 + }, + { + "epoch": 24.11, + "learning_rate": 0.0005, + "loss": 1.5786, + "step": 307500 + }, + { + "epoch": 24.12, + "learning_rate": 0.0005, + "loss": 1.5952, + "step": 307600 + }, + { + "epoch": 24.13, + "learning_rate": 0.0005, + "loss": 1.5937, + "step": 307700 + }, + { + "epoch": 24.14, + "learning_rate": 0.0005, + "loss": 1.5623, + "step": 307800 + }, + { + "epoch": 24.15, + "learning_rate": 0.0005, + "loss": 1.5925, + "step": 307900 + }, + { + "epoch": 24.15, + "learning_rate": 0.0005, + "loss": 1.5756, + "step": 308000 + }, + { + "epoch": 24.16, + "learning_rate": 0.0005, + "loss": 1.5649, + "step": 308100 + }, + { + "epoch": 24.17, + "learning_rate": 0.0005, + "loss": 1.5827, + "step": 308200 + }, + { + "epoch": 24.18, + "learning_rate": 0.0005, + "loss": 1.5905, + "step": 308300 + }, + { + "epoch": 24.18, + "learning_rate": 0.0005, + "loss": 1.6098, + "step": 308400 + }, + { + "epoch": 24.19, + "learning_rate": 0.0005, + "loss": 1.5813, + "step": 308500 + }, + { + "epoch": 24.2, + "learning_rate": 0.0005, + "loss": 1.5959, + "step": 308600 + }, + { + "epoch": 24.21, + "learning_rate": 0.0005, + "loss": 1.602, + "step": 308700 + }, + { + "epoch": 24.22, + "learning_rate": 0.0005, + "loss": 1.5968, + "step": 308800 + }, + { + "epoch": 24.22, + "learning_rate": 0.0005, + "loss": 1.5857, + "step": 308900 + }, + { + "epoch": 24.23, + "learning_rate": 0.0005, + "loss": 1.5816, + "step": 309000 + }, + { + "epoch": 24.24, + "learning_rate": 0.0005, + "loss": 1.6208, + "step": 309100 + }, + { + "epoch": 24.25, + "learning_rate": 0.0005, + "loss": 1.6071, + "step": 309200 + }, + { + "epoch": 24.26, + "learning_rate": 0.0005, + "loss": 1.6001, + "step": 309300 + }, + { + "epoch": 24.26, + "learning_rate": 0.0005, + "loss": 1.5837, + "step": 309400 + }, + { + "epoch": 24.27, + "learning_rate": 0.0005, + "loss": 1.5884, + "step": 309500 + }, + { + "epoch": 24.28, + "learning_rate": 0.0005, + "loss": 1.6021, + "step": 309600 + }, + { + "epoch": 24.29, + "learning_rate": 0.0005, + "loss": 1.6263, + "step": 309700 + }, + { + "epoch": 24.29, + "learning_rate": 0.0005, + "loss": 1.5973, + "step": 309800 + }, + { + "epoch": 24.3, + "learning_rate": 0.0005, + "loss": 1.61, + "step": 309900 + }, + { + "epoch": 24.31, + "learning_rate": 0.0005, + "loss": 1.6117, + "step": 310000 + }, + { + "epoch": 24.31, + "eval_gen_len": 18.821841617374414, + "eval_loss": 2.028640031814575, + "eval_rouge1": 35.3532, + "eval_rouge2": 14.2979, + "eval_rougeL": 29.0784, + "eval_rougeLsum": 29.0753, + "eval_runtime": 359.726, + "eval_samples_per_second": 31.488, + "eval_steps_per_second": 1.968, + "step": 310000 + }, + { + "epoch": 24.32, + "learning_rate": 0.0005, + "loss": 1.6213, + "step": 310100 + }, + { + "epoch": 24.33, + "learning_rate": 0.0005, + "loss": 1.6144, + "step": 310200 + }, + { + "epoch": 24.33, + "learning_rate": 0.0005, + "loss": 1.6167, + "step": 310300 + }, + { + "epoch": 24.34, + "learning_rate": 0.0005, + "loss": 1.5916, + "step": 310400 + }, + { + "epoch": 24.35, + "learning_rate": 0.0005, + "loss": 1.6089, + "step": 310500 + }, + { + "epoch": 24.36, + "learning_rate": 0.0005, + "loss": 1.6133, + "step": 310600 + }, + { + "epoch": 24.36, + "learning_rate": 0.0005, + "loss": 1.5997, + "step": 310700 + }, + { + "epoch": 24.37, + "learning_rate": 0.0005, + "loss": 1.5821, + "step": 310800 + }, + { + "epoch": 24.38, + "learning_rate": 0.0005, + "loss": 1.5782, + "step": 310900 + }, + { + "epoch": 24.39, + "learning_rate": 0.0005, + "loss": 1.6115, + "step": 311000 + }, + { + "epoch": 24.4, + "learning_rate": 0.0005, + "loss": 1.586, + "step": 311100 + }, + { + "epoch": 24.4, + "learning_rate": 0.0005, + "loss": 1.608, + "step": 311200 + }, + { + "epoch": 24.41, + "learning_rate": 0.0005, + "loss": 1.5944, + "step": 311300 + }, + { + "epoch": 24.42, + "learning_rate": 0.0005, + "loss": 1.6191, + "step": 311400 + }, + { + "epoch": 24.43, + "learning_rate": 0.0005, + "loss": 1.5982, + "step": 311500 + }, + { + "epoch": 24.44, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 311600 + }, + { + "epoch": 24.44, + "learning_rate": 0.0005, + "loss": 1.6403, + "step": 311700 + }, + { + "epoch": 24.45, + "learning_rate": 0.0005, + "loss": 1.6233, + "step": 311800 + }, + { + "epoch": 24.46, + "learning_rate": 0.0005, + "loss": 1.6071, + "step": 311900 + }, + { + "epoch": 24.47, + "learning_rate": 0.0005, + "loss": 1.6255, + "step": 312000 + }, + { + "epoch": 24.47, + "learning_rate": 0.0005, + "loss": 1.6234, + "step": 312100 + }, + { + "epoch": 24.48, + "learning_rate": 0.0005, + "loss": 1.6308, + "step": 312200 + }, + { + "epoch": 24.49, + "learning_rate": 0.0005, + "loss": 1.6172, + "step": 312300 + }, + { + "epoch": 24.5, + "learning_rate": 0.0005, + "loss": 1.6077, + "step": 312400 + }, + { + "epoch": 24.51, + "learning_rate": 0.0005, + "loss": 1.6378, + "step": 312500 + }, + { + "epoch": 24.51, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 312600 + }, + { + "epoch": 24.52, + "learning_rate": 0.0005, + "loss": 1.6411, + "step": 312700 + }, + { + "epoch": 24.53, + "learning_rate": 0.0005, + "loss": 1.6063, + "step": 312800 + }, + { + "epoch": 24.54, + "learning_rate": 0.0005, + "loss": 1.638, + "step": 312900 + }, + { + "epoch": 24.55, + "learning_rate": 0.0005, + "loss": 1.6137, + "step": 313000 + }, + { + "epoch": 24.55, + "learning_rate": 0.0005, + "loss": 1.6173, + "step": 313100 + }, + { + "epoch": 24.56, + "learning_rate": 0.0005, + "loss": 1.5994, + "step": 313200 + }, + { + "epoch": 24.57, + "learning_rate": 0.0005, + "loss": 1.6321, + "step": 313300 + }, + { + "epoch": 24.58, + "learning_rate": 0.0005, + "loss": 1.6248, + "step": 313400 + }, + { + "epoch": 24.58, + "learning_rate": 0.0005, + "loss": 1.622, + "step": 313500 + }, + { + "epoch": 24.59, + "learning_rate": 0.0005, + "loss": 1.623, + "step": 313600 + }, + { + "epoch": 24.6, + "learning_rate": 0.0005, + "loss": 1.6185, + "step": 313700 + }, + { + "epoch": 24.61, + "learning_rate": 0.0005, + "loss": 1.6278, + "step": 313800 + }, + { + "epoch": 24.62, + "learning_rate": 0.0005, + "loss": 1.6302, + "step": 313900 + }, + { + "epoch": 24.62, + "learning_rate": 0.0005, + "loss": 1.5845, + "step": 314000 + }, + { + "epoch": 24.63, + "learning_rate": 0.0005, + "loss": 1.6256, + "step": 314100 + }, + { + "epoch": 24.64, + "learning_rate": 0.0005, + "loss": 1.6112, + "step": 314200 + }, + { + "epoch": 24.65, + "learning_rate": 0.0005, + "loss": 1.6205, + "step": 314300 + }, + { + "epoch": 24.65, + "learning_rate": 0.0005, + "loss": 1.6244, + "step": 314400 + }, + { + "epoch": 24.66, + "learning_rate": 0.0005, + "loss": 1.6834, + "step": 314500 + }, + { + "epoch": 24.67, + "learning_rate": 0.0005, + "loss": 1.6155, + "step": 314600 + }, + { + "epoch": 24.68, + "learning_rate": 0.0005, + "loss": 1.6176, + "step": 314700 + }, + { + "epoch": 24.69, + "learning_rate": 0.0005, + "loss": 1.6207, + "step": 314800 + }, + { + "epoch": 24.69, + "learning_rate": 0.0005, + "loss": 1.6335, + "step": 314900 + }, + { + "epoch": 24.7, + "learning_rate": 0.0005, + "loss": 1.6422, + "step": 315000 + }, + { + "epoch": 24.71, + "learning_rate": 0.0005, + "loss": 1.612, + "step": 315100 + }, + { + "epoch": 24.72, + "learning_rate": 0.0005, + "loss": 1.6276, + "step": 315200 + }, + { + "epoch": 24.73, + "learning_rate": 0.0005, + "loss": 1.6261, + "step": 315300 + }, + { + "epoch": 24.73, + "learning_rate": 0.0005, + "loss": 1.6383, + "step": 315400 + }, + { + "epoch": 24.74, + "learning_rate": 0.0005, + "loss": 1.6474, + "step": 315500 + }, + { + "epoch": 24.75, + "learning_rate": 0.0005, + "loss": 1.6421, + "step": 315600 + }, + { + "epoch": 24.76, + "learning_rate": 0.0005, + "loss": 1.6137, + "step": 315700 + }, + { + "epoch": 24.76, + "learning_rate": 0.0005, + "loss": 1.6436, + "step": 315800 + }, + { + "epoch": 24.77, + "learning_rate": 0.0005, + "loss": 1.6318, + "step": 315900 + }, + { + "epoch": 24.78, + "learning_rate": 0.0005, + "loss": 1.6004, + "step": 316000 + }, + { + "epoch": 24.79, + "learning_rate": 0.0005, + "loss": 1.6413, + "step": 316100 + }, + { + "epoch": 24.8, + "learning_rate": 0.0005, + "loss": 1.5908, + "step": 316200 + }, + { + "epoch": 24.8, + "learning_rate": 0.0005, + "loss": 1.6252, + "step": 316300 + }, + { + "epoch": 24.81, + "learning_rate": 0.0005, + "loss": 1.6307, + "step": 316400 + }, + { + "epoch": 24.82, + "learning_rate": 0.0005, + "loss": 1.6039, + "step": 316500 + }, + { + "epoch": 24.83, + "learning_rate": 0.0005, + "loss": 1.635, + "step": 316600 + }, + { + "epoch": 24.84, + "learning_rate": 0.0005, + "loss": 1.6595, + "step": 316700 + }, + { + "epoch": 24.84, + "learning_rate": 0.0005, + "loss": 1.6373, + "step": 316800 + }, + { + "epoch": 24.85, + "learning_rate": 0.0005, + "loss": 1.6513, + "step": 316900 + }, + { + "epoch": 24.86, + "learning_rate": 0.0005, + "loss": 1.6329, + "step": 317000 + }, + { + "epoch": 24.87, + "learning_rate": 0.0005, + "loss": 1.6245, + "step": 317100 + }, + { + "epoch": 24.87, + "learning_rate": 0.0005, + "loss": 1.6224, + "step": 317200 + }, + { + "epoch": 24.88, + "learning_rate": 0.0005, + "loss": 1.6333, + "step": 317300 + }, + { + "epoch": 24.89, + "learning_rate": 0.0005, + "loss": 1.6248, + "step": 317400 + }, + { + "epoch": 24.9, + "learning_rate": 0.0005, + "loss": 1.6371, + "step": 317500 + }, + { + "epoch": 24.91, + "learning_rate": 0.0005, + "loss": 1.6373, + "step": 317600 + }, + { + "epoch": 24.91, + "learning_rate": 0.0005, + "loss": 1.6378, + "step": 317700 + }, + { + "epoch": 24.92, + "learning_rate": 0.0005, + "loss": 1.6357, + "step": 317800 + }, + { + "epoch": 24.93, + "learning_rate": 0.0005, + "loss": 1.6325, + "step": 317900 + }, + { + "epoch": 24.94, + "learning_rate": 0.0005, + "loss": 1.6113, + "step": 318000 + }, + { + "epoch": 24.95, + "learning_rate": 0.0005, + "loss": 1.6174, + "step": 318100 + }, + { + "epoch": 24.95, + "learning_rate": 0.0005, + "loss": 1.6451, + "step": 318200 + }, + { + "epoch": 24.96, + "learning_rate": 0.0005, + "loss": 1.6558, + "step": 318300 + }, + { + "epoch": 24.97, + "learning_rate": 0.0005, + "loss": 1.662, + "step": 318400 + }, + { + "epoch": 24.98, + "learning_rate": 0.0005, + "loss": 1.6724, + "step": 318500 + }, + { + "epoch": 24.98, + "learning_rate": 0.0005, + "loss": 1.6367, + "step": 318600 + }, + { + "epoch": 24.99, + "learning_rate": 0.0005, + "loss": 1.6426, + "step": 318700 + }, + { + "epoch": 25.0, + "learning_rate": 0.0005, + "loss": 1.6336, + "step": 318800 + }, + { + "epoch": 25.01, + "learning_rate": 0.0005, + "loss": 1.5282, + "step": 318900 + }, + { + "epoch": 25.02, + "learning_rate": 0.0005, + "loss": 1.5774, + "step": 319000 + }, + { + "epoch": 25.02, + "learning_rate": 0.0005, + "loss": 1.5832, + "step": 319100 + }, + { + "epoch": 25.03, + "learning_rate": 0.0005, + "loss": 1.5514, + "step": 319200 + }, + { + "epoch": 25.04, + "learning_rate": 0.0005, + "loss": 1.5626, + "step": 319300 + }, + { + "epoch": 25.05, + "learning_rate": 0.0005, + "loss": 1.5512, + "step": 319400 + }, + { + "epoch": 25.05, + "learning_rate": 0.0005, + "loss": 1.5384, + "step": 319500 + }, + { + "epoch": 25.06, + "learning_rate": 0.0005, + "loss": 1.5862, + "step": 319600 + }, + { + "epoch": 25.07, + "learning_rate": 0.0005, + "loss": 1.5757, + "step": 319700 + }, + { + "epoch": 25.08, + "learning_rate": 0.0005, + "loss": 1.5636, + "step": 319800 + }, + { + "epoch": 25.09, + "learning_rate": 0.0005, + "loss": 1.5657, + "step": 319900 + }, + { + "epoch": 25.09, + "learning_rate": 0.0005, + "loss": 1.5626, + "step": 320000 + }, + { + "epoch": 25.09, + "eval_gen_len": 18.80259556811159, + "eval_loss": 2.050901174545288, + "eval_rouge1": 35.454, + "eval_rouge2": 14.3262, + "eval_rougeL": 29.1638, + "eval_rougeLsum": 29.1575, + "eval_runtime": 378.8445, + "eval_samples_per_second": 29.899, + "eval_steps_per_second": 1.869, + "step": 320000 + }, + { + "epoch": 25.1, + "learning_rate": 0.0005, + "loss": 1.5716, + "step": 320100 + }, + { + "epoch": 25.11, + "learning_rate": 0.0005, + "loss": 1.5771, + "step": 320200 + }, + { + "epoch": 25.12, + "learning_rate": 0.0005, + "loss": 1.5626, + "step": 320300 + }, + { + "epoch": 25.13, + "learning_rate": 0.0005, + "loss": 1.5642, + "step": 320400 + }, + { + "epoch": 25.13, + "learning_rate": 0.0005, + "loss": 1.5571, + "step": 320500 + }, + { + "epoch": 25.14, + "learning_rate": 0.0005, + "loss": 1.5582, + "step": 320600 + }, + { + "epoch": 25.15, + "learning_rate": 0.0005, + "loss": 1.5973, + "step": 320700 + }, + { + "epoch": 25.16, + "learning_rate": 0.0005, + "loss": 1.5949, + "step": 320800 + }, + { + "epoch": 25.16, + "learning_rate": 0.0005, + "loss": 1.6062, + "step": 320900 + }, + { + "epoch": 25.17, + "learning_rate": 0.0005, + "loss": 1.5625, + "step": 321000 + }, + { + "epoch": 25.18, + "learning_rate": 0.0005, + "loss": 1.6049, + "step": 321100 + }, + { + "epoch": 25.19, + "learning_rate": 0.0005, + "loss": 1.5804, + "step": 321200 + }, + { + "epoch": 25.2, + "learning_rate": 0.0005, + "loss": 1.5686, + "step": 321300 + }, + { + "epoch": 25.2, + "learning_rate": 0.0005, + "loss": 1.5573, + "step": 321400 + }, + { + "epoch": 25.21, + "learning_rate": 0.0005, + "loss": 1.5394, + "step": 321500 + }, + { + "epoch": 25.22, + "learning_rate": 0.0005, + "loss": 1.5871, + "step": 321600 + }, + { + "epoch": 25.23, + "learning_rate": 0.0005, + "loss": 1.5794, + "step": 321700 + }, + { + "epoch": 25.24, + "learning_rate": 0.0005, + "loss": 1.5796, + "step": 321800 + }, + { + "epoch": 25.24, + "learning_rate": 0.0005, + "loss": 1.581, + "step": 321900 + }, + { + "epoch": 25.25, + "learning_rate": 0.0005, + "loss": 1.5899, + "step": 322000 + }, + { + "epoch": 25.26, + "learning_rate": 0.0005, + "loss": 1.5909, + "step": 322100 + }, + { + "epoch": 25.27, + "learning_rate": 0.0005, + "loss": 1.5914, + "step": 322200 + }, + { + "epoch": 25.27, + "learning_rate": 0.0005, + "loss": 1.564, + "step": 322300 + }, + { + "epoch": 25.28, + "learning_rate": 0.0005, + "loss": 1.595, + "step": 322400 + }, + { + "epoch": 25.29, + "learning_rate": 0.0005, + "loss": 1.5983, + "step": 322500 + }, + { + "epoch": 25.3, + "learning_rate": 0.0005, + "loss": 1.5696, + "step": 322600 + }, + { + "epoch": 25.31, + "learning_rate": 0.0005, + "loss": 1.5909, + "step": 322700 + }, + { + "epoch": 25.31, + "learning_rate": 0.0005, + "loss": 1.5952, + "step": 322800 + }, + { + "epoch": 25.32, + "learning_rate": 0.0005, + "loss": 1.5782, + "step": 322900 + }, + { + "epoch": 25.33, + "learning_rate": 0.0005, + "loss": 1.6011, + "step": 323000 + }, + { + "epoch": 25.34, + "learning_rate": 0.0005, + "loss": 1.5609, + "step": 323100 + }, + { + "epoch": 25.35, + "learning_rate": 0.0005, + "loss": 1.5914, + "step": 323200 + }, + { + "epoch": 25.35, + "learning_rate": 0.0005, + "loss": 1.5832, + "step": 323300 + }, + { + "epoch": 25.36, + "learning_rate": 0.0005, + "loss": 1.5871, + "step": 323400 + }, + { + "epoch": 25.37, + "learning_rate": 0.0005, + "loss": 1.5713, + "step": 323500 + }, + { + "epoch": 25.38, + "learning_rate": 0.0005, + "loss": 1.6133, + "step": 323600 + }, + { + "epoch": 25.38, + "learning_rate": 0.0005, + "loss": 1.6008, + "step": 323700 + }, + { + "epoch": 25.39, + "learning_rate": 0.0005, + "loss": 1.6153, + "step": 323800 + }, + { + "epoch": 25.4, + "learning_rate": 0.0005, + "loss": 1.6015, + "step": 323900 + }, + { + "epoch": 25.41, + "learning_rate": 0.0005, + "loss": 1.5809, + "step": 324000 + }, + { + "epoch": 25.42, + "learning_rate": 0.0005, + "loss": 1.5851, + "step": 324100 + }, + { + "epoch": 25.42, + "learning_rate": 0.0005, + "loss": 1.5882, + "step": 324200 + }, + { + "epoch": 25.43, + "learning_rate": 0.0005, + "loss": 1.5865, + "step": 324300 + }, + { + "epoch": 25.44, + "learning_rate": 0.0005, + "loss": 1.6289, + "step": 324400 + }, + { + "epoch": 25.45, + "learning_rate": 0.0005, + "loss": 1.5973, + "step": 324500 + }, + { + "epoch": 25.45, + "learning_rate": 0.0005, + "loss": 1.6141, + "step": 324600 + }, + { + "epoch": 25.46, + "learning_rate": 0.0005, + "loss": 1.5939, + "step": 324700 + }, + { + "epoch": 25.47, + "learning_rate": 0.0005, + "loss": 1.5722, + "step": 324800 + }, + { + "epoch": 25.48, + "learning_rate": 0.0005, + "loss": 1.587, + "step": 324900 + }, + { + "epoch": 25.49, + "learning_rate": 0.0005, + "loss": 1.5885, + "step": 325000 + }, + { + "epoch": 25.49, + "learning_rate": 0.0005, + "loss": 1.6158, + "step": 325100 + }, + { + "epoch": 25.5, + "learning_rate": 0.0005, + "loss": 1.5998, + "step": 325200 + }, + { + "epoch": 25.51, + "learning_rate": 0.0005, + "loss": 1.6076, + "step": 325300 + }, + { + "epoch": 25.52, + "learning_rate": 0.0005, + "loss": 1.6122, + "step": 325400 + }, + { + "epoch": 25.53, + "learning_rate": 0.0005, + "loss": 1.598, + "step": 325500 + }, + { + "epoch": 25.53, + "learning_rate": 0.0005, + "loss": 1.6054, + "step": 325600 + }, + { + "epoch": 25.54, + "learning_rate": 0.0005, + "loss": 1.6074, + "step": 325700 + }, + { + "epoch": 25.55, + "learning_rate": 0.0005, + "loss": 1.5875, + "step": 325800 + }, + { + "epoch": 25.56, + "learning_rate": 0.0005, + "loss": 1.5863, + "step": 325900 + }, + { + "epoch": 25.56, + "learning_rate": 0.0005, + "loss": 1.6041, + "step": 326000 + }, + { + "epoch": 25.57, + "learning_rate": 0.0005, + "loss": 1.6147, + "step": 326100 + }, + { + "epoch": 25.58, + "learning_rate": 0.0005, + "loss": 1.6279, + "step": 326200 + }, + { + "epoch": 25.59, + "learning_rate": 0.0005, + "loss": 1.6338, + "step": 326300 + }, + { + "epoch": 25.6, + "learning_rate": 0.0005, + "loss": 1.6134, + "step": 326400 + }, + { + "epoch": 25.6, + "learning_rate": 0.0005, + "loss": 1.5858, + "step": 326500 + }, + { + "epoch": 25.61, + "learning_rate": 0.0005, + "loss": 1.6179, + "step": 326600 + }, + { + "epoch": 25.62, + "learning_rate": 0.0005, + "loss": 1.6083, + "step": 326700 + }, + { + "epoch": 25.63, + "learning_rate": 0.0005, + "loss": 1.6161, + "step": 326800 + }, + { + "epoch": 25.64, + "learning_rate": 0.0005, + "loss": 1.6253, + "step": 326900 + }, + { + "epoch": 25.64, + "learning_rate": 0.0005, + "loss": 1.6429, + "step": 327000 + }, + { + "epoch": 25.65, + "learning_rate": 0.0005, + "loss": 1.6, + "step": 327100 + }, + { + "epoch": 25.66, + "learning_rate": 0.0005, + "loss": 1.5922, + "step": 327200 + }, + { + "epoch": 25.67, + "learning_rate": 0.0005, + "loss": 1.6395, + "step": 327300 + }, + { + "epoch": 25.67, + "learning_rate": 0.0005, + "loss": 1.61, + "step": 327400 + }, + { + "epoch": 25.68, + "learning_rate": 0.0005, + "loss": 1.6241, + "step": 327500 + }, + { + "epoch": 25.69, + "learning_rate": 0.0005, + "loss": 1.6458, + "step": 327600 + }, + { + "epoch": 25.7, + "learning_rate": 0.0005, + "loss": 1.597, + "step": 327700 + }, + { + "epoch": 25.71, + "learning_rate": 0.0005, + "loss": 1.6225, + "step": 327800 + }, + { + "epoch": 25.71, + "learning_rate": 0.0005, + "loss": 1.6317, + "step": 327900 + }, + { + "epoch": 25.72, + "learning_rate": 0.0005, + "loss": 1.6394, + "step": 328000 + }, + { + "epoch": 25.73, + "learning_rate": 0.0005, + "loss": 1.5723, + "step": 328100 + }, + { + "epoch": 25.74, + "learning_rate": 0.0005, + "loss": 1.6114, + "step": 328200 + }, + { + "epoch": 25.74, + "learning_rate": 0.0005, + "loss": 1.5943, + "step": 328300 + }, + { + "epoch": 25.75, + "learning_rate": 0.0005, + "loss": 1.6357, + "step": 328400 + }, + { + "epoch": 25.76, + "learning_rate": 0.0005, + "loss": 1.5943, + "step": 328500 + }, + { + "epoch": 25.77, + "learning_rate": 0.0005, + "loss": 1.6326, + "step": 328600 + }, + { + "epoch": 25.78, + "learning_rate": 0.0005, + "loss": 1.62, + "step": 328700 + }, + { + "epoch": 25.78, + "learning_rate": 0.0005, + "loss": 1.6162, + "step": 328800 + }, + { + "epoch": 25.79, + "learning_rate": 0.0005, + "loss": 1.6115, + "step": 328900 + }, + { + "epoch": 25.8, + "learning_rate": 0.0005, + "loss": 1.5891, + "step": 329000 + }, + { + "epoch": 25.81, + "learning_rate": 0.0005, + "loss": 1.6339, + "step": 329100 + }, + { + "epoch": 25.82, + "learning_rate": 0.0005, + "loss": 1.6204, + "step": 329200 + }, + { + "epoch": 25.82, + "learning_rate": 0.0005, + "loss": 1.6211, + "step": 329300 + }, + { + "epoch": 25.83, + "learning_rate": 0.0005, + "loss": 1.6182, + "step": 329400 + }, + { + "epoch": 25.84, + "learning_rate": 0.0005, + "loss": 1.5841, + "step": 329500 + }, + { + "epoch": 25.85, + "learning_rate": 0.0005, + "loss": 1.6014, + "step": 329600 + }, + { + "epoch": 25.85, + "learning_rate": 0.0005, + "loss": 1.6241, + "step": 329700 + }, + { + "epoch": 25.86, + "learning_rate": 0.0005, + "loss": 1.6218, + "step": 329800 + }, + { + "epoch": 25.87, + "learning_rate": 0.0005, + "loss": 1.6071, + "step": 329900 + }, + { + "epoch": 25.88, + "learning_rate": 0.0005, + "loss": 1.6151, + "step": 330000 + }, + { + "epoch": 25.88, + "eval_gen_len": 18.749359936435066, + "eval_loss": 2.0329372882843018, + "eval_rouge1": 35.4835, + "eval_rouge2": 14.3358, + "eval_rougeL": 29.1764, + "eval_rougeLsum": 29.1679, + "eval_runtime": 354.0201, + "eval_samples_per_second": 31.995, + "eval_steps_per_second": 2.0, + "step": 330000 + }, + { + "epoch": 25.89, + "learning_rate": 0.0005, + "loss": 1.6346, + "step": 330100 + }, + { + "epoch": 25.89, + "learning_rate": 0.0005, + "loss": 1.5966, + "step": 330200 + }, + { + "epoch": 25.9, + "learning_rate": 0.0005, + "loss": 1.6239, + "step": 330300 + }, + { + "epoch": 25.91, + "learning_rate": 0.0005, + "loss": 1.6197, + "step": 330400 + }, + { + "epoch": 25.92, + "learning_rate": 0.0005, + "loss": 1.6218, + "step": 330500 + }, + { + "epoch": 25.93, + "learning_rate": 0.0005, + "loss": 1.6206, + "step": 330600 + }, + { + "epoch": 25.93, + "learning_rate": 0.0005, + "loss": 1.627, + "step": 330700 + }, + { + "epoch": 25.94, + "learning_rate": 0.0005, + "loss": 1.6337, + "step": 330800 + }, + { + "epoch": 25.95, + "learning_rate": 0.0005, + "loss": 1.6318, + "step": 330900 + }, + { + "epoch": 25.96, + "learning_rate": 0.0005, + "loss": 1.6188, + "step": 331000 + }, + { + "epoch": 25.96, + "learning_rate": 0.0005, + "loss": 1.6249, + "step": 331100 + }, + { + "epoch": 25.97, + "learning_rate": 0.0005, + "loss": 1.6343, + "step": 331200 + }, + { + "epoch": 25.98, + "learning_rate": 0.0005, + "loss": 1.629, + "step": 331300 + }, + { + "epoch": 25.99, + "learning_rate": 0.0005, + "loss": 1.611, + "step": 331400 + }, + { + "epoch": 26.0, + "learning_rate": 0.0005, + "loss": 1.6585, + "step": 331500 + }, + { + "epoch": 26.0, + "learning_rate": 0.0005, + "loss": 1.5997, + "step": 331600 + }, + { + "epoch": 26.01, + "learning_rate": 0.0005, + "loss": 1.5408, + "step": 331700 + }, + { + "epoch": 26.02, + "learning_rate": 0.0005, + "loss": 1.5374, + "step": 331800 + }, + { + "epoch": 26.03, + "learning_rate": 0.0005, + "loss": 1.5412, + "step": 331900 + }, + { + "epoch": 26.04, + "learning_rate": 0.0005, + "loss": 1.5719, + "step": 332000 + }, + { + "epoch": 26.04, + "learning_rate": 0.0005, + "loss": 1.5398, + "step": 332100 + }, + { + "epoch": 26.05, + "learning_rate": 0.0005, + "loss": 1.5591, + "step": 332200 + }, + { + "epoch": 26.06, + "learning_rate": 0.0005, + "loss": 1.5313, + "step": 332300 + }, + { + "epoch": 26.07, + "learning_rate": 0.0005, + "loss": 1.5362, + "step": 332400 + }, + { + "epoch": 26.07, + "learning_rate": 0.0005, + "loss": 1.5486, + "step": 332500 + }, + { + "epoch": 26.08, + "learning_rate": 0.0005, + "loss": 1.5609, + "step": 332600 + }, + { + "epoch": 26.09, + "learning_rate": 0.0005, + "loss": 1.5272, + "step": 332700 + }, + { + "epoch": 26.1, + "learning_rate": 0.0005, + "loss": 1.5446, + "step": 332800 + }, + { + "epoch": 26.11, + "learning_rate": 0.0005, + "loss": 1.5752, + "step": 332900 + }, + { + "epoch": 26.11, + "learning_rate": 0.0005, + "loss": 1.5672, + "step": 333000 + }, + { + "epoch": 26.12, + "learning_rate": 0.0005, + "loss": 1.5693, + "step": 333100 + }, + { + "epoch": 26.13, + "learning_rate": 0.0005, + "loss": 1.5406, + "step": 333200 + }, + { + "epoch": 26.14, + "learning_rate": 0.0005, + "loss": 1.5762, + "step": 333300 + }, + { + "epoch": 26.14, + "learning_rate": 0.0005, + "loss": 1.5586, + "step": 333400 + }, + { + "epoch": 26.15, + "learning_rate": 0.0005, + "loss": 1.5743, + "step": 333500 + }, + { + "epoch": 26.16, + "learning_rate": 0.0005, + "loss": 1.5519, + "step": 333600 + }, + { + "epoch": 26.17, + "learning_rate": 0.0005, + "loss": 1.5945, + "step": 333700 + }, + { + "epoch": 26.18, + "learning_rate": 0.0005, + "loss": 1.5771, + "step": 333800 + }, + { + "epoch": 26.18, + "learning_rate": 0.0005, + "loss": 1.5625, + "step": 333900 + }, + { + "epoch": 26.19, + "learning_rate": 0.0005, + "loss": 1.5387, + "step": 334000 + }, + { + "epoch": 26.2, + "learning_rate": 0.0005, + "loss": 1.5574, + "step": 334100 + }, + { + "epoch": 26.21, + "learning_rate": 0.0005, + "loss": 1.562, + "step": 334200 + }, + { + "epoch": 26.22, + "learning_rate": 0.0005, + "loss": 1.5775, + "step": 334300 + }, + { + "epoch": 26.22, + "learning_rate": 0.0005, + "loss": 1.5707, + "step": 334400 + }, + { + "epoch": 26.23, + "learning_rate": 0.0005, + "loss": 1.5796, + "step": 334500 + }, + { + "epoch": 26.24, + "learning_rate": 0.0005, + "loss": 1.5527, + "step": 334600 + }, + { + "epoch": 26.25, + "learning_rate": 0.0005, + "loss": 1.5768, + "step": 334700 + }, + { + "epoch": 26.25, + "learning_rate": 0.0005, + "loss": 1.5504, + "step": 334800 + }, + { + "epoch": 26.26, + "learning_rate": 0.0005, + "loss": 1.5557, + "step": 334900 + }, + { + "epoch": 26.27, + "learning_rate": 0.0005, + "loss": 1.5739, + "step": 335000 + }, + { + "epoch": 26.28, + "learning_rate": 0.0005, + "loss": 1.5477, + "step": 335100 + }, + { + "epoch": 26.29, + "learning_rate": 0.0005, + "loss": 1.566, + "step": 335200 + }, + { + "epoch": 26.29, + "learning_rate": 0.0005, + "loss": 1.5769, + "step": 335300 + }, + { + "epoch": 26.3, + "learning_rate": 0.0005, + "loss": 1.5689, + "step": 335400 + }, + { + "epoch": 26.31, + "learning_rate": 0.0005, + "loss": 1.5647, + "step": 335500 + }, + { + "epoch": 26.32, + "learning_rate": 0.0005, + "loss": 1.5846, + "step": 335600 + }, + { + "epoch": 26.33, + "learning_rate": 0.0005, + "loss": 1.5499, + "step": 335700 + }, + { + "epoch": 26.33, + "learning_rate": 0.0005, + "loss": 1.5697, + "step": 335800 + }, + { + "epoch": 26.34, + "learning_rate": 0.0005, + "loss": 1.6024, + "step": 335900 + }, + { + "epoch": 26.35, + "learning_rate": 0.0005, + "loss": 1.5963, + "step": 336000 + }, + { + "epoch": 26.36, + "learning_rate": 0.0005, + "loss": 1.5966, + "step": 336100 + }, + { + "epoch": 26.36, + "learning_rate": 0.0005, + "loss": 1.5425, + "step": 336200 + }, + { + "epoch": 26.37, + "learning_rate": 0.0005, + "loss": 1.6052, + "step": 336300 + }, + { + "epoch": 26.38, + "learning_rate": 0.0005, + "loss": 1.5511, + "step": 336400 + }, + { + "epoch": 26.39, + "learning_rate": 0.0005, + "loss": 1.57, + "step": 336500 + }, + { + "epoch": 26.4, + "learning_rate": 0.0005, + "loss": 1.5823, + "step": 336600 + }, + { + "epoch": 26.4, + "learning_rate": 0.0005, + "loss": 1.5938, + "step": 336700 + }, + { + "epoch": 26.41, + "learning_rate": 0.0005, + "loss": 1.5801, + "step": 336800 + }, + { + "epoch": 26.42, + "learning_rate": 0.0005, + "loss": 1.5825, + "step": 336900 + }, + { + "epoch": 26.43, + "learning_rate": 0.0005, + "loss": 1.5734, + "step": 337000 + }, + { + "epoch": 26.44, + "learning_rate": 0.0005, + "loss": 1.5825, + "step": 337100 + }, + { + "epoch": 26.44, + "learning_rate": 0.0005, + "loss": 1.5665, + "step": 337200 + }, + { + "epoch": 26.45, + "learning_rate": 0.0005, + "loss": 1.6113, + "step": 337300 + }, + { + "epoch": 26.46, + "learning_rate": 0.0005, + "loss": 1.6016, + "step": 337400 + }, + { + "epoch": 26.47, + "learning_rate": 0.0005, + "loss": 1.5879, + "step": 337500 + }, + { + "epoch": 26.47, + "learning_rate": 0.0005, + "loss": 1.5739, + "step": 337600 + }, + { + "epoch": 26.48, + "learning_rate": 0.0005, + "loss": 1.5764, + "step": 337700 + }, + { + "epoch": 26.49, + "learning_rate": 0.0005, + "loss": 1.6103, + "step": 337800 + }, + { + "epoch": 26.5, + "learning_rate": 0.0005, + "loss": 1.5855, + "step": 337900 + }, + { + "epoch": 26.51, + "learning_rate": 0.0005, + "loss": 1.5896, + "step": 338000 + }, + { + "epoch": 26.51, + "learning_rate": 0.0005, + "loss": 1.5746, + "step": 338100 + }, + { + "epoch": 26.52, + "learning_rate": 0.0005, + "loss": 1.579, + "step": 338200 + }, + { + "epoch": 26.53, + "learning_rate": 0.0005, + "loss": 1.614, + "step": 338300 + }, + { + "epoch": 26.54, + "learning_rate": 0.0005, + "loss": 1.6163, + "step": 338400 + }, + { + "epoch": 26.54, + "learning_rate": 0.0005, + "loss": 1.5803, + "step": 338500 + }, + { + "epoch": 26.55, + "learning_rate": 0.0005, + "loss": 1.6032, + "step": 338600 + }, + { + "epoch": 26.56, + "learning_rate": 0.0005, + "loss": 1.5849, + "step": 338700 + }, + { + "epoch": 26.57, + "learning_rate": 0.0005, + "loss": 1.6291, + "step": 338800 + }, + { + "epoch": 26.58, + "learning_rate": 0.0005, + "loss": 1.5912, + "step": 338900 + }, + { + "epoch": 26.58, + "learning_rate": 0.0005, + "loss": 1.6117, + "step": 339000 + }, + { + "epoch": 26.59, + "learning_rate": 0.0005, + "loss": 1.5926, + "step": 339100 + }, + { + "epoch": 26.6, + "learning_rate": 0.0005, + "loss": 1.5644, + "step": 339200 + }, + { + "epoch": 26.61, + "learning_rate": 0.0005, + "loss": 1.6154, + "step": 339300 + }, + { + "epoch": 26.62, + "learning_rate": 0.0005, + "loss": 1.5896, + "step": 339400 + }, + { + "epoch": 26.62, + "learning_rate": 0.0005, + "loss": 1.5688, + "step": 339500 + }, + { + "epoch": 26.63, + "learning_rate": 0.0005, + "loss": 1.6337, + "step": 339600 + }, + { + "epoch": 26.64, + "learning_rate": 0.0005, + "loss": 1.5815, + "step": 339700 + }, + { + "epoch": 26.65, + "learning_rate": 0.0005, + "loss": 1.6126, + "step": 339800 + }, + { + "epoch": 26.65, + "learning_rate": 0.0005, + "loss": 1.5892, + "step": 339900 + }, + { + "epoch": 26.66, + "learning_rate": 0.0005, + "loss": 1.6247, + "step": 340000 + }, + { + "epoch": 26.66, + "eval_gen_len": 18.760395515140814, + "eval_loss": 2.032465696334839, + "eval_rouge1": 35.301, + "eval_rouge2": 14.2909, + "eval_rougeL": 29.0546, + "eval_rougeLsum": 29.0366, + "eval_runtime": 354.3826, + "eval_samples_per_second": 31.963, + "eval_steps_per_second": 1.998, + "step": 340000 + }, + { + "epoch": 26.67, + "learning_rate": 0.0005, + "loss": 1.6087, + "step": 340100 + }, + { + "epoch": 26.68, + "learning_rate": 0.0005, + "loss": 1.5795, + "step": 340200 + }, + { + "epoch": 26.69, + "learning_rate": 0.0005, + "loss": 1.5813, + "step": 340300 + }, + { + "epoch": 26.69, + "learning_rate": 0.0005, + "loss": 1.5897, + "step": 340400 + }, + { + "epoch": 26.7, + "learning_rate": 0.0005, + "loss": 1.6026, + "step": 340500 + }, + { + "epoch": 26.71, + "learning_rate": 0.0005, + "loss": 1.6048, + "step": 340600 + }, + { + "epoch": 26.72, + "learning_rate": 0.0005, + "loss": 1.6041, + "step": 340700 + }, + { + "epoch": 26.73, + "learning_rate": 0.0005, + "loss": 1.5818, + "step": 340800 + }, + { + "epoch": 26.73, + "learning_rate": 0.0005, + "loss": 1.5973, + "step": 340900 + }, + { + "epoch": 26.74, + "learning_rate": 0.0005, + "loss": 1.6304, + "step": 341000 + }, + { + "epoch": 26.75, + "learning_rate": 0.0005, + "loss": 1.5943, + "step": 341100 + }, + { + "epoch": 26.76, + "learning_rate": 0.0005, + "loss": 1.604, + "step": 341200 + }, + { + "epoch": 26.76, + "learning_rate": 0.0005, + "loss": 1.5891, + "step": 341300 + }, + { + "epoch": 26.77, + "learning_rate": 0.0005, + "loss": 1.6235, + "step": 341400 + }, + { + "epoch": 26.78, + "learning_rate": 0.0005, + "loss": 1.6246, + "step": 341500 + }, + { + "epoch": 26.79, + "learning_rate": 0.0005, + "loss": 1.5878, + "step": 341600 + }, + { + "epoch": 26.8, + "learning_rate": 0.0005, + "loss": 1.6338, + "step": 341700 + }, + { + "epoch": 26.8, + "learning_rate": 0.0005, + "loss": 1.5986, + "step": 341800 + }, + { + "epoch": 26.81, + "learning_rate": 0.0005, + "loss": 1.5755, + "step": 341900 + }, + { + "epoch": 26.82, + "learning_rate": 0.0005, + "loss": 1.5932, + "step": 342000 + }, + { + "epoch": 26.83, + "learning_rate": 0.0005, + "loss": 1.6145, + "step": 342100 + }, + { + "epoch": 26.84, + "learning_rate": 0.0005, + "loss": 1.5958, + "step": 342200 + }, + { + "epoch": 26.84, + "learning_rate": 0.0005, + "loss": 1.5869, + "step": 342300 + }, + { + "epoch": 26.85, + "learning_rate": 0.0005, + "loss": 1.6158, + "step": 342400 + }, + { + "epoch": 26.86, + "learning_rate": 0.0005, + "loss": 1.6216, + "step": 342500 + }, + { + "epoch": 26.87, + "learning_rate": 0.0005, + "loss": 1.6138, + "step": 342600 + }, + { + "epoch": 26.87, + "learning_rate": 0.0005, + "loss": 1.5909, + "step": 342700 + }, + { + "epoch": 26.88, + "learning_rate": 0.0005, + "loss": 1.6228, + "step": 342800 + }, + { + "epoch": 26.89, + "learning_rate": 0.0005, + "loss": 1.6369, + "step": 342900 + }, + { + "epoch": 26.9, + "learning_rate": 0.0005, + "loss": 1.5997, + "step": 343000 + }, + { + "epoch": 26.91, + "learning_rate": 0.0005, + "loss": 1.6028, + "step": 343100 + }, + { + "epoch": 26.91, + "learning_rate": 0.0005, + "loss": 1.5992, + "step": 343200 + }, + { + "epoch": 26.92, + "learning_rate": 0.0005, + "loss": 1.6149, + "step": 343300 + }, + { + "epoch": 26.93, + "learning_rate": 0.0005, + "loss": 1.6065, + "step": 343400 + }, + { + "epoch": 26.94, + "learning_rate": 0.0005, + "loss": 1.5902, + "step": 343500 + }, + { + "epoch": 26.94, + "learning_rate": 0.0005, + "loss": 1.633, + "step": 343600 + }, + { + "epoch": 26.95, + "learning_rate": 0.0005, + "loss": 1.5864, + "step": 343700 + }, + { + "epoch": 26.96, + "learning_rate": 0.0005, + "loss": 1.6256, + "step": 343800 + }, + { + "epoch": 26.97, + "learning_rate": 0.0005, + "loss": 1.6103, + "step": 343900 + }, + { + "epoch": 26.98, + "learning_rate": 0.0005, + "loss": 1.6103, + "step": 344000 + }, + { + "epoch": 26.98, + "learning_rate": 0.0005, + "loss": 1.6147, + "step": 344100 + }, + { + "epoch": 26.99, + "learning_rate": 0.0005, + "loss": 1.6163, + "step": 344200 + }, + { + "epoch": 27.0, + "learning_rate": 0.0005, + "loss": 1.6354, + "step": 344300 + }, + { + "epoch": 27.01, + "learning_rate": 0.0005, + "loss": 1.5376, + "step": 344400 + }, + { + "epoch": 27.02, + "learning_rate": 0.0005, + "loss": 1.5317, + "step": 344500 + }, + { + "epoch": 27.02, + "learning_rate": 0.0005, + "loss": 1.5318, + "step": 344600 + }, + { + "epoch": 27.03, + "learning_rate": 0.0005, + "loss": 1.5563, + "step": 344700 + }, + { + "epoch": 27.04, + "learning_rate": 0.0005, + "loss": 1.5286, + "step": 344800 + }, + { + "epoch": 27.05, + "learning_rate": 0.0005, + "loss": 1.519, + "step": 344900 + }, + { + "epoch": 27.05, + "learning_rate": 0.0005, + "loss": 1.5067, + "step": 345000 + }, + { + "epoch": 27.06, + "learning_rate": 0.0005, + "loss": 1.5485, + "step": 345100 + }, + { + "epoch": 27.07, + "learning_rate": 0.0005, + "loss": 1.5473, + "step": 345200 + }, + { + "epoch": 27.08, + "learning_rate": 0.0005, + "loss": 1.5467, + "step": 345300 + }, + { + "epoch": 27.09, + "learning_rate": 0.0005, + "loss": 1.5235, + "step": 345400 + }, + { + "epoch": 27.09, + "learning_rate": 0.0005, + "loss": 1.5264, + "step": 345500 + }, + { + "epoch": 27.1, + "learning_rate": 0.0005, + "loss": 1.5376, + "step": 345600 + }, + { + "epoch": 27.11, + "learning_rate": 0.0005, + "loss": 1.5383, + "step": 345700 + }, + { + "epoch": 27.12, + "learning_rate": 0.0005, + "loss": 1.5499, + "step": 345800 + }, + { + "epoch": 27.13, + "learning_rate": 0.0005, + "loss": 1.5365, + "step": 345900 + }, + { + "epoch": 27.13, + "learning_rate": 0.0005, + "loss": 1.5262, + "step": 346000 + }, + { + "epoch": 27.14, + "learning_rate": 0.0005, + "loss": 1.5533, + "step": 346100 + }, + { + "epoch": 27.15, + "learning_rate": 0.0005, + "loss": 1.5306, + "step": 346200 + }, + { + "epoch": 27.16, + "learning_rate": 0.0005, + "loss": 1.5767, + "step": 346300 + }, + { + "epoch": 27.16, + "learning_rate": 0.0005, + "loss": 1.5315, + "step": 346400 + }, + { + "epoch": 27.17, + "learning_rate": 0.0005, + "loss": 1.5352, + "step": 346500 + }, + { + "epoch": 27.18, + "learning_rate": 0.0005, + "loss": 1.534, + "step": 346600 + }, + { + "epoch": 27.19, + "learning_rate": 0.0005, + "loss": 1.5646, + "step": 346700 + }, + { + "epoch": 27.2, + "learning_rate": 0.0005, + "loss": 1.5394, + "step": 346800 + }, + { + "epoch": 27.2, + "learning_rate": 0.0005, + "loss": 1.5655, + "step": 346900 + }, + { + "epoch": 27.21, + "learning_rate": 0.0005, + "loss": 1.5497, + "step": 347000 + }, + { + "epoch": 27.22, + "learning_rate": 0.0005, + "loss": 1.56, + "step": 347100 + }, + { + "epoch": 27.23, + "learning_rate": 0.0005, + "loss": 1.5485, + "step": 347200 + }, + { + "epoch": 27.23, + "learning_rate": 0.0005, + "loss": 1.5658, + "step": 347300 + }, + { + "epoch": 27.24, + "learning_rate": 0.0005, + "loss": 1.5506, + "step": 347400 + }, + { + "epoch": 27.25, + "learning_rate": 0.0005, + "loss": 1.5561, + "step": 347500 + }, + { + "epoch": 27.26, + "learning_rate": 0.0005, + "loss": 1.5674, + "step": 347600 + }, + { + "epoch": 27.27, + "learning_rate": 0.0005, + "loss": 1.5509, + "step": 347700 + }, + { + "epoch": 27.27, + "learning_rate": 0.0005, + "loss": 1.5593, + "step": 347800 + }, + { + "epoch": 27.28, + "learning_rate": 0.0005, + "loss": 1.557, + "step": 347900 + }, + { + "epoch": 27.29, + "learning_rate": 0.0005, + "loss": 1.5817, + "step": 348000 + }, + { + "epoch": 27.3, + "learning_rate": 0.0005, + "loss": 1.5781, + "step": 348100 + }, + { + "epoch": 27.31, + "learning_rate": 0.0005, + "loss": 1.574, + "step": 348200 + }, + { + "epoch": 27.31, + "learning_rate": 0.0005, + "loss": 1.5613, + "step": 348300 + }, + { + "epoch": 27.32, + "learning_rate": 0.0005, + "loss": 1.5685, + "step": 348400 + }, + { + "epoch": 27.33, + "learning_rate": 0.0005, + "loss": 1.5287, + "step": 348500 + }, + { + "epoch": 27.34, + "learning_rate": 0.0005, + "loss": 1.5573, + "step": 348600 + }, + { + "epoch": 27.34, + "learning_rate": 0.0005, + "loss": 1.565, + "step": 348700 + }, + { + "epoch": 27.35, + "learning_rate": 0.0005, + "loss": 1.5758, + "step": 348800 + }, + { + "epoch": 27.36, + "learning_rate": 0.0005, + "loss": 1.5418, + "step": 348900 + }, + { + "epoch": 27.37, + "learning_rate": 0.0005, + "loss": 1.557, + "step": 349000 + }, + { + "epoch": 27.38, + "learning_rate": 0.0005, + "loss": 1.5715, + "step": 349100 + }, + { + "epoch": 27.38, + "learning_rate": 0.0005, + "loss": 1.5413, + "step": 349200 + }, + { + "epoch": 27.39, + "learning_rate": 0.0005, + "loss": 1.5829, + "step": 349300 + }, + { + "epoch": 27.4, + "learning_rate": 0.0005, + "loss": 1.6001, + "step": 349400 + }, + { + "epoch": 27.41, + "learning_rate": 0.0005, + "loss": 1.5802, + "step": 349500 + }, + { + "epoch": 27.42, + "learning_rate": 0.0005, + "loss": 1.5757, + "step": 349600 + }, + { + "epoch": 27.42, + "learning_rate": 0.0005, + "loss": 1.566, + "step": 349700 + }, + { + "epoch": 27.43, + "learning_rate": 0.0005, + "loss": 1.5486, + "step": 349800 + }, + { + "epoch": 27.44, + "learning_rate": 0.0005, + "loss": 1.5868, + "step": 349900 + }, + { + "epoch": 27.45, + "learning_rate": 0.0005, + "loss": 1.5729, + "step": 350000 + }, + { + "epoch": 27.45, + "eval_gen_len": 18.799858744592566, + "eval_loss": 2.0437819957733154, + "eval_rouge1": 35.5653, + "eval_rouge2": 14.397, + "eval_rougeL": 29.2537, + "eval_rougeLsum": 29.2463, + "eval_runtime": 364.5616, + "eval_samples_per_second": 31.07, + "eval_steps_per_second": 1.942, + "step": 350000 + }, + { + "epoch": 27.45, + "learning_rate": 0.0005, + "loss": 1.567, + "step": 350100 + }, + { + "epoch": 27.46, + "learning_rate": 0.0005, + "loss": 1.5654, + "step": 350200 + }, + { + "epoch": 27.47, + "learning_rate": 0.0005, + "loss": 1.5725, + "step": 350300 + }, + { + "epoch": 27.48, + "learning_rate": 0.0005, + "loss": 1.5994, + "step": 350400 + }, + { + "epoch": 27.49, + "learning_rate": 0.0005, + "loss": 1.5819, + "step": 350500 + }, + { + "epoch": 27.49, + "learning_rate": 0.0005, + "loss": 1.5937, + "step": 350600 + }, + { + "epoch": 27.5, + "learning_rate": 0.0005, + "loss": 1.5933, + "step": 350700 + }, + { + "epoch": 27.51, + "learning_rate": 0.0005, + "loss": 1.5684, + "step": 350800 + }, + { + "epoch": 27.52, + "learning_rate": 0.0005, + "loss": 1.5684, + "step": 350900 + }, + { + "epoch": 27.53, + "learning_rate": 0.0005, + "loss": 1.5823, + "step": 351000 + }, + { + "epoch": 27.53, + "learning_rate": 0.0005, + "loss": 1.5831, + "step": 351100 + }, + { + "epoch": 27.54, + "learning_rate": 0.0005, + "loss": 1.5852, + "step": 351200 + }, + { + "epoch": 27.55, + "learning_rate": 0.0005, + "loss": 1.5816, + "step": 351300 + }, + { + "epoch": 27.56, + "learning_rate": 0.0005, + "loss": 1.5839, + "step": 351400 + }, + { + "epoch": 27.56, + "learning_rate": 0.0005, + "loss": 1.5611, + "step": 351500 + }, + { + "epoch": 27.57, + "learning_rate": 0.0005, + "loss": 1.5772, + "step": 351600 + }, + { + "epoch": 27.58, + "learning_rate": 0.0005, + "loss": 1.5868, + "step": 351700 + }, + { + "epoch": 27.59, + "learning_rate": 0.0005, + "loss": 1.5839, + "step": 351800 + }, + { + "epoch": 27.6, + "learning_rate": 0.0005, + "loss": 1.5971, + "step": 351900 + }, + { + "epoch": 27.6, + "learning_rate": 0.0005, + "loss": 1.6076, + "step": 352000 + }, + { + "epoch": 27.61, + "learning_rate": 0.0005, + "loss": 1.5971, + "step": 352100 + }, + { + "epoch": 27.62, + "learning_rate": 0.0005, + "loss": 1.5659, + "step": 352200 + }, + { + "epoch": 27.63, + "learning_rate": 0.0005, + "loss": 1.589, + "step": 352300 + }, + { + "epoch": 27.63, + "learning_rate": 0.0005, + "loss": 1.5862, + "step": 352400 + }, + { + "epoch": 27.64, + "learning_rate": 0.0005, + "loss": 1.5739, + "step": 352500 + }, + { + "epoch": 27.65, + "learning_rate": 0.0005, + "loss": 1.6004, + "step": 352600 + }, + { + "epoch": 27.66, + "learning_rate": 0.0005, + "loss": 1.558, + "step": 352700 + }, + { + "epoch": 27.67, + "learning_rate": 0.0005, + "loss": 1.5733, + "step": 352800 + }, + { + "epoch": 27.67, + "learning_rate": 0.0005, + "loss": 1.573, + "step": 352900 + }, + { + "epoch": 27.68, + "learning_rate": 0.0005, + "loss": 1.5761, + "step": 353000 + }, + { + "epoch": 27.69, + "learning_rate": 0.0005, + "loss": 1.5865, + "step": 353100 + }, + { + "epoch": 27.7, + "learning_rate": 0.0005, + "loss": 1.5739, + "step": 353200 + }, + { + "epoch": 27.71, + "learning_rate": 0.0005, + "loss": 1.5902, + "step": 353300 + }, + { + "epoch": 27.71, + "learning_rate": 0.0005, + "loss": 1.5813, + "step": 353400 + }, + { + "epoch": 27.72, + "learning_rate": 0.0005, + "loss": 1.5722, + "step": 353500 + }, + { + "epoch": 27.73, + "learning_rate": 0.0005, + "loss": 1.5662, + "step": 353600 + }, + { + "epoch": 27.74, + "learning_rate": 0.0005, + "loss": 1.5982, + "step": 353700 + }, + { + "epoch": 27.74, + "learning_rate": 0.0005, + "loss": 1.5977, + "step": 353800 + }, + { + "epoch": 27.75, + "learning_rate": 0.0005, + "loss": 1.5929, + "step": 353900 + }, + { + "epoch": 27.76, + "learning_rate": 0.0005, + "loss": 1.5885, + "step": 354000 + }, + { + "epoch": 27.77, + "learning_rate": 0.0005, + "loss": 1.6001, + "step": 354100 + }, + { + "epoch": 27.78, + "learning_rate": 0.0005, + "loss": 1.6063, + "step": 354200 + }, + { + "epoch": 27.78, + "learning_rate": 0.0005, + "loss": 1.5976, + "step": 354300 + }, + { + "epoch": 27.79, + "learning_rate": 0.0005, + "loss": 1.5982, + "step": 354400 + }, + { + "epoch": 27.8, + "learning_rate": 0.0005, + "loss": 1.603, + "step": 354500 + }, + { + "epoch": 27.81, + "learning_rate": 0.0005, + "loss": 1.5693, + "step": 354600 + }, + { + "epoch": 27.82, + "learning_rate": 0.0005, + "loss": 1.5969, + "step": 354700 + }, + { + "epoch": 27.82, + "learning_rate": 0.0005, + "loss": 1.5818, + "step": 354800 + }, + { + "epoch": 27.83, + "learning_rate": 0.0005, + "loss": 1.5917, + "step": 354900 + }, + { + "epoch": 27.84, + "learning_rate": 0.0005, + "loss": 1.6187, + "step": 355000 + }, + { + "epoch": 27.85, + "learning_rate": 0.0005, + "loss": 1.5905, + "step": 355100 + }, + { + "epoch": 27.85, + "learning_rate": 0.0005, + "loss": 1.5871, + "step": 355200 + }, + { + "epoch": 27.86, + "learning_rate": 0.0005, + "loss": 1.5929, + "step": 355300 + }, + { + "epoch": 27.87, + "learning_rate": 0.0005, + "loss": 1.6266, + "step": 355400 + }, + { + "epoch": 27.88, + "learning_rate": 0.0005, + "loss": 1.5843, + "step": 355500 + }, + { + "epoch": 27.89, + "learning_rate": 0.0005, + "loss": 1.5929, + "step": 355600 + }, + { + "epoch": 27.89, + "learning_rate": 0.0005, + "loss": 1.5763, + "step": 355700 + }, + { + "epoch": 27.9, + "learning_rate": 0.0005, + "loss": 1.5725, + "step": 355800 + }, + { + "epoch": 27.91, + "learning_rate": 0.0005, + "loss": 1.5876, + "step": 355900 + }, + { + "epoch": 27.92, + "learning_rate": 0.0005, + "loss": 1.6009, + "step": 356000 + }, + { + "epoch": 27.93, + "learning_rate": 0.0005, + "loss": 1.5801, + "step": 356100 + }, + { + "epoch": 27.93, + "learning_rate": 0.0005, + "loss": 1.6225, + "step": 356200 + }, + { + "epoch": 27.94, + "learning_rate": 0.0005, + "loss": 1.6086, + "step": 356300 + }, + { + "epoch": 27.95, + "learning_rate": 0.0005, + "loss": 1.5756, + "step": 356400 + }, + { + "epoch": 27.96, + "learning_rate": 0.0005, + "loss": 1.6061, + "step": 356500 + }, + { + "epoch": 27.96, + "learning_rate": 0.0005, + "loss": 1.6086, + "step": 356600 + }, + { + "epoch": 27.97, + "learning_rate": 0.0005, + "loss": 1.573, + "step": 356700 + }, + { + "epoch": 27.98, + "learning_rate": 0.0005, + "loss": 1.6276, + "step": 356800 + }, + { + "epoch": 27.99, + "learning_rate": 0.0005, + "loss": 1.5962, + "step": 356900 + }, + { + "epoch": 28.0, + "learning_rate": 0.0005, + "loss": 1.5872, + "step": 357000 + }, + { + "epoch": 28.0, + "learning_rate": 0.0005, + "loss": 1.5682, + "step": 357100 + }, + { + "epoch": 28.01, + "learning_rate": 0.0005, + "loss": 1.5024, + "step": 357200 + }, + { + "epoch": 28.02, + "learning_rate": 0.0005, + "loss": 1.5107, + "step": 357300 + }, + { + "epoch": 28.03, + "learning_rate": 0.0005, + "loss": 1.5163, + "step": 357400 + }, + { + "epoch": 28.03, + "learning_rate": 0.0005, + "loss": 1.526, + "step": 357500 + }, + { + "epoch": 28.04, + "learning_rate": 0.0005, + "loss": 1.5352, + "step": 357600 + }, + { + "epoch": 28.05, + "learning_rate": 0.0005, + "loss": 1.5269, + "step": 357700 + }, + { + "epoch": 28.06, + "learning_rate": 0.0005, + "loss": 1.5396, + "step": 357800 + }, + { + "epoch": 28.07, + "learning_rate": 0.0005, + "loss": 1.523, + "step": 357900 + }, + { + "epoch": 28.07, + "learning_rate": 0.0005, + "loss": 1.5314, + "step": 358000 + }, + { + "epoch": 28.08, + "learning_rate": 0.0005, + "loss": 1.525, + "step": 358100 + }, + { + "epoch": 28.09, + "learning_rate": 0.0005, + "loss": 1.5458, + "step": 358200 + }, + { + "epoch": 28.1, + "learning_rate": 0.0005, + "loss": 1.5383, + "step": 358300 + }, + { + "epoch": 28.11, + "learning_rate": 0.0005, + "loss": 1.5293, + "step": 358400 + }, + { + "epoch": 28.11, + "learning_rate": 0.0005, + "loss": 1.5114, + "step": 358500 + }, + { + "epoch": 28.12, + "learning_rate": 0.0005, + "loss": 1.5141, + "step": 358600 + }, + { + "epoch": 28.13, + "learning_rate": 0.0005, + "loss": 1.5213, + "step": 358700 + }, + { + "epoch": 28.14, + "learning_rate": 0.0005, + "loss": 1.5199, + "step": 358800 + }, + { + "epoch": 28.14, + "learning_rate": 0.0005, + "loss": 1.542, + "step": 358900 + }, + { + "epoch": 28.15, + "learning_rate": 0.0005, + "loss": 1.5349, + "step": 359000 + }, + { + "epoch": 28.16, + "learning_rate": 0.0005, + "loss": 1.494, + "step": 359100 + }, + { + "epoch": 28.17, + "learning_rate": 0.0005, + "loss": 1.5497, + "step": 359200 + }, + { + "epoch": 28.18, + "learning_rate": 0.0005, + "loss": 1.5573, + "step": 359300 + }, + { + "epoch": 28.18, + "learning_rate": 0.0005, + "loss": 1.533, + "step": 359400 + }, + { + "epoch": 28.19, + "learning_rate": 0.0005, + "loss": 1.5426, + "step": 359500 + }, + { + "epoch": 28.2, + "learning_rate": 0.0005, + "loss": 1.5489, + "step": 359600 + }, + { + "epoch": 28.21, + "learning_rate": 0.0005, + "loss": 1.5595, + "step": 359700 + }, + { + "epoch": 28.22, + "learning_rate": 0.0005, + "loss": 1.5472, + "step": 359800 + }, + { + "epoch": 28.22, + "learning_rate": 0.0005, + "loss": 1.5507, + "step": 359900 + }, + { + "epoch": 28.23, + "learning_rate": 0.0005, + "loss": 1.558, + "step": 360000 + }, + { + "epoch": 28.23, + "eval_gen_len": 18.821665048115122, + "eval_loss": 2.0342040061950684, + "eval_rouge1": 35.4848, + "eval_rouge2": 14.3471, + "eval_rougeL": 29.1774, + "eval_rougeLsum": 29.1699, + "eval_runtime": 355.9312, + "eval_samples_per_second": 31.824, + "eval_steps_per_second": 1.989, + "step": 360000 + }, + { + "epoch": 28.24, + "learning_rate": 0.0005, + "loss": 1.5385, + "step": 360100 + }, + { + "epoch": 28.25, + "learning_rate": 0.0005, + "loss": 1.5381, + "step": 360200 + }, + { + "epoch": 28.25, + "learning_rate": 0.0005, + "loss": 1.5552, + "step": 360300 + }, + { + "epoch": 28.26, + "learning_rate": 0.0005, + "loss": 1.5452, + "step": 360400 + }, + { + "epoch": 28.27, + "learning_rate": 0.0005, + "loss": 1.5726, + "step": 360500 + }, + { + "epoch": 28.28, + "learning_rate": 0.0005, + "loss": 1.5543, + "step": 360600 + }, + { + "epoch": 28.29, + "learning_rate": 0.0005, + "loss": 1.5459, + "step": 360700 + }, + { + "epoch": 28.29, + "learning_rate": 0.0005, + "loss": 1.5494, + "step": 360800 + }, + { + "epoch": 28.3, + "learning_rate": 0.0005, + "loss": 1.5852, + "step": 360900 + }, + { + "epoch": 28.31, + "learning_rate": 0.0005, + "loss": 1.5807, + "step": 361000 + }, + { + "epoch": 28.32, + "learning_rate": 0.0005, + "loss": 1.5449, + "step": 361100 + }, + { + "epoch": 28.32, + "learning_rate": 0.0005, + "loss": 1.5605, + "step": 361200 + }, + { + "epoch": 28.33, + "learning_rate": 0.0005, + "loss": 1.5405, + "step": 361300 + }, + { + "epoch": 28.34, + "learning_rate": 0.0005, + "loss": 1.5573, + "step": 361400 + }, + { + "epoch": 28.35, + "learning_rate": 0.0005, + "loss": 1.5437, + "step": 361500 + }, + { + "epoch": 28.36, + "learning_rate": 0.0005, + "loss": 1.5475, + "step": 361600 + }, + { + "epoch": 28.36, + "learning_rate": 0.0005, + "loss": 1.5543, + "step": 361700 + }, + { + "epoch": 28.37, + "learning_rate": 0.0005, + "loss": 1.5673, + "step": 361800 + }, + { + "epoch": 28.38, + "learning_rate": 0.0005, + "loss": 1.5429, + "step": 361900 + }, + { + "epoch": 28.39, + "learning_rate": 0.0005, + "loss": 1.5304, + "step": 362000 + }, + { + "epoch": 28.4, + "learning_rate": 0.0005, + "loss": 1.5438, + "step": 362100 + }, + { + "epoch": 28.4, + "learning_rate": 0.0005, + "loss": 1.5253, + "step": 362200 + }, + { + "epoch": 28.41, + "learning_rate": 0.0005, + "loss": 1.5658, + "step": 362300 + }, + { + "epoch": 28.42, + "learning_rate": 0.0005, + "loss": 1.5737, + "step": 362400 + }, + { + "epoch": 28.43, + "learning_rate": 0.0005, + "loss": 1.5934, + "step": 362500 + }, + { + "epoch": 28.43, + "learning_rate": 0.0005, + "loss": 1.5434, + "step": 362600 + }, + { + "epoch": 28.44, + "learning_rate": 0.0005, + "loss": 1.5473, + "step": 362700 + }, + { + "epoch": 28.45, + "learning_rate": 0.0005, + "loss": 1.5754, + "step": 362800 + }, + { + "epoch": 28.46, + "learning_rate": 0.0005, + "loss": 1.5531, + "step": 362900 + }, + { + "epoch": 28.47, + "learning_rate": 0.0005, + "loss": 1.5496, + "step": 363000 + }, + { + "epoch": 28.47, + "learning_rate": 0.0005, + "loss": 1.5591, + "step": 363100 + }, + { + "epoch": 28.48, + "learning_rate": 0.0005, + "loss": 1.5777, + "step": 363200 + }, + { + "epoch": 28.49, + "learning_rate": 0.0005, + "loss": 1.5633, + "step": 363300 + }, + { + "epoch": 28.5, + "learning_rate": 0.0005, + "loss": 1.569, + "step": 363400 + }, + { + "epoch": 28.51, + "learning_rate": 0.0005, + "loss": 1.5578, + "step": 363500 + }, + { + "epoch": 28.51, + "learning_rate": 0.0005, + "loss": 1.5764, + "step": 363600 + }, + { + "epoch": 28.52, + "learning_rate": 0.0005, + "loss": 1.5284, + "step": 363700 + }, + { + "epoch": 28.53, + "learning_rate": 0.0005, + "loss": 1.5656, + "step": 363800 + }, + { + "epoch": 28.54, + "learning_rate": 0.0005, + "loss": 1.5843, + "step": 363900 + }, + { + "epoch": 28.54, + "learning_rate": 0.0005, + "loss": 1.5774, + "step": 364000 + }, + { + "epoch": 28.55, + "learning_rate": 0.0005, + "loss": 1.5399, + "step": 364100 + }, + { + "epoch": 28.56, + "learning_rate": 0.0005, + "loss": 1.5777, + "step": 364200 + }, + { + "epoch": 28.57, + "learning_rate": 0.0005, + "loss": 1.5734, + "step": 364300 + }, + { + "epoch": 28.58, + "learning_rate": 0.0005, + "loss": 1.5538, + "step": 364400 + }, + { + "epoch": 28.58, + "learning_rate": 0.0005, + "loss": 1.551, + "step": 364500 + }, + { + "epoch": 28.59, + "learning_rate": 0.0005, + "loss": 1.5904, + "step": 364600 + }, + { + "epoch": 28.6, + "learning_rate": 0.0005, + "loss": 1.5802, + "step": 364700 + }, + { + "epoch": 28.61, + "learning_rate": 0.0005, + "loss": 1.5516, + "step": 364800 + }, + { + "epoch": 28.62, + "learning_rate": 0.0005, + "loss": 1.5416, + "step": 364900 + }, + { + "epoch": 28.62, + "learning_rate": 0.0005, + "loss": 1.5757, + "step": 365000 + }, + { + "epoch": 28.63, + "learning_rate": 0.0005, + "loss": 1.5526, + "step": 365100 + }, + { + "epoch": 28.64, + "learning_rate": 0.0005, + "loss": 1.5526, + "step": 365200 + }, + { + "epoch": 28.65, + "learning_rate": 0.0005, + "loss": 1.5748, + "step": 365300 + }, + { + "epoch": 28.65, + "learning_rate": 0.0005, + "loss": 1.5667, + "step": 365400 + }, + { + "epoch": 28.66, + "learning_rate": 0.0005, + "loss": 1.5389, + "step": 365500 + }, + { + "epoch": 28.67, + "learning_rate": 0.0005, + "loss": 1.5805, + "step": 365600 + }, + { + "epoch": 28.68, + "learning_rate": 0.0005, + "loss": 1.5537, + "step": 365700 + }, + { + "epoch": 28.69, + "learning_rate": 0.0005, + "loss": 1.5464, + "step": 365800 + }, + { + "epoch": 28.69, + "learning_rate": 0.0005, + "loss": 1.5525, + "step": 365900 + }, + { + "epoch": 28.7, + "learning_rate": 0.0005, + "loss": 1.5933, + "step": 366000 + }, + { + "epoch": 28.71, + "learning_rate": 0.0005, + "loss": 1.5686, + "step": 366100 + }, + { + "epoch": 28.72, + "learning_rate": 0.0005, + "loss": 1.5916, + "step": 366200 + }, + { + "epoch": 28.72, + "learning_rate": 0.0005, + "loss": 1.5913, + "step": 366300 + }, + { + "epoch": 28.73, + "learning_rate": 0.0005, + "loss": 1.5873, + "step": 366400 + }, + { + "epoch": 28.74, + "learning_rate": 0.0005, + "loss": 1.5629, + "step": 366500 + }, + { + "epoch": 28.75, + "learning_rate": 0.0005, + "loss": 1.5586, + "step": 366600 + }, + { + "epoch": 28.76, + "learning_rate": 0.0005, + "loss": 1.5848, + "step": 366700 + }, + { + "epoch": 28.76, + "learning_rate": 0.0005, + "loss": 1.6012, + "step": 366800 + }, + { + "epoch": 28.77, + "learning_rate": 0.0005, + "loss": 1.5689, + "step": 366900 + }, + { + "epoch": 28.78, + "learning_rate": 0.0005, + "loss": 1.6003, + "step": 367000 + }, + { + "epoch": 28.79, + "learning_rate": 0.0005, + "loss": 1.6016, + "step": 367100 + }, + { + "epoch": 28.8, + "learning_rate": 0.0005, + "loss": 1.6006, + "step": 367200 + }, + { + "epoch": 28.8, + "learning_rate": 0.0005, + "loss": 1.5989, + "step": 367300 + }, + { + "epoch": 28.81, + "learning_rate": 0.0005, + "loss": 1.5512, + "step": 367400 + }, + { + "epoch": 28.82, + "learning_rate": 0.0005, + "loss": 1.5561, + "step": 367500 + }, + { + "epoch": 28.83, + "learning_rate": 0.0005, + "loss": 1.5544, + "step": 367600 + }, + { + "epoch": 28.83, + "learning_rate": 0.0005, + "loss": 1.5827, + "step": 367700 + }, + { + "epoch": 28.84, + "learning_rate": 0.0005, + "loss": 1.5807, + "step": 367800 + }, + { + "epoch": 28.85, + "learning_rate": 0.0005, + "loss": 1.5896, + "step": 367900 + }, + { + "epoch": 28.86, + "learning_rate": 0.0005, + "loss": 1.5632, + "step": 368000 + }, + { + "epoch": 28.87, + "learning_rate": 0.0005, + "loss": 1.6038, + "step": 368100 + }, + { + "epoch": 28.87, + "learning_rate": 0.0005, + "loss": 1.572, + "step": 368200 + }, + { + "epoch": 28.88, + "learning_rate": 0.0005, + "loss": 1.5859, + "step": 368300 + }, + { + "epoch": 28.89, + "learning_rate": 0.0005, + "loss": 1.5879, + "step": 368400 + }, + { + "epoch": 28.9, + "learning_rate": 0.0005, + "loss": 1.5807, + "step": 368500 + }, + { + "epoch": 28.91, + "learning_rate": 0.0005, + "loss": 1.5997, + "step": 368600 + }, + { + "epoch": 28.91, + "learning_rate": 0.0005, + "loss": 1.5818, + "step": 368700 + }, + { + "epoch": 28.92, + "learning_rate": 0.0005, + "loss": 1.5872, + "step": 368800 + }, + { + "epoch": 28.93, + "learning_rate": 0.0005, + "loss": 1.5478, + "step": 368900 + }, + { + "epoch": 28.94, + "learning_rate": 0.0005, + "loss": 1.5551, + "step": 369000 + }, + { + "epoch": 28.94, + "learning_rate": 0.0005, + "loss": 1.565, + "step": 369100 + }, + { + "epoch": 28.95, + "learning_rate": 0.0005, + "loss": 1.5758, + "step": 369200 + }, + { + "epoch": 28.96, + "learning_rate": 0.0005, + "loss": 1.5759, + "step": 369300 + }, + { + "epoch": 28.97, + "learning_rate": 0.0005, + "loss": 1.5761, + "step": 369400 + }, + { + "epoch": 28.98, + "learning_rate": 0.0005, + "loss": 1.6011, + "step": 369500 + }, + { + "epoch": 28.98, + "learning_rate": 0.0005, + "loss": 1.5991, + "step": 369600 + }, + { + "epoch": 28.99, + "learning_rate": 0.0005, + "loss": 1.5906, + "step": 369700 + }, + { + "epoch": 29.0, + "learning_rate": 0.0005, + "loss": 1.5794, + "step": 369800 + }, + { + "epoch": 29.01, + "learning_rate": 0.0005, + "loss": 1.5154, + "step": 369900 + }, + { + "epoch": 29.02, + "learning_rate": 0.0005, + "loss": 1.5134, + "step": 370000 + }, + { + "epoch": 29.02, + "eval_gen_len": 18.791383420146552, + "eval_loss": 2.0581727027893066, + "eval_rouge1": 35.3426, + "eval_rouge2": 14.425, + "eval_rougeL": 29.1042, + "eval_rougeLsum": 29.0882, + "eval_runtime": 354.5816, + "eval_samples_per_second": 31.945, + "eval_steps_per_second": 1.997, + "step": 370000 + }, + { + "epoch": 29.02, + "learning_rate": 0.0005, + "loss": 1.5084, + "step": 370100 + }, + { + "epoch": 29.03, + "learning_rate": 0.0005, + "loss": 1.5087, + "step": 370200 + }, + { + "epoch": 29.04, + "learning_rate": 0.0005, + "loss": 1.5158, + "step": 370300 + }, + { + "epoch": 29.05, + "learning_rate": 0.0005, + "loss": 1.5025, + "step": 370400 + }, + { + "epoch": 29.05, + "learning_rate": 0.0005, + "loss": 1.5105, + "step": 370500 + }, + { + "epoch": 29.06, + "learning_rate": 0.0005, + "loss": 1.4915, + "step": 370600 + }, + { + "epoch": 29.07, + "learning_rate": 0.0005, + "loss": 1.5036, + "step": 370700 + }, + { + "epoch": 29.08, + "learning_rate": 0.0005, + "loss": 1.5132, + "step": 370800 + }, + { + "epoch": 29.09, + "learning_rate": 0.0005, + "loss": 1.5009, + "step": 370900 + }, + { + "epoch": 29.09, + "learning_rate": 0.0005, + "loss": 1.5353, + "step": 371000 + }, + { + "epoch": 29.1, + "learning_rate": 0.0005, + "loss": 1.4914, + "step": 371100 + }, + { + "epoch": 29.11, + "learning_rate": 0.0005, + "loss": 1.5104, + "step": 371200 + }, + { + "epoch": 29.12, + "learning_rate": 0.0005, + "loss": 1.5413, + "step": 371300 + }, + { + "epoch": 29.12, + "learning_rate": 0.0005, + "loss": 1.5274, + "step": 371400 + }, + { + "epoch": 29.13, + "learning_rate": 0.0005, + "loss": 1.5159, + "step": 371500 + }, + { + "epoch": 29.14, + "learning_rate": 0.0005, + "loss": 1.54, + "step": 371600 + }, + { + "epoch": 29.15, + "learning_rate": 0.0005, + "loss": 1.5592, + "step": 371700 + }, + { + "epoch": 29.16, + "learning_rate": 0.0005, + "loss": 1.5061, + "step": 371800 + }, + { + "epoch": 29.16, + "learning_rate": 0.0005, + "loss": 1.5144, + "step": 371900 + }, + { + "epoch": 29.17, + "learning_rate": 0.0005, + "loss": 1.5369, + "step": 372000 + }, + { + "epoch": 29.18, + "learning_rate": 0.0005, + "loss": 1.5371, + "step": 372100 + }, + { + "epoch": 29.19, + "learning_rate": 0.0005, + "loss": 1.5418, + "step": 372200 + }, + { + "epoch": 29.2, + "learning_rate": 0.0005, + "loss": 1.5116, + "step": 372300 + }, + { + "epoch": 29.2, + "learning_rate": 0.0005, + "loss": 1.507, + "step": 372400 + }, + { + "epoch": 29.21, + "learning_rate": 0.0005, + "loss": 1.5431, + "step": 372500 + }, + { + "epoch": 29.22, + "learning_rate": 0.0005, + "loss": 1.5175, + "step": 372600 + }, + { + "epoch": 29.23, + "learning_rate": 0.0005, + "loss": 1.5071, + "step": 372700 + }, + { + "epoch": 29.23, + "learning_rate": 0.0005, + "loss": 1.5442, + "step": 372800 + }, + { + "epoch": 29.24, + "learning_rate": 0.0005, + "loss": 1.5561, + "step": 372900 + }, + { + "epoch": 29.25, + "learning_rate": 0.0005, + "loss": 1.4922, + "step": 373000 + }, + { + "epoch": 29.26, + "learning_rate": 0.0005, + "loss": 1.5415, + "step": 373100 + }, + { + "epoch": 29.27, + "learning_rate": 0.0005, + "loss": 1.543, + "step": 373200 + }, + { + "epoch": 29.27, + "learning_rate": 0.0005, + "loss": 1.5519, + "step": 373300 + }, + { + "epoch": 29.28, + "learning_rate": 0.0005, + "loss": 1.5147, + "step": 373400 + }, + { + "epoch": 29.29, + "learning_rate": 0.0005, + "loss": 1.5495, + "step": 373500 + }, + { + "epoch": 29.3, + "learning_rate": 0.0005, + "loss": 1.5376, + "step": 373600 + }, + { + "epoch": 29.31, + "learning_rate": 0.0005, + "loss": 1.5431, + "step": 373700 + }, + { + "epoch": 29.31, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 373800 + }, + { + "epoch": 29.32, + "learning_rate": 0.0005, + "loss": 1.5311, + "step": 373900 + }, + { + "epoch": 29.33, + "learning_rate": 0.0005, + "loss": 1.531, + "step": 374000 + }, + { + "epoch": 29.34, + "learning_rate": 0.0005, + "loss": 1.548, + "step": 374100 + }, + { + "epoch": 29.34, + "learning_rate": 0.0005, + "loss": 1.5405, + "step": 374200 + }, + { + "epoch": 29.35, + "learning_rate": 0.0005, + "loss": 1.5334, + "step": 374300 + }, + { + "epoch": 29.36, + "learning_rate": 0.0005, + "loss": 1.5425, + "step": 374400 + }, + { + "epoch": 29.37, + "learning_rate": 0.0005, + "loss": 1.5558, + "step": 374500 + }, + { + "epoch": 29.38, + "learning_rate": 0.0005, + "loss": 1.5261, + "step": 374600 + }, + { + "epoch": 29.38, + "learning_rate": 0.0005, + "loss": 1.5543, + "step": 374700 + }, + { + "epoch": 29.39, + "learning_rate": 0.0005, + "loss": 1.5462, + "step": 374800 + }, + { + "epoch": 29.4, + "learning_rate": 0.0005, + "loss": 1.5245, + "step": 374900 + }, + { + "epoch": 29.41, + "learning_rate": 0.0005, + "loss": 1.5498, + "step": 375000 + }, + { + "epoch": 29.41, + "learning_rate": 0.0005, + "loss": 1.5805, + "step": 375100 + }, + { + "epoch": 29.42, + "learning_rate": 0.0005, + "loss": 1.5346, + "step": 375200 + }, + { + "epoch": 29.43, + "learning_rate": 0.0005, + "loss": 1.5383, + "step": 375300 + }, + { + "epoch": 29.44, + "learning_rate": 0.0005, + "loss": 1.5239, + "step": 375400 + }, + { + "epoch": 29.45, + "learning_rate": 0.0005, + "loss": 1.588, + "step": 375500 + }, + { + "epoch": 29.45, + "learning_rate": 0.0005, + "loss": 1.5122, + "step": 375600 + }, + { + "epoch": 29.46, + "learning_rate": 0.0005, + "loss": 1.5565, + "step": 375700 + }, + { + "epoch": 29.47, + "learning_rate": 0.0005, + "loss": 1.5167, + "step": 375800 + }, + { + "epoch": 29.48, + "learning_rate": 0.0005, + "loss": 1.5715, + "step": 375900 + }, + { + "epoch": 29.49, + "learning_rate": 0.0005, + "loss": 1.5449, + "step": 376000 + }, + { + "epoch": 29.49, + "learning_rate": 0.0005, + "loss": 1.5567, + "step": 376100 + }, + { + "epoch": 29.5, + "learning_rate": 0.0005, + "loss": 1.5546, + "step": 376200 + }, + { + "epoch": 29.51, + "learning_rate": 0.0005, + "loss": 1.5428, + "step": 376300 + }, + { + "epoch": 29.52, + "learning_rate": 0.0005, + "loss": 1.5627, + "step": 376400 + }, + { + "epoch": 29.52, + "learning_rate": 0.0005, + "loss": 1.5605, + "step": 376500 + }, + { + "epoch": 29.53, + "learning_rate": 0.0005, + "loss": 1.5493, + "step": 376600 + }, + { + "epoch": 29.54, + "learning_rate": 0.0005, + "loss": 1.5653, + "step": 376700 + }, + { + "epoch": 29.55, + "learning_rate": 0.0005, + "loss": 1.5443, + "step": 376800 + }, + { + "epoch": 29.56, + "learning_rate": 0.0005, + "loss": 1.5499, + "step": 376900 + }, + { + "epoch": 29.56, + "learning_rate": 0.0005, + "loss": 1.5444, + "step": 377000 + }, + { + "epoch": 29.57, + "learning_rate": 0.0005, + "loss": 1.5792, + "step": 377100 + }, + { + "epoch": 29.58, + "learning_rate": 0.0005, + "loss": 1.5403, + "step": 377200 + }, + { + "epoch": 29.59, + "learning_rate": 0.0005, + "loss": 1.5827, + "step": 377300 + }, + { + "epoch": 29.6, + "learning_rate": 0.0005, + "loss": 1.5682, + "step": 377400 + }, + { + "epoch": 29.6, + "learning_rate": 0.0005, + "loss": 1.5276, + "step": 377500 + }, + { + "epoch": 29.61, + "learning_rate": 0.0005, + "loss": 1.5516, + "step": 377600 + }, + { + "epoch": 29.62, + "learning_rate": 0.0005, + "loss": 1.5465, + "step": 377700 + }, + { + "epoch": 29.63, + "learning_rate": 0.0005, + "loss": 1.577, + "step": 377800 + }, + { + "epoch": 29.63, + "learning_rate": 0.0005, + "loss": 1.575, + "step": 377900 + }, + { + "epoch": 29.64, + "learning_rate": 0.0005, + "loss": 1.5309, + "step": 378000 + }, + { + "epoch": 29.65, + "learning_rate": 0.0005, + "loss": 1.5714, + "step": 378100 + }, + { + "epoch": 29.66, + "learning_rate": 0.0005, + "loss": 1.5524, + "step": 378200 + }, + { + "epoch": 29.67, + "learning_rate": 0.0005, + "loss": 1.5659, + "step": 378300 + }, + { + "epoch": 29.67, + "learning_rate": 0.0005, + "loss": 1.5679, + "step": 378400 + }, + { + "epoch": 29.68, + "learning_rate": 0.0005, + "loss": 1.5378, + "step": 378500 + }, + { + "epoch": 29.69, + "learning_rate": 0.0005, + "loss": 1.5523, + "step": 378600 + }, + { + "epoch": 29.7, + "learning_rate": 0.0005, + "loss": 1.5675, + "step": 378700 + }, + { + "epoch": 29.71, + "learning_rate": 0.0005, + "loss": 1.578, + "step": 378800 + }, + { + "epoch": 29.71, + "learning_rate": 0.0005, + "loss": 1.5386, + "step": 378900 + }, + { + "epoch": 29.72, + "learning_rate": 0.0005, + "loss": 1.5588, + "step": 379000 + }, + { + "epoch": 29.73, + "learning_rate": 0.0005, + "loss": 1.5356, + "step": 379100 + }, + { + "epoch": 29.74, + "learning_rate": 0.0005, + "loss": 1.5889, + "step": 379200 + }, + { + "epoch": 29.74, + "learning_rate": 0.0005, + "loss": 1.5759, + "step": 379300 + }, + { + "epoch": 29.75, + "learning_rate": 0.0005, + "loss": 1.556, + "step": 379400 + }, + { + "epoch": 29.76, + "learning_rate": 0.0005, + "loss": 1.5655, + "step": 379500 + }, + { + "epoch": 29.77, + "learning_rate": 0.0005, + "loss": 1.589, + "step": 379600 + }, + { + "epoch": 29.78, + "learning_rate": 0.0005, + "loss": 1.5837, + "step": 379700 + }, + { + "epoch": 29.78, + "learning_rate": 0.0005, + "loss": 1.5566, + "step": 379800 + }, + { + "epoch": 29.79, + "learning_rate": 0.0005, + "loss": 1.5692, + "step": 379900 + }, + { + "epoch": 29.8, + "learning_rate": 0.0005, + "loss": 1.5468, + "step": 380000 + }, + { + "epoch": 29.8, + "eval_gen_len": 18.78440893440452, + "eval_loss": 2.0447006225585938, + "eval_rouge1": 35.5171, + "eval_rouge2": 14.5247, + "eval_rougeL": 29.2402, + "eval_rougeLsum": 29.2352, + "eval_runtime": 354.064, + "eval_samples_per_second": 31.991, + "eval_steps_per_second": 2.0, + "step": 380000 + }, + { + "epoch": 29.81, + "learning_rate": 0.0005, + "loss": 1.5943, + "step": 380100 + }, + { + "epoch": 29.81, + "learning_rate": 0.0005, + "loss": 1.5811, + "step": 380200 + }, + { + "epoch": 29.82, + "learning_rate": 0.0005, + "loss": 1.5551, + "step": 380300 + }, + { + "epoch": 29.83, + "learning_rate": 0.0005, + "loss": 1.5615, + "step": 380400 + }, + { + "epoch": 29.84, + "learning_rate": 0.0005, + "loss": 1.5603, + "step": 380500 + }, + { + "epoch": 29.85, + "learning_rate": 0.0005, + "loss": 1.5292, + "step": 380600 + }, + { + "epoch": 29.85, + "learning_rate": 0.0005, + "loss": 1.5513, + "step": 380700 + }, + { + "epoch": 29.86, + "learning_rate": 0.0005, + "loss": 1.568, + "step": 380800 + }, + { + "epoch": 29.87, + "learning_rate": 0.0005, + "loss": 1.5795, + "step": 380900 + }, + { + "epoch": 29.88, + "learning_rate": 0.0005, + "loss": 1.5701, + "step": 381000 + }, + { + "epoch": 29.89, + "learning_rate": 0.0005, + "loss": 1.5592, + "step": 381100 + }, + { + "epoch": 29.89, + "learning_rate": 0.0005, + "loss": 1.5821, + "step": 381200 + }, + { + "epoch": 29.9, + "learning_rate": 0.0005, + "loss": 1.5689, + "step": 381300 + }, + { + "epoch": 29.91, + "learning_rate": 0.0005, + "loss": 1.5638, + "step": 381400 + }, + { + "epoch": 29.92, + "learning_rate": 0.0005, + "loss": 1.5541, + "step": 381500 + }, + { + "epoch": 29.92, + "learning_rate": 0.0005, + "loss": 1.5629, + "step": 381600 + }, + { + "epoch": 29.93, + "learning_rate": 0.0005, + "loss": 1.5603, + "step": 381700 + }, + { + "epoch": 29.94, + "learning_rate": 0.0005, + "loss": 1.5926, + "step": 381800 + }, + { + "epoch": 29.95, + "learning_rate": 0.0005, + "loss": 1.5746, + "step": 381900 + }, + { + "epoch": 29.96, + "learning_rate": 0.0005, + "loss": 1.5786, + "step": 382000 + }, + { + "epoch": 29.96, + "learning_rate": 0.0005, + "loss": 1.5756, + "step": 382100 + }, + { + "epoch": 29.97, + "learning_rate": 0.0005, + "loss": 1.5677, + "step": 382200 + }, + { + "epoch": 29.98, + "learning_rate": 0.0005, + "loss": 1.5615, + "step": 382300 + }, + { + "epoch": 29.99, + "learning_rate": 0.0005, + "loss": 1.5771, + "step": 382400 + }, + { + "epoch": 30.0, + "learning_rate": 0.0005, + "loss": 1.5749, + "step": 382500 + }, + { + "epoch": 30.0, + "learning_rate": 0.0005, + "loss": 1.5499, + "step": 382600 + }, + { + "epoch": 30.01, + "learning_rate": 0.0005, + "loss": 1.4999, + "step": 382700 + }, + { + "epoch": 30.02, + "learning_rate": 0.0005, + "loss": 1.5096, + "step": 382800 + }, + { + "epoch": 30.03, + "learning_rate": 0.0005, + "loss": 1.4812, + "step": 382900 + }, + { + "epoch": 30.03, + "learning_rate": 0.0005, + "loss": 1.4866, + "step": 383000 + }, + { + "epoch": 30.04, + "learning_rate": 0.0005, + "loss": 1.4996, + "step": 383100 + }, + { + "epoch": 30.05, + "learning_rate": 0.0005, + "loss": 1.5003, + "step": 383200 + }, + { + "epoch": 30.06, + "learning_rate": 0.0005, + "loss": 1.4927, + "step": 383300 + }, + { + "epoch": 30.07, + "learning_rate": 0.0005, + "loss": 1.4954, + "step": 383400 + }, + { + "epoch": 30.07, + "learning_rate": 0.0005, + "loss": 1.5292, + "step": 383500 + }, + { + "epoch": 30.08, + "learning_rate": 0.0005, + "loss": 1.5011, + "step": 383600 + }, + { + "epoch": 30.09, + "learning_rate": 0.0005, + "loss": 1.5287, + "step": 383700 + }, + { + "epoch": 30.1, + "learning_rate": 0.0005, + "loss": 1.485, + "step": 383800 + }, + { + "epoch": 30.11, + "learning_rate": 0.0005, + "loss": 1.5102, + "step": 383900 + }, + { + "epoch": 30.11, + "learning_rate": 0.0005, + "loss": 1.5175, + "step": 384000 + }, + { + "epoch": 30.12, + "learning_rate": 0.0005, + "loss": 1.5078, + "step": 384100 + }, + { + "epoch": 30.13, + "learning_rate": 0.0005, + "loss": 1.5168, + "step": 384200 + }, + { + "epoch": 30.14, + "learning_rate": 0.0005, + "loss": 1.5254, + "step": 384300 + }, + { + "epoch": 30.14, + "learning_rate": 0.0005, + "loss": 1.5083, + "step": 384400 + }, + { + "epoch": 30.15, + "learning_rate": 0.0005, + "loss": 1.5071, + "step": 384500 + }, + { + "epoch": 30.16, + "learning_rate": 0.0005, + "loss": 1.5103, + "step": 384600 + }, + { + "epoch": 30.17, + "learning_rate": 0.0005, + "loss": 1.4913, + "step": 384700 + }, + { + "epoch": 30.18, + "learning_rate": 0.0005, + "loss": 1.4917, + "step": 384800 + }, + { + "epoch": 30.18, + "learning_rate": 0.0005, + "loss": 1.5229, + "step": 384900 + }, + { + "epoch": 30.19, + "learning_rate": 0.0005, + "loss": 1.4823, + "step": 385000 + }, + { + "epoch": 30.2, + "learning_rate": 0.0005, + "loss": 1.5141, + "step": 385100 + }, + { + "epoch": 30.21, + "learning_rate": 0.0005, + "loss": 1.5119, + "step": 385200 + }, + { + "epoch": 30.21, + "learning_rate": 0.0005, + "loss": 1.5448, + "step": 385300 + }, + { + "epoch": 30.22, + "learning_rate": 0.0005, + "loss": 1.5142, + "step": 385400 + }, + { + "epoch": 30.23, + "learning_rate": 0.0005, + "loss": 1.5171, + "step": 385500 + }, + { + "epoch": 30.24, + "learning_rate": 0.0005, + "loss": 1.5354, + "step": 385600 + }, + { + "epoch": 30.25, + "learning_rate": 0.0005, + "loss": 1.4944, + "step": 385700 + }, + { + "epoch": 30.25, + "learning_rate": 0.0005, + "loss": 1.523, + "step": 385800 + }, + { + "epoch": 30.26, + "learning_rate": 0.0005, + "loss": 1.5474, + "step": 385900 + }, + { + "epoch": 30.27, + "learning_rate": 0.0005, + "loss": 1.5116, + "step": 386000 + }, + { + "epoch": 30.28, + "learning_rate": 0.0005, + "loss": 1.5224, + "step": 386100 + }, + { + "epoch": 30.29, + "learning_rate": 0.0005, + "loss": 1.5225, + "step": 386200 + }, + { + "epoch": 30.29, + "learning_rate": 0.0005, + "loss": 1.5191, + "step": 386300 + }, + { + "epoch": 30.3, + "learning_rate": 0.0005, + "loss": 1.5531, + "step": 386400 + }, + { + "epoch": 30.31, + "learning_rate": 0.0005, + "loss": 1.5301, + "step": 386500 + }, + { + "epoch": 30.32, + "learning_rate": 0.0005, + "loss": 1.5313, + "step": 386600 + }, + { + "epoch": 30.32, + "learning_rate": 0.0005, + "loss": 1.4778, + "step": 386700 + }, + { + "epoch": 30.33, + "learning_rate": 0.0005, + "loss": 1.5302, + "step": 386800 + }, + { + "epoch": 30.34, + "learning_rate": 0.0005, + "loss": 1.5164, + "step": 386900 + }, + { + "epoch": 30.35, + "learning_rate": 0.0005, + "loss": 1.5088, + "step": 387000 + }, + { + "epoch": 30.36, + "learning_rate": 0.0005, + "loss": 1.5215, + "step": 387100 + }, + { + "epoch": 30.36, + "learning_rate": 0.0005, + "loss": 1.5594, + "step": 387200 + }, + { + "epoch": 30.37, + "learning_rate": 0.0005, + "loss": 1.534, + "step": 387300 + }, + { + "epoch": 30.38, + "learning_rate": 0.0005, + "loss": 1.5421, + "step": 387400 + }, + { + "epoch": 30.39, + "learning_rate": 0.0005, + "loss": 1.5384, + "step": 387500 + }, + { + "epoch": 30.4, + "learning_rate": 0.0005, + "loss": 1.5201, + "step": 387600 + }, + { + "epoch": 30.4, + "learning_rate": 0.0005, + "loss": 1.5385, + "step": 387700 + }, + { + "epoch": 30.41, + "learning_rate": 0.0005, + "loss": 1.5387, + "step": 387800 + }, + { + "epoch": 30.42, + "learning_rate": 0.0005, + "loss": 1.5556, + "step": 387900 + }, + { + "epoch": 30.43, + "learning_rate": 0.0005, + "loss": 1.5433, + "step": 388000 + }, + { + "epoch": 30.43, + "learning_rate": 0.0005, + "loss": 1.5271, + "step": 388100 + }, + { + "epoch": 30.44, + "learning_rate": 0.0005, + "loss": 1.5368, + "step": 388200 + }, + { + "epoch": 30.45, + "learning_rate": 0.0005, + "loss": 1.544, + "step": 388300 + }, + { + "epoch": 30.46, + "learning_rate": 0.0005, + "loss": 1.5288, + "step": 388400 + }, + { + "epoch": 30.47, + "learning_rate": 0.0005, + "loss": 1.5404, + "step": 388500 + }, + { + "epoch": 30.47, + "learning_rate": 0.0005, + "loss": 1.5382, + "step": 388600 + }, + { + "epoch": 30.48, + "learning_rate": 0.0005, + "loss": 1.5495, + "step": 388700 + }, + { + "epoch": 30.49, + "learning_rate": 0.0005, + "loss": 1.58, + "step": 388800 + }, + { + "epoch": 30.5, + "learning_rate": 0.0005, + "loss": 1.5482, + "step": 388900 + }, + { + "epoch": 30.51, + "learning_rate": 0.0005, + "loss": 1.4916, + "step": 389000 + }, + { + "epoch": 30.51, + "learning_rate": 0.0005, + "loss": 1.5232, + "step": 389100 + }, + { + "epoch": 30.52, + "learning_rate": 0.0005, + "loss": 1.5288, + "step": 389200 + }, + { + "epoch": 30.53, + "learning_rate": 0.0005, + "loss": 1.5001, + "step": 389300 + }, + { + "epoch": 30.54, + "learning_rate": 0.0005, + "loss": 1.5628, + "step": 389400 + }, + { + "epoch": 30.54, + "learning_rate": 0.0005, + "loss": 1.5341, + "step": 389500 + }, + { + "epoch": 30.55, + "learning_rate": 0.0005, + "loss": 1.522, + "step": 389600 + }, + { + "epoch": 30.56, + "learning_rate": 0.0005, + "loss": 1.5442, + "step": 389700 + }, + { + "epoch": 30.57, + "learning_rate": 0.0005, + "loss": 1.549, + "step": 389800 + }, + { + "epoch": 30.58, + "learning_rate": 0.0005, + "loss": 1.5188, + "step": 389900 + }, + { + "epoch": 30.58, + "learning_rate": 0.0005, + "loss": 1.5073, + "step": 390000 + }, + { + "epoch": 30.58, + "eval_gen_len": 18.813278008298756, + "eval_loss": 2.047095775604248, + "eval_rouge1": 35.2735, + "eval_rouge2": 14.329, + "eval_rougeL": 29.0615, + "eval_rougeLsum": 29.0566, + "eval_runtime": 354.7807, + "eval_samples_per_second": 31.927, + "eval_steps_per_second": 1.996, + "step": 390000 + }, + { + "epoch": 30.59, + "learning_rate": 0.0005, + "loss": 1.5498, + "step": 390100 + }, + { + "epoch": 30.6, + "learning_rate": 0.0005, + "loss": 1.5313, + "step": 390200 + }, + { + "epoch": 30.61, + "learning_rate": 0.0005, + "loss": 1.5331, + "step": 390300 + }, + { + "epoch": 30.61, + "learning_rate": 0.0005, + "loss": 1.5503, + "step": 390400 + }, + { + "epoch": 30.62, + "learning_rate": 0.0005, + "loss": 1.5392, + "step": 390500 + }, + { + "epoch": 30.63, + "learning_rate": 0.0005, + "loss": 1.5448, + "step": 390600 + }, + { + "epoch": 30.64, + "learning_rate": 0.0005, + "loss": 1.5421, + "step": 390700 + }, + { + "epoch": 30.65, + "learning_rate": 0.0005, + "loss": 1.5446, + "step": 390800 + }, + { + "epoch": 30.65, + "learning_rate": 0.0005, + "loss": 1.5466, + "step": 390900 + }, + { + "epoch": 30.66, + "learning_rate": 0.0005, + "loss": 1.557, + "step": 391000 + }, + { + "epoch": 30.67, + "learning_rate": 0.0005, + "loss": 1.5456, + "step": 391100 + }, + { + "epoch": 30.68, + "learning_rate": 0.0005, + "loss": 1.5441, + "step": 391200 + }, + { + "epoch": 30.69, + "learning_rate": 0.0005, + "loss": 1.5624, + "step": 391300 + }, + { + "epoch": 30.69, + "learning_rate": 0.0005, + "loss": 1.5449, + "step": 391400 + }, + { + "epoch": 30.7, + "learning_rate": 0.0005, + "loss": 1.5352, + "step": 391500 + }, + { + "epoch": 30.71, + "learning_rate": 0.0005, + "loss": 1.5159, + "step": 391600 + }, + { + "epoch": 30.72, + "learning_rate": 0.0005, + "loss": 1.5566, + "step": 391700 + }, + { + "epoch": 30.72, + "learning_rate": 0.0005, + "loss": 1.528, + "step": 391800 + }, + { + "epoch": 30.73, + "learning_rate": 0.0005, + "loss": 1.5379, + "step": 391900 + }, + { + "epoch": 30.74, + "learning_rate": 0.0005, + "loss": 1.5754, + "step": 392000 + }, + { + "epoch": 30.75, + "learning_rate": 0.0005, + "loss": 1.5689, + "step": 392100 + }, + { + "epoch": 30.76, + "learning_rate": 0.0005, + "loss": 1.5678, + "step": 392200 + }, + { + "epoch": 30.76, + "learning_rate": 0.0005, + "loss": 1.5665, + "step": 392300 + }, + { + "epoch": 30.77, + "learning_rate": 0.0005, + "loss": 1.5618, + "step": 392400 + }, + { + "epoch": 30.78, + "learning_rate": 0.0005, + "loss": 1.5661, + "step": 392500 + }, + { + "epoch": 30.79, + "learning_rate": 0.0005, + "loss": 1.5586, + "step": 392600 + }, + { + "epoch": 30.8, + "learning_rate": 0.0005, + "loss": 1.556, + "step": 392700 + }, + { + "epoch": 30.8, + "learning_rate": 0.0005, + "loss": 1.5416, + "step": 392800 + }, + { + "epoch": 30.81, + "learning_rate": 0.0005, + "loss": 1.5362, + "step": 392900 + }, + { + "epoch": 30.82, + "learning_rate": 0.0005, + "loss": 1.554, + "step": 393000 + }, + { + "epoch": 30.83, + "learning_rate": 0.0005, + "loss": 1.563, + "step": 393100 + }, + { + "epoch": 30.83, + "learning_rate": 0.0005, + "loss": 1.5251, + "step": 393200 + }, + { + "epoch": 30.84, + "learning_rate": 0.0005, + "loss": 1.5873, + "step": 393300 + }, + { + "epoch": 30.85, + "learning_rate": 0.0005, + "loss": 1.5558, + "step": 393400 + }, + { + "epoch": 30.86, + "learning_rate": 0.0005, + "loss": 1.5633, + "step": 393500 + }, + { + "epoch": 30.87, + "learning_rate": 0.0005, + "loss": 1.5633, + "step": 393600 + }, + { + "epoch": 30.87, + "learning_rate": 0.0005, + "loss": 1.5309, + "step": 393700 + }, + { + "epoch": 30.88, + "learning_rate": 0.0005, + "loss": 1.5891, + "step": 393800 + }, + { + "epoch": 30.89, + "learning_rate": 0.0005, + "loss": 1.5933, + "step": 393900 + }, + { + "epoch": 30.9, + "learning_rate": 0.0005, + "loss": 1.5798, + "step": 394000 + }, + { + "epoch": 30.9, + "learning_rate": 0.0005, + "loss": 1.5414, + "step": 394100 + }, + { + "epoch": 30.91, + "learning_rate": 0.0005, + "loss": 1.5658, + "step": 394200 + }, + { + "epoch": 30.92, + "learning_rate": 0.0005, + "loss": 1.5304, + "step": 394300 + }, + { + "epoch": 30.93, + "learning_rate": 0.0005, + "loss": 1.5448, + "step": 394400 + }, + { + "epoch": 30.94, + "learning_rate": 0.0005, + "loss": 1.5352, + "step": 394500 + }, + { + "epoch": 30.94, + "learning_rate": 0.0005, + "loss": 1.5415, + "step": 394600 + }, + { + "epoch": 30.95, + "learning_rate": 0.0005, + "loss": 1.5869, + "step": 394700 + }, + { + "epoch": 30.96, + "learning_rate": 0.0005, + "loss": 1.5442, + "step": 394800 + }, + { + "epoch": 30.97, + "learning_rate": 0.0005, + "loss": 1.5851, + "step": 394900 + }, + { + "epoch": 30.98, + "learning_rate": 0.0005, + "loss": 1.5618, + "step": 395000 + }, + { + "epoch": 30.98, + "learning_rate": 0.0005, + "loss": 1.5343, + "step": 395100 + }, + { + "epoch": 30.99, + "learning_rate": 0.0005, + "loss": 1.5599, + "step": 395200 + }, + { + "epoch": 31.0, + "learning_rate": 0.0005, + "loss": 1.5824, + "step": 395300 + }, + { + "epoch": 31.01, + "learning_rate": 0.0005, + "loss": 1.4802, + "step": 395400 + }, + { + "epoch": 31.01, + "learning_rate": 0.0005, + "loss": 1.5035, + "step": 395500 + }, + { + "epoch": 31.02, + "learning_rate": 0.0005, + "loss": 1.4814, + "step": 395600 + }, + { + "epoch": 31.03, + "learning_rate": 0.0005, + "loss": 1.4673, + "step": 395700 + }, + { + "epoch": 31.04, + "learning_rate": 0.0005, + "loss": 1.4805, + "step": 395800 + }, + { + "epoch": 31.05, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 395900 + }, + { + "epoch": 31.05, + "learning_rate": 0.0005, + "loss": 1.4932, + "step": 396000 + }, + { + "epoch": 31.06, + "learning_rate": 0.0005, + "loss": 1.4683, + "step": 396100 + }, + { + "epoch": 31.07, + "learning_rate": 0.0005, + "loss": 1.4829, + "step": 396200 + }, + { + "epoch": 31.08, + "learning_rate": 0.0005, + "loss": 1.4926, + "step": 396300 + }, + { + "epoch": 31.09, + "learning_rate": 0.0005, + "loss": 1.4833, + "step": 396400 + }, + { + "epoch": 31.09, + "learning_rate": 0.0005, + "loss": 1.5129, + "step": 396500 + }, + { + "epoch": 31.1, + "learning_rate": 0.0005, + "loss": 1.4949, + "step": 396600 + }, + { + "epoch": 31.11, + "learning_rate": 0.0005, + "loss": 1.5111, + "step": 396700 + }, + { + "epoch": 31.12, + "learning_rate": 0.0005, + "loss": 1.4714, + "step": 396800 + }, + { + "epoch": 31.12, + "learning_rate": 0.0005, + "loss": 1.4798, + "step": 396900 + }, + { + "epoch": 31.13, + "learning_rate": 0.0005, + "loss": 1.4855, + "step": 397000 + }, + { + "epoch": 31.14, + "learning_rate": 0.0005, + "loss": 1.4933, + "step": 397100 + }, + { + "epoch": 31.15, + "learning_rate": 0.0005, + "loss": 1.4793, + "step": 397200 + }, + { + "epoch": 31.16, + "learning_rate": 0.0005, + "loss": 1.4858, + "step": 397300 + }, + { + "epoch": 31.16, + "learning_rate": 0.0005, + "loss": 1.5112, + "step": 397400 + }, + { + "epoch": 31.17, + "learning_rate": 0.0005, + "loss": 1.5084, + "step": 397500 + }, + { + "epoch": 31.18, + "learning_rate": 0.0005, + "loss": 1.4982, + "step": 397600 + }, + { + "epoch": 31.19, + "learning_rate": 0.0005, + "loss": 1.4986, + "step": 397700 + }, + { + "epoch": 31.2, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 397800 + }, + { + "epoch": 31.2, + "learning_rate": 0.0005, + "loss": 1.5151, + "step": 397900 + }, + { + "epoch": 31.21, + "learning_rate": 0.0005, + "loss": 1.4965, + "step": 398000 + }, + { + "epoch": 31.22, + "learning_rate": 0.0005, + "loss": 1.4944, + "step": 398100 + }, + { + "epoch": 31.23, + "learning_rate": 0.0005, + "loss": 1.4989, + "step": 398200 + }, + { + "epoch": 31.23, + "learning_rate": 0.0005, + "loss": 1.5089, + "step": 398300 + }, + { + "epoch": 31.24, + "learning_rate": 0.0005, + "loss": 1.51, + "step": 398400 + }, + { + "epoch": 31.25, + "learning_rate": 0.0005, + "loss": 1.5227, + "step": 398500 + }, + { + "epoch": 31.26, + "learning_rate": 0.0005, + "loss": 1.5136, + "step": 398600 + }, + { + "epoch": 31.27, + "learning_rate": 0.0005, + "loss": 1.4949, + "step": 398700 + }, + { + "epoch": 31.27, + "learning_rate": 0.0005, + "loss": 1.485, + "step": 398800 + }, + { + "epoch": 31.28, + "learning_rate": 0.0005, + "loss": 1.5228, + "step": 398900 + }, + { + "epoch": 31.29, + "learning_rate": 0.0005, + "loss": 1.5132, + "step": 399000 + }, + { + "epoch": 31.3, + "learning_rate": 0.0005, + "loss": 1.5193, + "step": 399100 + }, + { + "epoch": 31.3, + "learning_rate": 0.0005, + "loss": 1.5199, + "step": 399200 + }, + { + "epoch": 31.31, + "learning_rate": 0.0005, + "loss": 1.5519, + "step": 399300 + }, + { + "epoch": 31.32, + "learning_rate": 0.0005, + "loss": 1.5422, + "step": 399400 + }, + { + "epoch": 31.33, + "learning_rate": 0.0005, + "loss": 1.5042, + "step": 399500 + }, + { + "epoch": 31.34, + "learning_rate": 0.0005, + "loss": 1.5193, + "step": 399600 + }, + { + "epoch": 31.34, + "learning_rate": 0.0005, + "loss": 1.5402, + "step": 399700 + }, + { + "epoch": 31.35, + "learning_rate": 0.0005, + "loss": 1.5366, + "step": 399800 + }, + { + "epoch": 31.36, + "learning_rate": 0.0005, + "loss": 1.5035, + "step": 399900 + }, + { + "epoch": 31.37, + "learning_rate": 0.0005, + "loss": 1.5176, + "step": 400000 + }, + { + "epoch": 31.37, + "eval_gen_len": 18.77999470292222, + "eval_loss": 2.055631637573242, + "eval_rouge1": 35.2942, + "eval_rouge2": 14.3432, + "eval_rougeL": 29.0501, + "eval_rougeLsum": 29.0411, + "eval_runtime": 356.7733, + "eval_samples_per_second": 31.748, + "eval_steps_per_second": 1.984, + "step": 400000 + }, + { + "epoch": 31.38, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 400100 + }, + { + "epoch": 31.38, + "learning_rate": 0.0005, + "loss": 1.5285, + "step": 400200 + }, + { + "epoch": 31.39, + "learning_rate": 0.0005, + "loss": 1.5118, + "step": 400300 + }, + { + "epoch": 31.4, + "learning_rate": 0.0005, + "loss": 1.5388, + "step": 400400 + }, + { + "epoch": 31.41, + "learning_rate": 0.0005, + "loss": 1.516, + "step": 400500 + }, + { + "epoch": 31.41, + "learning_rate": 0.0005, + "loss": 1.4913, + "step": 400600 + }, + { + "epoch": 31.42, + "learning_rate": 0.0005, + "loss": 1.5452, + "step": 400700 + }, + { + "epoch": 31.43, + "learning_rate": 0.0005, + "loss": 1.5255, + "step": 400800 + }, + { + "epoch": 31.44, + "learning_rate": 0.0005, + "loss": 1.535, + "step": 400900 + }, + { + "epoch": 31.45, + "learning_rate": 0.0005, + "loss": 1.5296, + "step": 401000 + }, + { + "epoch": 31.45, + "learning_rate": 0.0005, + "loss": 1.5089, + "step": 401100 + }, + { + "epoch": 31.46, + "learning_rate": 0.0005, + "loss": 1.5081, + "step": 401200 + }, + { + "epoch": 31.47, + "learning_rate": 0.0005, + "loss": 1.5248, + "step": 401300 + }, + { + "epoch": 31.48, + "learning_rate": 0.0005, + "loss": 1.5038, + "step": 401400 + }, + { + "epoch": 31.49, + "learning_rate": 0.0005, + "loss": 1.5106, + "step": 401500 + }, + { + "epoch": 31.49, + "learning_rate": 0.0005, + "loss": 1.5322, + "step": 401600 + }, + { + "epoch": 31.5, + "learning_rate": 0.0005, + "loss": 1.523, + "step": 401700 + }, + { + "epoch": 31.51, + "learning_rate": 0.0005, + "loss": 1.5027, + "step": 401800 + }, + { + "epoch": 31.52, + "learning_rate": 0.0005, + "loss": 1.5182, + "step": 401900 + }, + { + "epoch": 31.52, + "learning_rate": 0.0005, + "loss": 1.5425, + "step": 402000 + }, + { + "epoch": 31.53, + "learning_rate": 0.0005, + "loss": 1.5213, + "step": 402100 + }, + { + "epoch": 31.54, + "learning_rate": 0.0005, + "loss": 1.494, + "step": 402200 + }, + { + "epoch": 31.55, + "learning_rate": 0.0005, + "loss": 1.5569, + "step": 402300 + }, + { + "epoch": 31.56, + "learning_rate": 0.0005, + "loss": 1.5362, + "step": 402400 + }, + { + "epoch": 31.56, + "learning_rate": 0.0005, + "loss": 1.5118, + "step": 402500 + }, + { + "epoch": 31.57, + "learning_rate": 0.0005, + "loss": 1.5371, + "step": 402600 + }, + { + "epoch": 31.58, + "learning_rate": 0.0005, + "loss": 1.5366, + "step": 402700 + }, + { + "epoch": 31.59, + "learning_rate": 0.0005, + "loss": 1.5466, + "step": 402800 + }, + { + "epoch": 31.6, + "learning_rate": 0.0005, + "loss": 1.5523, + "step": 402900 + }, + { + "epoch": 31.6, + "learning_rate": 0.0005, + "loss": 1.5207, + "step": 403000 + }, + { + "epoch": 31.61, + "learning_rate": 0.0005, + "loss": 1.5294, + "step": 403100 + }, + { + "epoch": 31.62, + "learning_rate": 0.0005, + "loss": 1.5199, + "step": 403200 + }, + { + "epoch": 31.63, + "learning_rate": 0.0005, + "loss": 1.5206, + "step": 403300 + }, + { + "epoch": 31.63, + "learning_rate": 0.0005, + "loss": 1.5636, + "step": 403400 + }, + { + "epoch": 31.64, + "learning_rate": 0.0005, + "loss": 1.5254, + "step": 403500 + }, + { + "epoch": 31.65, + "learning_rate": 0.0005, + "loss": 1.5464, + "step": 403600 + }, + { + "epoch": 31.66, + "learning_rate": 0.0005, + "loss": 1.5271, + "step": 403700 + }, + { + "epoch": 31.67, + "learning_rate": 0.0005, + "loss": 1.5663, + "step": 403800 + }, + { + "epoch": 31.67, + "learning_rate": 0.0005, + "loss": 1.5366, + "step": 403900 + }, + { + "epoch": 31.68, + "learning_rate": 0.0005, + "loss": 1.522, + "step": 404000 + }, + { + "epoch": 31.69, + "learning_rate": 0.0005, + "loss": 1.5086, + "step": 404100 + }, + { + "epoch": 31.7, + "learning_rate": 0.0005, + "loss": 1.5337, + "step": 404200 + }, + { + "epoch": 31.7, + "learning_rate": 0.0005, + "loss": 1.5372, + "step": 404300 + }, + { + "epoch": 31.71, + "learning_rate": 0.0005, + "loss": 1.5346, + "step": 404400 + }, + { + "epoch": 31.72, + "learning_rate": 0.0005, + "loss": 1.5595, + "step": 404500 + }, + { + "epoch": 31.73, + "learning_rate": 0.0005, + "loss": 1.5283, + "step": 404600 + }, + { + "epoch": 31.74, + "learning_rate": 0.0005, + "loss": 1.5531, + "step": 404700 + }, + { + "epoch": 31.74, + "learning_rate": 0.0005, + "loss": 1.5486, + "step": 404800 + }, + { + "epoch": 31.75, + "learning_rate": 0.0005, + "loss": 1.5326, + "step": 404900 + }, + { + "epoch": 31.76, + "learning_rate": 0.0005, + "loss": 1.5473, + "step": 405000 + }, + { + "epoch": 31.77, + "learning_rate": 0.0005, + "loss": 1.5356, + "step": 405100 + }, + { + "epoch": 31.78, + "learning_rate": 0.0005, + "loss": 1.5393, + "step": 405200 + }, + { + "epoch": 31.78, + "learning_rate": 0.0005, + "loss": 1.5449, + "step": 405300 + }, + { + "epoch": 31.79, + "learning_rate": 0.0005, + "loss": 1.5594, + "step": 405400 + }, + { + "epoch": 31.8, + "learning_rate": 0.0005, + "loss": 1.5404, + "step": 405500 + }, + { + "epoch": 31.81, + "learning_rate": 0.0005, + "loss": 1.5441, + "step": 405600 + }, + { + "epoch": 31.81, + "learning_rate": 0.0005, + "loss": 1.5549, + "step": 405700 + }, + { + "epoch": 31.82, + "learning_rate": 0.0005, + "loss": 1.5179, + "step": 405800 + }, + { + "epoch": 31.83, + "learning_rate": 0.0005, + "loss": 1.5099, + "step": 405900 + }, + { + "epoch": 31.84, + "learning_rate": 0.0005, + "loss": 1.5443, + "step": 406000 + }, + { + "epoch": 31.85, + "learning_rate": 0.0005, + "loss": 1.5671, + "step": 406100 + }, + { + "epoch": 31.85, + "learning_rate": 0.0005, + "loss": 1.537, + "step": 406200 + }, + { + "epoch": 31.86, + "learning_rate": 0.0005, + "loss": 1.5413, + "step": 406300 + }, + { + "epoch": 31.87, + "learning_rate": 0.0005, + "loss": 1.548, + "step": 406400 + }, + { + "epoch": 31.88, + "learning_rate": 0.0005, + "loss": 1.5445, + "step": 406500 + }, + { + "epoch": 31.89, + "learning_rate": 0.0005, + "loss": 1.5405, + "step": 406600 + }, + { + "epoch": 31.89, + "learning_rate": 0.0005, + "loss": 1.5748, + "step": 406700 + }, + { + "epoch": 31.9, + "learning_rate": 0.0005, + "loss": 1.5358, + "step": 406800 + }, + { + "epoch": 31.91, + "learning_rate": 0.0005, + "loss": 1.5438, + "step": 406900 + }, + { + "epoch": 31.92, + "learning_rate": 0.0005, + "loss": 1.5229, + "step": 407000 + }, + { + "epoch": 31.92, + "learning_rate": 0.0005, + "loss": 1.546, + "step": 407100 + }, + { + "epoch": 31.93, + "learning_rate": 0.0005, + "loss": 1.5315, + "step": 407200 + }, + { + "epoch": 31.94, + "learning_rate": 0.0005, + "loss": 1.5509, + "step": 407300 + }, + { + "epoch": 31.95, + "learning_rate": 0.0005, + "loss": 1.5527, + "step": 407400 + }, + { + "epoch": 31.96, + "learning_rate": 0.0005, + "loss": 1.5307, + "step": 407500 + }, + { + "epoch": 31.96, + "learning_rate": 0.0005, + "loss": 1.5234, + "step": 407600 + }, + { + "epoch": 31.97, + "learning_rate": 0.0005, + "loss": 1.5626, + "step": 407700 + }, + { + "epoch": 31.98, + "learning_rate": 0.0005, + "loss": 1.5508, + "step": 407800 + }, + { + "epoch": 31.99, + "learning_rate": 0.0005, + "loss": 1.5658, + "step": 407900 + }, + { + "epoch": 31.99, + "learning_rate": 0.0005, + "loss": 1.541, + "step": 408000 + }, + { + "epoch": 32.0, + "learning_rate": 0.0005, + "loss": 1.5188, + "step": 408100 + }, + { + "epoch": 32.01, + "learning_rate": 0.0005, + "loss": 1.4728, + "step": 408200 + }, + { + "epoch": 32.02, + "learning_rate": 0.0005, + "loss": 1.4386, + "step": 408300 + }, + { + "epoch": 32.03, + "learning_rate": 0.0005, + "loss": 1.4807, + "step": 408400 + }, + { + "epoch": 32.03, + "learning_rate": 0.0005, + "loss": 1.4852, + "step": 408500 + }, + { + "epoch": 32.04, + "learning_rate": 0.0005, + "loss": 1.4643, + "step": 408600 + }, + { + "epoch": 32.05, + "learning_rate": 0.0005, + "loss": 1.4934, + "step": 408700 + }, + { + "epoch": 32.06, + "learning_rate": 0.0005, + "loss": 1.4816, + "step": 408800 + }, + { + "epoch": 32.07, + "learning_rate": 0.0005, + "loss": 1.4521, + "step": 408900 + }, + { + "epoch": 32.07, + "learning_rate": 0.0005, + "loss": 1.4754, + "step": 409000 + }, + { + "epoch": 32.08, + "learning_rate": 0.0005, + "loss": 1.4847, + "step": 409100 + }, + { + "epoch": 32.09, + "learning_rate": 0.0005, + "loss": 1.5096, + "step": 409200 + }, + { + "epoch": 32.1, + "learning_rate": 0.0005, + "loss": 1.4838, + "step": 409300 + }, + { + "epoch": 32.1, + "learning_rate": 0.0005, + "loss": 1.4722, + "step": 409400 + }, + { + "epoch": 32.11, + "learning_rate": 0.0005, + "loss": 1.4993, + "step": 409500 + }, + { + "epoch": 32.12, + "learning_rate": 0.0005, + "loss": 1.4843, + "step": 409600 + }, + { + "epoch": 32.13, + "learning_rate": 0.0005, + "loss": 1.4763, + "step": 409700 + }, + { + "epoch": 32.14, + "learning_rate": 0.0005, + "loss": 1.4765, + "step": 409800 + }, + { + "epoch": 32.14, + "learning_rate": 0.0005, + "loss": 1.5052, + "step": 409900 + }, + { + "epoch": 32.15, + "learning_rate": 0.0005, + "loss": 1.4973, + "step": 410000 + }, + { + "epoch": 32.15, + "eval_gen_len": 18.798799329036814, + "eval_loss": 2.0617787837982178, + "eval_rouge1": 35.6624, + "eval_rouge2": 14.4996, + "eval_rougeL": 29.2625, + "eval_rougeLsum": 29.2571, + "eval_runtime": 364.7552, + "eval_samples_per_second": 31.054, + "eval_steps_per_second": 1.941, + "step": 410000 + }, + { + "epoch": 32.16, + "learning_rate": 0.0005, + "loss": 1.4893, + "step": 410100 + }, + { + "epoch": 32.17, + "learning_rate": 0.0005, + "loss": 1.51, + "step": 410200 + }, + { + "epoch": 32.18, + "learning_rate": 0.0005, + "loss": 1.5004, + "step": 410300 + }, + { + "epoch": 32.18, + "learning_rate": 0.0005, + "loss": 1.5074, + "step": 410400 + }, + { + "epoch": 32.19, + "learning_rate": 0.0005, + "loss": 1.4952, + "step": 410500 + }, + { + "epoch": 32.2, + "learning_rate": 0.0005, + "loss": 1.4865, + "step": 410600 + }, + { + "epoch": 32.21, + "learning_rate": 0.0005, + "loss": 1.5009, + "step": 410700 + }, + { + "epoch": 32.21, + "learning_rate": 0.0005, + "loss": 1.4766, + "step": 410800 + }, + { + "epoch": 32.22, + "learning_rate": 0.0005, + "loss": 1.4949, + "step": 410900 + }, + { + "epoch": 32.23, + "learning_rate": 0.0005, + "loss": 1.4874, + "step": 411000 + }, + { + "epoch": 32.24, + "learning_rate": 0.0005, + "loss": 1.4945, + "step": 411100 + }, + { + "epoch": 32.25, + "learning_rate": 0.0005, + "loss": 1.5005, + "step": 411200 + }, + { + "epoch": 32.25, + "learning_rate": 0.0005, + "loss": 1.4743, + "step": 411300 + }, + { + "epoch": 32.26, + "learning_rate": 0.0005, + "loss": 1.4751, + "step": 411400 + }, + { + "epoch": 32.27, + "learning_rate": 0.0005, + "loss": 1.5003, + "step": 411500 + }, + { + "epoch": 32.28, + "learning_rate": 0.0005, + "loss": 1.473, + "step": 411600 + }, + { + "epoch": 32.29, + "learning_rate": 0.0005, + "loss": 1.477, + "step": 411700 + }, + { + "epoch": 32.29, + "learning_rate": 0.0005, + "loss": 1.518, + "step": 411800 + }, + { + "epoch": 32.3, + "learning_rate": 0.0005, + "loss": 1.4561, + "step": 411900 + }, + { + "epoch": 32.31, + "learning_rate": 0.0005, + "loss": 1.4887, + "step": 412000 + }, + { + "epoch": 32.32, + "learning_rate": 0.0005, + "loss": 1.5099, + "step": 412100 + }, + { + "epoch": 32.32, + "learning_rate": 0.0005, + "loss": 1.5178, + "step": 412200 + }, + { + "epoch": 32.33, + "learning_rate": 0.0005, + "loss": 1.5064, + "step": 412300 + }, + { + "epoch": 32.34, + "learning_rate": 0.0005, + "loss": 1.4879, + "step": 412400 + }, + { + "epoch": 32.35, + "learning_rate": 0.0005, + "loss": 1.5057, + "step": 412500 + }, + { + "epoch": 32.36, + "learning_rate": 0.0005, + "loss": 1.5124, + "step": 412600 + }, + { + "epoch": 32.36, + "learning_rate": 0.0005, + "loss": 1.5316, + "step": 412700 + }, + { + "epoch": 32.37, + "learning_rate": 0.0005, + "loss": 1.4788, + "step": 412800 + }, + { + "epoch": 32.38, + "learning_rate": 0.0005, + "loss": 1.5265, + "step": 412900 + }, + { + "epoch": 32.39, + "learning_rate": 0.0005, + "loss": 1.5078, + "step": 413000 + }, + { + "epoch": 32.39, + "learning_rate": 0.0005, + "loss": 1.5052, + "step": 413100 + }, + { + "epoch": 32.4, + "learning_rate": 0.0005, + "loss": 1.4874, + "step": 413200 + }, + { + "epoch": 32.41, + "learning_rate": 0.0005, + "loss": 1.5137, + "step": 413300 + }, + { + "epoch": 32.42, + "learning_rate": 0.0005, + "loss": 1.5306, + "step": 413400 + }, + { + "epoch": 32.43, + "learning_rate": 0.0005, + "loss": 1.4981, + "step": 413500 + }, + { + "epoch": 32.43, + "learning_rate": 0.0005, + "loss": 1.4998, + "step": 413600 + }, + { + "epoch": 32.44, + "learning_rate": 0.0005, + "loss": 1.496, + "step": 413700 + }, + { + "epoch": 32.45, + "learning_rate": 0.0005, + "loss": 1.529, + "step": 413800 + }, + { + "epoch": 32.46, + "learning_rate": 0.0005, + "loss": 1.5146, + "step": 413900 + }, + { + "epoch": 32.47, + "learning_rate": 0.0005, + "loss": 1.4946, + "step": 414000 + }, + { + "epoch": 32.47, + "learning_rate": 0.0005, + "loss": 1.5132, + "step": 414100 + }, + { + "epoch": 32.48, + "learning_rate": 0.0005, + "loss": 1.5192, + "step": 414200 + }, + { + "epoch": 32.49, + "learning_rate": 0.0005, + "loss": 1.498, + "step": 414300 + }, + { + "epoch": 32.5, + "learning_rate": 0.0005, + "loss": 1.5389, + "step": 414400 + }, + { + "epoch": 32.5, + "learning_rate": 0.0005, + "loss": 1.5097, + "step": 414500 + }, + { + "epoch": 32.51, + "learning_rate": 0.0005, + "loss": 1.4973, + "step": 414600 + }, + { + "epoch": 32.52, + "learning_rate": 0.0005, + "loss": 1.5201, + "step": 414700 + }, + { + "epoch": 32.53, + "learning_rate": 0.0005, + "loss": 1.4835, + "step": 414800 + }, + { + "epoch": 32.54, + "learning_rate": 0.0005, + "loss": 1.5017, + "step": 414900 + }, + { + "epoch": 32.54, + "learning_rate": 0.0005, + "loss": 1.5011, + "step": 415000 + }, + { + "epoch": 32.55, + "learning_rate": 0.0005, + "loss": 1.5075, + "step": 415100 + }, + { + "epoch": 32.56, + "learning_rate": 0.0005, + "loss": 1.5067, + "step": 415200 + }, + { + "epoch": 32.57, + "learning_rate": 0.0005, + "loss": 1.5373, + "step": 415300 + }, + { + "epoch": 32.58, + "learning_rate": 0.0005, + "loss": 1.5174, + "step": 415400 + }, + { + "epoch": 32.58, + "learning_rate": 0.0005, + "loss": 1.5264, + "step": 415500 + }, + { + "epoch": 32.59, + "learning_rate": 0.0005, + "loss": 1.5168, + "step": 415600 + }, + { + "epoch": 32.6, + "learning_rate": 0.0005, + "loss": 1.5411, + "step": 415700 + }, + { + "epoch": 32.61, + "learning_rate": 0.0005, + "loss": 1.5051, + "step": 415800 + }, + { + "epoch": 32.61, + "learning_rate": 0.0005, + "loss": 1.5014, + "step": 415900 + }, + { + "epoch": 32.62, + "learning_rate": 0.0005, + "loss": 1.5198, + "step": 416000 + }, + { + "epoch": 32.63, + "learning_rate": 0.0005, + "loss": 1.5211, + "step": 416100 + }, + { + "epoch": 32.64, + "learning_rate": 0.0005, + "loss": 1.5409, + "step": 416200 + }, + { + "epoch": 32.65, + "learning_rate": 0.0005, + "loss": 1.5302, + "step": 416300 + }, + { + "epoch": 32.65, + "learning_rate": 0.0005, + "loss": 1.5224, + "step": 416400 + }, + { + "epoch": 32.66, + "learning_rate": 0.0005, + "loss": 1.519, + "step": 416500 + }, + { + "epoch": 32.67, + "learning_rate": 0.0005, + "loss": 1.5162, + "step": 416600 + }, + { + "epoch": 32.68, + "learning_rate": 0.0005, + "loss": 1.5157, + "step": 416700 + }, + { + "epoch": 32.69, + "learning_rate": 0.0005, + "loss": 1.5368, + "step": 416800 + }, + { + "epoch": 32.69, + "learning_rate": 0.0005, + "loss": 1.513, + "step": 416900 + }, + { + "epoch": 32.7, + "learning_rate": 0.0005, + "loss": 1.5508, + "step": 417000 + }, + { + "epoch": 32.71, + "learning_rate": 0.0005, + "loss": 1.5104, + "step": 417100 + }, + { + "epoch": 32.72, + "learning_rate": 0.0005, + "loss": 1.5185, + "step": 417200 + }, + { + "epoch": 32.72, + "learning_rate": 0.0005, + "loss": 1.492, + "step": 417300 + }, + { + "epoch": 32.73, + "learning_rate": 0.0005, + "loss": 1.5189, + "step": 417400 + }, + { + "epoch": 32.74, + "learning_rate": 0.0005, + "loss": 1.5375, + "step": 417500 + }, + { + "epoch": 32.75, + "learning_rate": 0.0005, + "loss": 1.5309, + "step": 417600 + }, + { + "epoch": 32.76, + "learning_rate": 0.0005, + "loss": 1.5239, + "step": 417700 + }, + { + "epoch": 32.76, + "learning_rate": 0.0005, + "loss": 1.5401, + "step": 417800 + }, + { + "epoch": 32.77, + "learning_rate": 0.0005, + "loss": 1.5296, + "step": 417900 + }, + { + "epoch": 32.78, + "learning_rate": 0.0005, + "loss": 1.5092, + "step": 418000 + }, + { + "epoch": 32.79, + "learning_rate": 0.0005, + "loss": 1.5272, + "step": 418100 + }, + { + "epoch": 32.79, + "learning_rate": 0.0005, + "loss": 1.5192, + "step": 418200 + }, + { + "epoch": 32.8, + "learning_rate": 0.0005, + "loss": 1.5315, + "step": 418300 + }, + { + "epoch": 32.81, + "learning_rate": 0.0005, + "loss": 1.5335, + "step": 418400 + }, + { + "epoch": 32.82, + "learning_rate": 0.0005, + "loss": 1.5279, + "step": 418500 + }, + { + "epoch": 32.83, + "learning_rate": 0.0005, + "loss": 1.5168, + "step": 418600 + }, + { + "epoch": 32.83, + "learning_rate": 0.0005, + "loss": 1.5587, + "step": 418700 + }, + { + "epoch": 32.84, + "learning_rate": 0.0005, + "loss": 1.5549, + "step": 418800 + }, + { + "epoch": 32.85, + "learning_rate": 0.0005, + "loss": 1.5463, + "step": 418900 + }, + { + "epoch": 32.86, + "learning_rate": 0.0005, + "loss": 1.5279, + "step": 419000 + }, + { + "epoch": 32.87, + "learning_rate": 0.0005, + "loss": 1.5431, + "step": 419100 + }, + { + "epoch": 32.87, + "learning_rate": 0.0005, + "loss": 1.5552, + "step": 419200 + }, + { + "epoch": 32.88, + "learning_rate": 0.0005, + "loss": 1.5442, + "step": 419300 + }, + { + "epoch": 32.89, + "learning_rate": 0.0005, + "loss": 1.5202, + "step": 419400 + }, + { + "epoch": 32.9, + "learning_rate": 0.0005, + "loss": 1.543, + "step": 419500 + }, + { + "epoch": 32.9, + "learning_rate": 0.0005, + "loss": 1.5277, + "step": 419600 + }, + { + "epoch": 32.91, + "learning_rate": 0.0005, + "loss": 1.5405, + "step": 419700 + }, + { + "epoch": 32.92, + "learning_rate": 0.0005, + "loss": 1.557, + "step": 419800 + }, + { + "epoch": 32.93, + "learning_rate": 0.0005, + "loss": 1.5291, + "step": 419900 + }, + { + "epoch": 32.94, + "learning_rate": 0.0005, + "loss": 1.5584, + "step": 420000 + }, + { + "epoch": 32.94, + "eval_gen_len": 18.746093405138165, + "eval_loss": 2.043473720550537, + "eval_rouge1": 35.4532, + "eval_rouge2": 14.4774, + "eval_rougeL": 29.1654, + "eval_rougeLsum": 29.1572, + "eval_runtime": 355.0094, + "eval_samples_per_second": 31.906, + "eval_steps_per_second": 1.994, + "step": 420000 + }, + { + "epoch": 32.94, + "learning_rate": 0.0005, + "loss": 1.5445, + "step": 420100 + }, + { + "epoch": 32.95, + "learning_rate": 0.0005, + "loss": 1.5559, + "step": 420200 + }, + { + "epoch": 32.96, + "learning_rate": 0.0005, + "loss": 1.5439, + "step": 420300 + }, + { + "epoch": 32.97, + "learning_rate": 0.0005, + "loss": 1.5413, + "step": 420400 + }, + { + "epoch": 32.98, + "learning_rate": 0.0005, + "loss": 1.5161, + "step": 420500 + }, + { + "epoch": 32.98, + "learning_rate": 0.0005, + "loss": 1.526, + "step": 420600 + }, + { + "epoch": 32.99, + "learning_rate": 0.0005, + "loss": 1.5214, + "step": 420700 + }, + { + "epoch": 33.0, + "learning_rate": 0.0005, + "loss": 1.5649, + "step": 420800 + }, + { + "epoch": 33.01, + "learning_rate": 0.0005, + "loss": 1.479, + "step": 420900 + }, + { + "epoch": 33.01, + "learning_rate": 0.0005, + "loss": 1.4364, + "step": 421000 + }, + { + "epoch": 33.02, + "learning_rate": 0.0005, + "loss": 1.4831, + "step": 421100 + }, + { + "epoch": 33.03, + "learning_rate": 0.0005, + "loss": 1.4527, + "step": 421200 + }, + { + "epoch": 33.04, + "learning_rate": 0.0005, + "loss": 1.4754, + "step": 421300 + }, + { + "epoch": 33.05, + "learning_rate": 0.0005, + "loss": 1.4573, + "step": 421400 + }, + { + "epoch": 33.05, + "learning_rate": 0.0005, + "loss": 1.457, + "step": 421500 + }, + { + "epoch": 33.06, + "learning_rate": 0.0005, + "loss": 1.4525, + "step": 421600 + }, + { + "epoch": 33.07, + "learning_rate": 0.0005, + "loss": 1.4549, + "step": 421700 + }, + { + "epoch": 33.08, + "learning_rate": 0.0005, + "loss": 1.4764, + "step": 421800 + }, + { + "epoch": 33.09, + "learning_rate": 0.0005, + "loss": 1.48, + "step": 421900 + }, + { + "epoch": 33.09, + "learning_rate": 0.0005, + "loss": 1.4471, + "step": 422000 + }, + { + "epoch": 33.1, + "learning_rate": 0.0005, + "loss": 1.4762, + "step": 422100 + }, + { + "epoch": 33.11, + "learning_rate": 0.0005, + "loss": 1.4794, + "step": 422200 + }, + { + "epoch": 33.12, + "learning_rate": 0.0005, + "loss": 1.4909, + "step": 422300 + }, + { + "epoch": 33.12, + "learning_rate": 0.0005, + "loss": 1.4878, + "step": 422400 + }, + { + "epoch": 33.13, + "learning_rate": 0.0005, + "loss": 1.4934, + "step": 422500 + }, + { + "epoch": 33.14, + "learning_rate": 0.0005, + "loss": 1.4961, + "step": 422600 + }, + { + "epoch": 33.15, + "learning_rate": 0.0005, + "loss": 1.4815, + "step": 422700 + }, + { + "epoch": 33.16, + "learning_rate": 0.0005, + "loss": 1.4502, + "step": 422800 + }, + { + "epoch": 33.16, + "learning_rate": 0.0005, + "loss": 1.4715, + "step": 422900 + }, + { + "epoch": 33.17, + "learning_rate": 0.0005, + "loss": 1.4665, + "step": 423000 + }, + { + "epoch": 33.18, + "learning_rate": 0.0005, + "loss": 1.4945, + "step": 423100 + }, + { + "epoch": 33.19, + "learning_rate": 0.0005, + "loss": 1.4853, + "step": 423200 + }, + { + "epoch": 33.19, + "learning_rate": 0.0005, + "loss": 1.4898, + "step": 423300 + }, + { + "epoch": 33.2, + "learning_rate": 0.0005, + "loss": 1.4736, + "step": 423400 + }, + { + "epoch": 33.21, + "learning_rate": 0.0005, + "loss": 1.4931, + "step": 423500 + }, + { + "epoch": 33.22, + "learning_rate": 0.0005, + "loss": 1.508, + "step": 423600 + }, + { + "epoch": 33.23, + "learning_rate": 0.0005, + "loss": 1.459, + "step": 423700 + }, + { + "epoch": 33.23, + "learning_rate": 0.0005, + "loss": 1.4718, + "step": 423800 + }, + { + "epoch": 33.24, + "learning_rate": 0.0005, + "loss": 1.4846, + "step": 423900 + }, + { + "epoch": 33.25, + "learning_rate": 0.0005, + "loss": 1.4906, + "step": 424000 + }, + { + "epoch": 33.26, + "learning_rate": 0.0005, + "loss": 1.4812, + "step": 424100 + }, + { + "epoch": 33.27, + "learning_rate": 0.0005, + "loss": 1.5124, + "step": 424200 + }, + { + "epoch": 33.27, + "learning_rate": 0.0005, + "loss": 1.4844, + "step": 424300 + }, + { + "epoch": 33.28, + "learning_rate": 0.0005, + "loss": 1.4951, + "step": 424400 + }, + { + "epoch": 33.29, + "learning_rate": 0.0005, + "loss": 1.5, + "step": 424500 + }, + { + "epoch": 33.3, + "learning_rate": 0.0005, + "loss": 1.4982, + "step": 424600 + }, + { + "epoch": 33.3, + "learning_rate": 0.0005, + "loss": 1.4761, + "step": 424700 + }, + { + "epoch": 33.31, + "learning_rate": 0.0005, + "loss": 1.4928, + "step": 424800 + }, + { + "epoch": 33.32, + "learning_rate": 0.0005, + "loss": 1.4939, + "step": 424900 + }, + { + "epoch": 33.33, + "learning_rate": 0.0005, + "loss": 1.47, + "step": 425000 + }, + { + "epoch": 33.34, + "learning_rate": 0.0005, + "loss": 1.506, + "step": 425100 + }, + { + "epoch": 33.34, + "learning_rate": 0.0005, + "loss": 1.4834, + "step": 425200 + }, + { + "epoch": 33.35, + "learning_rate": 0.0005, + "loss": 1.4674, + "step": 425300 + }, + { + "epoch": 33.36, + "learning_rate": 0.0005, + "loss": 1.4815, + "step": 425400 + }, + { + "epoch": 33.37, + "learning_rate": 0.0005, + "loss": 1.4808, + "step": 425500 + }, + { + "epoch": 33.38, + "learning_rate": 0.0005, + "loss": 1.528, + "step": 425600 + }, + { + "epoch": 33.38, + "learning_rate": 0.0005, + "loss": 1.4721, + "step": 425700 + }, + { + "epoch": 33.39, + "learning_rate": 0.0005, + "loss": 1.5043, + "step": 425800 + }, + { + "epoch": 33.4, + "learning_rate": 0.0005, + "loss": 1.4849, + "step": 425900 + }, + { + "epoch": 33.41, + "learning_rate": 0.0005, + "loss": 1.5054, + "step": 426000 + }, + { + "epoch": 33.41, + "learning_rate": 0.0005, + "loss": 1.4862, + "step": 426100 + }, + { + "epoch": 33.42, + "learning_rate": 0.0005, + "loss": 1.4764, + "step": 426200 + }, + { + "epoch": 33.43, + "learning_rate": 0.0005, + "loss": 1.4996, + "step": 426300 + }, + { + "epoch": 33.44, + "learning_rate": 0.0005, + "loss": 1.5226, + "step": 426400 + }, + { + "epoch": 33.45, + "learning_rate": 0.0005, + "loss": 1.4969, + "step": 426500 + }, + { + "epoch": 33.45, + "learning_rate": 0.0005, + "loss": 1.5382, + "step": 426600 + }, + { + "epoch": 33.46, + "learning_rate": 0.0005, + "loss": 1.5159, + "step": 426700 + }, + { + "epoch": 33.47, + "learning_rate": 0.0005, + "loss": 1.5188, + "step": 426800 + }, + { + "epoch": 33.48, + "learning_rate": 0.0005, + "loss": 1.5403, + "step": 426900 + }, + { + "epoch": 33.48, + "learning_rate": 0.0005, + "loss": 1.5021, + "step": 427000 + }, + { + "epoch": 33.49, + "learning_rate": 0.0005, + "loss": 1.5213, + "step": 427100 + }, + { + "epoch": 33.5, + "learning_rate": 0.0005, + "loss": 1.5093, + "step": 427200 + }, + { + "epoch": 33.51, + "learning_rate": 0.0005, + "loss": 1.5155, + "step": 427300 + }, + { + "epoch": 33.52, + "learning_rate": 0.0005, + "loss": 1.5006, + "step": 427400 + }, + { + "epoch": 33.52, + "learning_rate": 0.0005, + "loss": 1.5207, + "step": 427500 + }, + { + "epoch": 33.53, + "learning_rate": 0.0005, + "loss": 1.526, + "step": 427600 + }, + { + "epoch": 33.54, + "learning_rate": 0.0005, + "loss": 1.505, + "step": 427700 + }, + { + "epoch": 33.55, + "learning_rate": 0.0005, + "loss": 1.5, + "step": 427800 + }, + { + "epoch": 33.56, + "learning_rate": 0.0005, + "loss": 1.4947, + "step": 427900 + }, + { + "epoch": 33.56, + "learning_rate": 0.0005, + "loss": 1.4942, + "step": 428000 + }, + { + "epoch": 33.57, + "learning_rate": 0.0005, + "loss": 1.5054, + "step": 428100 + }, + { + "epoch": 33.58, + "learning_rate": 0.0005, + "loss": 1.5072, + "step": 428200 + }, + { + "epoch": 33.59, + "learning_rate": 0.0005, + "loss": 1.4926, + "step": 428300 + }, + { + "epoch": 33.59, + "learning_rate": 0.0005, + "loss": 1.4933, + "step": 428400 + }, + { + "epoch": 33.6, + "learning_rate": 0.0005, + "loss": 1.5019, + "step": 428500 + }, + { + "epoch": 33.61, + "learning_rate": 0.0005, + "loss": 1.503, + "step": 428600 + }, + { + "epoch": 33.62, + "learning_rate": 0.0005, + "loss": 1.4991, + "step": 428700 + }, + { + "epoch": 33.63, + "learning_rate": 0.0005, + "loss": 1.4853, + "step": 428800 + }, + { + "epoch": 33.63, + "learning_rate": 0.0005, + "loss": 1.5268, + "step": 428900 + }, + { + "epoch": 33.64, + "learning_rate": 0.0005, + "loss": 1.48, + "step": 429000 + }, + { + "epoch": 33.65, + "learning_rate": 0.0005, + "loss": 1.517, + "step": 429100 + }, + { + "epoch": 33.66, + "learning_rate": 0.0005, + "loss": 1.5225, + "step": 429200 + }, + { + "epoch": 33.67, + "learning_rate": 0.0005, + "loss": 1.5214, + "step": 429300 + }, + { + "epoch": 33.67, + "learning_rate": 0.0005, + "loss": 1.506, + "step": 429400 + }, + { + "epoch": 33.68, + "learning_rate": 0.0005, + "loss": 1.5391, + "step": 429500 + }, + { + "epoch": 33.69, + "learning_rate": 0.0005, + "loss": 1.5118, + "step": 429600 + }, + { + "epoch": 33.7, + "learning_rate": 0.0005, + "loss": 1.4951, + "step": 429700 + }, + { + "epoch": 33.7, + "learning_rate": 0.0005, + "loss": 1.5314, + "step": 429800 + }, + { + "epoch": 33.71, + "learning_rate": 0.0005, + "loss": 1.5346, + "step": 429900 + }, + { + "epoch": 33.72, + "learning_rate": 0.0005, + "loss": 1.5106, + "step": 430000 + }, + { + "epoch": 33.72, + "eval_gen_len": 18.773549924958065, + "eval_loss": 2.0536205768585205, + "eval_rouge1": 35.5461, + "eval_rouge2": 14.4953, + "eval_rougeL": 29.2671, + "eval_rougeLsum": 29.2571, + "eval_runtime": 355.0911, + "eval_samples_per_second": 31.899, + "eval_steps_per_second": 1.994, + "step": 430000 + }, + { + "epoch": 33.73, + "learning_rate": 0.0005, + "loss": 1.5295, + "step": 430100 + }, + { + "epoch": 33.74, + "learning_rate": 0.0005, + "loss": 1.5017, + "step": 430200 + }, + { + "epoch": 33.74, + "learning_rate": 0.0005, + "loss": 1.5423, + "step": 430300 + }, + { + "epoch": 33.75, + "learning_rate": 0.0005, + "loss": 1.4981, + "step": 430400 + }, + { + "epoch": 33.76, + "learning_rate": 0.0005, + "loss": 1.5199, + "step": 430500 + }, + { + "epoch": 33.77, + "learning_rate": 0.0005, + "loss": 1.5, + "step": 430600 + }, + { + "epoch": 33.78, + "learning_rate": 0.0005, + "loss": 1.5106, + "step": 430700 + }, + { + "epoch": 33.78, + "learning_rate": 0.0005, + "loss": 1.5251, + "step": 430800 + }, + { + "epoch": 33.79, + "learning_rate": 0.0005, + "loss": 1.5132, + "step": 430900 + }, + { + "epoch": 33.8, + "learning_rate": 0.0005, + "loss": 1.5032, + "step": 431000 + }, + { + "epoch": 33.81, + "learning_rate": 0.0005, + "loss": 1.5136, + "step": 431100 + }, + { + "epoch": 33.81, + "learning_rate": 0.0005, + "loss": 1.5212, + "step": 431200 + }, + { + "epoch": 33.82, + "learning_rate": 0.0005, + "loss": 1.5235, + "step": 431300 + }, + { + "epoch": 33.83, + "learning_rate": 0.0005, + "loss": 1.4808, + "step": 431400 + }, + { + "epoch": 33.84, + "learning_rate": 0.0005, + "loss": 1.5457, + "step": 431500 + }, + { + "epoch": 33.85, + "learning_rate": 0.0005, + "loss": 1.5044, + "step": 431600 + }, + { + "epoch": 33.85, + "learning_rate": 0.0005, + "loss": 1.5327, + "step": 431700 + }, + { + "epoch": 33.86, + "learning_rate": 0.0005, + "loss": 1.5262, + "step": 431800 + }, + { + "epoch": 33.87, + "learning_rate": 0.0005, + "loss": 1.5198, + "step": 431900 + }, + { + "epoch": 33.88, + "learning_rate": 0.0005, + "loss": 1.5182, + "step": 432000 + }, + { + "epoch": 33.88, + "learning_rate": 0.0005, + "loss": 1.5306, + "step": 432100 + }, + { + "epoch": 33.89, + "learning_rate": 0.0005, + "loss": 1.5222, + "step": 432200 + }, + { + "epoch": 33.9, + "learning_rate": 0.0005, + "loss": 1.488, + "step": 432300 + }, + { + "epoch": 33.91, + "learning_rate": 0.0005, + "loss": 1.5269, + "step": 432400 + }, + { + "epoch": 33.92, + "learning_rate": 0.0005, + "loss": 1.5264, + "step": 432500 + }, + { + "epoch": 33.92, + "learning_rate": 0.0005, + "loss": 1.5207, + "step": 432600 + }, + { + "epoch": 33.93, + "learning_rate": 0.0005, + "loss": 1.5209, + "step": 432700 + }, + { + "epoch": 33.94, + "learning_rate": 0.0005, + "loss": 1.5291, + "step": 432800 + }, + { + "epoch": 33.95, + "learning_rate": 0.0005, + "loss": 1.5382, + "step": 432900 + }, + { + "epoch": 33.96, + "learning_rate": 0.0005, + "loss": 1.5209, + "step": 433000 + }, + { + "epoch": 33.96, + "learning_rate": 0.0005, + "loss": 1.5115, + "step": 433100 + }, + { + "epoch": 33.97, + "learning_rate": 0.0005, + "loss": 1.5056, + "step": 433200 + }, + { + "epoch": 33.98, + "learning_rate": 0.0005, + "loss": 1.5244, + "step": 433300 + }, + { + "epoch": 33.99, + "learning_rate": 0.0005, + "loss": 1.5259, + "step": 433400 + }, + { + "epoch": 33.99, + "learning_rate": 0.0005, + "loss": 1.5489, + "step": 433500 + }, + { + "epoch": 34.0, + "learning_rate": 0.0005, + "loss": 1.4757, + "step": 433600 + }, + { + "epoch": 34.01, + "learning_rate": 0.0005, + "loss": 1.4282, + "step": 433700 + }, + { + "epoch": 34.02, + "learning_rate": 0.0005, + "loss": 1.4352, + "step": 433800 + }, + { + "epoch": 34.03, + "learning_rate": 0.0005, + "loss": 1.4427, + "step": 433900 + }, + { + "epoch": 34.03, + "learning_rate": 0.0005, + "loss": 1.4662, + "step": 434000 + }, + { + "epoch": 34.04, + "learning_rate": 0.0005, + "loss": 1.4606, + "step": 434100 + }, + { + "epoch": 34.05, + "learning_rate": 0.0005, + "loss": 1.4774, + "step": 434200 + }, + { + "epoch": 34.06, + "learning_rate": 0.0005, + "loss": 1.4842, + "step": 434300 + }, + { + "epoch": 34.07, + "learning_rate": 0.0005, + "loss": 1.452, + "step": 434400 + }, + { + "epoch": 34.07, + "learning_rate": 0.0005, + "loss": 1.4619, + "step": 434500 + }, + { + "epoch": 34.08, + "learning_rate": 0.0005, + "loss": 1.4472, + "step": 434600 + }, + { + "epoch": 34.09, + "learning_rate": 0.0005, + "loss": 1.4587, + "step": 434700 + }, + { + "epoch": 34.1, + "learning_rate": 0.0005, + "loss": 1.4629, + "step": 434800 + }, + { + "epoch": 34.1, + "learning_rate": 0.0005, + "loss": 1.4664, + "step": 434900 + }, + { + "epoch": 34.11, + "learning_rate": 0.0005, + "loss": 1.4545, + "step": 435000 + }, + { + "epoch": 34.12, + "learning_rate": 0.0005, + "loss": 1.466, + "step": 435100 + }, + { + "epoch": 34.13, + "learning_rate": 0.0005, + "loss": 1.4531, + "step": 435200 + }, + { + "epoch": 34.14, + "learning_rate": 0.0005, + "loss": 1.4366, + "step": 435300 + }, + { + "epoch": 34.14, + "learning_rate": 0.0005, + "loss": 1.4653, + "step": 435400 + }, + { + "epoch": 34.15, + "learning_rate": 0.0005, + "loss": 1.4787, + "step": 435500 + }, + { + "epoch": 34.16, + "learning_rate": 0.0005, + "loss": 1.4584, + "step": 435600 + }, + { + "epoch": 34.17, + "learning_rate": 0.0005, + "loss": 1.4645, + "step": 435700 + }, + { + "epoch": 34.18, + "learning_rate": 0.0005, + "loss": 1.4765, + "step": 435800 + }, + { + "epoch": 34.18, + "learning_rate": 0.0005, + "loss": 1.4918, + "step": 435900 + }, + { + "epoch": 34.19, + "learning_rate": 0.0005, + "loss": 1.4779, + "step": 436000 + }, + { + "epoch": 34.2, + "learning_rate": 0.0005, + "loss": 1.4586, + "step": 436100 + }, + { + "epoch": 34.21, + "learning_rate": 0.0005, + "loss": 1.4688, + "step": 436200 + }, + { + "epoch": 34.21, + "learning_rate": 0.0005, + "loss": 1.4623, + "step": 436300 + }, + { + "epoch": 34.22, + "learning_rate": 0.0005, + "loss": 1.4582, + "step": 436400 + }, + { + "epoch": 34.23, + "learning_rate": 0.0005, + "loss": 1.4566, + "step": 436500 + }, + { + "epoch": 34.24, + "learning_rate": 0.0005, + "loss": 1.4839, + "step": 436600 + }, + { + "epoch": 34.25, + "learning_rate": 0.0005, + "loss": 1.4723, + "step": 436700 + }, + { + "epoch": 34.25, + "learning_rate": 0.0005, + "loss": 1.4567, + "step": 436800 + }, + { + "epoch": 34.26, + "learning_rate": 0.0005, + "loss": 1.4795, + "step": 436900 + }, + { + "epoch": 34.27, + "learning_rate": 0.0005, + "loss": 1.4614, + "step": 437000 + }, + { + "epoch": 34.28, + "learning_rate": 0.0005, + "loss": 1.4889, + "step": 437100 + }, + { + "epoch": 34.28, + "learning_rate": 0.0005, + "loss": 1.472, + "step": 437200 + }, + { + "epoch": 34.29, + "learning_rate": 0.0005, + "loss": 1.4718, + "step": 437300 + }, + { + "epoch": 34.3, + "learning_rate": 0.0005, + "loss": 1.4689, + "step": 437400 + }, + { + "epoch": 34.31, + "learning_rate": 0.0005, + "loss": 1.4568, + "step": 437500 + }, + { + "epoch": 34.32, + "learning_rate": 0.0005, + "loss": 1.4908, + "step": 437600 + }, + { + "epoch": 34.32, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 437700 + }, + { + "epoch": 34.33, + "learning_rate": 0.0005, + "loss": 1.4859, + "step": 437800 + }, + { + "epoch": 34.34, + "learning_rate": 0.0005, + "loss": 1.4901, + "step": 437900 + }, + { + "epoch": 34.35, + "learning_rate": 0.0005, + "loss": 1.4836, + "step": 438000 + }, + { + "epoch": 34.36, + "learning_rate": 0.0005, + "loss": 1.4968, + "step": 438100 + }, + { + "epoch": 34.36, + "learning_rate": 0.0005, + "loss": 1.4828, + "step": 438200 + }, + { + "epoch": 34.37, + "learning_rate": 0.0005, + "loss": 1.4885, + "step": 438300 + }, + { + "epoch": 34.38, + "learning_rate": 0.0005, + "loss": 1.4761, + "step": 438400 + }, + { + "epoch": 34.39, + "learning_rate": 0.0005, + "loss": 1.5173, + "step": 438500 + }, + { + "epoch": 34.39, + "learning_rate": 0.0005, + "loss": 1.4909, + "step": 438600 + }, + { + "epoch": 34.4, + "learning_rate": 0.0005, + "loss": 1.4682, + "step": 438700 + }, + { + "epoch": 34.41, + "learning_rate": 0.0005, + "loss": 1.5019, + "step": 438800 + }, + { + "epoch": 34.42, + "learning_rate": 0.0005, + "loss": 1.5032, + "step": 438900 + }, + { + "epoch": 34.43, + "learning_rate": 0.0005, + "loss": 1.5024, + "step": 439000 + }, + { + "epoch": 34.43, + "learning_rate": 0.0005, + "loss": 1.4704, + "step": 439100 + }, + { + "epoch": 34.44, + "learning_rate": 0.0005, + "loss": 1.4926, + "step": 439200 + }, + { + "epoch": 34.45, + "learning_rate": 0.0005, + "loss": 1.4694, + "step": 439300 + }, + { + "epoch": 34.46, + "learning_rate": 0.0005, + "loss": 1.5041, + "step": 439400 + }, + { + "epoch": 34.47, + "learning_rate": 0.0005, + "loss": 1.4666, + "step": 439500 + }, + { + "epoch": 34.47, + "learning_rate": 0.0005, + "loss": 1.5155, + "step": 439600 + }, + { + "epoch": 34.48, + "learning_rate": 0.0005, + "loss": 1.4776, + "step": 439700 + }, + { + "epoch": 34.49, + "learning_rate": 0.0005, + "loss": 1.4833, + "step": 439800 + }, + { + "epoch": 34.5, + "learning_rate": 0.0005, + "loss": 1.4878, + "step": 439900 + }, + { + "epoch": 34.5, + "learning_rate": 0.0005, + "loss": 1.5018, + "step": 440000 + }, + { + "epoch": 34.5, + "eval_gen_len": 18.794385097554517, + "eval_loss": 2.0479230880737305, + "eval_rouge1": 35.8012, + "eval_rouge2": 14.6585, + "eval_rougeL": 29.4215, + "eval_rougeLsum": 29.4088, + "eval_runtime": 359.8484, + "eval_samples_per_second": 31.477, + "eval_steps_per_second": 1.967, + "step": 440000 + }, + { + "epoch": 34.51, + "learning_rate": 0.0005, + "loss": 1.476, + "step": 440100 + }, + { + "epoch": 34.52, + "learning_rate": 0.0005, + "loss": 1.5182, + "step": 440200 + }, + { + "epoch": 34.53, + "learning_rate": 0.0005, + "loss": 1.4924, + "step": 440300 + }, + { + "epoch": 34.54, + "learning_rate": 0.0005, + "loss": 1.482, + "step": 440400 + }, + { + "epoch": 34.54, + "learning_rate": 0.0005, + "loss": 1.4905, + "step": 440500 + }, + { + "epoch": 34.55, + "learning_rate": 0.0005, + "loss": 1.4989, + "step": 440600 + }, + { + "epoch": 34.56, + "learning_rate": 0.0005, + "loss": 1.5047, + "step": 440700 + }, + { + "epoch": 34.57, + "learning_rate": 0.0005, + "loss": 1.5066, + "step": 440800 + }, + { + "epoch": 34.57, + "learning_rate": 0.0005, + "loss": 1.4814, + "step": 440900 + }, + { + "epoch": 34.58, + "learning_rate": 0.0005, + "loss": 1.4962, + "step": 441000 + }, + { + "epoch": 34.59, + "learning_rate": 0.0005, + "loss": 1.4896, + "step": 441100 + }, + { + "epoch": 34.6, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 441200 + }, + { + "epoch": 34.61, + "learning_rate": 0.0005, + "loss": 1.5018, + "step": 441300 + }, + { + "epoch": 34.61, + "learning_rate": 0.0005, + "loss": 1.5106, + "step": 441400 + }, + { + "epoch": 34.62, + "learning_rate": 0.0005, + "loss": 1.5057, + "step": 441500 + }, + { + "epoch": 34.63, + "learning_rate": 0.0005, + "loss": 1.4933, + "step": 441600 + }, + { + "epoch": 34.64, + "learning_rate": 0.0005, + "loss": 1.4933, + "step": 441700 + }, + { + "epoch": 34.65, + "learning_rate": 0.0005, + "loss": 1.5129, + "step": 441800 + }, + { + "epoch": 34.65, + "learning_rate": 0.0005, + "loss": 1.5032, + "step": 441900 + }, + { + "epoch": 34.66, + "learning_rate": 0.0005, + "loss": 1.5064, + "step": 442000 + }, + { + "epoch": 34.67, + "learning_rate": 0.0005, + "loss": 1.5261, + "step": 442100 + }, + { + "epoch": 34.68, + "learning_rate": 0.0005, + "loss": 1.5168, + "step": 442200 + }, + { + "epoch": 34.68, + "learning_rate": 0.0005, + "loss": 1.4994, + "step": 442300 + }, + { + "epoch": 34.69, + "learning_rate": 0.0005, + "loss": 1.5022, + "step": 442400 + }, + { + "epoch": 34.7, + "learning_rate": 0.0005, + "loss": 1.5251, + "step": 442500 + }, + { + "epoch": 34.71, + "learning_rate": 0.0005, + "loss": 1.5243, + "step": 442600 + }, + { + "epoch": 34.72, + "learning_rate": 0.0005, + "loss": 1.5109, + "step": 442700 + }, + { + "epoch": 34.72, + "learning_rate": 0.0005, + "loss": 1.4982, + "step": 442800 + }, + { + "epoch": 34.73, + "learning_rate": 0.0005, + "loss": 1.5158, + "step": 442900 + }, + { + "epoch": 34.74, + "learning_rate": 0.0005, + "loss": 1.509, + "step": 443000 + }, + { + "epoch": 34.75, + "learning_rate": 0.0005, + "loss": 1.5081, + "step": 443100 + }, + { + "epoch": 34.76, + "learning_rate": 0.0005, + "loss": 1.5148, + "step": 443200 + }, + { + "epoch": 34.76, + "learning_rate": 0.0005, + "loss": 1.4927, + "step": 443300 + }, + { + "epoch": 34.77, + "learning_rate": 0.0005, + "loss": 1.5106, + "step": 443400 + }, + { + "epoch": 34.78, + "learning_rate": 0.0005, + "loss": 1.4789, + "step": 443500 + }, + { + "epoch": 34.79, + "learning_rate": 0.0005, + "loss": 1.5104, + "step": 443600 + }, + { + "epoch": 34.79, + "learning_rate": 0.0005, + "loss": 1.5178, + "step": 443700 + }, + { + "epoch": 34.8, + "learning_rate": 0.0005, + "loss": 1.5068, + "step": 443800 + }, + { + "epoch": 34.81, + "learning_rate": 0.0005, + "loss": 1.4998, + "step": 443900 + }, + { + "epoch": 34.82, + "learning_rate": 0.0005, + "loss": 1.4906, + "step": 444000 + }, + { + "epoch": 34.83, + "learning_rate": 0.0005, + "loss": 1.5059, + "step": 444100 + }, + { + "epoch": 34.83, + "learning_rate": 0.0005, + "loss": 1.4998, + "step": 444200 + }, + { + "epoch": 34.84, + "learning_rate": 0.0005, + "loss": 1.5098, + "step": 444300 + }, + { + "epoch": 34.85, + "learning_rate": 0.0005, + "loss": 1.5164, + "step": 444400 + }, + { + "epoch": 34.86, + "learning_rate": 0.0005, + "loss": 1.5034, + "step": 444500 + }, + { + "epoch": 34.87, + "learning_rate": 0.0005, + "loss": 1.5082, + "step": 444600 + }, + { + "epoch": 34.87, + "learning_rate": 0.0005, + "loss": 1.5303, + "step": 444700 + }, + { + "epoch": 34.88, + "learning_rate": 0.0005, + "loss": 1.5157, + "step": 444800 + }, + { + "epoch": 34.89, + "learning_rate": 0.0005, + "loss": 1.5325, + "step": 444900 + }, + { + "epoch": 34.9, + "learning_rate": 0.0005, + "loss": 1.5343, + "step": 445000 + }, + { + "epoch": 34.9, + "learning_rate": 0.0005, + "loss": 1.516, + "step": 445100 + }, + { + "epoch": 34.91, + "learning_rate": 0.0005, + "loss": 1.5298, + "step": 445200 + }, + { + "epoch": 34.92, + "learning_rate": 0.0005, + "loss": 1.5331, + "step": 445300 + }, + { + "epoch": 34.93, + "learning_rate": 0.0005, + "loss": 1.5028, + "step": 445400 + }, + { + "epoch": 34.94, + "learning_rate": 0.0005, + "loss": 1.5004, + "step": 445500 + }, + { + "epoch": 34.94, + "learning_rate": 0.0005, + "loss": 1.5088, + "step": 445600 + }, + { + "epoch": 34.95, + "learning_rate": 0.0005, + "loss": 1.4988, + "step": 445700 + }, + { + "epoch": 34.96, + "learning_rate": 0.0005, + "loss": 1.49, + "step": 445800 + }, + { + "epoch": 34.97, + "learning_rate": 0.0005, + "loss": 1.5205, + "step": 445900 + }, + { + "epoch": 34.97, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 446000 + }, + { + "epoch": 34.98, + "learning_rate": 0.0005, + "loss": 1.5047, + "step": 446100 + }, + { + "epoch": 34.99, + "learning_rate": 0.0005, + "loss": 1.5249, + "step": 446200 + }, + { + "epoch": 35.0, + "learning_rate": 0.0005, + "loss": 1.5226, + "step": 446300 + }, + { + "epoch": 35.01, + "learning_rate": 0.0005, + "loss": 1.4523, + "step": 446400 + }, + { + "epoch": 35.01, + "learning_rate": 0.0005, + "loss": 1.4336, + "step": 446500 + }, + { + "epoch": 35.02, + "learning_rate": 0.0005, + "loss": 1.4534, + "step": 446600 + }, + { + "epoch": 35.03, + "learning_rate": 0.0005, + "loss": 1.4535, + "step": 446700 + }, + { + "epoch": 35.04, + "learning_rate": 0.0005, + "loss": 1.4416, + "step": 446800 + }, + { + "epoch": 35.05, + "learning_rate": 0.0005, + "loss": 1.4218, + "step": 446900 + }, + { + "epoch": 35.05, + "learning_rate": 0.0005, + "loss": 1.4596, + "step": 447000 + }, + { + "epoch": 35.06, + "learning_rate": 0.0005, + "loss": 1.4474, + "step": 447100 + }, + { + "epoch": 35.07, + "learning_rate": 0.0005, + "loss": 1.4481, + "step": 447200 + }, + { + "epoch": 35.08, + "learning_rate": 0.0005, + "loss": 1.4636, + "step": 447300 + }, + { + "epoch": 35.08, + "learning_rate": 0.0005, + "loss": 1.4575, + "step": 447400 + }, + { + "epoch": 35.09, + "learning_rate": 0.0005, + "loss": 1.484, + "step": 447500 + }, + { + "epoch": 35.1, + "learning_rate": 0.0005, + "loss": 1.453, + "step": 447600 + }, + { + "epoch": 35.11, + "learning_rate": 0.0005, + "loss": 1.4408, + "step": 447700 + }, + { + "epoch": 35.12, + "learning_rate": 0.0005, + "loss": 1.4777, + "step": 447800 + }, + { + "epoch": 35.12, + "learning_rate": 0.0005, + "loss": 1.4703, + "step": 447900 + }, + { + "epoch": 35.13, + "learning_rate": 0.0005, + "loss": 1.4324, + "step": 448000 + }, + { + "epoch": 35.14, + "learning_rate": 0.0005, + "loss": 1.4664, + "step": 448100 + }, + { + "epoch": 35.15, + "learning_rate": 0.0005, + "loss": 1.4364, + "step": 448200 + }, + { + "epoch": 35.16, + "learning_rate": 0.0005, + "loss": 1.4724, + "step": 448300 + }, + { + "epoch": 35.16, + "learning_rate": 0.0005, + "loss": 1.4512, + "step": 448400 + }, + { + "epoch": 35.17, + "learning_rate": 0.0005, + "loss": 1.4756, + "step": 448500 + }, + { + "epoch": 35.18, + "learning_rate": 0.0005, + "loss": 1.4784, + "step": 448600 + }, + { + "epoch": 35.19, + "learning_rate": 0.0005, + "loss": 1.4541, + "step": 448700 + }, + { + "epoch": 35.19, + "learning_rate": 0.0005, + "loss": 1.4614, + "step": 448800 + }, + { + "epoch": 35.2, + "learning_rate": 0.0005, + "loss": 1.4543, + "step": 448900 + }, + { + "epoch": 35.21, + "learning_rate": 0.0005, + "loss": 1.4481, + "step": 449000 + }, + { + "epoch": 35.22, + "learning_rate": 0.0005, + "loss": 1.4598, + "step": 449100 + }, + { + "epoch": 35.23, + "learning_rate": 0.0005, + "loss": 1.4563, + "step": 449200 + }, + { + "epoch": 35.23, + "learning_rate": 0.0005, + "loss": 1.4464, + "step": 449300 + }, + { + "epoch": 35.24, + "learning_rate": 0.0005, + "loss": 1.4748, + "step": 449400 + }, + { + "epoch": 35.25, + "learning_rate": 0.0005, + "loss": 1.4759, + "step": 449500 + }, + { + "epoch": 35.26, + "learning_rate": 0.0005, + "loss": 1.4241, + "step": 449600 + }, + { + "epoch": 35.27, + "learning_rate": 0.0005, + "loss": 1.4666, + "step": 449700 + }, + { + "epoch": 35.27, + "learning_rate": 0.0005, + "loss": 1.4687, + "step": 449800 + }, + { + "epoch": 35.28, + "learning_rate": 0.0005, + "loss": 1.4675, + "step": 449900 + }, + { + "epoch": 35.29, + "learning_rate": 0.0005, + "loss": 1.455, + "step": 450000 + }, + { + "epoch": 35.29, + "eval_gen_len": 18.781495541626203, + "eval_loss": 2.0648272037506104, + "eval_rouge1": 35.5722, + "eval_rouge2": 14.5725, + "eval_rougeL": 29.2816, + "eval_rougeLsum": 29.2646, + "eval_runtime": 355.571, + "eval_samples_per_second": 31.856, + "eval_steps_per_second": 1.991, + "step": 450000 + }, + { + "epoch": 35.3, + "learning_rate": 0.0005, + "loss": 1.4855, + "step": 450100 + }, + { + "epoch": 35.3, + "learning_rate": 0.0005, + "loss": 1.4581, + "step": 450200 + }, + { + "epoch": 35.31, + "learning_rate": 0.0005, + "loss": 1.4779, + "step": 450300 + }, + { + "epoch": 35.32, + "learning_rate": 0.0005, + "loss": 1.4595, + "step": 450400 + }, + { + "epoch": 35.33, + "learning_rate": 0.0005, + "loss": 1.4915, + "step": 450500 + }, + { + "epoch": 35.34, + "learning_rate": 0.0005, + "loss": 1.4983, + "step": 450600 + }, + { + "epoch": 35.34, + "learning_rate": 0.0005, + "loss": 1.4711, + "step": 450700 + }, + { + "epoch": 35.35, + "learning_rate": 0.0005, + "loss": 1.4428, + "step": 450800 + }, + { + "epoch": 35.36, + "learning_rate": 0.0005, + "loss": 1.4672, + "step": 450900 + }, + { + "epoch": 35.37, + "learning_rate": 0.0005, + "loss": 1.4455, + "step": 451000 + }, + { + "epoch": 35.37, + "learning_rate": 0.0005, + "loss": 1.467, + "step": 451100 + }, + { + "epoch": 35.38, + "learning_rate": 0.0005, + "loss": 1.459, + "step": 451200 + }, + { + "epoch": 35.39, + "learning_rate": 0.0005, + "loss": 1.4694, + "step": 451300 + }, + { + "epoch": 35.4, + "learning_rate": 0.0005, + "loss": 1.4676, + "step": 451400 + }, + { + "epoch": 35.41, + "learning_rate": 0.0005, + "loss": 1.4714, + "step": 451500 + }, + { + "epoch": 35.41, + "learning_rate": 0.0005, + "loss": 1.495, + "step": 451600 + }, + { + "epoch": 35.42, + "learning_rate": 0.0005, + "loss": 1.4767, + "step": 451700 + }, + { + "epoch": 35.43, + "learning_rate": 0.0005, + "loss": 1.4694, + "step": 451800 + }, + { + "epoch": 35.44, + "learning_rate": 0.0005, + "loss": 1.4619, + "step": 451900 + }, + { + "epoch": 35.45, + "learning_rate": 0.0005, + "loss": 1.4719, + "step": 452000 + }, + { + "epoch": 35.45, + "learning_rate": 0.0005, + "loss": 1.4722, + "step": 452100 + }, + { + "epoch": 35.46, + "learning_rate": 0.0005, + "loss": 1.4741, + "step": 452200 + }, + { + "epoch": 35.47, + "learning_rate": 0.0005, + "loss": 1.4872, + "step": 452300 + }, + { + "epoch": 35.48, + "learning_rate": 0.0005, + "loss": 1.4847, + "step": 452400 + }, + { + "epoch": 35.48, + "learning_rate": 0.0005, + "loss": 1.4803, + "step": 452500 + }, + { + "epoch": 35.49, + "learning_rate": 0.0005, + "loss": 1.4905, + "step": 452600 + }, + { + "epoch": 35.5, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 452700 + }, + { + "epoch": 35.51, + "learning_rate": 0.0005, + "loss": 1.5042, + "step": 452800 + }, + { + "epoch": 35.52, + "learning_rate": 0.0005, + "loss": 1.4808, + "step": 452900 + }, + { + "epoch": 35.52, + "learning_rate": 0.0005, + "loss": 1.4739, + "step": 453000 + }, + { + "epoch": 35.53, + "learning_rate": 0.0005, + "loss": 1.4859, + "step": 453100 + }, + { + "epoch": 35.54, + "learning_rate": 0.0005, + "loss": 1.492, + "step": 453200 + }, + { + "epoch": 35.55, + "learning_rate": 0.0005, + "loss": 1.484, + "step": 453300 + }, + { + "epoch": 35.56, + "learning_rate": 0.0005, + "loss": 1.4576, + "step": 453400 + }, + { + "epoch": 35.56, + "learning_rate": 0.0005, + "loss": 1.4851, + "step": 453500 + }, + { + "epoch": 35.57, + "learning_rate": 0.0005, + "loss": 1.4735, + "step": 453600 + }, + { + "epoch": 35.58, + "learning_rate": 0.0005, + "loss": 1.4744, + "step": 453700 + }, + { + "epoch": 35.59, + "learning_rate": 0.0005, + "loss": 1.4876, + "step": 453800 + }, + { + "epoch": 35.59, + "learning_rate": 0.0005, + "loss": 1.4768, + "step": 453900 + }, + { + "epoch": 35.6, + "learning_rate": 0.0005, + "loss": 1.4672, + "step": 454000 + }, + { + "epoch": 35.61, + "learning_rate": 0.0005, + "loss": 1.4988, + "step": 454100 + }, + { + "epoch": 35.62, + "learning_rate": 0.0005, + "loss": 1.4867, + "step": 454200 + }, + { + "epoch": 35.63, + "learning_rate": 0.0005, + "loss": 1.4815, + "step": 454300 + }, + { + "epoch": 35.63, + "learning_rate": 0.0005, + "loss": 1.4974, + "step": 454400 + }, + { + "epoch": 35.64, + "learning_rate": 0.0005, + "loss": 1.4821, + "step": 454500 + }, + { + "epoch": 35.65, + "learning_rate": 0.0005, + "loss": 1.4884, + "step": 454600 + }, + { + "epoch": 35.66, + "learning_rate": 0.0005, + "loss": 1.4819, + "step": 454700 + }, + { + "epoch": 35.66, + "learning_rate": 0.0005, + "loss": 1.492, + "step": 454800 + }, + { + "epoch": 35.67, + "learning_rate": 0.0005, + "loss": 1.4862, + "step": 454900 + }, + { + "epoch": 35.68, + "learning_rate": 0.0005, + "loss": 1.4751, + "step": 455000 + }, + { + "epoch": 35.69, + "learning_rate": 0.0005, + "loss": 1.4783, + "step": 455100 + }, + { + "epoch": 35.7, + "learning_rate": 0.0005, + "loss": 1.4948, + "step": 455200 + }, + { + "epoch": 35.7, + "learning_rate": 0.0005, + "loss": 1.5175, + "step": 455300 + }, + { + "epoch": 35.71, + "learning_rate": 0.0005, + "loss": 1.5152, + "step": 455400 + }, + { + "epoch": 35.72, + "learning_rate": 0.0005, + "loss": 1.4671, + "step": 455500 + }, + { + "epoch": 35.73, + "learning_rate": 0.0005, + "loss": 1.4956, + "step": 455600 + }, + { + "epoch": 35.74, + "learning_rate": 0.0005, + "loss": 1.5073, + "step": 455700 + }, + { + "epoch": 35.74, + "learning_rate": 0.0005, + "loss": 1.4786, + "step": 455800 + }, + { + "epoch": 35.75, + "learning_rate": 0.0005, + "loss": 1.5124, + "step": 455900 + }, + { + "epoch": 35.76, + "learning_rate": 0.0005, + "loss": 1.4967, + "step": 456000 + }, + { + "epoch": 35.77, + "learning_rate": 0.0005, + "loss": 1.4846, + "step": 456100 + }, + { + "epoch": 35.77, + "learning_rate": 0.0005, + "loss": 1.4982, + "step": 456200 + }, + { + "epoch": 35.78, + "learning_rate": 0.0005, + "loss": 1.5005, + "step": 456300 + }, + { + "epoch": 35.79, + "learning_rate": 0.0005, + "loss": 1.504, + "step": 456400 + }, + { + "epoch": 35.8, + "learning_rate": 0.0005, + "loss": 1.4895, + "step": 456500 + }, + { + "epoch": 35.81, + "learning_rate": 0.0005, + "loss": 1.4705, + "step": 456600 + }, + { + "epoch": 35.81, + "learning_rate": 0.0005, + "loss": 1.4976, + "step": 456700 + }, + { + "epoch": 35.82, + "learning_rate": 0.0005, + "loss": 1.4959, + "step": 456800 + }, + { + "epoch": 35.83, + "learning_rate": 0.0005, + "loss": 1.4716, + "step": 456900 + }, + { + "epoch": 35.84, + "learning_rate": 0.0005, + "loss": 1.4709, + "step": 457000 + }, + { + "epoch": 35.85, + "learning_rate": 0.0005, + "loss": 1.5104, + "step": 457100 + }, + { + "epoch": 35.85, + "learning_rate": 0.0005, + "loss": 1.5258, + "step": 457200 + }, + { + "epoch": 35.86, + "learning_rate": 0.0005, + "loss": 1.4968, + "step": 457300 + }, + { + "epoch": 35.87, + "learning_rate": 0.0005, + "loss": 1.5232, + "step": 457400 + }, + { + "epoch": 35.88, + "learning_rate": 0.0005, + "loss": 1.5145, + "step": 457500 + }, + { + "epoch": 35.88, + "learning_rate": 0.0005, + "loss": 1.5133, + "step": 457600 + }, + { + "epoch": 35.89, + "learning_rate": 0.0005, + "loss": 1.4737, + "step": 457700 + }, + { + "epoch": 35.9, + "learning_rate": 0.0005, + "loss": 1.513, + "step": 457800 + }, + { + "epoch": 35.91, + "learning_rate": 0.0005, + "loss": 1.5141, + "step": 457900 + }, + { + "epoch": 35.92, + "learning_rate": 0.0005, + "loss": 1.5234, + "step": 458000 + }, + { + "epoch": 35.92, + "learning_rate": 0.0005, + "loss": 1.5103, + "step": 458100 + }, + { + "epoch": 35.93, + "learning_rate": 0.0005, + "loss": 1.5067, + "step": 458200 + }, + { + "epoch": 35.94, + "learning_rate": 0.0005, + "loss": 1.4814, + "step": 458300 + }, + { + "epoch": 35.95, + "learning_rate": 0.0005, + "loss": 1.527, + "step": 458400 + }, + { + "epoch": 35.96, + "learning_rate": 0.0005, + "loss": 1.4959, + "step": 458500 + }, + { + "epoch": 35.96, + "learning_rate": 0.0005, + "loss": 1.48, + "step": 458600 + }, + { + "epoch": 35.97, + "learning_rate": 0.0005, + "loss": 1.499, + "step": 458700 + }, + { + "epoch": 35.98, + "learning_rate": 0.0005, + "loss": 1.5181, + "step": 458800 + }, + { + "epoch": 35.99, + "learning_rate": 0.0005, + "loss": 1.5087, + "step": 458900 + }, + { + "epoch": 35.99, + "learning_rate": 0.0005, + "loss": 1.5377, + "step": 459000 + }, + { + "epoch": 36.0, + "learning_rate": 0.0005, + "loss": 1.5162, + "step": 459100 + }, + { + "epoch": 36.01, + "learning_rate": 0.0005, + "loss": 1.4301, + "step": 459200 + }, + { + "epoch": 36.02, + "learning_rate": 0.0005, + "loss": 1.466, + "step": 459300 + }, + { + "epoch": 36.03, + "learning_rate": 0.0005, + "loss": 1.435, + "step": 459400 + }, + { + "epoch": 36.03, + "learning_rate": 0.0005, + "loss": 1.4635, + "step": 459500 + }, + { + "epoch": 36.04, + "learning_rate": 0.0005, + "loss": 1.4322, + "step": 459600 + }, + { + "epoch": 36.05, + "learning_rate": 0.0005, + "loss": 1.4237, + "step": 459700 + }, + { + "epoch": 36.06, + "learning_rate": 0.0005, + "loss": 1.4357, + "step": 459800 + }, + { + "epoch": 36.06, + "learning_rate": 0.0005, + "loss": 1.4613, + "step": 459900 + }, + { + "epoch": 36.07, + "learning_rate": 0.0005, + "loss": 1.4209, + "step": 460000 + }, + { + "epoch": 36.07, + "eval_gen_len": 18.764368323474883, + "eval_loss": 2.0837438106536865, + "eval_rouge1": 35.6324, + "eval_rouge2": 14.6282, + "eval_rougeL": 29.3399, + "eval_rougeLsum": 29.3299, + "eval_runtime": 358.8308, + "eval_samples_per_second": 31.566, + "eval_steps_per_second": 1.973, + "step": 460000 + }, + { + "epoch": 36.08, + "learning_rate": 0.0005, + "loss": 1.4281, + "step": 460100 + }, + { + "epoch": 36.09, + "learning_rate": 0.0005, + "loss": 1.4115, + "step": 460200 + }, + { + "epoch": 36.1, + "learning_rate": 0.0005, + "loss": 1.4535, + "step": 460300 + }, + { + "epoch": 36.1, + "learning_rate": 0.0005, + "loss": 1.4266, + "step": 460400 + }, + { + "epoch": 36.11, + "learning_rate": 0.0005, + "loss": 1.446, + "step": 460500 + }, + { + "epoch": 36.12, + "learning_rate": 0.0005, + "loss": 1.4321, + "step": 460600 + }, + { + "epoch": 36.13, + "learning_rate": 0.0005, + "loss": 1.4412, + "step": 460700 + }, + { + "epoch": 36.14, + "learning_rate": 0.0005, + "loss": 1.4506, + "step": 460800 + }, + { + "epoch": 36.14, + "learning_rate": 0.0005, + "loss": 1.4445, + "step": 460900 + }, + { + "epoch": 36.15, + "learning_rate": 0.0005, + "loss": 1.454, + "step": 461000 + }, + { + "epoch": 36.16, + "learning_rate": 0.0005, + "loss": 1.4436, + "step": 461100 + }, + { + "epoch": 36.17, + "learning_rate": 0.0005, + "loss": 1.445, + "step": 461200 + }, + { + "epoch": 36.17, + "learning_rate": 0.0005, + "loss": 1.4234, + "step": 461300 + }, + { + "epoch": 36.18, + "learning_rate": 0.0005, + "loss": 1.425, + "step": 461400 + }, + { + "epoch": 36.19, + "learning_rate": 0.0005, + "loss": 1.4417, + "step": 461500 + }, + { + "epoch": 36.2, + "learning_rate": 0.0005, + "loss": 1.4736, + "step": 461600 + }, + { + "epoch": 36.21, + "learning_rate": 0.0005, + "loss": 1.4525, + "step": 461700 + }, + { + "epoch": 36.21, + "learning_rate": 0.0005, + "loss": 1.4606, + "step": 461800 + }, + { + "epoch": 36.22, + "learning_rate": 0.0005, + "loss": 1.4473, + "step": 461900 + }, + { + "epoch": 36.23, + "learning_rate": 0.0005, + "loss": 1.4424, + "step": 462000 + }, + { + "epoch": 36.24, + "learning_rate": 0.0005, + "loss": 1.4631, + "step": 462100 + }, + { + "epoch": 36.25, + "learning_rate": 0.0005, + "loss": 1.4441, + "step": 462200 + }, + { + "epoch": 36.25, + "learning_rate": 0.0005, + "loss": 1.4454, + "step": 462300 + }, + { + "epoch": 36.26, + "learning_rate": 0.0005, + "loss": 1.4588, + "step": 462400 + }, + { + "epoch": 36.27, + "learning_rate": 0.0005, + "loss": 1.4737, + "step": 462500 + }, + { + "epoch": 36.28, + "learning_rate": 0.0005, + "loss": 1.4574, + "step": 462600 + }, + { + "epoch": 36.28, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 462700 + }, + { + "epoch": 36.29, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 462800 + }, + { + "epoch": 36.3, + "learning_rate": 0.0005, + "loss": 1.4558, + "step": 462900 + }, + { + "epoch": 36.31, + "learning_rate": 0.0005, + "loss": 1.4674, + "step": 463000 + }, + { + "epoch": 36.32, + "learning_rate": 0.0005, + "loss": 1.4519, + "step": 463100 + }, + { + "epoch": 36.32, + "learning_rate": 0.0005, + "loss": 1.4642, + "step": 463200 + }, + { + "epoch": 36.33, + "learning_rate": 0.0005, + "loss": 1.4722, + "step": 463300 + }, + { + "epoch": 36.34, + "learning_rate": 0.0005, + "loss": 1.4422, + "step": 463400 + }, + { + "epoch": 36.35, + "learning_rate": 0.0005, + "loss": 1.4698, + "step": 463500 + }, + { + "epoch": 36.36, + "learning_rate": 0.0005, + "loss": 1.4571, + "step": 463600 + }, + { + "epoch": 36.36, + "learning_rate": 0.0005, + "loss": 1.4748, + "step": 463700 + }, + { + "epoch": 36.37, + "learning_rate": 0.0005, + "loss": 1.4678, + "step": 463800 + }, + { + "epoch": 36.38, + "learning_rate": 0.0005, + "loss": 1.4802, + "step": 463900 + }, + { + "epoch": 36.39, + "learning_rate": 0.0005, + "loss": 1.4493, + "step": 464000 + }, + { + "epoch": 36.39, + "learning_rate": 0.0005, + "loss": 1.4593, + "step": 464100 + }, + { + "epoch": 36.4, + "learning_rate": 0.0005, + "loss": 1.4984, + "step": 464200 + }, + { + "epoch": 36.41, + "learning_rate": 0.0005, + "loss": 1.456, + "step": 464300 + }, + { + "epoch": 36.42, + "learning_rate": 0.0005, + "loss": 1.4591, + "step": 464400 + }, + { + "epoch": 36.43, + "learning_rate": 0.0005, + "loss": 1.4638, + "step": 464500 + }, + { + "epoch": 36.43, + "learning_rate": 0.0005, + "loss": 1.4558, + "step": 464600 + }, + { + "epoch": 36.44, + "learning_rate": 0.0005, + "loss": 1.465, + "step": 464700 + }, + { + "epoch": 36.45, + "learning_rate": 0.0005, + "loss": 1.4669, + "step": 464800 + }, + { + "epoch": 36.46, + "learning_rate": 0.0005, + "loss": 1.4679, + "step": 464900 + }, + { + "epoch": 36.46, + "learning_rate": 0.0005, + "loss": 1.473, + "step": 465000 + }, + { + "epoch": 36.47, + "learning_rate": 0.0005, + "loss": 1.4825, + "step": 465100 + }, + { + "epoch": 36.48, + "learning_rate": 0.0005, + "loss": 1.4514, + "step": 465200 + }, + { + "epoch": 36.49, + "learning_rate": 0.0005, + "loss": 1.4839, + "step": 465300 + }, + { + "epoch": 36.5, + "learning_rate": 0.0005, + "loss": 1.4616, + "step": 465400 + }, + { + "epoch": 36.5, + "learning_rate": 0.0005, + "loss": 1.4662, + "step": 465500 + }, + { + "epoch": 36.51, + "learning_rate": 0.0005, + "loss": 1.4781, + "step": 465600 + }, + { + "epoch": 36.52, + "learning_rate": 0.0005, + "loss": 1.4787, + "step": 465700 + }, + { + "epoch": 36.53, + "learning_rate": 0.0005, + "loss": 1.4712, + "step": 465800 + }, + { + "epoch": 36.54, + "learning_rate": 0.0005, + "loss": 1.4667, + "step": 465900 + }, + { + "epoch": 36.54, + "learning_rate": 0.0005, + "loss": 1.4688, + "step": 466000 + }, + { + "epoch": 36.55, + "learning_rate": 0.0005, + "loss": 1.4499, + "step": 466100 + }, + { + "epoch": 36.56, + "learning_rate": 0.0005, + "loss": 1.4675, + "step": 466200 + }, + { + "epoch": 36.57, + "learning_rate": 0.0005, + "loss": 1.4699, + "step": 466300 + }, + { + "epoch": 36.57, + "learning_rate": 0.0005, + "loss": 1.4869, + "step": 466400 + }, + { + "epoch": 36.58, + "learning_rate": 0.0005, + "loss": 1.4657, + "step": 466500 + }, + { + "epoch": 36.59, + "learning_rate": 0.0005, + "loss": 1.46, + "step": 466600 + }, + { + "epoch": 36.6, + "learning_rate": 0.0005, + "loss": 1.4743, + "step": 466700 + }, + { + "epoch": 36.61, + "learning_rate": 0.0005, + "loss": 1.4902, + "step": 466800 + }, + { + "epoch": 36.61, + "learning_rate": 0.0005, + "loss": 1.463, + "step": 466900 + }, + { + "epoch": 36.62, + "learning_rate": 0.0005, + "loss": 1.4619, + "step": 467000 + }, + { + "epoch": 36.63, + "learning_rate": 0.0005, + "loss": 1.4819, + "step": 467100 + }, + { + "epoch": 36.64, + "learning_rate": 0.0005, + "loss": 1.4381, + "step": 467200 + }, + { + "epoch": 36.65, + "learning_rate": 0.0005, + "loss": 1.4876, + "step": 467300 + }, + { + "epoch": 36.65, + "learning_rate": 0.0005, + "loss": 1.4934, + "step": 467400 + }, + { + "epoch": 36.66, + "learning_rate": 0.0005, + "loss": 1.4673, + "step": 467500 + }, + { + "epoch": 36.67, + "learning_rate": 0.0005, + "loss": 1.4811, + "step": 467600 + }, + { + "epoch": 36.68, + "learning_rate": 0.0005, + "loss": 1.4584, + "step": 467700 + }, + { + "epoch": 36.68, + "learning_rate": 0.0005, + "loss": 1.4824, + "step": 467800 + }, + { + "epoch": 36.69, + "learning_rate": 0.0005, + "loss": 1.5057, + "step": 467900 + }, + { + "epoch": 36.7, + "learning_rate": 0.0005, + "loss": 1.4781, + "step": 468000 + }, + { + "epoch": 36.71, + "learning_rate": 0.0005, + "loss": 1.4791, + "step": 468100 + }, + { + "epoch": 36.72, + "learning_rate": 0.0005, + "loss": 1.507, + "step": 468200 + }, + { + "epoch": 36.72, + "learning_rate": 0.0005, + "loss": 1.4879, + "step": 468300 + }, + { + "epoch": 36.73, + "learning_rate": 0.0005, + "loss": 1.4758, + "step": 468400 + }, + { + "epoch": 36.74, + "learning_rate": 0.0005, + "loss": 1.4805, + "step": 468500 + }, + { + "epoch": 36.75, + "learning_rate": 0.0005, + "loss": 1.5019, + "step": 468600 + }, + { + "epoch": 36.76, + "learning_rate": 0.0005, + "loss": 1.4877, + "step": 468700 + }, + { + "epoch": 36.76, + "learning_rate": 0.0005, + "loss": 1.4739, + "step": 468800 + }, + { + "epoch": 36.77, + "learning_rate": 0.0005, + "loss": 1.456, + "step": 468900 + }, + { + "epoch": 36.78, + "learning_rate": 0.0005, + "loss": 1.4748, + "step": 469000 + }, + { + "epoch": 36.79, + "learning_rate": 0.0005, + "loss": 1.4974, + "step": 469100 + }, + { + "epoch": 36.79, + "learning_rate": 0.0005, + "loss": 1.4883, + "step": 469200 + }, + { + "epoch": 36.8, + "learning_rate": 0.0005, + "loss": 1.4729, + "step": 469300 + }, + { + "epoch": 36.81, + "learning_rate": 0.0005, + "loss": 1.485, + "step": 469400 + }, + { + "epoch": 36.82, + "learning_rate": 0.0005, + "loss": 1.4733, + "step": 469500 + }, + { + "epoch": 36.83, + "learning_rate": 0.0005, + "loss": 1.4927, + "step": 469600 + }, + { + "epoch": 36.83, + "learning_rate": 0.0005, + "loss": 1.4826, + "step": 469700 + }, + { + "epoch": 36.84, + "learning_rate": 0.0005, + "loss": 1.5009, + "step": 469800 + }, + { + "epoch": 36.85, + "learning_rate": 0.0005, + "loss": 1.4938, + "step": 469900 + }, + { + "epoch": 36.86, + "learning_rate": 0.0005, + "loss": 1.4894, + "step": 470000 + }, + { + "epoch": 36.86, + "eval_gen_len": 18.78617462699744, + "eval_loss": 2.0626096725463867, + "eval_rouge1": 35.7293, + "eval_rouge2": 14.5534, + "eval_rougeL": 29.39, + "eval_rougeLsum": 29.3792, + "eval_runtime": 354.5618, + "eval_samples_per_second": 31.946, + "eval_steps_per_second": 1.997, + "step": 470000 + }, + { + "epoch": 36.86, + "learning_rate": 0.0005, + "loss": 1.506, + "step": 470100 + }, + { + "epoch": 36.87, + "learning_rate": 0.0005, + "loss": 1.4744, + "step": 470200 + }, + { + "epoch": 36.88, + "learning_rate": 0.0005, + "loss": 1.4895, + "step": 470300 + }, + { + "epoch": 36.89, + "learning_rate": 0.0005, + "loss": 1.5299, + "step": 470400 + }, + { + "epoch": 36.9, + "learning_rate": 0.0005, + "loss": 1.4758, + "step": 470500 + }, + { + "epoch": 36.9, + "learning_rate": 0.0005, + "loss": 1.4826, + "step": 470600 + }, + { + "epoch": 36.91, + "learning_rate": 0.0005, + "loss": 1.4854, + "step": 470700 + }, + { + "epoch": 36.92, + "learning_rate": 0.0005, + "loss": 1.4919, + "step": 470800 + }, + { + "epoch": 36.93, + "learning_rate": 0.0005, + "loss": 1.4969, + "step": 470900 + }, + { + "epoch": 36.94, + "learning_rate": 0.0005, + "loss": 1.5361, + "step": 471000 + }, + { + "epoch": 36.94, + "learning_rate": 0.0005, + "loss": 1.4955, + "step": 471100 + }, + { + "epoch": 36.95, + "learning_rate": 0.0005, + "loss": 1.4918, + "step": 471200 + }, + { + "epoch": 36.96, + "learning_rate": 0.0005, + "loss": 1.4706, + "step": 471300 + }, + { + "epoch": 36.97, + "learning_rate": 0.0005, + "loss": 1.508, + "step": 471400 + }, + { + "epoch": 36.97, + "learning_rate": 0.0005, + "loss": 1.4943, + "step": 471500 + }, + { + "epoch": 36.98, + "learning_rate": 0.0005, + "loss": 1.5189, + "step": 471600 + }, + { + "epoch": 36.99, + "learning_rate": 0.0005, + "loss": 1.5013, + "step": 471700 + }, + { + "epoch": 37.0, + "learning_rate": 0.0005, + "loss": 1.4928, + "step": 471800 + }, + { + "epoch": 37.01, + "learning_rate": 0.0005, + "loss": 1.4571, + "step": 471900 + }, + { + "epoch": 37.01, + "learning_rate": 0.0005, + "loss": 1.422, + "step": 472000 + }, + { + "epoch": 37.02, + "learning_rate": 0.0005, + "loss": 1.3969, + "step": 472100 + }, + { + "epoch": 37.03, + "learning_rate": 0.0005, + "loss": 1.4296, + "step": 472200 + }, + { + "epoch": 37.04, + "learning_rate": 0.0005, + "loss": 1.4178, + "step": 472300 + }, + { + "epoch": 37.05, + "learning_rate": 0.0005, + "loss": 1.4167, + "step": 472400 + }, + { + "epoch": 37.05, + "learning_rate": 0.0005, + "loss": 1.4105, + "step": 472500 + }, + { + "epoch": 37.06, + "learning_rate": 0.0005, + "loss": 1.4229, + "step": 472600 + }, + { + "epoch": 37.07, + "learning_rate": 0.0005, + "loss": 1.427, + "step": 472700 + }, + { + "epoch": 37.08, + "learning_rate": 0.0005, + "loss": 1.4127, + "step": 472800 + }, + { + "epoch": 37.08, + "learning_rate": 0.0005, + "loss": 1.3988, + "step": 472900 + }, + { + "epoch": 37.09, + "learning_rate": 0.0005, + "loss": 1.4349, + "step": 473000 + }, + { + "epoch": 37.1, + "learning_rate": 0.0005, + "loss": 1.383, + "step": 473100 + }, + { + "epoch": 37.11, + "learning_rate": 0.0005, + "loss": 1.4186, + "step": 473200 + }, + { + "epoch": 37.12, + "learning_rate": 0.0005, + "loss": 1.4331, + "step": 473300 + }, + { + "epoch": 37.12, + "learning_rate": 0.0005, + "loss": 1.4529, + "step": 473400 + }, + { + "epoch": 37.13, + "learning_rate": 0.0005, + "loss": 1.4127, + "step": 473500 + }, + { + "epoch": 37.14, + "learning_rate": 0.0005, + "loss": 1.4333, + "step": 473600 + }, + { + "epoch": 37.15, + "learning_rate": 0.0005, + "loss": 1.4442, + "step": 473700 + }, + { + "epoch": 37.15, + "learning_rate": 0.0005, + "loss": 1.4151, + "step": 473800 + }, + { + "epoch": 37.16, + "learning_rate": 0.0005, + "loss": 1.4192, + "step": 473900 + }, + { + "epoch": 37.17, + "learning_rate": 0.0005, + "loss": 1.4437, + "step": 474000 + }, + { + "epoch": 37.18, + "learning_rate": 0.0005, + "loss": 1.4698, + "step": 474100 + }, + { + "epoch": 37.19, + "learning_rate": 0.0005, + "loss": 1.4343, + "step": 474200 + }, + { + "epoch": 37.19, + "learning_rate": 0.0005, + "loss": 1.456, + "step": 474300 + }, + { + "epoch": 37.2, + "learning_rate": 0.0005, + "loss": 1.4433, + "step": 474400 + }, + { + "epoch": 37.21, + "learning_rate": 0.0005, + "loss": 1.4381, + "step": 474500 + }, + { + "epoch": 37.22, + "learning_rate": 0.0005, + "loss": 1.4324, + "step": 474600 + }, + { + "epoch": 37.23, + "learning_rate": 0.0005, + "loss": 1.4325, + "step": 474700 + }, + { + "epoch": 37.23, + "learning_rate": 0.0005, + "loss": 1.4719, + "step": 474800 + }, + { + "epoch": 37.24, + "learning_rate": 0.0005, + "loss": 1.4383, + "step": 474900 + }, + { + "epoch": 37.25, + "learning_rate": 0.0005, + "loss": 1.4518, + "step": 475000 + }, + { + "epoch": 37.26, + "learning_rate": 0.0005, + "loss": 1.457, + "step": 475100 + }, + { + "epoch": 37.26, + "learning_rate": 0.0005, + "loss": 1.4485, + "step": 475200 + }, + { + "epoch": 37.27, + "learning_rate": 0.0005, + "loss": 1.4477, + "step": 475300 + }, + { + "epoch": 37.28, + "learning_rate": 0.0005, + "loss": 1.4299, + "step": 475400 + }, + { + "epoch": 37.29, + "learning_rate": 0.0005, + "loss": 1.4411, + "step": 475500 + }, + { + "epoch": 37.3, + "learning_rate": 0.0005, + "loss": 1.4588, + "step": 475600 + }, + { + "epoch": 37.3, + "learning_rate": 0.0005, + "loss": 1.453, + "step": 475700 + }, + { + "epoch": 37.31, + "learning_rate": 0.0005, + "loss": 1.4495, + "step": 475800 + }, + { + "epoch": 37.32, + "learning_rate": 0.0005, + "loss": 1.4252, + "step": 475900 + }, + { + "epoch": 37.33, + "learning_rate": 0.0005, + "loss": 1.4523, + "step": 476000 + }, + { + "epoch": 37.34, + "learning_rate": 0.0005, + "loss": 1.434, + "step": 476100 + }, + { + "epoch": 37.34, + "learning_rate": 0.0005, + "loss": 1.4625, + "step": 476200 + }, + { + "epoch": 37.35, + "learning_rate": 0.0005, + "loss": 1.4547, + "step": 476300 + }, + { + "epoch": 37.36, + "learning_rate": 0.0005, + "loss": 1.4441, + "step": 476400 + }, + { + "epoch": 37.37, + "learning_rate": 0.0005, + "loss": 1.4308, + "step": 476500 + }, + { + "epoch": 37.37, + "learning_rate": 0.0005, + "loss": 1.4641, + "step": 476600 + }, + { + "epoch": 37.38, + "learning_rate": 0.0005, + "loss": 1.4441, + "step": 476700 + }, + { + "epoch": 37.39, + "learning_rate": 0.0005, + "loss": 1.4377, + "step": 476800 + }, + { + "epoch": 37.4, + "learning_rate": 0.0005, + "loss": 1.4943, + "step": 476900 + }, + { + "epoch": 37.41, + "learning_rate": 0.0005, + "loss": 1.4712, + "step": 477000 + }, + { + "epoch": 37.41, + "learning_rate": 0.0005, + "loss": 1.4618, + "step": 477100 + }, + { + "epoch": 37.42, + "learning_rate": 0.0005, + "loss": 1.4653, + "step": 477200 + }, + { + "epoch": 37.43, + "learning_rate": 0.0005, + "loss": 1.4633, + "step": 477300 + }, + { + "epoch": 37.44, + "learning_rate": 0.0005, + "loss": 1.4464, + "step": 477400 + }, + { + "epoch": 37.45, + "learning_rate": 0.0005, + "loss": 1.4462, + "step": 477500 + }, + { + "epoch": 37.45, + "learning_rate": 0.0005, + "loss": 1.4831, + "step": 477600 + }, + { + "epoch": 37.46, + "learning_rate": 0.0005, + "loss": 1.4498, + "step": 477700 + }, + { + "epoch": 37.47, + "learning_rate": 0.0005, + "loss": 1.4278, + "step": 477800 + }, + { + "epoch": 37.48, + "learning_rate": 0.0005, + "loss": 1.4742, + "step": 477900 + }, + { + "epoch": 37.48, + "learning_rate": 0.0005, + "loss": 1.4649, + "step": 478000 + }, + { + "epoch": 37.49, + "learning_rate": 0.0005, + "loss": 1.4472, + "step": 478100 + }, + { + "epoch": 37.5, + "learning_rate": 0.0005, + "loss": 1.4375, + "step": 478200 + }, + { + "epoch": 37.51, + "learning_rate": 0.0005, + "loss": 1.4675, + "step": 478300 + }, + { + "epoch": 37.52, + "learning_rate": 0.0005, + "loss": 1.4489, + "step": 478400 + }, + { + "epoch": 37.52, + "learning_rate": 0.0005, + "loss": 1.476, + "step": 478500 + }, + { + "epoch": 37.53, + "learning_rate": 0.0005, + "loss": 1.495, + "step": 478600 + }, + { + "epoch": 37.54, + "learning_rate": 0.0005, + "loss": 1.4534, + "step": 478700 + }, + { + "epoch": 37.55, + "learning_rate": 0.0005, + "loss": 1.4523, + "step": 478800 + }, + { + "epoch": 37.55, + "learning_rate": 0.0005, + "loss": 1.4639, + "step": 478900 + }, + { + "epoch": 37.56, + "learning_rate": 0.0005, + "loss": 1.464, + "step": 479000 + }, + { + "epoch": 37.57, + "learning_rate": 0.0005, + "loss": 1.4707, + "step": 479100 + }, + { + "epoch": 37.58, + "learning_rate": 0.0005, + "loss": 1.4779, + "step": 479200 + }, + { + "epoch": 37.59, + "learning_rate": 0.0005, + "loss": 1.4698, + "step": 479300 + }, + { + "epoch": 37.59, + "learning_rate": 0.0005, + "loss": 1.459, + "step": 479400 + }, + { + "epoch": 37.6, + "learning_rate": 0.0005, + "loss": 1.474, + "step": 479500 + }, + { + "epoch": 37.61, + "learning_rate": 0.0005, + "loss": 1.4644, + "step": 479600 + }, + { + "epoch": 37.62, + "learning_rate": 0.0005, + "loss": 1.4517, + "step": 479700 + }, + { + "epoch": 37.63, + "learning_rate": 0.0005, + "loss": 1.4844, + "step": 479800 + }, + { + "epoch": 37.63, + "learning_rate": 0.0005, + "loss": 1.4627, + "step": 479900 + }, + { + "epoch": 37.64, + "learning_rate": 0.0005, + "loss": 1.4801, + "step": 480000 + }, + { + "epoch": 37.64, + "eval_gen_len": 18.763485477178424, + "eval_loss": 2.0635464191436768, + "eval_rouge1": 35.7102, + "eval_rouge2": 14.5234, + "eval_rougeL": 29.4028, + "eval_rougeLsum": 29.3844, + "eval_runtime": 356.8998, + "eval_samples_per_second": 31.737, + "eval_steps_per_second": 1.984, + "step": 480000 + }, + { + "epoch": 37.65, + "learning_rate": 0.0005, + "loss": 1.4629, + "step": 480100 + }, + { + "epoch": 37.66, + "learning_rate": 0.0005, + "loss": 1.4369, + "step": 480200 + }, + { + "epoch": 37.66, + "learning_rate": 0.0005, + "loss": 1.5003, + "step": 480300 + }, + { + "epoch": 37.67, + "learning_rate": 0.0005, + "loss": 1.4944, + "step": 480400 + }, + { + "epoch": 37.68, + "learning_rate": 0.0005, + "loss": 1.4683, + "step": 480500 + }, + { + "epoch": 37.69, + "learning_rate": 0.0005, + "loss": 1.4545, + "step": 480600 + }, + { + "epoch": 37.7, + "learning_rate": 0.0005, + "loss": 1.4789, + "step": 480700 + }, + { + "epoch": 37.7, + "learning_rate": 0.0005, + "loss": 1.4754, + "step": 480800 + }, + { + "epoch": 37.71, + "learning_rate": 0.0005, + "loss": 1.4473, + "step": 480900 + }, + { + "epoch": 37.72, + "learning_rate": 0.0005, + "loss": 1.4698, + "step": 481000 + }, + { + "epoch": 37.73, + "learning_rate": 0.0005, + "loss": 1.4793, + "step": 481100 + }, + { + "epoch": 37.74, + "learning_rate": 0.0005, + "loss": 1.4657, + "step": 481200 + }, + { + "epoch": 37.74, + "learning_rate": 0.0005, + "loss": 1.4849, + "step": 481300 + }, + { + "epoch": 37.75, + "learning_rate": 0.0005, + "loss": 1.4639, + "step": 481400 + }, + { + "epoch": 37.76, + "learning_rate": 0.0005, + "loss": 1.4791, + "step": 481500 + }, + { + "epoch": 37.77, + "learning_rate": 0.0005, + "loss": 1.4802, + "step": 481600 + }, + { + "epoch": 37.77, + "learning_rate": 0.0005, + "loss": 1.4676, + "step": 481700 + }, + { + "epoch": 37.78, + "learning_rate": 0.0005, + "loss": 1.4727, + "step": 481800 + }, + { + "epoch": 37.79, + "learning_rate": 0.0005, + "loss": 1.4757, + "step": 481900 + }, + { + "epoch": 37.8, + "learning_rate": 0.0005, + "loss": 1.5054, + "step": 482000 + }, + { + "epoch": 37.81, + "learning_rate": 0.0005, + "loss": 1.4704, + "step": 482100 + }, + { + "epoch": 37.81, + "learning_rate": 0.0005, + "loss": 1.4794, + "step": 482200 + }, + { + "epoch": 37.82, + "learning_rate": 0.0005, + "loss": 1.4873, + "step": 482300 + }, + { + "epoch": 37.83, + "learning_rate": 0.0005, + "loss": 1.4662, + "step": 482400 + }, + { + "epoch": 37.84, + "learning_rate": 0.0005, + "loss": 1.4757, + "step": 482500 + }, + { + "epoch": 37.85, + "learning_rate": 0.0005, + "loss": 1.4867, + "step": 482600 + }, + { + "epoch": 37.85, + "learning_rate": 0.0005, + "loss": 1.4748, + "step": 482700 + }, + { + "epoch": 37.86, + "learning_rate": 0.0005, + "loss": 1.5006, + "step": 482800 + }, + { + "epoch": 37.87, + "learning_rate": 0.0005, + "loss": 1.4998, + "step": 482900 + }, + { + "epoch": 37.88, + "learning_rate": 0.0005, + "loss": 1.4715, + "step": 483000 + }, + { + "epoch": 37.88, + "learning_rate": 0.0005, + "loss": 1.4772, + "step": 483100 + }, + { + "epoch": 37.89, + "learning_rate": 0.0005, + "loss": 1.477, + "step": 483200 + }, + { + "epoch": 37.9, + "learning_rate": 0.0005, + "loss": 1.4816, + "step": 483300 + }, + { + "epoch": 37.91, + "learning_rate": 0.0005, + "loss": 1.4715, + "step": 483400 + }, + { + "epoch": 37.92, + "learning_rate": 0.0005, + "loss": 1.4821, + "step": 483500 + }, + { + "epoch": 37.92, + "learning_rate": 0.0005, + "loss": 1.5009, + "step": 483600 + }, + { + "epoch": 37.93, + "learning_rate": 0.0005, + "loss": 1.4916, + "step": 483700 + }, + { + "epoch": 37.94, + "learning_rate": 0.0005, + "loss": 1.5157, + "step": 483800 + }, + { + "epoch": 37.95, + "learning_rate": 0.0005, + "loss": 1.4821, + "step": 483900 + }, + { + "epoch": 37.95, + "learning_rate": 0.0005, + "loss": 1.4669, + "step": 484000 + }, + { + "epoch": 37.96, + "learning_rate": 0.0005, + "loss": 1.4763, + "step": 484100 + }, + { + "epoch": 37.97, + "learning_rate": 0.0005, + "loss": 1.4832, + "step": 484200 + }, + { + "epoch": 37.98, + "learning_rate": 0.0005, + "loss": 1.4875, + "step": 484300 + }, + { + "epoch": 37.99, + "learning_rate": 0.0005, + "loss": 1.5, + "step": 484400 + }, + { + "epoch": 37.99, + "learning_rate": 0.0005, + "loss": 1.4921, + "step": 484500 + }, + { + "epoch": 38.0, + "learning_rate": 0.0005, + "loss": 1.4853, + "step": 484600 + }, + { + "epoch": 38.01, + "learning_rate": 0.0005, + "loss": 1.4102, + "step": 484700 + }, + { + "epoch": 38.02, + "learning_rate": 0.0005, + "loss": 1.4333, + "step": 484800 + }, + { + "epoch": 38.03, + "learning_rate": 0.0005, + "loss": 1.4171, + "step": 484900 + }, + { + "epoch": 38.03, + "learning_rate": 0.0005, + "loss": 1.4047, + "step": 485000 + }, + { + "epoch": 38.04, + "learning_rate": 0.0005, + "loss": 1.4108, + "step": 485100 + }, + { + "epoch": 38.05, + "learning_rate": 0.0005, + "loss": 1.3879, + "step": 485200 + }, + { + "epoch": 38.06, + "learning_rate": 0.0005, + "loss": 1.4144, + "step": 485300 + }, + { + "epoch": 38.06, + "learning_rate": 0.0005, + "loss": 1.4304, + "step": 485400 + }, + { + "epoch": 38.07, + "learning_rate": 0.0005, + "loss": 1.3913, + "step": 485500 + }, + { + "epoch": 38.08, + "learning_rate": 0.0005, + "loss": 1.4303, + "step": 485600 + }, + { + "epoch": 38.09, + "learning_rate": 0.0005, + "loss": 1.3798, + "step": 485700 + }, + { + "epoch": 38.1, + "learning_rate": 0.0005, + "loss": 1.4202, + "step": 485800 + }, + { + "epoch": 38.1, + "learning_rate": 0.0005, + "loss": 1.4066, + "step": 485900 + }, + { + "epoch": 38.11, + "learning_rate": 0.0005, + "loss": 1.4153, + "step": 486000 + }, + { + "epoch": 38.12, + "learning_rate": 0.0005, + "loss": 1.4256, + "step": 486100 + }, + { + "epoch": 38.13, + "learning_rate": 0.0005, + "loss": 1.4468, + "step": 486200 + }, + { + "epoch": 38.14, + "learning_rate": 0.0005, + "loss": 1.3938, + "step": 486300 + }, + { + "epoch": 38.14, + "learning_rate": 0.0005, + "loss": 1.4367, + "step": 486400 + }, + { + "epoch": 38.15, + "learning_rate": 0.0005, + "loss": 1.4248, + "step": 486500 + }, + { + "epoch": 38.16, + "learning_rate": 0.0005, + "loss": 1.4351, + "step": 486600 + }, + { + "epoch": 38.17, + "learning_rate": 0.0005, + "loss": 1.4488, + "step": 486700 + }, + { + "epoch": 38.17, + "learning_rate": 0.0005, + "loss": 1.4195, + "step": 486800 + }, + { + "epoch": 38.18, + "learning_rate": 0.0005, + "loss": 1.4355, + "step": 486900 + }, + { + "epoch": 38.19, + "learning_rate": 0.0005, + "loss": 1.4374, + "step": 487000 + }, + { + "epoch": 38.2, + "learning_rate": 0.0005, + "loss": 1.4443, + "step": 487100 + }, + { + "epoch": 38.21, + "learning_rate": 0.0005, + "loss": 1.4462, + "step": 487200 + }, + { + "epoch": 38.21, + "learning_rate": 0.0005, + "loss": 1.423, + "step": 487300 + }, + { + "epoch": 38.22, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 487400 + }, + { + "epoch": 38.23, + "learning_rate": 0.0005, + "loss": 1.4318, + "step": 487500 + }, + { + "epoch": 38.24, + "learning_rate": 0.0005, + "loss": 1.4522, + "step": 487600 + }, + { + "epoch": 38.24, + "learning_rate": 0.0005, + "loss": 1.4219, + "step": 487700 + }, + { + "epoch": 38.25, + "learning_rate": 0.0005, + "loss": 1.4298, + "step": 487800 + }, + { + "epoch": 38.26, + "learning_rate": 0.0005, + "loss": 1.4482, + "step": 487900 + }, + { + "epoch": 38.27, + "learning_rate": 0.0005, + "loss": 1.4304, + "step": 488000 + }, + { + "epoch": 38.28, + "learning_rate": 0.0005, + "loss": 1.4545, + "step": 488100 + }, + { + "epoch": 38.28, + "learning_rate": 0.0005, + "loss": 1.4368, + "step": 488200 + }, + { + "epoch": 38.29, + "learning_rate": 0.0005, + "loss": 1.4392, + "step": 488300 + }, + { + "epoch": 38.3, + "learning_rate": 0.0005, + "loss": 1.435, + "step": 488400 + }, + { + "epoch": 38.31, + "learning_rate": 0.0005, + "loss": 1.4098, + "step": 488500 + }, + { + "epoch": 38.32, + "learning_rate": 0.0005, + "loss": 1.4293, + "step": 488600 + }, + { + "epoch": 38.32, + "learning_rate": 0.0005, + "loss": 1.4606, + "step": 488700 + }, + { + "epoch": 38.33, + "learning_rate": 0.0005, + "loss": 1.4354, + "step": 488800 + }, + { + "epoch": 38.34, + "learning_rate": 0.0005, + "loss": 1.4521, + "step": 488900 + }, + { + "epoch": 38.35, + "learning_rate": 0.0005, + "loss": 1.4369, + "step": 489000 + }, + { + "epoch": 38.35, + "learning_rate": 0.0005, + "loss": 1.4305, + "step": 489100 + }, + { + "epoch": 38.36, + "learning_rate": 0.0005, + "loss": 1.4358, + "step": 489200 + }, + { + "epoch": 38.37, + "learning_rate": 0.0005, + "loss": 1.4482, + "step": 489300 + }, + { + "epoch": 38.38, + "learning_rate": 0.0005, + "loss": 1.4456, + "step": 489400 + }, + { + "epoch": 38.39, + "learning_rate": 0.0005, + "loss": 1.4476, + "step": 489500 + }, + { + "epoch": 38.39, + "learning_rate": 0.0005, + "loss": 1.4623, + "step": 489600 + }, + { + "epoch": 38.4, + "learning_rate": 0.0005, + "loss": 1.4577, + "step": 489700 + }, + { + "epoch": 38.41, + "learning_rate": 0.0005, + "loss": 1.4286, + "step": 489800 + }, + { + "epoch": 38.42, + "learning_rate": 0.0005, + "loss": 1.4479, + "step": 489900 + }, + { + "epoch": 38.43, + "learning_rate": 0.0005, + "loss": 1.4553, + "step": 490000 + }, + { + "epoch": 38.43, + "eval_gen_len": 18.818486801447868, + "eval_loss": 2.0702340602874756, + "eval_rouge1": 35.6617, + "eval_rouge2": 14.5328, + "eval_rougeL": 29.2952, + "eval_rougeLsum": 29.2949, + "eval_runtime": 360.1467, + "eval_samples_per_second": 31.451, + "eval_steps_per_second": 1.966, + "step": 490000 + }, + { + "epoch": 38.43, + "learning_rate": 0.0005, + "loss": 1.4381, + "step": 490100 + }, + { + "epoch": 38.44, + "learning_rate": 0.0005, + "loss": 1.4515, + "step": 490200 + }, + { + "epoch": 38.45, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 490300 + }, + { + "epoch": 38.46, + "learning_rate": 0.0005, + "loss": 1.4283, + "step": 490400 + }, + { + "epoch": 38.46, + "learning_rate": 0.0005, + "loss": 1.435, + "step": 490500 + }, + { + "epoch": 38.47, + "learning_rate": 0.0005, + "loss": 1.4338, + "step": 490600 + }, + { + "epoch": 38.48, + "learning_rate": 0.0005, + "loss": 1.4418, + "step": 490700 + }, + { + "epoch": 38.49, + "learning_rate": 0.0005, + "loss": 1.4445, + "step": 490800 + }, + { + "epoch": 38.5, + "learning_rate": 0.0005, + "loss": 1.4408, + "step": 490900 + }, + { + "epoch": 38.5, + "learning_rate": 0.0005, + "loss": 1.4541, + "step": 491000 + }, + { + "epoch": 38.51, + "learning_rate": 0.0005, + "loss": 1.4602, + "step": 491100 + }, + { + "epoch": 38.52, + "learning_rate": 0.0005, + "loss": 1.4278, + "step": 491200 + }, + { + "epoch": 38.53, + "learning_rate": 0.0005, + "loss": 1.4641, + "step": 491300 + }, + { + "epoch": 38.54, + "learning_rate": 0.0005, + "loss": 1.4253, + "step": 491400 + }, + { + "epoch": 38.54, + "learning_rate": 0.0005, + "loss": 1.4791, + "step": 491500 + }, + { + "epoch": 38.55, + "learning_rate": 0.0005, + "loss": 1.4497, + "step": 491600 + }, + { + "epoch": 38.56, + "learning_rate": 0.0005, + "loss": 1.4517, + "step": 491700 + }, + { + "epoch": 38.57, + "learning_rate": 0.0005, + "loss": 1.4639, + "step": 491800 + }, + { + "epoch": 38.57, + "learning_rate": 0.0005, + "loss": 1.4773, + "step": 491900 + }, + { + "epoch": 38.58, + "learning_rate": 0.0005, + "loss": 1.4668, + "step": 492000 + }, + { + "epoch": 38.59, + "learning_rate": 0.0005, + "loss": 1.4547, + "step": 492100 + }, + { + "epoch": 38.6, + "learning_rate": 0.0005, + "loss": 1.4635, + "step": 492200 + }, + { + "epoch": 38.61, + "learning_rate": 0.0005, + "loss": 1.4822, + "step": 492300 + }, + { + "epoch": 38.61, + "learning_rate": 0.0005, + "loss": 1.4691, + "step": 492400 + }, + { + "epoch": 38.62, + "learning_rate": 0.0005, + "loss": 1.4819, + "step": 492500 + }, + { + "epoch": 38.63, + "learning_rate": 0.0005, + "loss": 1.4483, + "step": 492600 + }, + { + "epoch": 38.64, + "learning_rate": 0.0005, + "loss": 1.4592, + "step": 492700 + }, + { + "epoch": 38.64, + "learning_rate": 0.0005, + "loss": 1.4713, + "step": 492800 + }, + { + "epoch": 38.65, + "learning_rate": 0.0005, + "loss": 1.4678, + "step": 492900 + }, + { + "epoch": 38.66, + "learning_rate": 0.0005, + "loss": 1.461, + "step": 493000 + }, + { + "epoch": 38.67, + "learning_rate": 0.0005, + "loss": 1.4566, + "step": 493100 + }, + { + "epoch": 38.68, + "learning_rate": 0.0005, + "loss": 1.4501, + "step": 493200 + }, + { + "epoch": 38.68, + "learning_rate": 0.0005, + "loss": 1.4737, + "step": 493300 + }, + { + "epoch": 38.69, + "learning_rate": 0.0005, + "loss": 1.4579, + "step": 493400 + }, + { + "epoch": 38.7, + "learning_rate": 0.0005, + "loss": 1.4737, + "step": 493500 + }, + { + "epoch": 38.71, + "learning_rate": 0.0005, + "loss": 1.4571, + "step": 493600 + }, + { + "epoch": 38.72, + "learning_rate": 0.0005, + "loss": 1.4711, + "step": 493700 + }, + { + "epoch": 38.72, + "learning_rate": 0.0005, + "loss": 1.4522, + "step": 493800 + }, + { + "epoch": 38.73, + "learning_rate": 0.0005, + "loss": 1.4809, + "step": 493900 + }, + { + "epoch": 38.74, + "learning_rate": 0.0005, + "loss": 1.4816, + "step": 494000 + }, + { + "epoch": 38.75, + "learning_rate": 0.0005, + "loss": 1.4372, + "step": 494100 + }, + { + "epoch": 38.75, + "learning_rate": 0.0005, + "loss": 1.4837, + "step": 494200 + }, + { + "epoch": 38.76, + "learning_rate": 0.0005, + "loss": 1.4849, + "step": 494300 + }, + { + "epoch": 38.77, + "learning_rate": 0.0005, + "loss": 1.4664, + "step": 494400 + }, + { + "epoch": 38.78, + "learning_rate": 0.0005, + "loss": 1.4442, + "step": 494500 + }, + { + "epoch": 38.79, + "learning_rate": 0.0005, + "loss": 1.4493, + "step": 494600 + }, + { + "epoch": 38.79, + "learning_rate": 0.0005, + "loss": 1.45, + "step": 494700 + }, + { + "epoch": 38.8, + "learning_rate": 0.0005, + "loss": 1.4717, + "step": 494800 + }, + { + "epoch": 38.81, + "learning_rate": 0.0005, + "loss": 1.4816, + "step": 494900 + }, + { + "epoch": 38.82, + "learning_rate": 0.0005, + "loss": 1.4556, + "step": 495000 + }, + { + "epoch": 38.83, + "learning_rate": 0.0005, + "loss": 1.4619, + "step": 495100 + }, + { + "epoch": 38.83, + "learning_rate": 0.0005, + "loss": 1.4585, + "step": 495200 + }, + { + "epoch": 38.84, + "learning_rate": 0.0005, + "loss": 1.4512, + "step": 495300 + }, + { + "epoch": 38.85, + "learning_rate": 0.0005, + "loss": 1.4606, + "step": 495400 + }, + { + "epoch": 38.86, + "learning_rate": 0.0005, + "loss": 1.4713, + "step": 495500 + }, + { + "epoch": 38.86, + "learning_rate": 0.0005, + "loss": 1.4917, + "step": 495600 + }, + { + "epoch": 38.87, + "learning_rate": 0.0005, + "loss": 1.4603, + "step": 495700 + }, + { + "epoch": 38.88, + "learning_rate": 0.0005, + "loss": 1.4885, + "step": 495800 + }, + { + "epoch": 38.89, + "learning_rate": 0.0005, + "loss": 1.4555, + "step": 495900 + }, + { + "epoch": 38.9, + "learning_rate": 0.0005, + "loss": 1.4674, + "step": 496000 + }, + { + "epoch": 38.9, + "learning_rate": 0.0005, + "loss": 1.4614, + "step": 496100 + }, + { + "epoch": 38.91, + "learning_rate": 0.0005, + "loss": 1.4815, + "step": 496200 + }, + { + "epoch": 38.92, + "learning_rate": 0.0005, + "loss": 1.4637, + "step": 496300 + }, + { + "epoch": 38.93, + "learning_rate": 0.0005, + "loss": 1.475, + "step": 496400 + }, + { + "epoch": 38.94, + "learning_rate": 0.0005, + "loss": 1.4696, + "step": 496500 + }, + { + "epoch": 38.94, + "learning_rate": 0.0005, + "loss": 1.4896, + "step": 496600 + }, + { + "epoch": 38.95, + "learning_rate": 0.0005, + "loss": 1.4555, + "step": 496700 + }, + { + "epoch": 38.96, + "learning_rate": 0.0005, + "loss": 1.4873, + "step": 496800 + }, + { + "epoch": 38.97, + "learning_rate": 0.0005, + "loss": 1.4663, + "step": 496900 + }, + { + "epoch": 38.97, + "learning_rate": 0.0005, + "loss": 1.4983, + "step": 497000 + }, + { + "epoch": 38.98, + "learning_rate": 0.0005, + "loss": 1.4715, + "step": 497100 + }, + { + "epoch": 38.99, + "learning_rate": 0.0005, + "loss": 1.5052, + "step": 497200 + }, + { + "epoch": 39.0, + "learning_rate": 0.0005, + "loss": 1.4862, + "step": 497300 + }, + { + "epoch": 39.01, + "learning_rate": 0.0005, + "loss": 1.412, + "step": 497400 + }, + { + "epoch": 39.01, + "learning_rate": 0.0005, + "loss": 1.3872, + "step": 497500 + }, + { + "epoch": 39.02, + "learning_rate": 0.0005, + "loss": 1.3971, + "step": 497600 + }, + { + "epoch": 39.03, + "learning_rate": 0.0005, + "loss": 1.4171, + "step": 497700 + }, + { + "epoch": 39.04, + "learning_rate": 0.0005, + "loss": 1.4026, + "step": 497800 + }, + { + "epoch": 39.04, + "learning_rate": 0.0005, + "loss": 1.39, + "step": 497900 + }, + { + "epoch": 39.05, + "learning_rate": 0.0005, + "loss": 1.3829, + "step": 498000 + }, + { + "epoch": 39.06, + "learning_rate": 0.0005, + "loss": 1.3924, + "step": 498100 + }, + { + "epoch": 39.07, + "learning_rate": 0.0005, + "loss": 1.396, + "step": 498200 + }, + { + "epoch": 39.08, + "learning_rate": 0.0005, + "loss": 1.386, + "step": 498300 + }, + { + "epoch": 39.08, + "learning_rate": 0.0005, + "loss": 1.3992, + "step": 498400 + }, + { + "epoch": 39.09, + "learning_rate": 0.0005, + "loss": 1.4199, + "step": 498500 + }, + { + "epoch": 39.1, + "learning_rate": 0.0005, + "loss": 1.3924, + "step": 498600 + }, + { + "epoch": 39.11, + "learning_rate": 0.0005, + "loss": 1.402, + "step": 498700 + }, + { + "epoch": 39.12, + "learning_rate": 0.0005, + "loss": 1.4241, + "step": 498800 + }, + { + "epoch": 39.12, + "learning_rate": 0.0005, + "loss": 1.4045, + "step": 498900 + }, + { + "epoch": 39.13, + "learning_rate": 0.0005, + "loss": 1.4163, + "step": 499000 + }, + { + "epoch": 39.14, + "learning_rate": 0.0005, + "loss": 1.4071, + "step": 499100 + }, + { + "epoch": 39.15, + "learning_rate": 0.0005, + "loss": 1.4081, + "step": 499200 + }, + { + "epoch": 39.15, + "learning_rate": 0.0005, + "loss": 1.4432, + "step": 499300 + }, + { + "epoch": 39.16, + "learning_rate": 0.0005, + "loss": 1.4187, + "step": 499400 + }, + { + "epoch": 39.17, + "learning_rate": 0.0005, + "loss": 1.4159, + "step": 499500 + }, + { + "epoch": 39.18, + "learning_rate": 0.0005, + "loss": 1.4294, + "step": 499600 + }, + { + "epoch": 39.19, + "learning_rate": 0.0005, + "loss": 1.4187, + "step": 499700 + }, + { + "epoch": 39.19, + "learning_rate": 0.0005, + "loss": 1.4207, + "step": 499800 + }, + { + "epoch": 39.2, + "learning_rate": 0.0005, + "loss": 1.429, + "step": 499900 + }, + { + "epoch": 39.21, + "learning_rate": 0.0005, + "loss": 1.4295, + "step": 500000 + }, + { + "epoch": 39.21, + "eval_gen_len": 18.78025955681116, + "eval_loss": 2.1033854484558105, + "eval_rouge1": 35.8815, + "eval_rouge2": 14.7785, + "eval_rougeL": 29.503, + "eval_rougeLsum": 29.4979, + "eval_runtime": 355.4667, + "eval_samples_per_second": 31.865, + "eval_steps_per_second": 1.992, + "step": 500000 + }, + { + "epoch": 39.22, + "learning_rate": 0.0005, + "loss": 1.4241, + "step": 500100 + }, + { + "epoch": 39.23, + "learning_rate": 0.0005, + "loss": 1.4048, + "step": 500200 + }, + { + "epoch": 39.23, + "learning_rate": 0.0005, + "loss": 1.4098, + "step": 500300 + }, + { + "epoch": 39.24, + "learning_rate": 0.0005, + "loss": 1.3955, + "step": 500400 + }, + { + "epoch": 39.25, + "learning_rate": 0.0005, + "loss": 1.4262, + "step": 500500 + }, + { + "epoch": 39.26, + "learning_rate": 0.0005, + "loss": 1.4024, + "step": 500600 + }, + { + "epoch": 39.26, + "learning_rate": 0.0005, + "loss": 1.4432, + "step": 500700 + }, + { + "epoch": 39.27, + "learning_rate": 0.0005, + "loss": 1.4488, + "step": 500800 + }, + { + "epoch": 39.28, + "learning_rate": 0.0005, + "loss": 1.4148, + "step": 500900 + }, + { + "epoch": 39.29, + "learning_rate": 0.0005, + "loss": 1.4618, + "step": 501000 + }, + { + "epoch": 39.3, + "learning_rate": 0.0005, + "loss": 1.4333, + "step": 501100 + }, + { + "epoch": 39.3, + "learning_rate": 0.0005, + "loss": 1.4268, + "step": 501200 + }, + { + "epoch": 39.31, + "learning_rate": 0.0005, + "loss": 1.4125, + "step": 501300 + }, + { + "epoch": 39.32, + "learning_rate": 0.0005, + "loss": 1.4449, + "step": 501400 + }, + { + "epoch": 39.33, + "learning_rate": 0.0005, + "loss": 1.4244, + "step": 501500 + }, + { + "epoch": 39.34, + "learning_rate": 0.0005, + "loss": 1.4344, + "step": 501600 + }, + { + "epoch": 39.34, + "learning_rate": 0.0005, + "loss": 1.4306, + "step": 501700 + }, + { + "epoch": 39.35, + "learning_rate": 0.0005, + "loss": 1.447, + "step": 501800 + }, + { + "epoch": 39.36, + "learning_rate": 0.0005, + "loss": 1.4367, + "step": 501900 + }, + { + "epoch": 39.37, + "learning_rate": 0.0005, + "loss": 1.4568, + "step": 502000 + }, + { + "epoch": 39.37, + "learning_rate": 0.0005, + "loss": 1.4365, + "step": 502100 + }, + { + "epoch": 39.38, + "learning_rate": 0.0005, + "loss": 1.449, + "step": 502200 + }, + { + "epoch": 39.39, + "learning_rate": 0.0005, + "loss": 1.4194, + "step": 502300 + }, + { + "epoch": 39.4, + "learning_rate": 0.0005, + "loss": 1.4658, + "step": 502400 + }, + { + "epoch": 39.41, + "learning_rate": 0.0005, + "loss": 1.4472, + "step": 502500 + }, + { + "epoch": 39.41, + "learning_rate": 0.0005, + "loss": 1.4311, + "step": 502600 + }, + { + "epoch": 39.42, + "learning_rate": 0.0005, + "loss": 1.448, + "step": 502700 + }, + { + "epoch": 39.43, + "learning_rate": 0.0005, + "loss": 1.4346, + "step": 502800 + }, + { + "epoch": 39.44, + "learning_rate": 0.0005, + "loss": 1.4348, + "step": 502900 + }, + { + "epoch": 39.44, + "learning_rate": 0.0005, + "loss": 1.4081, + "step": 503000 + }, + { + "epoch": 39.45, + "learning_rate": 0.0005, + "loss": 1.4255, + "step": 503100 + }, + { + "epoch": 39.46, + "learning_rate": 0.0005, + "loss": 1.4441, + "step": 503200 + }, + { + "epoch": 39.47, + "learning_rate": 0.0005, + "loss": 1.4332, + "step": 503300 + }, + { + "epoch": 39.48, + "learning_rate": 0.0005, + "loss": 1.4609, + "step": 503400 + }, + { + "epoch": 39.48, + "learning_rate": 0.0005, + "loss": 1.452, + "step": 503500 + }, + { + "epoch": 39.49, + "learning_rate": 0.0005, + "loss": 1.4605, + "step": 503600 + }, + { + "epoch": 39.5, + "learning_rate": 0.0005, + "loss": 1.4327, + "step": 503700 + }, + { + "epoch": 39.51, + "learning_rate": 0.0005, + "loss": 1.4194, + "step": 503800 + }, + { + "epoch": 39.52, + "learning_rate": 0.0005, + "loss": 1.4626, + "step": 503900 + }, + { + "epoch": 39.52, + "learning_rate": 0.0005, + "loss": 1.4482, + "step": 504000 + }, + { + "epoch": 39.53, + "learning_rate": 0.0005, + "loss": 1.4396, + "step": 504100 + }, + { + "epoch": 39.54, + "learning_rate": 0.0005, + "loss": 1.4632, + "step": 504200 + }, + { + "epoch": 39.55, + "learning_rate": 0.0005, + "loss": 1.4469, + "step": 504300 + }, + { + "epoch": 39.55, + "learning_rate": 0.0005, + "loss": 1.4564, + "step": 504400 + }, + { + "epoch": 39.56, + "learning_rate": 0.0005, + "loss": 1.4522, + "step": 504500 + }, + { + "epoch": 39.57, + "learning_rate": 0.0005, + "loss": 1.4582, + "step": 504600 + }, + { + "epoch": 39.58, + "learning_rate": 0.0005, + "loss": 1.4384, + "step": 504700 + }, + { + "epoch": 39.59, + "learning_rate": 0.0005, + "loss": 1.4638, + "step": 504800 + }, + { + "epoch": 39.59, + "learning_rate": 0.0005, + "loss": 1.4518, + "step": 504900 + }, + { + "epoch": 39.6, + "learning_rate": 0.0005, + "loss": 1.4322, + "step": 505000 + }, + { + "epoch": 39.61, + "learning_rate": 0.0005, + "loss": 1.4654, + "step": 505100 + }, + { + "epoch": 39.62, + "learning_rate": 0.0005, + "loss": 1.4446, + "step": 505200 + }, + { + "epoch": 39.63, + "learning_rate": 0.0005, + "loss": 1.4646, + "step": 505300 + }, + { + "epoch": 39.63, + "learning_rate": 0.0005, + "loss": 1.4356, + "step": 505400 + }, + { + "epoch": 39.64, + "learning_rate": 0.0005, + "loss": 1.4614, + "step": 505500 + }, + { + "epoch": 39.65, + "learning_rate": 0.0005, + "loss": 1.4351, + "step": 505600 + }, + { + "epoch": 39.66, + "learning_rate": 0.0005, + "loss": 1.4379, + "step": 505700 + }, + { + "epoch": 39.66, + "learning_rate": 0.0005, + "loss": 1.4683, + "step": 505800 + }, + { + "epoch": 39.67, + "learning_rate": 0.0005, + "loss": 1.4412, + "step": 505900 + }, + { + "epoch": 39.68, + "learning_rate": 0.0005, + "loss": 1.4549, + "step": 506000 + }, + { + "epoch": 39.69, + "learning_rate": 0.0005, + "loss": 1.4474, + "step": 506100 + }, + { + "epoch": 39.7, + "learning_rate": 0.0005, + "loss": 1.4656, + "step": 506200 + }, + { + "epoch": 39.7, + "learning_rate": 0.0005, + "loss": 1.4564, + "step": 506300 + }, + { + "epoch": 39.71, + "learning_rate": 0.0005, + "loss": 1.4471, + "step": 506400 + }, + { + "epoch": 39.72, + "learning_rate": 0.0005, + "loss": 1.4516, + "step": 506500 + }, + { + "epoch": 39.73, + "learning_rate": 0.0005, + "loss": 1.4619, + "step": 506600 + }, + { + "epoch": 39.73, + "learning_rate": 0.0005, + "loss": 1.4515, + "step": 506700 + }, + { + "epoch": 39.74, + "learning_rate": 0.0005, + "loss": 1.4363, + "step": 506800 + }, + { + "epoch": 39.75, + "learning_rate": 0.0005, + "loss": 1.4689, + "step": 506900 + }, + { + "epoch": 39.76, + "learning_rate": 0.0005, + "loss": 1.4752, + "step": 507000 + }, + { + "epoch": 39.77, + "learning_rate": 0.0005, + "loss": 1.4471, + "step": 507100 + }, + { + "epoch": 39.77, + "learning_rate": 0.0005, + "loss": 1.4483, + "step": 507200 + }, + { + "epoch": 39.78, + "learning_rate": 0.0005, + "loss": 1.43, + "step": 507300 + }, + { + "epoch": 39.79, + "learning_rate": 0.0005, + "loss": 1.4408, + "step": 507400 + }, + { + "epoch": 39.8, + "learning_rate": 0.0005, + "loss": 1.4258, + "step": 507500 + }, + { + "epoch": 39.81, + "learning_rate": 0.0005, + "loss": 1.4326, + "step": 507600 + }, + { + "epoch": 39.81, + "learning_rate": 0.0005, + "loss": 1.5031, + "step": 507700 + }, + { + "epoch": 39.82, + "learning_rate": 0.0005, + "loss": 1.461, + "step": 507800 + }, + { + "epoch": 39.83, + "learning_rate": 0.0005, + "loss": 1.4695, + "step": 507900 + }, + { + "epoch": 39.84, + "learning_rate": 0.0005, + "loss": 1.491, + "step": 508000 + }, + { + "epoch": 39.84, + "learning_rate": 0.0005, + "loss": 1.4485, + "step": 508100 + }, + { + "epoch": 39.85, + "learning_rate": 0.0005, + "loss": 1.4509, + "step": 508200 + }, + { + "epoch": 39.86, + "learning_rate": 0.0005, + "loss": 1.4386, + "step": 508300 + }, + { + "epoch": 39.87, + "learning_rate": 0.0005, + "loss": 1.4672, + "step": 508400 + }, + { + "epoch": 39.88, + "learning_rate": 0.0005, + "loss": 1.4693, + "step": 508500 + }, + { + "epoch": 39.88, + "learning_rate": 0.0005, + "loss": 1.4867, + "step": 508600 + }, + { + "epoch": 39.89, + "learning_rate": 0.0005, + "loss": 1.4633, + "step": 508700 + }, + { + "epoch": 39.9, + "learning_rate": 0.0005, + "loss": 1.4794, + "step": 508800 + }, + { + "epoch": 39.91, + "learning_rate": 0.0005, + "loss": 1.4676, + "step": 508900 + }, + { + "epoch": 39.92, + "learning_rate": 0.0005, + "loss": 1.4694, + "step": 509000 + }, + { + "epoch": 39.92, + "learning_rate": 0.0005, + "loss": 1.4547, + "step": 509100 + }, + { + "epoch": 39.93, + "learning_rate": 0.0005, + "loss": 1.4533, + "step": 509200 + }, + { + "epoch": 39.94, + "learning_rate": 0.0005, + "loss": 1.4713, + "step": 509300 + }, + { + "epoch": 39.95, + "learning_rate": 0.0005, + "loss": 1.455, + "step": 509400 + }, + { + "epoch": 39.95, + "learning_rate": 0.0005, + "loss": 1.4833, + "step": 509500 + }, + { + "epoch": 39.96, + "learning_rate": 0.0005, + "loss": 1.4749, + "step": 509600 + }, + { + "epoch": 39.97, + "learning_rate": 0.0005, + "loss": 1.4435, + "step": 509700 + }, + { + "epoch": 39.98, + "learning_rate": 0.0005, + "loss": 1.4589, + "step": 509800 + }, + { + "epoch": 39.99, + "learning_rate": 0.0005, + "loss": 1.469, + "step": 509900 + }, + { + "epoch": 39.99, + "learning_rate": 0.0005, + "loss": 1.4541, + "step": 510000 + }, + { + "epoch": 39.99, + "eval_gen_len": 18.759071245696123, + "eval_loss": 2.0739943981170654, + "eval_rouge1": 35.8683, + "eval_rouge2": 14.7103, + "eval_rougeL": 29.5012, + "eval_rougeLsum": 29.4958, + "eval_runtime": 358.0238, + "eval_samples_per_second": 31.638, + "eval_steps_per_second": 1.978, + "step": 510000 + } + ], + "max_steps": 637600, + "num_train_epochs": 50, + "total_flos": 2.1607039565631652e+18, + "trial_name": null, + "trial_params": null +}