|
{ |
|
"best_metric": 2.255647659301758, |
|
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_sfd_memsum_30/checkpoint-43", |
|
"epoch": 29.217391304347824, |
|
"eval_steps": 500, |
|
"global_step": 420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.001, |
|
"loss": 3.2355, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.001, |
|
"loss": 3.0785, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.001, |
|
"loss": 3.0423, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.001, |
|
"loss": 2.9059, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.001, |
|
"loss": 2.777, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.001, |
|
"loss": 2.6848, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.001, |
|
"loss": 2.6697, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 2.4168484210968018, |
|
"eval_runtime": 14.3646, |
|
"eval_samples_per_second": 23.53, |
|
"eval_steps_per_second": 2.993, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.001, |
|
"loss": 2.4413, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.3287, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 0.001, |
|
"loss": 2.3054, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 0.001, |
|
"loss": 2.3458, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.001, |
|
"loss": 2.3942, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 0.001, |
|
"loss": 2.3186, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 0.001, |
|
"loss": 2.2272, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 2.264381170272827, |
|
"eval_runtime": 14.3433, |
|
"eval_samples_per_second": 23.565, |
|
"eval_steps_per_second": 2.998, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.001, |
|
"loss": 2.0385, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.001, |
|
"loss": 1.9493, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.001, |
|
"loss": 1.9811, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 0.001, |
|
"loss": 1.9783, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.001, |
|
"loss": 1.9534, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"learning_rate": 0.001, |
|
"loss": 1.9822, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 0.001, |
|
"loss": 1.9024, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_loss": 2.255647659301758, |
|
"eval_runtime": 14.349, |
|
"eval_samples_per_second": 23.556, |
|
"eval_steps_per_second": 2.997, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 0.001, |
|
"loss": 1.7788, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.001, |
|
"loss": 1.6545, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 0.001, |
|
"loss": 1.633, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 0.001, |
|
"loss": 1.6763, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 0.001, |
|
"loss": 1.6571, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.001, |
|
"loss": 1.6333, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 0.001, |
|
"loss": 1.6554, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"eval_loss": 2.4007155895233154, |
|
"eval_runtime": 14.3209, |
|
"eval_samples_per_second": 23.602, |
|
"eval_steps_per_second": 3.003, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.001, |
|
"loss": 1.5927, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"learning_rate": 0.001, |
|
"loss": 1.3606, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"learning_rate": 0.001, |
|
"loss": 1.3576, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"learning_rate": 0.001, |
|
"loss": 1.4079, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"learning_rate": 0.001, |
|
"loss": 1.4133, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.001, |
|
"loss": 1.3712, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 0.001, |
|
"loss": 1.3619, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"eval_loss": 2.423330545425415, |
|
"eval_runtime": 14.334, |
|
"eval_samples_per_second": 23.58, |
|
"eval_steps_per_second": 3.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"learning_rate": 0.001, |
|
"loss": 1.3071, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"learning_rate": 0.001, |
|
"loss": 1.0472, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"learning_rate": 0.001, |
|
"loss": 1.1038, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.001, |
|
"loss": 1.1355, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 0.001, |
|
"loss": 1.1239, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.001, |
|
"loss": 1.1464, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"learning_rate": 0.001, |
|
"loss": 1.1564, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"learning_rate": 0.001, |
|
"loss": 1.1577, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"eval_loss": 2.679687976837158, |
|
"eval_runtime": 14.3396, |
|
"eval_samples_per_second": 23.571, |
|
"eval_steps_per_second": 2.999, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"learning_rate": 0.001, |
|
"loss": 0.9042, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"learning_rate": 0.001, |
|
"loss": 0.882, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.001, |
|
"loss": 0.9122, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"learning_rate": 0.001, |
|
"loss": 0.8572, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"learning_rate": 0.001, |
|
"loss": 0.8929, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"learning_rate": 0.001, |
|
"loss": 0.9316, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"learning_rate": 0.001, |
|
"loss": 0.9584, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"eval_loss": 2.8448543548583984, |
|
"eval_runtime": 14.3357, |
|
"eval_samples_per_second": 23.578, |
|
"eval_steps_per_second": 3.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.7378, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.7519, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.001, |
|
"loss": 0.6793, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 0.001, |
|
"loss": 0.7177, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"learning_rate": 0.001, |
|
"loss": 0.7523, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 0.001, |
|
"loss": 0.7479, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.7197, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.025547504425049, |
|
"eval_runtime": 14.3269, |
|
"eval_samples_per_second": 23.592, |
|
"eval_steps_per_second": 3.001, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.001, |
|
"loss": 0.6231, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"learning_rate": 0.001, |
|
"loss": 0.5361, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.5926, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 0.001, |
|
"loss": 0.5732, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"learning_rate": 0.001, |
|
"loss": 0.6658, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"learning_rate": 0.001, |
|
"loss": 0.803, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"learning_rate": 0.001, |
|
"loss": 0.5756, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"eval_loss": 3.146686553955078, |
|
"eval_runtime": 14.3236, |
|
"eval_samples_per_second": 23.597, |
|
"eval_steps_per_second": 3.002, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 0.001, |
|
"loss": 0.5221, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"learning_rate": 0.001, |
|
"loss": 0.4466, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"learning_rate": 0.001, |
|
"loss": 0.4909, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.4707, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 0.001, |
|
"loss": 0.4588, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 0.001, |
|
"loss": 0.4693, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"learning_rate": 0.001, |
|
"loss": 0.485, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"eval_loss": 3.2976272106170654, |
|
"eval_runtime": 14.3283, |
|
"eval_samples_per_second": 23.59, |
|
"eval_steps_per_second": 3.001, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"learning_rate": 0.001, |
|
"loss": 0.4447, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"learning_rate": 0.001, |
|
"loss": 0.3447, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 10.3, |
|
"learning_rate": 0.001, |
|
"loss": 0.3563, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"learning_rate": 0.001, |
|
"loss": 0.3578, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"learning_rate": 0.001, |
|
"loss": 0.3623, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"learning_rate": 0.001, |
|
"loss": 0.383, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 10.85, |
|
"learning_rate": 0.001, |
|
"loss": 0.3717, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 10.99, |
|
"learning_rate": 0.001, |
|
"loss": 0.4027, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 10.99, |
|
"eval_loss": 3.8110954761505127, |
|
"eval_runtime": 14.3426, |
|
"eval_samples_per_second": 23.566, |
|
"eval_steps_per_second": 2.998, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 0.001, |
|
"loss": 0.2821, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"learning_rate": 0.001, |
|
"loss": 0.2735, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 11.41, |
|
"learning_rate": 0.001, |
|
"loss": 0.3073, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 11.55, |
|
"learning_rate": 0.001, |
|
"loss": 0.3386, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 11.69, |
|
"learning_rate": 0.001, |
|
"loss": 0.3426, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"learning_rate": 0.001, |
|
"loss": 0.3111, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"learning_rate": 0.001, |
|
"loss": 0.2938, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"eval_loss": 3.7330400943756104, |
|
"eval_runtime": 14.3561, |
|
"eval_samples_per_second": 23.544, |
|
"eval_steps_per_second": 2.995, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.2443, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"learning_rate": 0.001, |
|
"loss": 0.2312, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 12.38, |
|
"learning_rate": 0.001, |
|
"loss": 0.2253, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 12.52, |
|
"learning_rate": 0.001, |
|
"loss": 0.243, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 12.66, |
|
"learning_rate": 0.001, |
|
"loss": 0.2582, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.001, |
|
"loss": 0.2449, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"learning_rate": 0.001, |
|
"loss": 0.2665, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"eval_loss": 4.141729831695557, |
|
"eval_runtime": 14.3523, |
|
"eval_samples_per_second": 23.55, |
|
"eval_steps_per_second": 2.996, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 13.08, |
|
"learning_rate": 0.001, |
|
"loss": 0.2176, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 13.22, |
|
"learning_rate": 0.001, |
|
"loss": 0.1728, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 13.36, |
|
"learning_rate": 0.001, |
|
"loss": 0.1807, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.2002, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 13.63, |
|
"learning_rate": 0.001, |
|
"loss": 0.1975, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"learning_rate": 0.001, |
|
"loss": 0.1973, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 13.91, |
|
"learning_rate": 0.001, |
|
"loss": 0.2019, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"eval_loss": 4.03155517578125, |
|
"eval_runtime": 14.3514, |
|
"eval_samples_per_second": 23.552, |
|
"eval_steps_per_second": 2.996, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 14.05, |
|
"learning_rate": 0.001, |
|
"loss": 0.19, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"learning_rate": 0.001, |
|
"loss": 0.1599, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 14.33, |
|
"learning_rate": 0.001, |
|
"loss": 0.1647, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"learning_rate": 0.001, |
|
"loss": 0.1506, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 14.61, |
|
"learning_rate": 0.001, |
|
"loss": 0.1551, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 14.75, |
|
"learning_rate": 0.001, |
|
"loss": 0.1628, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"learning_rate": 0.001, |
|
"loss": 0.1706, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 14.96, |
|
"eval_loss": 4.135679244995117, |
|
"eval_runtime": 14.348, |
|
"eval_samples_per_second": 23.557, |
|
"eval_steps_per_second": 2.997, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 15.03, |
|
"learning_rate": 0.001, |
|
"loss": 0.1729, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"learning_rate": 0.001, |
|
"loss": 0.1421, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 15.3, |
|
"learning_rate": 0.001, |
|
"loss": 0.1398, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 15.44, |
|
"learning_rate": 0.001, |
|
"loss": 0.1377, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 15.58, |
|
"learning_rate": 0.001, |
|
"loss": 0.1307, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 15.72, |
|
"learning_rate": 0.001, |
|
"loss": 0.1284, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"learning_rate": 0.001, |
|
"loss": 0.1384, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 0.001, |
|
"loss": 0.1418, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 4.10220193862915, |
|
"eval_runtime": 14.3626, |
|
"eval_samples_per_second": 23.533, |
|
"eval_steps_per_second": 2.994, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 16.14, |
|
"learning_rate": 0.001, |
|
"loss": 0.1085, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 16.28, |
|
"learning_rate": 0.001, |
|
"loss": 0.1024, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 16.42, |
|
"learning_rate": 0.001, |
|
"loss": 0.1073, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"learning_rate": 0.001, |
|
"loss": 0.1075, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 16.7, |
|
"learning_rate": 0.001, |
|
"loss": 0.1193, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"learning_rate": 0.001, |
|
"loss": 0.1191, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 16.97, |
|
"learning_rate": 0.001, |
|
"loss": 0.1286, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 16.97, |
|
"eval_loss": 4.119844436645508, |
|
"eval_runtime": 14.3458, |
|
"eval_samples_per_second": 23.561, |
|
"eval_steps_per_second": 2.997, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 17.11, |
|
"learning_rate": 0.001, |
|
"loss": 0.1073, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 17.25, |
|
"learning_rate": 0.001, |
|
"loss": 0.0944, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 17.39, |
|
"learning_rate": 0.001, |
|
"loss": 0.0973, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 0.001, |
|
"loss": 0.0922, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 17.67, |
|
"learning_rate": 0.001, |
|
"loss": 0.0969, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 17.81, |
|
"learning_rate": 0.001, |
|
"loss": 0.096, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 17.95, |
|
"learning_rate": 0.001, |
|
"loss": 0.1022, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 17.95, |
|
"eval_loss": 4.186199188232422, |
|
"eval_runtime": 14.357, |
|
"eval_samples_per_second": 23.543, |
|
"eval_steps_per_second": 2.995, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 18.09, |
|
"learning_rate": 0.001, |
|
"loss": 0.0902, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.083, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 18.37, |
|
"learning_rate": 0.001, |
|
"loss": 0.0826, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"learning_rate": 0.001, |
|
"loss": 0.0955, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 18.64, |
|
"learning_rate": 0.001, |
|
"loss": 0.0997, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 18.78, |
|
"learning_rate": 0.001, |
|
"loss": 0.1227, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 18.92, |
|
"learning_rate": 0.001, |
|
"loss": 0.1122, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 18.99, |
|
"eval_loss": 4.638622760772705, |
|
"eval_runtime": 14.3606, |
|
"eval_samples_per_second": 23.537, |
|
"eval_steps_per_second": 2.994, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 19.06, |
|
"learning_rate": 0.001, |
|
"loss": 0.0904, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 0.001, |
|
"loss": 0.0891, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 19.34, |
|
"learning_rate": 0.001, |
|
"loss": 0.0887, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 19.48, |
|
"learning_rate": 0.001, |
|
"loss": 0.0976, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 19.62, |
|
"learning_rate": 0.001, |
|
"loss": 0.1121, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 19.76, |
|
"learning_rate": 0.001, |
|
"loss": 0.09, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"learning_rate": 0.001, |
|
"loss": 0.093, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 19.97, |
|
"eval_loss": 4.682866096496582, |
|
"eval_runtime": 14.3499, |
|
"eval_samples_per_second": 23.554, |
|
"eval_steps_per_second": 2.997, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 20.03, |
|
"learning_rate": 0.001, |
|
"loss": 0.0847, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 20.17, |
|
"learning_rate": 0.001, |
|
"loss": 0.0754, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 20.31, |
|
"learning_rate": 0.001, |
|
"loss": 0.0757, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 20.45, |
|
"learning_rate": 0.001, |
|
"loss": 0.0688, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 20.59, |
|
"learning_rate": 0.001, |
|
"loss": 0.0723, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 20.73, |
|
"learning_rate": 0.001, |
|
"loss": 0.079, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 20.87, |
|
"learning_rate": 0.001, |
|
"loss": 0.0783, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 20.94, |
|
"eval_loss": 4.663684844970703, |
|
"eval_runtime": 14.3504, |
|
"eval_samples_per_second": 23.553, |
|
"eval_steps_per_second": 2.996, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 21.01, |
|
"learning_rate": 0.001, |
|
"loss": 0.0779, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 21.15, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 21.29, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 21.43, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 21.57, |
|
"learning_rate": 0.001, |
|
"loss": 0.0682, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 21.7, |
|
"learning_rate": 0.001, |
|
"loss": 0.0702, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 21.84, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 21.98, |
|
"learning_rate": 0.001, |
|
"loss": 0.0698, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 21.98, |
|
"eval_loss": 4.718987941741943, |
|
"eval_runtime": 14.3394, |
|
"eval_samples_per_second": 23.571, |
|
"eval_steps_per_second": 2.999, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 22.12, |
|
"learning_rate": 0.001, |
|
"loss": 0.0702, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 22.26, |
|
"learning_rate": 0.001, |
|
"loss": 0.0713, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 22.54, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 22.68, |
|
"learning_rate": 0.001, |
|
"loss": 0.0652, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 22.82, |
|
"learning_rate": 0.001, |
|
"loss": 0.0643, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 22.96, |
|
"learning_rate": 0.001, |
|
"loss": 0.0688, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 22.96, |
|
"eval_loss": 5.020025253295898, |
|
"eval_runtime": 14.3306, |
|
"eval_samples_per_second": 23.586, |
|
"eval_steps_per_second": 3.001, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 23.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.0584, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 23.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.0565, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 23.37, |
|
"learning_rate": 0.001, |
|
"loss": 0.0551, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 23.51, |
|
"learning_rate": 0.001, |
|
"loss": 0.056, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 23.65, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 23.79, |
|
"learning_rate": 0.001, |
|
"loss": 0.0589, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 23.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.0633, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 4.757626056671143, |
|
"eval_runtime": 14.3331, |
|
"eval_samples_per_second": 23.582, |
|
"eval_steps_per_second": 3.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 24.07, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 24.21, |
|
"learning_rate": 0.001, |
|
"loss": 0.0524, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 24.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.0506, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 24.49, |
|
"learning_rate": 0.001, |
|
"loss": 0.0482, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 24.63, |
|
"learning_rate": 0.001, |
|
"loss": 0.0543, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 24.77, |
|
"learning_rate": 0.001, |
|
"loss": 0.0563, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 24.9, |
|
"learning_rate": 0.001, |
|
"loss": 0.0609, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 24.97, |
|
"eval_loss": 4.7804856300354, |
|
"eval_runtime": 14.3386, |
|
"eval_samples_per_second": 23.573, |
|
"eval_steps_per_second": 2.999, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 25.04, |
|
"learning_rate": 0.001, |
|
"loss": 0.054, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 25.18, |
|
"learning_rate": 0.001, |
|
"loss": 0.0577, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 25.32, |
|
"learning_rate": 0.001, |
|
"loss": 0.052, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 25.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.0472, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 25.6, |
|
"learning_rate": 0.001, |
|
"loss": 0.0527, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 25.74, |
|
"learning_rate": 0.001, |
|
"loss": 0.0516, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 25.88, |
|
"learning_rate": 0.001, |
|
"loss": 0.0553, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 25.95, |
|
"eval_loss": 4.733786582946777, |
|
"eval_runtime": 14.3354, |
|
"eval_samples_per_second": 23.578, |
|
"eval_steps_per_second": 3.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 26.02, |
|
"learning_rate": 0.001, |
|
"loss": 0.0575, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 26.16, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 26.3, |
|
"learning_rate": 0.001, |
|
"loss": 0.0598, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 26.43, |
|
"learning_rate": 0.001, |
|
"loss": 0.0647, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 26.57, |
|
"learning_rate": 0.001, |
|
"loss": 0.0486, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 26.71, |
|
"learning_rate": 0.001, |
|
"loss": 0.0434, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 26.85, |
|
"learning_rate": 0.001, |
|
"loss": 0.0523, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 26.99, |
|
"learning_rate": 0.001, |
|
"loss": 0.0503, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 26.99, |
|
"eval_loss": 5.140863418579102, |
|
"eval_runtime": 14.327, |
|
"eval_samples_per_second": 23.592, |
|
"eval_steps_per_second": 3.001, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 27.13, |
|
"learning_rate": 0.001, |
|
"loss": 0.0393, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 27.27, |
|
"learning_rate": 0.001, |
|
"loss": 0.038, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 27.41, |
|
"learning_rate": 0.001, |
|
"loss": 0.0407, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 27.55, |
|
"learning_rate": 0.001, |
|
"loss": 0.0417, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 27.69, |
|
"learning_rate": 0.001, |
|
"loss": 0.0437, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 27.83, |
|
"learning_rate": 0.001, |
|
"loss": 0.0448, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 27.97, |
|
"learning_rate": 0.001, |
|
"loss": 0.0471, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 27.97, |
|
"eval_loss": 5.14629602432251, |
|
"eval_runtime": 14.3389, |
|
"eval_samples_per_second": 23.572, |
|
"eval_steps_per_second": 2.999, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 28.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.041, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 28.24, |
|
"learning_rate": 0.001, |
|
"loss": 0.0391, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 28.38, |
|
"learning_rate": 0.001, |
|
"loss": 0.0429, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 28.52, |
|
"learning_rate": 0.001, |
|
"loss": 0.0472, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 28.66, |
|
"learning_rate": 0.001, |
|
"loss": 0.0458, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 28.8, |
|
"learning_rate": 0.001, |
|
"loss": 0.0451, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 28.94, |
|
"learning_rate": 0.001, |
|
"loss": 0.0472, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 28.94, |
|
"eval_loss": 5.163626194000244, |
|
"eval_runtime": 14.3465, |
|
"eval_samples_per_second": 23.56, |
|
"eval_steps_per_second": 2.997, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 29.08, |
|
"learning_rate": 0.001, |
|
"loss": 0.0403, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 29.22, |
|
"learning_rate": 0.001, |
|
"loss": 0.0376, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 29.22, |
|
"eval_loss": 5.132218360900879, |
|
"eval_runtime": 14.3004, |
|
"eval_samples_per_second": 23.636, |
|
"eval_steps_per_second": 3.007, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 29.22, |
|
"step": 420, |
|
"total_flos": 8.206795539988808e+17, |
|
"train_loss": 0.5600467140298514, |
|
"train_runtime": 18789.4517, |
|
"train_samples_per_second": 5.864, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 420, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 8.206795539988808e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|