training_args = TrainingArguments( | |
output_dir='pegasus-base-wikilarge-newsela-with-domain-adaptation', | |
num_train_epochs=20, | |
warmup_steps=250, | |
per_device_train_batch_size=BATCH_SIZE, | |
weight_decay=0.01, | |
learning_rate=2e-4, | |
# fp16=True, | |
optim="adafactor", | |
) | |
Step Training Loss | |
500 4.391800 | |
1000 3.994400 | |
1500 3.009300 | |
2000 2.596300 | |
2500 2.389600 | |
3000 2.328100 | |
3500 2.272900 | |
4000 2.125300 | |
4500 2.019500 | |
5000 2.005500 | |
5500 1.994400 | |
6000 1.853300 | |
6500 1.838700 | |
7000 1.840200 | |
7500 1.822700 | |
8000 1.716600 | |
8500 1.734100 | |
9000 1.739500 | |
9500 1.696000 | |
10000 1.661700 | |
10500 1.672500 | |
11000 1.666800 | |
11500 1.617700 | |
12000 1.611400 | |
12500 1.616300 | |
13000 1.625800 | |
13500 1.567700 | |
14000 1.584600 | |
14500 1.589800 | |
15000 1.574600 | |
15500 1.548300 | |
16000 1.559800 | |
16500 1.562100 | |
17000 1.541600 | |
17500 1.533500 | |
18000 1.538400 | |
18500 1.545700 | |
19000 1.510100 | |
19500 1.522600 | |
20000 1.529100 | |
20500 1.520900 | |
21000 1.501100 | |
21500 1.508400 | |
22000 1.515400 | |
22500 1.500100 | |
23000 1.496700 | |
23500 1.495900 | |
24000 1.505200 | |
24500 1.484400 | |
25000 1.483000 | |
25500 1.491200 | |
26000 1.491900 | |
26500 1.468600 | |
27000 1.479800 | |
27500 1.479600 | |
28000 1.474400 | |
28500 1.466000 | |
29000 1.471700 | |
29500 1.473100 | |
30000 1.463900 | |
30500 1.459300 | |
31000 1.463200 | |
31500 1.466800 | |
32000 1.455300 | |
32500 1.458600 | |
33000 1.455200 | |
33500 1.455500 | |
34000 1.449100 | |
34500 1.451400 | |
35000 1.451200 | |
35500 1.449000 | |
36000 1.443700 | |
36500 1.447600 | |
37000 1.447300 | |
TrainOutput(global_step=37100, training_loss=1.7110925547967382, metrics={'train_runtime': 33376.534, 'train_samples_per_second': 8.889, 'train_steps_per_second': 1.112, 'total_flos': 0.0, 'train_loss': 1.7110925547967382, 'epoch': 20.0}) |