training_args = TrainingArguments( output_dir='bart-base-newsela-biendata-with-domain-adaptation', num_train_epochs=20, warmup_steps=250, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, learning_rate=2e-4, fp16=True, optim="adafactor", )
Step Training Loss 500 5.677000 1000 2.361900 1500 1.826000 2000 1.672900 2500 1.597900 3000 1.555700 3500 1.520600 4000 1.496300 4500 1.476800 TrainOutput(global_step=4640, training_loss=2.1116079396214977, metrics={'train_runtime': 1059.6025, 'train_samples_per_second': 279.992, 'train_steps_per_second': 4.379, 'total_flos': 0.0, 'train_loss': 2.1116079396214977, 'epoch': 20.0})