training_args = TrainingArguments( output_dir='t5-small-newsela-biendata-with-domain-adaptation', num_train_epochs=20, warmup_steps=250, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, learning_rate=2e-4, fp16=True, optim="adafactor", )
Step Training Loss 500 35.466600 1000 25.795400 1500 10.923200 2000 4.515500 TrainOutput(global_step=2320, training_loss=16.92537920721646, metrics={'train_runtime': 628.0033, 'train_samples_per_second': 472.418, 'train_steps_per_second': 3.694, 'total_flos': 0.0, 'train_loss': 16.92537920721646, 'epoch': 20.0})