Fine tuning model Google Colab t4-GPU
Hello!
Is it possible to fine tune the model in Google Colab?
I'm trying to use Lora and Peft to tune an English to Spanish translation model
My code is the following
code
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
raw_datasets = load_dataset('csv', data_files='Dataset.csv', delimiter=',')
model_checkpoint = "facebook/nllb-200-distilled-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang="en", tgt_lang="es")
def preprocess_function(examples):
inputs = [ex for ex in examples["sourceString"]]
targets = [ex for ex in examples["targetString"]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True)
# Provide information for the decoder
model_inputs["labels"] = labels["input_ids"]
model_inputs["decoder_input_ids"] = labels["input_ids"]
return model_inputs
Tokenize Data
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
Fine Tune Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
Fine Tune Model
Load model with quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_checkpoint,
load_in_4bit=True,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map="auto"
)
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(model, peft_config)
args = Seq2SeqTrainingArguments(
f"Model-{model_checkpoint}",
#evaluation_strategy = "steps",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=15,
predict_with_generate=True,
logging_steps=200,#50,
warmup_steps=500,#100,
fp16 = True,
label_smoothing_factor = 0.1,
logging_first_step = True
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
#eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
tokenizer=tokenizer
)
trainer.train()
and I get the error
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
I assume it is a problem with quantization, because I have tried this code with other models and it does not give me this problem.
If anyone has a collaborative notebook that they can share with me, I would greatly appreciate it.