Hello!

Is it possible to fine tune the model in Google Colab?

I'm trying to use Lora and Peft to tune an English to Spanish translation model

My code is the following

code
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

raw_datasets = load_dataset('csv', data_files='Dataset.csv', delimiter=',')

model_checkpoint = "facebook/nllb-200-distilled-1.3B"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, src_lang="en", tgt_lang="es")

def preprocess_function(examples):
inputs = [ex for ex in examples["sourceString"]]
targets = [ex for ex in examples["targetString"]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True)

# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=128, truncation=True)

# Provide information for the decoder
model_inputs["labels"] = labels["input_ids"]
model_inputs["decoder_input_ids"] = labels["input_ids"]

return model_inputs

Tokenize Data

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Fine Tune Model

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Fine Tune Model

Load model with quantization

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
model_checkpoint,
load_in_4bit=True,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
device_map="auto"
)

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(model, peft_config)

args = Seq2SeqTrainingArguments(
f"Model-{model_checkpoint}",
#evaluation_strategy = "steps",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=15,
predict_with_generate=True,
logging_steps=200,#50,
warmup_steps=500,#100,
fp16 = True,
label_smoothing_factor = 0.1,
logging_first_step = True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
#eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
tokenizer=tokenizer
)

trainer.train()

and I get the error

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

I assume it is a problem with quantization, because I have tried this code with other models and it does not give me this problem.

If anyone has a collaborative notebook that they can share with me, I would greatly appreciate it.

facebook
/

nllb-200-3.3B

Fine tuning model Google Colab t4-GPU

Tokenize Data

Fine Tune Model

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Fine Tune Model

Load model with quantization