File size: 2,557 Bytes
ece0628 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"]="0.0"
class PromptDataset(Dataset):
def __init__(self, file_path, tokenizer, block_size=512):
self.input_examples = []
with open(file_path, 'r', encoding="utf-8", errors="replace") as f:
text = f.read()
lines = text.splitlines()
for line in lines:
if line.strip():
parts = line.split('[PAD]')
if len(parts) >= 3:
input_part = '[PAD]'.join(parts[:1]).strip() # Only keep the part up to the first [PAD]
input_part += tokenizer.eos_token
tokenized_input = tokenizer.encode(input_part, add_special_tokens=True)
# Split sequences longer than the block size for input
for i in range(0, len(tokenized_input), block_size):
input_chunk = tokenized_input[i:i + block_size]
self.input_examples.append(torch.tensor(input_chunk))
def __len__(self):
return len(self.input_examples)
def __getitem__(self, i):
return self.input_examples[i]
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
print(tokenizer.eos_token)
tokenizer.pad_token = tokenizer.eos_token
dataset = PromptDataset("batch_ds_v2.txt", tokenizer)
print(f"Number of examples: {len(dataset)}")
model = GPT2LMHeadModel.from_pretrained("gpt2-large")
device = torch.device("mps")
model.to(device)
training_args = TrainingArguments(
lr_scheduler_type="cosine",
run_name="large-1of3_v2",
output_dir="./v2/large",
overwrite_output_dir=True,
max_steps=500,
save_steps=50,
#auto_find_batch_size=True,
per_device_train_batch_size=2,
learning_rate=1e-4,
max_grad_norm=1.0,
logging_steps=1,
)
def data_collator(features):
input_ids = torch.nn.utils.rnn.pad_sequence(features, batch_first=True, padding_value=tokenizer.pad_token_id)
labels = input_ids.clone()
labels[labels == tokenizer.pad_token_id] = -100 # Set labels to -100 where input is [PAD] to ignore in loss calculation
return {"input_ids": input_ids, "labels": labels}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
model.save_pretrained("./v2/large")
tokenizer.save_pretrained("./v2/large")
|