File size: 2,703 Bytes
ece0628 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"]="0.0"
class StoryAbilitiesDataset(Dataset):
def __init__(self, file_path, tokenizer, block_size=256):
self.examples = []
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
text = f.read()
lines = text.splitlines()
print1 = True
for line in lines:
if line.strip():
parts = line.split('<|endoftext|>')
if len(parts) > 1:
story_part = parts[1].split('[PAD]')
if len(story_part) >= 3:
story = story_part[0].strip() # Extract the story
abilities = story_part[2].strip() # Extract the abilities
story_with_end = story + " <|endoftext|>"
combined = story_with_end + " " + abilities + "<|endoftext|>"
if print1:
print1 = False
print("\n" + combined + "\n\n")
tokenized_combined = tokenizer.encode(combined, add_special_tokens=True)
self.examples.append(torch.tensor(tokenized_combined))
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return self.examples[i]
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
dataset = StoryAbilitiesDataset("batch_ds_v2.txt", tokenizer)
print(f"Number of examples: {len(dataset)}")
model = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)
training_args = TrainingArguments(
lr_scheduler_type="cosine",
run_name="small-3of4",
output_dir="./v2/abilities/small",
overwrite_output_dir=True,
num_train_epochs=15,
max_steps=5000,
save_steps=1000,
auto_find_batch_size=True,
learning_rate=1e-4,
max_grad_norm=1.0,
logging_steps=1,
)
def data_collator(features):
input_ids = torch.nn.utils.rnn.pad_sequence(features, batch_first=True, padding_value=tokenizer.pad_token_id)
labels = input_ids.clone()
return {"input_ids": input_ids, "labels": labels}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
model.save_pretrained("./v2/abilities/small")
tokenizer.save_pretrained("./v2/abilities/small")
|