|
""" |
|
use this file to train the model |
|
|
|
working: |
|
- imports vatious dependencies first, and then loads the training data |
|
- tokenizes it, per-character basis |
|
- loads the required hyper-parameters and the model file |
|
- trains it till 'max_iters' and saves the model state, and generates outputs |
|
|
|
with the current set configuration, model can reach upto ~60million parameters |
|
and can become ~99% accurate with next token prediction |
|
""" |
|
|
|
import torch |
|
import json |
|
import os |
|
current_directory = os.path.dirname(os.path.abspath(__file__)) |
|
os.chdir(current_directory) |
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
with open('../datasets/wiki_176m.txt', 'r', encoding='utf-8') as file: |
|
data = file.read() |
|
|
|
print(f"{(len(data)/1e6):.2f} million letters") |
|
|
|
from tokenizer import Tokenizer |
|
|
|
tokenizer = Tokenizer() |
|
vocab_size = tokenizer.get_vocab() |
|
|
|
|
|
data = torch.tensor(tokenizer.encode(data), dtype=torch.long) |
|
n = int(0.9*len(data)) |
|
train_data = data[:n] |
|
val_data = data[n:] |
|
|
|
with open('config.json', 'r', encoding='utf-8') as file: |
|
params = json.load(file) |
|
|
|
|
|
batch_size = params['batch_size'] |
|
block_size = params['block_size'] |
|
max_iters = 1000 |
|
eval_interval = 100 |
|
eval_iters = 200 |
|
learning_rate = params['learning_rate'] |
|
|
|
torch.manual_seed(1400) |
|
|
|
def get_batch(split): |
|
|
|
data = train_data if split == 'train' else val_data |
|
ix = torch.randint(len(data) - block_size, (batch_size,)) |
|
x = torch.stack([data[i:i+block_size] for i in ix]) |
|
y = torch.stack([data[i+1:i+block_size+1] for i in ix]) |
|
x, y = x.to(device), y.to(device) |
|
return x, y |
|
|
|
@torch.no_grad() |
|
def estimate_loss(): |
|
out = {} |
|
model.eval() |
|
for split in ['train', 'val']: |
|
losses = torch.zeros(eval_iters) |
|
for k in range(eval_iters): |
|
X, Y = get_batch(split) |
|
logits, loss = model(X, Y) |
|
losses[k] = loss.item() |
|
out[split] = losses.mean() |
|
model.train() |
|
return out |
|
|
|
from model import Transformer |
|
model = Transformer(vocab_size) |
|
m = model.to(device) |
|
|
|
|
|
n_param = sum(p.numel() for p in m.parameters())/1e6 |
|
print(f"{n_param:.2f} million") |
|
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) |
|
steps = [] |
|
train_losses = [] |
|
val_losses = [] |
|
|
|
for iter in range(max_iters): |
|
|
|
if iter % eval_interval == 0 or iter == max_iters - 1: |
|
losses = estimate_loss() |
|
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") |
|
|
|
steps.append(iter) |
|
train_losses.append(losses['train']) |
|
val_losses.append(losses['val']) |
|
|
|
xb, yb = get_batch('train') |
|
logits, loss = model(xb, yb) |
|
optimizer.zero_grad(set_to_none=True) |
|
loss.backward() |
|
optimizer.step() |
|
|
|
torch.save(model.state_dict(), f'enigma_{n_param:.0f}m.pth') |