In [1]:
import datasets
import transformers
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

In [2]:
MODEL = "EleutherAI/pythia-125m-deduped"

config = AutoConfig.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(MODEL)

In [3]:
# @title Extend model

num_added_tokens = tokenizer.add_special_tokens({"sep_token": "<|STK_SP|>"})
print(f"Added {num_added_tokens} tokens!")
model.resize_token_embeddings(len(tokenizer))

# TODO: ???
tokenizer.pad_token = tokenizer.eos_token

assert tokenizer.sep_token == "<|STK_SP|>"

Added 1 tokens!


In [4]:
# @title Load in the dataset

from datasets import load_dataset

data_files = {
    "train": "./dataset-r1/train.jsonl",
    "validation": "./dataset-r1/valid.jsonl",
}

raw_datasets = load_dataset(
    "json",
    data_files=data_files,
)

Using custom data configuration default-b39c74bc29b6f917
Found cached dataset json (C:/Users/lego-/.cache/huggingface/datasets/json/default-b39c74bc29b6f917/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# @title Tokenize the dataset
tokenized_datasets = raw_datasets.map(
    lambda e: tokenizer(e["input"] + e["output"] + tokenizer.eos_token),
    #batched=True,
    #num_proc=4,
    remove_columns=["input", "output", "coder", "system", "god", "user", "ai", "topic"]
)

for i in range(len(tokenized_datasets["train"])):
    if len(tokenized_datasets["train"][i]["input_ids"]) > config.max_position_embeddings:
        print(f"Error in {i} of train")
for i in range(len(tokenized_datasets["validation"])):
    if len(tokenized_datasets["validation"][i]["input_ids"]) > config.max_position_embeddings:
        print(f"Error in {i} of validation")

# [tokenized_datasets["train"][1], tokenized_datasets["validation"][1]]
print("Total processed datasets sizes are ", len(tokenized_datasets["train"]), len(tokenized_datasets["validation"]))

Loading cached processed dataset at C:\Users\lego-\.cache\huggingface\datasets\json\default-b39c74bc29b6f917\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-d06df8923a2befa8.arrow
Loading cached processed dataset at C:\Users\lego-\.cache\huggingface\datasets\json\default-b39c74bc29b6f917\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-847113bf21349cf9.arrow


Total processed datasets sizes are  2755 150


In [6]:
# TODO: maybe group?

from itertools import chain

block_size = 2048
def group_texts(examples):
    # Concatenate all texts.
    #print(list(chain(*examples['input_ids'])))
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    # num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=False,
    desc=f"Grouping texts in chunks of {block_size}",
)

print("Total LM datasets sizes are ", len(lm_datasets["train"]), len(lm_datasets["validation"]))

Grouping texts in chunks of 2048:   0%|          | 0/3 [00:00<?, ?ba/s]

Grouping texts in chunks of 2048:   0%|          | 0/1 [00:00<?, ?ba/s]

Total LM datasets sizes are  628 31


In [7]:
from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding
from transformers.trainer_pt_utils import get_parameter_names
import evaluate

import bitsandbytes as bnb
from bitsandbytes.optim import GlobalOptimManager

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

model.config.use_cache = False

#data_collator_pad = DataCollatorWithPadding(tokenizer)
def data_collator(data_):
    data = default_data_collator(data_)
    #print(data)
    return {'input_ids': torch.stack([i for i in data['input_ids']]),
      'attention_mask': torch.stack([i for i in data['attention_mask']]),
      'labels': torch.stack([i for i in data['input_ids']])}

training_args = TrainingArguments(
    "./openchatgpt-neox-r1.1/",
    do_train=True, 
    do_eval=True,
    
    push_to_hub=False,

    # Pulled from examples
    evaluation_strategy="epoch",
    #learning_rate=2e-5,
    #weight_decay=0.01,

    save_steps=300,

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    gradient_accumulation_steps=2,
    gradient_checkpointing=True,

    fp16=True,
)

optim = bnb.optim.Adam8bit
def set_optim_to_run_embedding_in_fp32(model):
    for module in model.modules():
        if isinstance(module, torch.nn.Embedding):
            GlobalOptimManager.get_instance().register_module_override(module, 'weight', {'optim_bits': 32})
set_optim_to_run_embedding_in_fp32(model)
# model.cuda()

decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

adam_bnb_optim = optim(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

trainer = Trainer(
    model=model,
    #train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["validation"],
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    tokenizer=tokenizer,

    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,

    # data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
    #     'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),
    #     'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])},

    args=training_args,

    optimizers=(adam_bnb_optim, None),
)

Using magick windows DLL!
CUDA SETUP: Loading binary d:\projects\python\distilchatgpt2\venv\lib\site-packages\bitsandbytes\libbitsandbytes_cudaall.dll...


Using cuda_amp half precision backend


In [8]:
# @title Get last model checkpoint if any...

from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = get_last_checkpoint("./openchatgpt-neox-r1.1/")
if last_checkpoint is None:
    print("No last checkpoint detected!")

No last checkpoint detected!


In [9]:
trainer.train(resume_from_checkpoint=last_checkpoint)

***** Running training *****
  Num examples = 628
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 942
  Number of trainable parameters = 162283008


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.881487,0.7871
2,0.811800,0.871694,0.791922
3,0.811800,0.896573,0.792001


Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-300
Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-300\config.json
Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-300\pytorch_model.bin
tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-300\tokenizer_config.json
Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-300\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 31
  Batch size = 1
Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-600
Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-600\config.json
Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-600\pytorch_model.bin
tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-600\tokenizer_config.json
Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-600\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 31
  Batch size = 1
Saving model checkpoint to ./openchatgpt-neox-r1.1/check

TrainOutput(global_step=942, training_loss=0.6499279856428726, metrics={'train_runtime': 5481.9853, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.172, 'total_flos': 2863022229946368.0, 'train_loss': 0.6499279856428726, 'epoch': 3.0})

In [10]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 31
  Batch size = 1


Perplexity: 2.45


In [11]:
trainer.save_state()
trainer.create_model_card(tasks="text-generation", finetuned_from=MODEL, dataset="openchatgpt safe-r1")
trainer.save_model()

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7920008824873537}]}
Saving model checkpoint to ./openchatgpt-neox-r1.1/
Configuration saved in ./openchatgpt-neox-r1.1/config.json
Model weights saved in ./openchatgpt-neox-r1.1/pytorch_model.bin
tokenizer config file saved in ./openchatgpt-neox-r1.1/tokenizer_config.json
Special tokens file saved in ./openchatgpt-neox-r1.1/special_tokens_map.json
