Initial commit

Browse files

Files changed (13) hide show

README.md +28 -0
backup/do_backup.py +21 -0
backup/step-1-10300.bin +3 -0
config.json +22 -0
demo.py +15 -0
generation_config.json +7 -0
pytorch_model.bin +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +31 -0
train.ipynb +777 -0
valid.py +52 -0

README.md CHANGED Viewed

@@ -1,3 +1,31 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+This is a first version of recreating roneneldan/TinyStories-1M but using Llama architecture.
+* Full training process is included in the notebook train.ipynb. Recreating it as simple as downloading
+TinyStoriesV2-GPT4-train.txt and TinyStoriesV2-GPT4-valid.txt in the same folder with the notebook and running
+the cells. Validation content is not used by the script so you put anythin in
+* Backup directory has a script do_backup that I used to copy weights from remote machine to local.
+Weight are generated too quickly, so by the time script copied weihgt N+1
+* This is extremely PoC version. Training truncates stories that are longer than context size and doesn't use
+any sliding window to train story not from the start
+* Training took approximately 9 hours (3 hours per epoch) on 40GB A100. ~30GB VRAM was used
+* I use tokenizer from open_llama_3b. However I had troubles with it locally(https://github.com/openlm-research/open_llama/issues/69).
+I had no troubles on the cloud machine with preninstalled libraries.
+* Demo script is demo.py
+* Validation script is provided: valid.py. use it like `python valid.py path/to/TinyStoriesV2-GPT4-valid.txt [optional-model-id-or-path]`:
+After training I decided that it's not necessary to beat validation into chunks
+* Also this version uses very stupid caching mechinsm to shuffle stories for training: it keeps cache of N recently loaded chunks
+so if random shuffle asks for a story, it may use cache or load chunk.
+Training dataset is too small, so in next versions I will get rid of it.
+from transformers import AutoModelForCausalLM, AutoTokenizer

backup/do_backup.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import shutil
+from pathlib import Path
+import time
+copied = []
+while True:
+    existing = [x.name for x in Path(".").glob("*.bin")]
+    copy_from = [x for x in Path("/home/fella/mnt/selectel/tiny-llama/").glob("*.bin")]
+    for file in copy_from:
+        if file.name not in existing:
+            print(file)
+            try:
+                shutil.copy(file, file.name)
+                copied.append(file.name)
+                if len(copied) > 6:
+                    delete_me = copied.pop(0)
+                    Path(delete_me).unlink()
+            except Exception as e:
+                print(f"Skipping {file.name}: {e}")
+                pass
+    time.sleep(15)

backup/step-1-10300.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6ec1c440a393eb0cce4c215de321efa232fb77a549cc85bd98e64a635ca28d9
+size 9275989

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 64,
+  "initializer_range": 0.02,
+  "intermediate_size": 256,
+  "max_position_embeddings": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 8,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

demo.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import sys
+import os
+model_id = os.getcwd()
+if len(sys.argv) > 1:
+    model_id = sys.argv[1]
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id).cuda().bfloat16()
+prompt = "Lily picked up a flower."
+inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False).to('cuda')
+out = model.generate(**inputs, max_new_tokens=80).ravel()
+out = tokenizer.decode(out)
+print(out)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae959aaff509d66f9dd85c53f16481463286950a21e0349c7793f8412fc4a094
+size 9269994

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab1b681ec7fc02fed5edd3026687d7a692a918c4dd8e150ca2e3994a6229843b
+size 534194

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 2048,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

train.ipynb ADDED Viewed

	@@ -0,0 +1,777 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f41486ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NVIDIA A100-PCIE-40GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# step 0. Preliminary\n",
+    "import torch\n",
+    "# check that cuda doesn't crash on us\n",
+    "print(torch.cuda.get_device_name())\n",
+    "# check that transformers installed\n",
+    "import transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ffd19cfb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EPOCHS=3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3a91ef1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1. Preparing the training\n",
+    "# First ensure that required files are here\n",
+    "from pathlib import Path\n",
+    "assert Path(\"TinyStoriesV2-GPT4-train.txt\").exists()\n",
+    "assert Path(\"TinyStoriesV2-GPT4-valid.txt\").exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "56b046d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Then prepare directories\n",
+    "Path(\"chunks.txt/train\").mkdir(parents=True, exist_ok=True)\n",
+    "Path(\"chunks.tensors/train\").mkdir(parents=True, exist_ok=True)\n",
+    "Path(\"chunks.txt/valid\").mkdir(parents=True, exist_ok=True)\n",
+    "Path(\"chunks.tensors/valid\").mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1bddb2ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Then prepare method to split one text to several\n",
+    "from multiprocessing.pool import Pool\n",
+    "from tqdm.contrib.concurrent import process_map\n",
+    "import os\n",
+    "_chunk_me = None\n",
+    "def extract_chunk(chunk):\n",
+    "    split, i, chunk_from, chunk_to = chunk\n",
+    "    chunk = _chunk_me[chunk_from:chunk_to].strip()    \n",
+    "    name = f\"chunks.txt/{split}/chunk-{i+1}.txt\"\n",
+    "    with open(name, \"w\") as f:\n",
+    "        f.write(chunk)\n",
+    "    return name\n",
+    "\n",
+    "def split_to_text_chunks(split:str, chunk_size = 16*1024*1024, max_workers=None):\n",
+    "    global _chunk_me #text is too chunky to pass as argument. storing as global so fork() can take care of it\n",
+    "    print(f\"reading {split}\")\n",
+    "    text = _chunk_me = Path(f\"./TinyStoriesV2-GPT4-{split}.txt\").read_text()\n",
+    "    offsets = []    \n",
+    "    delimiter = \"<|endoftext|>\"\n",
+    "    i=0\n",
+    "    while i < len(text):    \n",
+    "        offsets.append(i)\n",
+    "        i += chunk_size\n",
+    "        i = text.find(delimiter, i)\n",
+    "        if i < 0:\n",
+    "            break\n",
+    "        i += len(delimiter)\n",
+    "    offsets.append(len(text))\n",
+    "    chunks = [(split, i, start,end) for (i, (start, end)) in enumerate(zip(offsets[:-1], offsets[1:]))]\n",
+    "    \n",
+    "    print(\"writing\")\n",
+    "    process_map(extract_chunk, chunks, max_workers=max_workers)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e60017ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Assuming split has finished already\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare text of train split\n",
+    "if not Path(\"chunks.txt/train/chunk-133.txt\").exists():\n",
+    "    split_to_text_chunks(\"train\")\n",
+    "else:\n",
+    "    print(\"Assuming split has finished already\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e9b7effe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Assuming split has finished already\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Prepare text of valid split\n",
+    "if not Path(\"chunks.txt/valid/chunk-2.txt\").exists():\n",
+    "    split_to_text_chunks(\"valid\")    \n",
+    "else:\n",
+    "    print(\"Assuming split has finished already\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b4706f24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 2. Prepare OpenLLAMA tokenizer. \n",
+    "#Needed to be done once(TODO: add code to load tokenizer?)\n",
+    "from transformers import AutoTokenizer\n",
+    "import os\n",
+    "if not Path('tokenizer.json').exists():    \n",
+    "    try:\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(\"openlm-research/open_llama_3b\")\n",
+    "    except:\n",
+    "        os.environ[\"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION\"]=\"python\"    \n",
+    "        tokenizer = AutoTokenizer.from_pretrained(\"openlm-research/open_llama_3b\")\n",
+    "        tokenizer.save_pretrained(\".\")\n",
+    "        del os.environ[\"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION\"]\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "f9c935b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 3. Preparing to tokenize each text chunk\n",
+    "from tqdm.contrib.concurrent import process_map\n",
+    "def tokenize_file(filename:Path):\n",
+    "    text = Path.read_text(filename)\n",
+    "    stories = text.split(\"<|endoftext|>\")\n",
+    "    result = []\n",
+    "    while stories:\n",
+    "        story = stories.pop(0).strip()\n",
+    "        tokenized = tokenizer(story, max_length=None).input_ids\n",
+    "        tokenized.append(tokenizer.eos_token_id)\n",
+    "        result.append(torch.tensor(tokenized))\n",
+    "    output_name = str(filename).replace(\".txt\", \".tensors\")\n",
+    "    torch.save(result, output_name)\n",
+    "\n",
+    "def tokenize_split(split, max_workers=None):\n",
+    "    to_process = list(Path(f\"chunks.txt/{split}\").glob(\"*\"))    \n",
+    "    process_map(tokenize_file, to_process, max_workers=max_workers)\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "95257f12",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Assuming train was tokenized already\n"
+     ]
+    }
+   ],
+   "source": [
+    "# processing train(this can take several minutes)\n",
+    "if not Path(\"chunks.tensors/train/chunk-133.tensors\").exists():\n",
+    "    tokenize_split(\"train\")\n",
+    "else:\n",
+    "    print(\"Assuming train was tokenized already\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "bbbe4599",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Assuming valid was tokenized already\n"
+     ]
+    }
+   ],
+   "source": [
+    "# processing valid(this can take one minutes)\n",
+    "if not Path(\"chunks.tensors/valid/chunk-2.tensors\").exists():\n",
+    "    tokenize_split(\"valid\")\n",
+    "else:\n",
+    "    print(\"Assuming valid was tokenized already\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a31a4aa7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Resetting [PAD] to [EOS]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 4. Training. \n",
+    "# Step 4.1 Preparing tokenizer and setting pad token if it is not set(it is not set)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\".\")\n",
+    "if not tokenizer.pad_token_id:\n",
+    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "    print(\"Resetting [PAD] to [EOS]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f677c9c0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Step 4.2. Preparing model\n",
+    "from transformers.models.llama.modeling_llama import LlamaConfig, LlamaForCausalLM\n",
+    "\n",
+    "tiny_llama = LlamaConfig(\n",
+    "    hidden_size=64, \n",
+    "    vocab_size=tokenizer.vocab_size,\n",
+    "    intermediate_size=256, \n",
+    "    num_attention_heads=16, \n",
+    "    num_hidden_layers=8)\n",
+    "\n",
+    "torch.manual_seed(11010)\n",
+    "torch.cuda.manual_seed(11010)\n",
+    "model = LlamaForCausalLM(tiny_llama).cuda().bfloat16()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "aad9620b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import functools\n",
+    "import torch.nn.functional as F\n",
+    "from tqdm.contrib.concurrent import process_map\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "# Step 4.3 Preparing dataset class\n",
+    "def get_file_data_len(filename):\n",
+    "    data = torch.load(filename)\n",
+    "    return (filename, len(data))\n",
+    "from datasets import Dataset\n",
+    "\n",
+    "CACHE_SIZE = 2000 # There are ~150 train splits. We can fit them in memory, so let's do it\n",
+    "\n",
+    "class TinyDataset(Dataset):\n",
+    "    def __init__(self, split: str, populate_cache=True):\n",
+    "        print(f\"Reading dataset {split} data\")\n",
+    "        self.file_lens = process_map(\n",
+    "            get_file_data_len,\n",
+    "            list(Path(f\"chunks.tensors/{split}\").glob(\"*\")))\n",
+    "        self.file_lens.sort()\n",
+    "        if populate_cache:\n",
+    "            print(\"Populating a cache\")\n",
+    "            for filename, _ in tqdm(self.file_lens):\n",
+    "                self.load_tensor_file(filename)\n",
+    "\n",
+    "    @functools.lru_cache(maxsize=CACHE_SIZE)\n",
+    "    def load_tensor_file(self, filename):\n",
+    "        return torch.load(filename)\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return sum(x[1] for x in self.file_lens)\n",
+    "\n",
+    "    def global_index_to_local(self, i):\n",
+    "        for (file, length) in self.file_lens:\n",
+    "            if i < length:\n",
+    "                return (file, i)\n",
+    "            i -= length\n",
+    "        raise IndexError(f\"{i} is out-of-bonds, have {len(self)} sample\")\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        if torch.is_tensor(index):\n",
+    "            index = index.tolist()\n",
+    "        if isinstance(index, int):\n",
+    "            filename, local_index = self.global_index_to_local(index)\n",
+    "            tensors = self.load_tensor_file(filename)\n",
+    "            return {\n",
+    "                'input_ids': tensors[local_index]\n",
+    "            }\n",
+    "        if isinstance(index, list):\n",
+    "            data = []\n",
+    "            indices = index\n",
+    "            for index in indices:\n",
+    "                filename, local_index = self.global_index_to_local(index)\n",
+    "                tensors = self.load_tensor_file(filename)\n",
+    "                data.append(tensors[local_index])\n",
+    "\n",
+    "            return {'input_ids': data}\n",
+    "\n",
+    "        raise TypeError(f'Invaldi index type {type(index)}')\n",
+    "        \n",
+    "def batch_collate(data: list[torch.Tensor]):\n",
+    "    max_len = max(len(datum[\"input_ids\"]) for datum in data)\n",
+    "    inputs = []\n",
+    "    attentions = []\n",
+    "    for row in data:\n",
+    "        input_ids = row[\"input_ids\"]\n",
+    "        attention_mask = torch.ones_like(input_ids)\n",
+    "        attention_mask[-1] = 0  # don't care about EOS\n",
+    "        # Manual padding\n",
+    "        to_pad = max_len - len(input_ids)\n",
+    "        is_left_pad = tokenizer.padding_side == \"left\"\n",
+    "        padding = (is_left_pad * to_pad, (1 - is_left_pad) * to_pad)\n",
+    "        input_ids = F.pad(input_ids, padding, value=tokenizer.pad_token_id)\n",
+    "        attention_mask = F.pad(attention_mask, padding, value=0)\n",
+    "        inputs.append(input_ids)\n",
+    "        attentions.append(attention_mask)\n",
+    "\n",
+    "    attention_masks = torch.stack(attentions)\n",
+    "    input_ids = torch.stack(inputs)\n",
+    "    labels = input_ids.clone()\n",
+    "\n",
+    "    # disable prediction of the padding\n",
+    "    labels[attention_masks == 0] = -100\n",
+    "    # enable prediction of an actual EOS\n",
+    "    labels[:, -1] = tokenizer.eos_token_id\n",
+    "\n",
+    "    return {\n",
+    "        'input_ids': input_ids,\n",
+    "        'attention_mask': attention_masks,\n",
+    "        'labels': labels\n",
+    "    }\n",
+    "\n",
+    "def get_max_story_length(ds):    \n",
+    "    return max(file_len[1] for file_len in ds.file_lens)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "2e828afe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading dataset train data\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8ca542afc1694073af6dcf9ce5f7e13a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/133 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Populating a cache\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8035a75107e84a54870a8c6f15c4100a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/133 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "WARNIING: split long stories",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m tokenizer\u001b[38;5;241m.\u001b[39mpadding_side \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mleft\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mright\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m      2\u001b[0m train_ds \u001b[38;5;241m=\u001b[39m TinyDataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m get_max_story_length(train_ds) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mmodel_max_length, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWARNIING: split long stories\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: WARNIING: split long stories"
+     ]
+    }
+   ],
+   "source": [
+    "assert tokenizer.padding_side in [\"left\", \"right\"]\n",
+    "train_ds = TinyDataset(\"train\")\n",
+    "assert get_max_story_length(train_ds) <= tokenizer.model_max_length, \"WARNIING: split long stories\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "6412e7c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "torch.manual_seed(11010)\n",
+    "torch.cuda.manual_seed(11010)\n",
+    "train_dl = DataLoader(train_ds, 16, True, collate_fn=batch_collate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f3ff5a66",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mggg4\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.15.5"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/home/mayk/tiny-llama/wandb/run-20230707_181234-rilt4m6f</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/ggg4/training-tiny-llama-preview/runs/rilt4m6f' target=\"_blank\">grateful-jazz-4</a></strong> to <a href='https://wandb.ai/ggg4/training-tiny-llama-preview' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/ggg4/training-tiny-llama-preview' target=\"_blank\">https://wandb.ai/ggg4/training-tiny-llama-preview</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/ggg4/training-tiny-llama-preview/runs/rilt4m6f' target=\"_blank\">https://wandb.ai/ggg4/training-tiny-llama-preview/runs/rilt4m6f</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/ggg4/training-tiny-llama-preview/runs/rilt4m6f?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+      ],
+      "text/plain": [
+       "<wandb.sdk.wandb_run.Run at 0x7f6af8170b50>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# prepare wandb\n",
+    "import wandb\n",
+    "wandb.init(\n",
+    "    project=\"training-tiny-llama-preview\",\n",
+    "    config={\n",
+    "    \"architecture\": \"llama\",\n",
+    "    \"dataset\": \"tiny-stories\",\n",
+    "    \"epochs\": EPOCHS,\n",
+    "    }   \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aed7b7a4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "166a4a27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "def save_imm(epoch, step, saved=[]):\n",
+    "    fname = f\"step-{epoch}-{step}.bin\"\n",
+    "    torch.save(model.state_dict(), f\"step-{epoch}-{step}.bin\")\n",
+    "    saved.append(fname)\n",
+    "    if len(saved) > 5:\n",
+    "        delete_me = saved.pop(0)\n",
+    "        Path(delete_me).unlink(missing_ok=True)\n",
+    "\n",
+    "def epoch_step(epoch, opt):\n",
+    "    for i, batch in enumerate(bar := tqdm(train_dl)):\n",
+    "        for k in batch:\n",
+    "            batch[k] = batch[k].to(device=model.lm_head.weight.device)\n",
+    "        \n",
+    "        n_batch, n_seq = batch[\"input_ids\"].shape\n",
+    "        if n_seq > tokenizer.model_max_length:\n",
+    "            assert tokenizer.padding_side == \"right\", \"Left-pad truncation only supported[as model should not see >2k token anyway]\"\n",
+    "            batch[\"input_ids\"] = batch[\"input_ids\"][:, -tokenizer.model_max_length]\n",
+    "            batch[\"labels\"] = batch[\"labels\"][:, -tokenizer.model_max_length]\n",
+    "            batch[\"attention_mask\"] = batch[\"attention_mask\"][:, -tokenizer.model_max_length]\n",
+    "            \n",
+    "        \n",
+    "        loss = model(**batch).loss\n",
+    "        loss.backward()\n",
+    "        opt.step()\n",
+    "        opt.zero_grad()\n",
+    "        bar.set_description(f'L:{loss.item():.4f}')\n",
+    "        wandb.log({\"loss\": loss.item()})\n",
+    "        if (i+1) % 100 == 0:\n",
+    "            save_imm(epoch, i+1)\n",
+    "        \n",
+    "    torch.save(model.state_dict(), f\"epoch-{epoch}.bin\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "ec4943c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opt = torch.optim.AdamW(model.parameters(), fused=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "daae9020",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f7ab6fe3b99546f49acb0d43888b7ceb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/169865 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for e in range(EPOCHS):\n",
+    "    epoch_step(e+1, opt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "87988cf5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "               total        used        free      shared  buff/cache   available\r\n",
+      "Mem:            85Gi       1.5Gi        72Gi       8.0Mi        11Gi        83Gi\r\n",
+      "Swap:             0B          0B          0B\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!free -h"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "e43eb9f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fri Jul  7 17:44:05 2023       \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |\n",
+      "|-------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                               |                      |               MIG M. |\n",
+      "|===============================+======================+======================|\n",
+      "|   0  NVIDIA A100-PCI...  On   | 00000000:05:00.0 Off |                    0 |\n",
+      "| N/A   30C    P0    34W / 250W |   5739MiB / 40960MiB |      0%      Default |\n",
+      "|                               |                      |             Disabled |\n",
+      "+-------------------------------+----------------------+----------------------+\n",
+      "                                                                               \n",
+      "+-----------------------------------------------------------------------------+\n",
+      "| Processes:                                                                  |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
+      "|        ID   ID                                                   Usage      |\n",
+      "|=============================================================================|\n",
+      "|    0   N/A  N/A     13768      C   /opt/conda/bin/python            5736MiB |\n",
+      "+-----------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "0351f57f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Parameter containing:\n",
+       "tensor([[ 8.3618e-03,  3.8330e-02, -5.9204e-03,  ...,  2.0752e-02,\n",
+       "          4.4861e-03,  1.2512e-02],\n",
+       "        [ 3.9978e-03,  2.1118e-02, -3.5645e-02,  ..., -1.6846e-02,\n",
+       "          5.0659e-03, -3.8818e-02],\n",
+       "        [-1.6928e-05, -1.2756e-02, -1.1536e-02,  ..., -1.6235e-02,\n",
+       "          4.8218e-03, -1.4099e-02],\n",
+       "        ...,\n",
+       "        [-9.8267e-03, -6.8665e-03,  1.0864e-02,  ..., -1.0864e-02,\n",
+       "         -2.4170e-02, -5.6076e-04],\n",
+       "        [-9.5749e-04,  7.3853e-03,  4.9438e-03,  ...,  1.2390e-02,\n",
+       "         -2.1606e-02, -9.2163e-03],\n",
+       "        [ 5.1758e-02,  2.1484e-02, -1.5381e-02,  ..., -2.4292e-02,\n",
+       "         -3.4912e-02,  3.0823e-03]], device='cuda:0', dtype=torch.bfloat16,\n",
+       "       requires_grad=True)"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ace72db5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

valid.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import sys
+import os
+from pathlib import Path
+from tqdm.auto import tqdm
+model_id = os.getcwd()
+if len(sys.argv) == 2:
+    filename = sys.argv[1]
+elif len(sys.argv) == 3:
+    filename = sys.argv[1]
+    model_id = sys.argv[2]
+else:
+    raise Exception("use valid.py <path-to-text> [model-id]")
+text = Path(filename).read_text()
+stories = text.split("<|endoftext|>")
+print(len(stories))
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id).cuda().bfloat16()
+ctx_size = tokenizer.model_max_length
+sliding_window = ctx_size // 2
+total_loss = 0.0
+measurements = 0
+model.eval()
+for story in (bar := tqdm(stories)):
+    story = story.strip()
+    tokens = tokenizer(story, add_special_tokens=False).input_ids + [tokenizer.eos_token_id]
+    i = 0
+    while i < len(tokens):
+        current_window = tokens[i:i+ctx_size-1]
+        part_tokens = [tokenizer.bos_token_id] + current_window
+        input_ids = torch.tensor(part_tokens, device="cuda")[None]
+        labels = input_ids.clone()
+        if i:
+            # disable seen tokens
+            labels[:, :-sliding_window] = -100
+        with torch.no_grad():
+            loss = model(input_ids, labels=labels).loss
+            total_loss += loss.item()
+            measurements += 1
+        i += len(current_window)
+        bar.set_description(f"L {total_loss/measurements:.4f}")
+print(f"FINAL LOSS: {total_loss/measurements:.4f}")