barc0
/

google_cloude_test_20k_transduction-gpt4omini_lr1e-5_epoch2_seed25

+---
+library_name: transformers
+license: llama3.1
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: google_cloude_test_20k_transduction-gpt4omini_lr1e-5_epoch2_seed25
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# google_cloude_test_20k_transduction-gpt4omini_lr1e-5_epoch2_seed25
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0608
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 8
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 2
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.0683        | 0.9967 | 150  | 0.0725          |
+| 0.0482        | 1.9934 | 300  | 0.0608          |
+### Framework versions
+- Transformers 4.45.0.dev0
+- Pytorch 2.4.0+cu121
+- Datasets 3.0.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9933554817275747,
+    "total_flos": 1.1631033437569024e+17,
+    "train_loss": 0.07701069086790086,
+    "train_runtime": 4589.0959,
+    "train_samples": 19216,
+    "train_samples_per_second": 8.375,
+    "train_steps_per_second": 0.065
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.0.dev0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9933554817275747,
+    "total_flos": 1.1631033437569024e+17,
+    "train_loss": 0.07701069086790086,
+    "train_runtime": 4589.0959,
+    "train_samples": 19216,
+    "train_samples_per_second": 8.375,
+    "train_steps_per_second": 0.065
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2158 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9933554817275747,
+  "eval_steps": 500,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006644518272425249,
+      "grad_norm": 10.145524315941286,
+      "learning_rate": 3.3333333333333335e-07,
+      "loss": 0.2861,
+      "step": 1
+    },
+    {
+      "epoch": 0.013289036544850499,
+      "grad_norm": 8.915211114203709,
+      "learning_rate": 6.666666666666667e-07,
+      "loss": 0.2616,
+      "step": 2
+    },
+    {
+      "epoch": 0.019933554817275746,
+      "grad_norm": 8.777900140983958,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.2835,
+      "step": 3
+    },
+    {
+      "epoch": 0.026578073089700997,
+      "grad_norm": 8.285488875547399,
+      "learning_rate": 1.3333333333333334e-06,
+      "loss": 0.2231,
+      "step": 4
+    },
+    {
+      "epoch": 0.03322259136212625,
+      "grad_norm": 7.810022519562283,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.2467,
+      "step": 5
+    },
+    {
+      "epoch": 0.03986710963455149,
+      "grad_norm": 6.560410496844325,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.241,
+      "step": 6
+    },
+    {
+      "epoch": 0.046511627906976744,
+      "grad_norm": 4.7170552839011775,
+      "learning_rate": 2.3333333333333336e-06,
+      "loss": 0.2127,
+      "step": 7
+    },
+    {
+      "epoch": 0.053156146179401995,
+      "grad_norm": 4.183129818287662,
+      "learning_rate": 2.666666666666667e-06,
+      "loss": 0.2308,
+      "step": 8
+    },
+    {
+      "epoch": 0.059800664451827246,
+      "grad_norm": 3.1610581594177942,
+      "learning_rate": 3e-06,
+      "loss": 0.1254,
+      "step": 9
+    },
+    {
+      "epoch": 0.0664451827242525,
+      "grad_norm": 2.538388610366679,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.178,
+      "step": 10
+    },
+    {
+      "epoch": 0.07308970099667775,
+      "grad_norm": 1.6417414573531897,
+      "learning_rate": 3.6666666666666666e-06,
+      "loss": 0.1305,
+      "step": 11
+    },
+    {
+      "epoch": 0.07973421926910298,
+      "grad_norm": 5.9298971216598115,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 0.152,
+      "step": 12
+    },
+    {
+      "epoch": 0.08637873754152824,
+      "grad_norm": 3.0446940552354858,
+      "learning_rate": 4.333333333333334e-06,
+      "loss": 0.1014,
+      "step": 13
+    },
+    {
+      "epoch": 0.09302325581395349,
+      "grad_norm": 2.2106089020081106,
+      "learning_rate": 4.666666666666667e-06,
+      "loss": 0.1222,
+      "step": 14
+    },
+    {
+      "epoch": 0.09966777408637874,
+      "grad_norm": 2.547231850213915,
+      "learning_rate": 5e-06,
+      "loss": 0.1536,
+      "step": 15
+    },
+    {
+      "epoch": 0.10631229235880399,
+      "grad_norm": 1.9791098486011387,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 0.1499,
+      "step": 16
+    },
+    {
+      "epoch": 0.11295681063122924,
+      "grad_norm": 1.5927564141097508,
+      "learning_rate": 5.666666666666667e-06,
+      "loss": 0.0975,
+      "step": 17
+    },
+    {
+      "epoch": 0.11960132890365449,
+      "grad_norm": 1.242283265980379,
+      "learning_rate": 6e-06,
+      "loss": 0.0974,
+      "step": 18
+    },
+    {
+      "epoch": 0.12624584717607973,
+      "grad_norm": 2.0819749599134756,
+      "learning_rate": 6.333333333333333e-06,
+      "loss": 0.105,
+      "step": 19
+    },
+    {
+      "epoch": 0.132890365448505,
+      "grad_norm": 3.1481612101400835,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.1413,
+      "step": 20
+    },
+    {
+      "epoch": 0.13953488372093023,
+      "grad_norm": 1.329108390047055,
+      "learning_rate": 7e-06,
+      "loss": 0.082,
+      "step": 21
+    },
+    {
+      "epoch": 0.1461794019933555,
+      "grad_norm": 2.205626574560857,
+      "learning_rate": 7.333333333333333e-06,
+      "loss": 0.098,
+      "step": 22
+    },
+    {
+      "epoch": 0.15282392026578073,
+      "grad_norm": 1.323315317507035,
+      "learning_rate": 7.666666666666667e-06,
+      "loss": 0.0828,
+      "step": 23
+    },
+    {
+      "epoch": 0.15946843853820597,
+      "grad_norm": 1.47807340385477,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 0.1228,
+      "step": 24
+    },
+    {
+      "epoch": 0.16611295681063123,
+      "grad_norm": 1.4244378567157914,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.093,
+      "step": 25
+    },
+    {
+      "epoch": 0.17275747508305647,
+      "grad_norm": 1.6965216293403562,
+      "learning_rate": 8.666666666666668e-06,
+      "loss": 0.1138,
+      "step": 26
+    },
+    {
+      "epoch": 0.17940199335548174,
+      "grad_norm": 2.4560093523672966,
+      "learning_rate": 9e-06,
+      "loss": 0.1197,
+      "step": 27
+    },
+    {
+      "epoch": 0.18604651162790697,
+      "grad_norm": 1.920641163538201,
+      "learning_rate": 9.333333333333334e-06,
+      "loss": 0.1118,
+      "step": 28
+    },
+    {
+      "epoch": 0.19269102990033224,
+      "grad_norm": 2.4139931857560577,
+      "learning_rate": 9.666666666666667e-06,
+      "loss": 0.1143,
+      "step": 29
+    },
+    {
+      "epoch": 0.19933554817275748,
+      "grad_norm": 1.212761719437112,
+      "learning_rate": 1e-05,
+      "loss": 0.1061,
+      "step": 30
+    },
+    {
+      "epoch": 0.2059800664451827,
+      "grad_norm": 2.647502675755485,
+      "learning_rate": 9.999661540018812e-06,
+      "loss": 0.115,
+      "step": 31
+    },
+    {
+      "epoch": 0.21262458471760798,
+      "grad_norm": 1.446671287082578,
+      "learning_rate": 9.99864620589731e-06,
+      "loss": 0.1063,
+      "step": 32
+    },
+    {
+      "epoch": 0.21926910299003322,
+      "grad_norm": 1.1989767107611067,
+      "learning_rate": 9.99695413509548e-06,
+      "loss": 0.0783,
+      "step": 33
+    },
+    {
+      "epoch": 0.22591362126245848,
+      "grad_norm": 1.5133023884865615,
+      "learning_rate": 9.994585556692624e-06,
+      "loss": 0.0914,
+      "step": 34
+    },
+    {
+      "epoch": 0.23255813953488372,
+      "grad_norm": 1.4282681068359147,
+      "learning_rate": 9.991540791356342e-06,
+      "loss": 0.1023,
+      "step": 35
+    },
+    {
+      "epoch": 0.23920265780730898,
+      "grad_norm": 1.5675540747452452,
+      "learning_rate": 9.987820251299121e-06,
+      "loss": 0.0737,
+      "step": 36
+    },
+    {
+      "epoch": 0.24584717607973422,
+      "grad_norm": 1.4699450317353462,
+      "learning_rate": 9.98342444022253e-06,
+      "loss": 0.1156,
+      "step": 37
+    },
+    {
+      "epoch": 0.25249169435215946,
+      "grad_norm": 2.2211547112198873,
+      "learning_rate": 9.978353953249023e-06,
+      "loss": 0.1114,
+      "step": 38
+    },
+    {
+      "epoch": 0.2591362126245847,
+      "grad_norm": 1.1602597727805404,
+      "learning_rate": 9.972609476841368e-06,
+      "loss": 0.1032,
+      "step": 39
+    },
+    {
+      "epoch": 0.26578073089701,
+      "grad_norm": 0.9637077068719486,
+      "learning_rate": 9.966191788709716e-06,
+      "loss": 0.0681,
+      "step": 40
+    },
+    {
+      "epoch": 0.2724252491694352,
+      "grad_norm": 1.6293839524249614,
+      "learning_rate": 9.959101757706308e-06,
+      "loss": 0.115,
+      "step": 41
+    },
+    {
+      "epoch": 0.27906976744186046,
+      "grad_norm": 0.9732726248145027,
+      "learning_rate": 9.951340343707852e-06,
+      "loss": 0.0975,
+      "step": 42
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 1.9370817624388006,
+      "learning_rate": 9.942908597485558e-06,
+      "loss": 0.1148,
+      "step": 43
+    },
+    {
+      "epoch": 0.292358803986711,
+      "grad_norm": 1.6614214400661733,
+      "learning_rate": 9.933807660562898e-06,
+      "loss": 0.1163,
+      "step": 44
+    },
+    {
+      "epoch": 0.29900332225913623,
+      "grad_norm": 1.1978664226300189,
+      "learning_rate": 9.924038765061042e-06,
+      "loss": 0.0984,
+      "step": 45
+    },
+    {
+      "epoch": 0.30564784053156147,
+      "grad_norm": 1.3623012344086183,
+      "learning_rate": 9.913603233532067e-06,
+      "loss": 0.0993,
+      "step": 46
+    },
+    {
+      "epoch": 0.3122923588039867,
+      "grad_norm": 0.987057722774455,
+      "learning_rate": 9.902502478779897e-06,
+      "loss": 0.0554,
+      "step": 47
+    },
+    {
+      "epoch": 0.31893687707641194,
+      "grad_norm": 1.748369907296719,
+      "learning_rate": 9.890738003669029e-06,
+      "loss": 0.1239,
+      "step": 48
+    },
+    {
+      "epoch": 0.32558139534883723,
+      "grad_norm": 2.491434672974267,
+      "learning_rate": 9.878311400921072e-06,
+      "loss": 0.1029,
+      "step": 49
+    },
+    {
+      "epoch": 0.33222591362126247,
+      "grad_norm": 1.4006160903691325,
+      "learning_rate": 9.86522435289912e-06,
+      "loss": 0.0996,
+      "step": 50
+    },
+    {
+      "epoch": 0.3388704318936877,
+      "grad_norm": 2.2756678251100726,
+      "learning_rate": 9.851478631379982e-06,
+      "loss": 0.1372,
+      "step": 51
+    },
+    {
+      "epoch": 0.34551495016611294,
+      "grad_norm": 1.3072647570291376,
+      "learning_rate": 9.83707609731432e-06,
+      "loss": 0.0867,
+      "step": 52
+    },
+    {
+      "epoch": 0.3521594684385382,
+      "grad_norm": 1.8929458494674647,
+      "learning_rate": 9.822018700574696e-06,
+      "loss": 0.131,
+      "step": 53
+    },
+    {
+      "epoch": 0.3588039867109635,
+      "grad_norm": 1.3900822961021344,
+      "learning_rate": 9.806308479691595e-06,
+      "loss": 0.0702,
+      "step": 54
+    },
+    {
+      "epoch": 0.3654485049833887,
+      "grad_norm": 0.7898540133946153,
+      "learning_rate": 9.789947561577445e-06,
+      "loss": 0.0857,
+      "step": 55
+    },
+    {
+      "epoch": 0.37209302325581395,
+      "grad_norm": 1.8444881635021309,
+      "learning_rate": 9.77293816123866e-06,
+      "loss": 0.1006,
+      "step": 56
+    },
+    {
+      "epoch": 0.3787375415282392,
+      "grad_norm": 1.1604374728958515,
+      "learning_rate": 9.755282581475769e-06,
+      "loss": 0.077,
+      "step": 57
+    },
+    {
+      "epoch": 0.3853820598006645,
+      "grad_norm": 1.361789017794424,
+      "learning_rate": 9.736983212571646e-06,
+      "loss": 0.0755,
+      "step": 58
+    },
+    {
+      "epoch": 0.3920265780730897,
+      "grad_norm": 1.6989317046427177,
+      "learning_rate": 9.718042531967918e-06,
+      "loss": 0.1073,
+      "step": 59
+    },
+    {
+      "epoch": 0.39867109634551495,
+      "grad_norm": 0.5576923228958391,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 0.0563,
+      "step": 60
+    },
+    {
+      "epoch": 0.4053156146179402,
+      "grad_norm": 1.197144883948249,
+      "learning_rate": 9.678247579197658e-06,
+      "loss": 0.0835,
+      "step": 61
+    },
+    {
+      "epoch": 0.4119601328903654,
+      "grad_norm": 0.7800865622678685,
+      "learning_rate": 9.657398694630713e-06,
+      "loss": 0.084,
+      "step": 62
+    },
+    {
+      "epoch": 0.4186046511627907,
+      "grad_norm": 1.332148923735981,
+      "learning_rate": 9.635919272833938e-06,
+      "loss": 0.0965,
+      "step": 63
+    },
+    {
+      "epoch": 0.42524916943521596,
+      "grad_norm": 0.9537850616235446,
+      "learning_rate": 9.613812221777212e-06,
+      "loss": 0.0784,
+      "step": 64
+    },
+    {
+      "epoch": 0.4318936877076412,
+      "grad_norm": 0.9773777024805813,
+      "learning_rate": 9.591080534401371e-06,
+      "loss": 0.094,
+      "step": 65
+    },
+    {
+      "epoch": 0.43853820598006643,
+      "grad_norm": 0.9622850200719331,
+      "learning_rate": 9.567727288213005e-06,
+      "loss": 0.0682,
+      "step": 66
+    },
+    {
+      "epoch": 0.44518272425249167,
+      "grad_norm": 1.1749521195263826,
+      "learning_rate": 9.543755644867823e-06,
+      "loss": 0.1174,
+      "step": 67
+    },
+    {
+      "epoch": 0.45182724252491696,
+      "grad_norm": 0.9900437964371273,
+      "learning_rate": 9.519168849742603e-06,
+      "loss": 0.0631,
+      "step": 68
+    },
+    {
+      "epoch": 0.4584717607973422,
+      "grad_norm": 1.1369496264217245,
+      "learning_rate": 9.493970231495836e-06,
+      "loss": 0.091,
+      "step": 69
+    },
+    {
+      "epoch": 0.46511627906976744,
+      "grad_norm": 0.7596879303837656,
+      "learning_rate": 9.468163201617063e-06,
+      "loss": 0.0737,
+      "step": 70
+    },
+    {
+      "epoch": 0.4717607973421927,
+      "grad_norm": 2.091451100542592,
+      "learning_rate": 9.441751253965022e-06,
+      "loss": 0.1005,
+      "step": 71
+    },
+    {
+      "epoch": 0.47840531561461797,
+      "grad_norm": 1.093842023272068,
+      "learning_rate": 9.414737964294636e-06,
+      "loss": 0.0777,
+      "step": 72
+    },
+    {
+      "epoch": 0.4850498338870432,
+      "grad_norm": 0.9952555604898301,
+      "learning_rate": 9.38712698977291e-06,
+      "loss": 0.0877,
+      "step": 73
+    },
+    {
+      "epoch": 0.49169435215946844,
+      "grad_norm": 1.1620904184349503,
+      "learning_rate": 9.358922068483813e-06,
+      "loss": 0.0944,
+      "step": 74
+    },
+    {
+      "epoch": 0.4983388704318937,
+      "grad_norm": 0.5355560711239664,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.0655,
+      "step": 75
+    },
+    {
+      "epoch": 0.5049833887043189,
+      "grad_norm": 1.0977107361921403,
+      "learning_rate": 9.30074573947683e-06,
+      "loss": 0.0837,
+      "step": 76
+    },
+    {
+      "epoch": 0.5116279069767442,
+      "grad_norm": 0.5247471981497749,
+      "learning_rate": 9.27078220790263e-06,
+      "loss": 0.0593,
+      "step": 77
+    },
+    {
+      "epoch": 0.5182724252491694,
+      "grad_norm": 0.5701004570093748,
+      "learning_rate": 9.24024048078213e-06,
+      "loss": 0.0459,
+      "step": 78
+    },
+    {
+      "epoch": 0.5249169435215947,
+      "grad_norm": 1.0366232450474129,
+      "learning_rate": 9.209124692976287e-06,
+      "loss": 0.0876,
+      "step": 79
+    },
+    {
+      "epoch": 0.53156146179402,
+      "grad_norm": 0.6364682974892603,
+      "learning_rate": 9.177439057064684e-06,
+      "loss": 0.0763,
+      "step": 80
+    },
+    {
+      "epoch": 0.5382059800664452,
+      "grad_norm": 1.0568589661747034,
+      "learning_rate": 9.145187862775208e-06,
+      "loss": 0.093,
+      "step": 81
+    },
+    {
+      "epoch": 0.5448504983388704,
+      "grad_norm": 0.9386306410175308,
+      "learning_rate": 9.112375476403313e-06,
+      "loss": 0.0891,
+      "step": 82
+    },
+    {
+      "epoch": 0.5514950166112956,
+      "grad_norm": 0.8643230084646484,
+      "learning_rate": 9.079006340220862e-06,
+      "loss": 0.096,
+      "step": 83
+    },
+    {
+      "epoch": 0.5581395348837209,
+      "grad_norm": 0.6462165981943299,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 0.0595,
+      "step": 84
+    },
+    {
+      "epoch": 0.5647840531561462,
+      "grad_norm": 0.7897404631218589,
+      "learning_rate": 9.01061596377522e-06,
+      "loss": 0.0731,
+      "step": 85
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.0476593444440303,
+      "learning_rate": 8.97560398247424e-06,
+      "loss": 0.0705,
+      "step": 86
+    },
+    {
+      "epoch": 0.5780730897009967,
+      "grad_norm": 0.7346838336829987,
+      "learning_rate": 8.94005376803361e-06,
+      "loss": 0.0749,
+      "step": 87
+    },
+    {
+      "epoch": 0.584717607973422,
+      "grad_norm": 1.1640993902217747,
+      "learning_rate": 8.903970133383297e-06,
+      "loss": 0.0993,
+      "step": 88
+    },
+    {
+      "epoch": 0.5913621262458472,
+      "grad_norm": 0.8668509799741243,
+      "learning_rate": 8.867357963669821e-06,
+      "loss": 0.0781,
+      "step": 89
+    },
+    {
+      "epoch": 0.5980066445182725,
+      "grad_norm": 0.8019269925738027,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 0.0627,
+      "step": 90
+    },
+    {
+      "epoch": 0.6046511627906976,
+      "grad_norm": 0.9309523320951085,
+      "learning_rate": 8.792567916744346e-06,
+      "loss": 0.0689,
+      "step": 91
+    },
+    {
+      "epoch": 0.6112956810631229,
+      "grad_norm": 0.9960236498367079,
+      "learning_rate": 8.754400164907496e-06,
+      "loss": 0.0817,
+      "step": 92
+    },
+    {
+      "epoch": 0.6179401993355482,
+      "grad_norm": 1.0124963384933403,
+      "learning_rate": 8.715724127386971e-06,
+      "loss": 0.0917,
+      "step": 93
+    },
+    {
+      "epoch": 0.6245847176079734,
+      "grad_norm": 0.9034419295571391,
+      "learning_rate": 8.676545040299145e-06,
+      "loss": 0.0913,
+      "step": 94
+    },
+    {
+      "epoch": 0.6312292358803987,
+      "grad_norm": 0.7241319583554866,
+      "learning_rate": 8.636868207865244e-06,
+      "loss": 0.0758,
+      "step": 95
+    },
+    {
+      "epoch": 0.6378737541528239,
+      "grad_norm": 0.9291976143638696,
+      "learning_rate": 8.596699001693257e-06,
+      "loss": 0.0993,
+      "step": 96
+    },
+    {
+      "epoch": 0.6445182724252492,
+      "grad_norm": 0.7063222766334012,
+      "learning_rate": 8.556042860050686e-06,
+      "loss": 0.087,
+      "step": 97
+    },
+    {
+      "epoch": 0.6511627906976745,
+      "grad_norm": 0.7828811245723147,
+      "learning_rate": 8.51490528712831e-06,
+      "loss": 0.0748,
+      "step": 98
+    },
+    {
+      "epoch": 0.6578073089700996,
+      "grad_norm": 0.7179259078532849,
+      "learning_rate": 8.473291852294986e-06,
+      "loss": 0.0708,
+      "step": 99
+    },
+    {
+      "epoch": 0.6644518272425249,
+      "grad_norm": 1.0029460084879072,
+      "learning_rate": 8.43120818934367e-06,
+      "loss": 0.0851,
+      "step": 100
+    },
+    {
+      "epoch": 0.6710963455149501,
+      "grad_norm": 0.8100677095349954,
+      "learning_rate": 8.388659995728662e-06,
+      "loss": 0.0965,
+      "step": 101
+    },
+    {
+      "epoch": 0.6777408637873754,
+      "grad_norm": 0.7387893774671513,
+      "learning_rate": 8.345653031794292e-06,
+      "loss": 0.0887,
+      "step": 102
+    },
+    {
+      "epoch": 0.6843853820598007,
+      "grad_norm": 0.8786303341962616,
+      "learning_rate": 8.302193119995038e-06,
+      "loss": 0.0833,
+      "step": 103
+    },
+    {
+      "epoch": 0.6910299003322259,
+      "grad_norm": 0.6498104624184341,
+      "learning_rate": 8.258286144107277e-06,
+      "loss": 0.0664,
+      "step": 104
+    },
+    {
+      "epoch": 0.6976744186046512,
+      "grad_norm": 0.8053829136759763,
+      "learning_rate": 8.213938048432697e-06,
+      "loss": 0.0773,
+      "step": 105
+    },
+    {
+      "epoch": 0.7043189368770764,
+      "grad_norm": 0.7440146521214637,
+      "learning_rate": 8.16915483699355e-06,
+      "loss": 0.084,
+      "step": 106
+    },
+    {
+      "epoch": 0.7109634551495017,
+      "grad_norm": 0.8430477204164517,
+      "learning_rate": 8.123942572719801e-06,
+      "loss": 0.0754,
+      "step": 107
+    },
+    {
+      "epoch": 0.717607973421927,
+      "grad_norm": 0.7844336038759876,
+      "learning_rate": 8.078307376628292e-06,
+      "loss": 0.0681,
+      "step": 108
+    },
+    {
+      "epoch": 0.7242524916943521,
+      "grad_norm": 0.6865608703175325,
+      "learning_rate": 8.032255426994069e-06,
+      "loss": 0.0808,
+      "step": 109
+    },
+    {
+      "epoch": 0.7308970099667774,
+      "grad_norm": 0.8626677757680601,
+      "learning_rate": 7.985792958513932e-06,
+      "loss": 0.1014,
+      "step": 110
+    },
+    {
+      "epoch": 0.7375415282392026,
+      "grad_norm": 0.6458902559836811,
+      "learning_rate": 7.938926261462366e-06,
+      "loss": 0.0727,
+      "step": 111
+    },
+    {
+      "epoch": 0.7441860465116279,
+      "grad_norm": 0.67871461818545,
+      "learning_rate": 7.891661680839932e-06,
+      "loss": 0.0658,
+      "step": 112
+    },
+    {
+      "epoch": 0.7508305647840532,
+      "grad_norm": 0.7537117403397503,
+      "learning_rate": 7.84400561551426e-06,
+      "loss": 0.0826,
+      "step": 113
+    },
+    {
+      "epoch": 0.7574750830564784,
+      "grad_norm": 0.849811513648714,
+      "learning_rate": 7.795964517353734e-06,
+      "loss": 0.0795,
+      "step": 114
+    },
+    {
+      "epoch": 0.7641196013289037,
+      "grad_norm": 0.6434504848546716,
+      "learning_rate": 7.747544890354031e-06,
+      "loss": 0.0661,
+      "step": 115
+    },
+    {
+      "epoch": 0.770764119601329,
+      "grad_norm": 0.8079608133403346,
+      "learning_rate": 7.698753289757565e-06,
+      "loss": 0.075,
+      "step": 116
+    },
+    {
+      "epoch": 0.7774086378737541,
+      "grad_norm": 0.9390957260974234,
+      "learning_rate": 7.649596321166024e-06,
+      "loss": 0.0649,
+      "step": 117
+    },
+    {
+      "epoch": 0.7840531561461794,
+      "grad_norm": 0.8258259729749267,
+      "learning_rate": 7.600080639646077e-06,
+      "loss": 0.076,
+      "step": 118
+    },
+    {
+      "epoch": 0.7906976744186046,
+      "grad_norm": 0.8344165033408021,
+      "learning_rate": 7.550212948828377e-06,
+      "loss": 0.0658,
+      "step": 119
+    },
+    {
+      "epoch": 0.7973421926910299,
+      "grad_norm": 0.7468613999721869,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.0731,
+      "step": 120
+    },
+    {
+      "epoch": 0.8039867109634552,
+      "grad_norm": 0.7375954465469966,
+      "learning_rate": 7.449448591190436e-06,
+      "loss": 0.0775,
+      "step": 121
+    },
+    {
+      "epoch": 0.8106312292358804,
+      "grad_norm": 0.5770159108524059,
+      "learning_rate": 7.398565566251232e-06,
+      "loss": 0.0766,
+      "step": 122
+    },
+    {
+      "epoch": 0.8172757475083057,
+      "grad_norm": 0.6213954523156613,
+      "learning_rate": 7.347357813929455e-06,
+      "loss": 0.0684,
+      "step": 123
+    },
+    {
+      "epoch": 0.8239202657807309,
+      "grad_norm": 0.6319841085682676,
+      "learning_rate": 7.295832266935059e-06,
+      "loss": 0.0508,
+      "step": 124
+    },
+    {
+      "epoch": 0.8305647840531561,
+      "grad_norm": 0.5561773891072708,
+      "learning_rate": 7.243995901002312e-06,
+      "loss": 0.0416,
+      "step": 125
+    },
+    {
+      "epoch": 0.8372093023255814,
+      "grad_norm": 0.6768768843907513,
+      "learning_rate": 7.191855733945388e-06,
+      "loss": 0.1001,
+      "step": 126
+    },
+    {
+      "epoch": 0.8438538205980066,
+      "grad_norm": 0.8870781137917316,
+      "learning_rate": 7.1394188247082715e-06,
+      "loss": 0.096,
+      "step": 127
+    },
+    {
+      "epoch": 0.8504983388704319,
+      "grad_norm": 0.6199813951371215,
+      "learning_rate": 7.08669227240909e-06,
+      "loss": 0.058,
+      "step": 128
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.6258631411773975,
+      "learning_rate": 7.033683215379002e-06,
+      "loss": 0.0797,
+      "step": 129
+    },
+    {
+      "epoch": 0.8637873754152824,
+      "grad_norm": 0.7141241764483236,
+      "learning_rate": 6.980398830195785e-06,
+      "loss": 0.0616,
+      "step": 130
+    },
+    {
+      "epoch": 0.8704318936877077,
+      "grad_norm": 0.6539814755788471,
+      "learning_rate": 6.9268463307122425e-06,
+      "loss": 0.0543,
+      "step": 131
+    },
+    {
+      "epoch": 0.8770764119601329,
+      "grad_norm": 0.6696475606516166,
+      "learning_rate": 6.873032967079562e-06,
+      "loss": 0.0922,
+      "step": 132
+    },
+    {
+      "epoch": 0.8837209302325582,
+      "grad_norm": 0.7256752618037919,
+      "learning_rate": 6.818966024765758e-06,
+      "loss": 0.0723,
+      "step": 133
+    },
+    {
+      "epoch": 0.8903654485049833,
+      "grad_norm": 0.7949722457642716,
+      "learning_rate": 6.7646528235693445e-06,
+      "loss": 0.0919,
+      "step": 134
+    },
+    {
+      "epoch": 0.8970099667774086,
+      "grad_norm": 0.595409116890567,
+      "learning_rate": 6.710100716628345e-06,
+      "loss": 0.0585,
+      "step": 135
+    },
+    {
+      "epoch": 0.9036544850498339,
+      "grad_norm": 1.0033697089937976,
+      "learning_rate": 6.655317089424791e-06,
+      "loss": 0.1081,
+      "step": 136
+    },
+    {
+      "epoch": 0.9102990033222591,
+      "grad_norm": 0.710716471651743,
+      "learning_rate": 6.600309358784858e-06,
+      "loss": 0.0761,
+      "step": 137
+    },
+    {
+      "epoch": 0.9169435215946844,
+      "grad_norm": 0.576448742099724,
+      "learning_rate": 6.545084971874738e-06,
+      "loss": 0.0614,
+      "step": 138
+    },
+    {
+      "epoch": 0.9235880398671097,
+      "grad_norm": 0.8348391733283355,
+      "learning_rate": 6.48965140519241e-06,
+      "loss": 0.063,
+      "step": 139
+    },
+    {
+      "epoch": 0.9302325581395349,
+      "grad_norm": 1.1013542545275912,
+      "learning_rate": 6.434016163555452e-06,
+      "loss": 0.0734,
+      "step": 140
+    },
+    {
+      "epoch": 0.9368770764119602,
+      "grad_norm": 0.779239872251012,
+      "learning_rate": 6.378186779084996e-06,
+      "loss": 0.0721,
+      "step": 141
+    },
+    {
+      "epoch": 0.9435215946843853,
+      "grad_norm": 0.6235545197422645,
+      "learning_rate": 6.322170810186013e-06,
+      "loss": 0.0623,
+      "step": 142
+    },
+    {
+      "epoch": 0.9501661129568106,
+      "grad_norm": 0.7535118680118036,
+      "learning_rate": 6.26597584052401e-06,
+      "loss": 0.0847,
+      "step": 143
+    },
+    {
+      "epoch": 0.9568106312292359,
+      "grad_norm": 0.7025166191190397,
+      "learning_rate": 6.209609477998339e-06,
+      "loss": 0.0625,
+      "step": 144
+    },
+    {
+      "epoch": 0.9634551495016611,
+      "grad_norm": 0.6972936781122409,
+      "learning_rate": 6.153079353712201e-06,
+      "loss": 0.1023,
+      "step": 145
+    },
+    {
+      "epoch": 0.9700996677740864,
+      "grad_norm": 0.6793230837700109,
+      "learning_rate": 6.0963931209395165e-06,
+      "loss": 0.0638,
+      "step": 146
+    },
+    {
+      "epoch": 0.9767441860465116,
+      "grad_norm": 0.7859561569735135,
+      "learning_rate": 6.039558454088796e-06,
+      "loss": 0.0921,
+      "step": 147
+    },
+    {
+      "epoch": 0.9833887043189369,
+      "grad_norm": 0.5714141172905004,
+      "learning_rate": 5.982583047664151e-06,
+      "loss": 0.0731,
+      "step": 148
+    },
+    {
+      "epoch": 0.9900332225913622,
+      "grad_norm": 0.7332537893766391,
+      "learning_rate": 5.925474615223573e-06,
+      "loss": 0.0673,
+      "step": 149
+    },
+    {
+      "epoch": 0.9966777408637874,
+      "grad_norm": 0.7492221124159332,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 0.0683,
+      "step": 150
+    },
+    {
+      "epoch": 0.9966777408637874,
+      "eval_loss": 0.07248932123184204,
+      "eval_runtime": 32.808,
+      "eval_samples_per_second": 30.846,
+      "eval_steps_per_second": 0.975,
+      "step": 150
+    },
+    {
+      "epoch": 1.0033222591362125,
+      "grad_norm": 0.869985165126094,
+      "learning_rate": 5.810889615527839e-06,
+      "loss": 0.0832,
+      "step": 151
+    },
+    {
+      "epoch": 1.0099667774086378,
+      "grad_norm": 0.7265007952878254,
+      "learning_rate": 5.753428561247416e-06,
+      "loss": 0.0603,
+      "step": 152
+    },
+    {
+      "epoch": 1.0166112956810631,
+      "grad_norm": 0.5118692934892353,
+      "learning_rate": 5.695865504800328e-06,
+      "loss": 0.0556,
+      "step": 153
+    },
+    {
+      "epoch": 1.0232558139534884,
+      "grad_norm": 0.6621056974289653,
+      "learning_rate": 5.638208239302975e-06,
+      "loss": 0.1026,
+      "step": 154
+    },
+    {
+      "epoch": 1.0299003322259137,
+      "grad_norm": 0.5532826508404691,
+      "learning_rate": 5.5804645706261515e-06,
+      "loss": 0.0689,
+      "step": 155
+    },
+    {
+      "epoch": 1.0365448504983388,
+      "grad_norm": 0.7370728155757319,
+      "learning_rate": 5.522642316338268e-06,
+      "loss": 0.0507,
+      "step": 156
+    },
+    {
+      "epoch": 1.043189368770764,
+      "grad_norm": 0.608945548724636,
+      "learning_rate": 5.464749304646963e-06,
+      "loss": 0.0541,
+      "step": 157
+    },
+    {
+      "epoch": 1.0498338870431894,
+      "grad_norm": 0.694396430915162,
+      "learning_rate": 5.406793373339292e-06,
+      "loss": 0.0794,
+      "step": 158
+    },
+    {
+      "epoch": 1.0564784053156147,
+      "grad_norm": 0.9492041254202869,
+      "learning_rate": 5.348782368720627e-06,
+      "loss": 0.0907,
+      "step": 159
+    },
+    {
+      "epoch": 1.06312292358804,
+      "grad_norm": 0.7429952070523816,
+      "learning_rate": 5.290724144552379e-06,
+      "loss": 0.0586,
+      "step": 160
+    },
+    {
+      "epoch": 1.069767441860465,
+      "grad_norm": 0.640375361353487,
+      "learning_rate": 5.232626560988735e-06,
+      "loss": 0.0675,
+      "step": 161
+    },
+    {
+      "epoch": 1.0764119601328903,
+      "grad_norm": 0.753445536719332,
+      "learning_rate": 5.174497483512506e-06,
+      "loss": 0.0666,
+      "step": 162
+    },
+    {
+      "epoch": 1.0830564784053156,
+      "grad_norm": 0.6292170840779705,
+      "learning_rate": 5.116344781870282e-06,
+      "loss": 0.0594,
+      "step": 163
+    },
+    {
+      "epoch": 1.089700996677741,
+      "grad_norm": 0.6346627170439836,
+      "learning_rate": 5.0581763290069865e-06,
+      "loss": 0.0523,
+      "step": 164
+    },
+    {
+      "epoch": 1.0963455149501662,
+      "grad_norm": 0.6561801470391864,
+      "learning_rate": 5e-06,
+      "loss": 0.0661,
+      "step": 165
+    },
+    {
+      "epoch": 1.1029900332225913,
+      "grad_norm": 0.6948146032383066,
+      "learning_rate": 4.941823670993016e-06,
+      "loss": 0.0706,
+      "step": 166
+    },
+    {
+      "epoch": 1.1096345514950166,
+      "grad_norm": 0.7613914644690916,
+      "learning_rate": 4.883655218129719e-06,
+      "loss": 0.0649,
+      "step": 167
+    },
+    {
+      "epoch": 1.1162790697674418,
+      "grad_norm": 0.45992106011709505,
+      "learning_rate": 4.825502516487497e-06,
+      "loss": 0.0557,
+      "step": 168
+    },
+    {
+      "epoch": 1.1229235880398671,
+      "grad_norm": 0.6249306843166101,
+      "learning_rate": 4.767373439011267e-06,
+      "loss": 0.0562,
+      "step": 169
+    },
+    {
+      "epoch": 1.1295681063122924,
+      "grad_norm": 0.5678874358035478,
+      "learning_rate": 4.7092758554476215e-06,
+      "loss": 0.0627,
+      "step": 170
+    },
+    {
+      "epoch": 1.1362126245847177,
+      "grad_norm": 0.37893696528451193,
+      "learning_rate": 4.651217631279374e-06,
+      "loss": 0.0411,
+      "step": 171
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.6653453617589765,
+      "learning_rate": 4.59320662666071e-06,
+      "loss": 0.0634,
+      "step": 172
+    },
+    {
+      "epoch": 1.149501661129568,
+      "grad_norm": 0.8688586555661224,
+      "learning_rate": 4.53525069535304e-06,
+      "loss": 0.0455,
+      "step": 173
+    },
+    {
+      "epoch": 1.1561461794019934,
+      "grad_norm": 0.8273055275277121,
+      "learning_rate": 4.477357683661734e-06,
+      "loss": 0.0732,
+      "step": 174
+    },
+    {
+      "epoch": 1.1627906976744187,
+      "grad_norm": 0.4300505630395071,
+      "learning_rate": 4.4195354293738484e-06,
+      "loss": 0.0521,
+      "step": 175
+    },
+    {
+      "epoch": 1.169435215946844,
+      "grad_norm": 0.4719570261098666,
+      "learning_rate": 4.361791760697027e-06,
+      "loss": 0.0421,
+      "step": 176
+    },
+    {
+      "epoch": 1.176079734219269,
+      "grad_norm": 0.7484964051921864,
+      "learning_rate": 4.304134495199675e-06,
+      "loss": 0.0724,
+      "step": 177
+    },
+    {
+      "epoch": 1.1827242524916943,
+      "grad_norm": 0.46240257112979466,
+      "learning_rate": 4.246571438752585e-06,
+      "loss": 0.0406,
+      "step": 178
+    },
+    {
+      "epoch": 1.1893687707641196,
+      "grad_norm": 0.41112129356312116,
+      "learning_rate": 4.189110384472164e-06,
+      "loss": 0.0538,
+      "step": 179
+    },
+    {
+      "epoch": 1.196013289036545,
+      "grad_norm": 0.38333151957355116,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 0.054,
+      "step": 180
+    },
+    {
+      "epoch": 1.2026578073089702,
+      "grad_norm": 0.5908193567344284,
+      "learning_rate": 4.074525384776428e-06,
+      "loss": 0.0731,
+      "step": 181
+    },
+    {
+      "epoch": 1.2093023255813953,
+      "grad_norm": 0.5085203609671776,
+      "learning_rate": 4.017416952335849e-06,
+      "loss": 0.0563,
+      "step": 182
+    },
+    {
+      "epoch": 1.2159468438538206,
+      "grad_norm": 0.6307549929539201,
+      "learning_rate": 3.960441545911205e-06,
+      "loss": 0.0807,
+      "step": 183
+    },
+    {
+      "epoch": 1.2225913621262459,
+      "grad_norm": 0.3759705322994287,
+      "learning_rate": 3.903606879060483e-06,
+      "loss": 0.043,
+      "step": 184
+    },
+    {
+      "epoch": 1.2292358803986712,
+      "grad_norm": 0.5833166358506473,
+      "learning_rate": 3.8469206462878e-06,
+      "loss": 0.0361,
+      "step": 185
+    },
+    {
+      "epoch": 1.2358803986710964,
+      "grad_norm": 0.38785628492735885,
+      "learning_rate": 3.790390522001662e-06,
+      "loss": 0.0394,
+      "step": 186
+    },
+    {
+      "epoch": 1.2425249169435215,
+      "grad_norm": 0.5384756114916416,
+      "learning_rate": 3.7340241594759917e-06,
+      "loss": 0.0664,
+      "step": 187
+    },
+    {
+      "epoch": 1.2491694352159468,
+      "grad_norm": 0.46563888733253733,
+      "learning_rate": 3.6778291898139907e-06,
+      "loss": 0.0488,
+      "step": 188
+    },
+    {
+      "epoch": 1.255813953488372,
+      "grad_norm": 0.8009617857561557,
+      "learning_rate": 3.6218132209150047e-06,
+      "loss": 0.0784,
+      "step": 189
+    },
+    {
+      "epoch": 1.2624584717607974,
+      "grad_norm": 0.5704636583609043,
+      "learning_rate": 3.5659838364445505e-06,
+      "loss": 0.043,
+      "step": 190
+    },
+    {
+      "epoch": 1.2691029900332227,
+      "grad_norm": 0.4704454938004665,
+      "learning_rate": 3.51034859480759e-06,
+      "loss": 0.0511,
+      "step": 191
+    },
+    {
+      "epoch": 1.2757475083056478,
+      "grad_norm": 0.41034160768316386,
+      "learning_rate": 3.4549150281252635e-06,
+      "loss": 0.0548,
+      "step": 192
+    },
+    {
+      "epoch": 1.282392026578073,
+      "grad_norm": 0.5724269024421317,
+      "learning_rate": 3.399690641215142e-06,
+      "loss": 0.0595,
+      "step": 193
+    },
+    {
+      "epoch": 1.2890365448504983,
+      "grad_norm": 0.5846464592698902,
+      "learning_rate": 3.3446829105752103e-06,
+      "loss": 0.0503,
+      "step": 194
+    },
+    {
+      "epoch": 1.2956810631229236,
+      "grad_norm": 0.5118811966547737,
+      "learning_rate": 3.289899283371657e-06,
+      "loss": 0.0524,
+      "step": 195
+    },
+    {
+      "epoch": 1.302325581395349,
+      "grad_norm": 0.8890139228230102,
+      "learning_rate": 3.2353471764306567e-06,
+      "loss": 0.0517,
+      "step": 196
+    },
+    {
+      "epoch": 1.308970099667774,
+      "grad_norm": 0.6318848948033396,
+      "learning_rate": 3.1810339752342446e-06,
+      "loss": 0.0614,
+      "step": 197
+    },
+    {
+      "epoch": 1.3156146179401993,
+      "grad_norm": 0.7335844136817455,
+      "learning_rate": 3.12696703292044e-06,
+      "loss": 0.0667,
+      "step": 198
+    },
+    {
+      "epoch": 1.3222591362126246,
+      "grad_norm": 0.6548135189722719,
+      "learning_rate": 3.0731536692877596e-06,
+      "loss": 0.0562,
+      "step": 199
+    },
+    {
+      "epoch": 1.3289036544850499,
+      "grad_norm": 0.46580064823387207,
+      "learning_rate": 3.019601169804216e-06,
+      "loss": 0.0483,
+      "step": 200
+    },
+    {
+      "epoch": 1.3355481727574752,
+      "grad_norm": 0.8221711324426894,
+      "learning_rate": 2.966316784621e-06,
+      "loss": 0.0658,
+      "step": 201
+    },
+    {
+      "epoch": 1.3421926910299002,
+      "grad_norm": 0.901723556867836,
+      "learning_rate": 2.9133077275909112e-06,
+      "loss": 0.0846,
+      "step": 202
+    },
+    {
+      "epoch": 1.3488372093023255,
+      "grad_norm": 0.5521866482755017,
+      "learning_rate": 2.86058117529173e-06,
+      "loss": 0.0516,
+      "step": 203
+    },
+    {
+      "epoch": 1.3554817275747508,
+      "grad_norm": 0.4587332626482284,
+      "learning_rate": 2.8081442660546126e-06,
+      "loss": 0.0632,
+      "step": 204
+    },
+    {
+      "epoch": 1.3621262458471761,
+      "grad_norm": 0.7702320166873419,
+      "learning_rate": 2.7560040989976894e-06,
+      "loss": 0.0908,
+      "step": 205
+    },
+    {
+      "epoch": 1.3687707641196014,
+      "grad_norm": 0.47296016709400723,
+      "learning_rate": 2.7041677330649408e-06,
+      "loss": 0.0537,
+      "step": 206
+    },
+    {
+      "epoch": 1.3754152823920265,
+      "grad_norm": 0.4347511906822839,
+      "learning_rate": 2.6526421860705474e-06,
+      "loss": 0.0471,
+      "step": 207
+    },
+    {
+      "epoch": 1.3820598006644518,
+      "grad_norm": 0.4808252094789194,
+      "learning_rate": 2.601434433748771e-06,
+      "loss": 0.0564,
+      "step": 208
+    },
+    {
+      "epoch": 1.388704318936877,
+      "grad_norm": 0.39354003895746387,
+      "learning_rate": 2.550551408809566e-06,
+      "loss": 0.0566,
+      "step": 209
+    },
+    {
+      "epoch": 1.3953488372093024,
+      "grad_norm": 0.4911701503722221,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.0621,
+      "step": 210
+    },
+    {
+      "epoch": 1.4019933554817277,
+      "grad_norm": 0.46092436668003706,
+      "learning_rate": 2.4497870511716237e-06,
+      "loss": 0.0607,
+      "step": 211
+    },
+    {
+      "epoch": 1.4086378737541527,
+      "grad_norm": 0.5878983217004033,
+      "learning_rate": 2.3999193603539234e-06,
+      "loss": 0.061,
+      "step": 212
+    },
+    {
+      "epoch": 1.415282392026578,
+      "grad_norm": 0.522361230864803,
+      "learning_rate": 2.3504036788339763e-06,
+      "loss": 0.072,
+      "step": 213
+    },
+    {
+      "epoch": 1.4219269102990033,
+      "grad_norm": 0.4092827869499798,
+      "learning_rate": 2.3012467102424373e-06,
+      "loss": 0.0457,
+      "step": 214
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.6368421322210067,
+      "learning_rate": 2.2524551096459703e-06,
+      "loss": 0.0758,
+      "step": 215
+    },
+    {
+      "epoch": 1.435215946843854,
+      "grad_norm": 0.6403778504814651,
+      "learning_rate": 2.204035482646267e-06,
+      "loss": 0.0678,
+      "step": 216
+    },
+    {
+      "epoch": 1.441860465116279,
+      "grad_norm": 0.5188080009509758,
+      "learning_rate": 2.155994384485742e-06,
+      "loss": 0.0598,
+      "step": 217
+    },
+    {
+      "epoch": 1.4485049833887043,
+      "grad_norm": 0.48165858744860335,
+      "learning_rate": 2.1083383191600676e-06,
+      "loss": 0.0533,
+      "step": 218
+    },
+    {
+      "epoch": 1.4551495016611296,
+      "grad_norm": 0.5327724082626948,
+      "learning_rate": 2.061073738537635e-06,
+      "loss": 0.0522,
+      "step": 219
+    },
+    {
+      "epoch": 1.4617940199335548,
+      "grad_norm": 0.7299533338476814,
+      "learning_rate": 2.0142070414860704e-06,
+      "loss": 0.0648,
+      "step": 220
+    },
+    {
+      "epoch": 1.4684385382059801,
+      "grad_norm": 0.6399049060963062,
+      "learning_rate": 1.9677445730059348e-06,
+      "loss": 0.0646,
+      "step": 221
+    },
+    {
+      "epoch": 1.4750830564784052,
+      "grad_norm": 0.6611530351876729,
+      "learning_rate": 1.9216926233717087e-06,
+      "loss": 0.0553,
+      "step": 222
+    },
+    {
+      "epoch": 1.4817275747508305,
+      "grad_norm": 0.5364895163115652,
+      "learning_rate": 1.8760574272802002e-06,
+      "loss": 0.068,
+      "step": 223
+    },
+    {
+      "epoch": 1.4883720930232558,
+      "grad_norm": 0.5151753951352734,
+      "learning_rate": 1.8308451630064484e-06,
+      "loss": 0.0727,
+      "step": 224
+    },
+    {
+      "epoch": 1.495016611295681,
+      "grad_norm": 0.4311368423319901,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 0.0423,
+      "step": 225
+    },
+    {
+      "epoch": 1.5016611295681064,
+      "grad_norm": 0.4919258897689397,
+      "learning_rate": 1.7417138558927244e-06,
+      "loss": 0.0405,
+      "step": 226
+    },
+    {
+      "epoch": 1.5083056478405314,
+      "grad_norm": 0.5521587113661647,
+      "learning_rate": 1.6978068800049624e-06,
+      "loss": 0.07,
+      "step": 227
+    },
+    {
+      "epoch": 1.514950166112957,
+      "grad_norm": 0.5044366493415394,
+      "learning_rate": 1.6543469682057105e-06,
+      "loss": 0.0565,
+      "step": 228
+    },
+    {
+      "epoch": 1.521594684385382,
+      "grad_norm": 0.39516777310730944,
+      "learning_rate": 1.611340004271339e-06,
+      "loss": 0.0513,
+      "step": 229
+    },
+    {
+      "epoch": 1.5282392026578073,
+      "grad_norm": 0.5908802744488714,
+      "learning_rate": 1.5687918106563326e-06,
+      "loss": 0.0505,
+      "step": 230
+    },
+    {
+      "epoch": 1.5348837209302326,
+      "grad_norm": 0.5993938538262349,
+      "learning_rate": 1.5267081477050132e-06,
+      "loss": 0.061,
+      "step": 231
+    },
+    {
+      "epoch": 1.5415282392026577,
+      "grad_norm": 0.5387186847707379,
+      "learning_rate": 1.4850947128716914e-06,
+      "loss": 0.0642,
+      "step": 232
+    },
+    {
+      "epoch": 1.5481727574750832,
+      "grad_norm": 0.615519559618303,
+      "learning_rate": 1.4439571399493146e-06,
+      "loss": 0.0547,
+      "step": 233
+    },
+    {
+      "epoch": 1.5548172757475083,
+      "grad_norm": 0.610572822487856,
+      "learning_rate": 1.4033009983067454e-06,
+      "loss": 0.0638,
+      "step": 234
+    },
+    {
+      "epoch": 1.5614617940199336,
+      "grad_norm": 0.48741840749475485,
+      "learning_rate": 1.3631317921347564e-06,
+      "loss": 0.051,
+      "step": 235
+    },
+    {
+      "epoch": 1.5681063122923589,
+      "grad_norm": 0.540406507264975,
+      "learning_rate": 1.3234549597008572e-06,
+      "loss": 0.0596,
+      "step": 236
+    },
+    {
+      "epoch": 1.574750830564784,
+      "grad_norm": 0.4514556860955898,
+      "learning_rate": 1.2842758726130283e-06,
+      "loss": 0.0562,
+      "step": 237
+    },
+    {
+      "epoch": 1.5813953488372094,
+      "grad_norm": 0.4493200533821844,
+      "learning_rate": 1.2455998350925042e-06,
+      "loss": 0.0306,
+      "step": 238
+    },
+    {
+      "epoch": 1.5880398671096345,
+      "grad_norm": 0.5263235492360758,
+      "learning_rate": 1.2074320832556558e-06,
+      "loss": 0.0602,
+      "step": 239
+    },
+    {
+      "epoch": 1.5946843853820598,
+      "grad_norm": 0.5740721455957148,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 0.0586,
+      "step": 240
+    },
+    {
+      "epoch": 1.601328903654485,
+      "grad_norm": 0.6202865595174548,
+      "learning_rate": 1.132642036330181e-06,
+      "loss": 0.0693,
+      "step": 241
+    },
+    {
+      "epoch": 1.6079734219269102,
+      "grad_norm": 0.5998297967904946,
+      "learning_rate": 1.096029866616704e-06,
+      "loss": 0.0588,
+      "step": 242
+    },
+    {
+      "epoch": 1.6146179401993357,
+      "grad_norm": 0.45039052826506404,
+      "learning_rate": 1.0599462319663906e-06,
+      "loss": 0.0404,
+      "step": 243
+    },
+    {
+      "epoch": 1.6212624584717608,
+      "grad_norm": 0.45387478927844527,
+      "learning_rate": 1.0243960175257605e-06,
+      "loss": 0.0466,
+      "step": 244
+    },
+    {
+      "epoch": 1.627906976744186,
+      "grad_norm": 0.7727461157878296,
+      "learning_rate": 9.893840362247809e-07,
+      "loss": 0.0512,
+      "step": 245
+    },
+    {
+      "epoch": 1.6345514950166113,
+      "grad_norm": 0.4674812446077405,
+      "learning_rate": 9.549150281252633e-07,
+      "loss": 0.0621,
+      "step": 246
+    },
+    {
+      "epoch": 1.6411960132890364,
+      "grad_norm": 0.5112435475561715,
+      "learning_rate": 9.209936597791407e-07,
+      "loss": 0.0568,
+      "step": 247
+    },
+    {
+      "epoch": 1.647840531561462,
+      "grad_norm": 0.48326632808802955,
+      "learning_rate": 8.876245235966884e-07,
+      "loss": 0.0354,
+      "step": 248
+    },
+    {
+      "epoch": 1.654485049833887,
+      "grad_norm": 0.399153045603439,
+      "learning_rate": 8.54812137224792e-07,
+      "loss": 0.0462,
+      "step": 249
+    },
+    {
+      "epoch": 1.6611295681063123,
+      "grad_norm": 0.4550505862487184,
+      "learning_rate": 8.225609429353187e-07,
+      "loss": 0.0435,
+      "step": 250
+    },
+    {
+      "epoch": 1.6677740863787376,
+      "grad_norm": 0.5263233387777809,
+      "learning_rate": 7.908753070237124e-07,
+      "loss": 0.0692,
+      "step": 251
+    },
+    {
+      "epoch": 1.6744186046511627,
+      "grad_norm": 0.439125675201105,
+      "learning_rate": 7.597595192178702e-07,
+      "loss": 0.0526,
+      "step": 252
+    },
+    {
+      "epoch": 1.6810631229235882,
+      "grad_norm": 0.3876135531333277,
+      "learning_rate": 7.292177920973726e-07,
+      "loss": 0.0464,
+      "step": 253
+    },
+    {
+      "epoch": 1.6877076411960132,
+      "grad_norm": 0.3784518342351669,
+      "learning_rate": 6.992542605231739e-07,
+      "loss": 0.0565,
+      "step": 254
+    },
+    {
+      "epoch": 1.6943521594684385,
+      "grad_norm": 0.4466792986377153,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.0516,
+      "step": 255
+    },
+    {
+      "epoch": 1.7009966777408638,
+      "grad_norm": 0.75150863120178,
+      "learning_rate": 6.410779315161885e-07,
+      "loss": 0.065,
+      "step": 256
+    },
+    {
+      "epoch": 1.707641196013289,
+      "grad_norm": 0.41970908980773713,
+      "learning_rate": 6.128730102270897e-07,
+      "loss": 0.0403,
+      "step": 257
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.42761564066364777,
+      "learning_rate": 5.852620357053651e-07,
+      "loss": 0.0545,
+      "step": 258
+    },
+    {
+      "epoch": 1.7209302325581395,
+      "grad_norm": 0.40501611642435215,
+      "learning_rate": 5.582487460349806e-07,
+      "loss": 0.0426,
+      "step": 259
+    },
+    {
+      "epoch": 1.7275747508305648,
+      "grad_norm": 0.5175574150905407,
+      "learning_rate": 5.318367983829393e-07,
+      "loss": 0.0666,
+      "step": 260
+    },
+    {
+      "epoch": 1.73421926910299,
+      "grad_norm": 0.36925931545961604,
+      "learning_rate": 5.06029768504166e-07,
+      "loss": 0.0414,
+      "step": 261
+    },
+    {
+      "epoch": 1.7408637873754151,
+      "grad_norm": 0.5763761416847274,
+      "learning_rate": 4.808311502573976e-07,
+      "loss": 0.0638,
+      "step": 262
+    },
+    {
+      "epoch": 1.7475083056478407,
+      "grad_norm": 0.3316025388394529,
+      "learning_rate": 4.562443551321788e-07,
+      "loss": 0.04,
+      "step": 263
+    },
+    {
+      "epoch": 1.7541528239202657,
+      "grad_norm": 0.7139336516221676,
+      "learning_rate": 4.322727117869951e-07,
+      "loss": 0.0605,
+      "step": 264
+    },
+    {
+      "epoch": 1.760797342192691,
+      "grad_norm": 0.5156748893460643,
+      "learning_rate": 4.089194655986306e-07,
+      "loss": 0.0591,
+      "step": 265
+    },
+    {
+      "epoch": 1.7674418604651163,
+      "grad_norm": 0.7283638596556546,
+      "learning_rate": 3.8618777822278854e-07,
+      "loss": 0.0798,
+      "step": 266
+    },
+    {
+      "epoch": 1.7740863787375414,
+      "grad_norm": 0.5421800539182494,
+      "learning_rate": 3.6408072716606346e-07,
+      "loss": 0.0571,
+      "step": 267
+    },
+    {
+      "epoch": 1.780730897009967,
+      "grad_norm": 0.5090117089066726,
+      "learning_rate": 3.426013053692878e-07,
+      "loss": 0.058,
+      "step": 268
+    },
+    {
+      "epoch": 1.787375415282392,
+      "grad_norm": 0.49158590728514556,
+      "learning_rate": 3.2175242080234314e-07,
+      "loss": 0.0457,
+      "step": 269
+    },
+    {
+      "epoch": 1.7940199335548173,
+      "grad_norm": 0.4810283287867908,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 0.0504,
+      "step": 270
+    },
+    {
+      "epoch": 1.8006644518272426,
+      "grad_norm": 0.6267357418173619,
+      "learning_rate": 2.819574680320825e-07,
+      "loss": 0.0826,
+      "step": 271
+    },
+    {
+      "epoch": 1.8073089700996676,
+      "grad_norm": 0.42142389112996576,
+      "learning_rate": 2.63016787428354e-07,
+      "loss": 0.0432,
+      "step": 272
+    },
+    {
+      "epoch": 1.8139534883720931,
+      "grad_norm": 0.5002929091372561,
+      "learning_rate": 2.447174185242324e-07,
+      "loss": 0.0604,
+      "step": 273
+    },
+    {
+      "epoch": 1.8205980066445182,
+      "grad_norm": 0.42068361332599374,
+      "learning_rate": 2.2706183876134047e-07,
+      "loss": 0.0505,
+      "step": 274
+    },
+    {
+      "epoch": 1.8272425249169435,
+      "grad_norm": 0.4524190111477669,
+      "learning_rate": 2.1005243842255552e-07,
+      "loss": 0.048,
+      "step": 275
+    },
+    {
+      "epoch": 1.8338870431893688,
+      "grad_norm": 0.45609655951471145,
+      "learning_rate": 1.9369152030840553e-07,
+      "loss": 0.0592,
+      "step": 276
+    },
+    {
+      "epoch": 1.8405315614617939,
+      "grad_norm": 0.6678751099330384,
+      "learning_rate": 1.779812994253055e-07,
+      "loss": 0.0406,
+      "step": 277
+    },
+    {
+      "epoch": 1.8471760797342194,
+      "grad_norm": 0.4540634884040343,
+      "learning_rate": 1.6292390268568103e-07,
+      "loss": 0.0535,
+      "step": 278
+    },
+    {
+      "epoch": 1.8538205980066444,
+      "grad_norm": 0.3803108748809768,
+      "learning_rate": 1.4852136862001766e-07,
+      "loss": 0.0543,
+      "step": 279
+    },
+    {
+      "epoch": 1.8604651162790697,
+      "grad_norm": 1.0694423086901395,
+      "learning_rate": 1.3477564710088097e-07,
+      "loss": 0.0556,
+      "step": 280
+    },
+    {
+      "epoch": 1.867109634551495,
+      "grad_norm": 2.7488037869138098,
+      "learning_rate": 1.2168859907892904e-07,
+      "loss": 0.0941,
+      "step": 281
+    },
+    {
+      "epoch": 1.87375415282392,
+      "grad_norm": 0.5891173565335839,
+      "learning_rate": 1.0926199633097156e-07,
+      "loss": 0.0555,
+      "step": 282
+    },
+    {
+      "epoch": 1.8803986710963456,
+      "grad_norm": 0.41004921093494356,
+      "learning_rate": 9.749752122010347e-08,
+      "loss": 0.0432,
+      "step": 283
+    },
+    {
+      "epoch": 1.8870431893687707,
+      "grad_norm": 0.4128102542974997,
+      "learning_rate": 8.639676646793382e-08,
+      "loss": 0.0452,
+      "step": 284
+    },
+    {
+      "epoch": 1.893687707641196,
+      "grad_norm": 0.5446352118383607,
+      "learning_rate": 7.59612349389599e-08,
+      "loss": 0.0672,
+      "step": 285
+    },
+    {
+      "epoch": 1.9003322259136213,
+      "grad_norm": 0.5663671001216136,
+      "learning_rate": 6.61923394371039e-08,
+      "loss": 0.0599,
+      "step": 286
+    },
+    {
+      "epoch": 1.9069767441860463,
+      "grad_norm": 0.7575229176051346,
+      "learning_rate": 5.709140251444201e-08,
+      "loss": 0.0564,
+      "step": 287
+    },
+    {
+      "epoch": 1.9136212624584719,
+      "grad_norm": 0.41591234765048585,
+      "learning_rate": 4.865965629214819e-08,
+      "loss": 0.0582,
+      "step": 288
+    },
+    {
+      "epoch": 1.920265780730897,
+      "grad_norm": 0.3650616439731819,
+      "learning_rate": 4.0898242293691546e-08,
+      "loss": 0.0386,
+      "step": 289
+    },
+    {
+      "epoch": 1.9269102990033222,
+      "grad_norm": 0.6937484049329661,
+      "learning_rate": 3.3808211290284886e-08,
+      "loss": 0.0779,
+      "step": 290
+    },
+    {
+      "epoch": 1.9335548172757475,
+      "grad_norm": 0.3706387763514699,
+      "learning_rate": 2.7390523158633552e-08,
+      "loss": 0.0456,
+      "step": 291
+    },
+    {
+      "epoch": 1.9401993355481728,
+      "grad_norm": 0.40696852019057694,
+      "learning_rate": 2.1646046750978255e-08,
+      "loss": 0.0368,
+      "step": 292
+    },
+    {
+      "epoch": 1.946843853820598,
+      "grad_norm": 0.5221037977959876,
+      "learning_rate": 1.657555977746972e-08,
+      "loss": 0.0455,
+      "step": 293
+    },
+    {
+      "epoch": 1.9534883720930232,
+      "grad_norm": 0.4283204104270534,
+      "learning_rate": 1.2179748700879013e-08,
+      "loss": 0.0569,
+      "step": 294
+    },
+    {
+      "epoch": 1.9601328903654485,
+      "grad_norm": 0.45877963879091327,
+      "learning_rate": 8.459208643659122e-09,
+      "loss": 0.0523,
+      "step": 295
+    },
+    {
+      "epoch": 1.9667774086378738,
+      "grad_norm": 0.4089943640130188,
+      "learning_rate": 5.414443307377171e-09,
+      "loss": 0.0558,
+      "step": 296
+    },
+    {
+      "epoch": 1.973421926910299,
+      "grad_norm": 0.4565605936691547,
+      "learning_rate": 3.0458649045211897e-09,
+      "loss": 0.0462,
+      "step": 297
+    },
+    {
+      "epoch": 1.9800664451827243,
+      "grad_norm": 0.7146498443290045,
+      "learning_rate": 1.3537941026914302e-09,
+      "loss": 0.0413,
+      "step": 298
+    },
+    {
+      "epoch": 1.9867109634551494,
+      "grad_norm": 0.4588515242529337,
+      "learning_rate": 3.384599811889766e-10,
+      "loss": 0.0567,
+      "step": 299
+    },
+    {
+      "epoch": 1.9933554817275747,
+      "grad_norm": 0.6455401534739803,
+      "learning_rate": 0.0,
+      "loss": 0.0482,
+      "step": 300
+    },
+    {
+      "epoch": 1.9933554817275747,
+      "eval_loss": 0.060801174491643906,
+      "eval_runtime": 30.1206,
+      "eval_samples_per_second": 33.598,
+      "eval_steps_per_second": 1.062,
+      "step": 300
+    },
+    {
+      "epoch": 1.9933554817275747,
+      "step": 300,
+      "total_flos": 1.1631033437569024e+17,
+      "train_loss": 0.07701069086790086,
+      "train_runtime": 4589.0959,
+      "train_samples_per_second": 8.375,
+      "train_steps_per_second": 0.065
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 300,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1631033437569024e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}