Model save

Browse files

Files changed (6) hide show

README.md +76 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
runs/Aug30_16-06-25_938RL43/events.out.tfevents.1725027260.938RL43.754039.0 +2 -2
train_results.json +9 -0
trainer_state.json +1566 -0

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+datasets:
+- generator
+library_name: peft
+license: llama3.1
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: hansken_human_hql
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# hansken_human_hql
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.2892
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 20
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.4508        | 0.9976 | 102  | 0.4433          |
+| 0.302         | 1.9951 | 204  | 0.3140          |
+| 0.2692        | 2.9927 | 306  | 0.2616          |
+| 0.177         | 4.0    | 409  | 0.2431          |
+| 0.1616        | 4.9976 | 511  | 0.2362          |
+| 0.1358        | 5.9951 | 613  | 0.2394          |
+| 0.1199        | 6.9927 | 715  | 0.2474          |
+| 0.1051        | 8.0    | 818  | 0.2625          |
+| 0.0945        | 8.9976 | 920  | 0.2797          |
+| 0.0843        | 9.9951 | 1022 | 0.2892          |
+### Framework versions
+- PEFT 0.12.0
+- Transformers 4.44.0
+- Pytorch 2.1.2+cu121
+- Datasets 2.20.0
+- Tokenizers 0.19.1

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99abb8987ba0d50dcd3a79157a55b3e5b45fe847e6566de5e799885d119153dd
 size 167832240

 version https://git-lfs.github.com/spec/v1
+oid sha256:22336ad6031f16fcf852a9fc6e36c81ef128033fc8de69ca86c761c7f6007c1c
 size 167832240

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.995110024449877,
+    "total_flos": 7.585797735459062e+17,
+    "train_loss": 0.2326957560244605,
+    "train_runtime": 28787.7929,
+    "train_samples": 11408,
+    "train_samples_per_second": 0.568,
+    "train_steps_per_second": 0.071
+}

runs/Aug30_16-06-25_938RL43/events.out.tfevents.1725027260.938RL43.754039.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f285c3e48bb7d4a8db0c6e40d780051e05dab0f9b444efdaf97170cb9b70da0
-size 48090

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6c6b183c0496ca525244a9f8029c075a0bdb7065b3d65c833c78b0b7f3100b9
+size 52935

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.995110024449877,
+    "total_flos": 7.585797735459062e+17,
+    "train_loss": 0.2326957560244605,
+    "train_runtime": 28787.7929,
+    "train_samples": 11408,
+    "train_samples_per_second": 0.568,
+    "train_steps_per_second": 0.071
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1566 @@

+{
+  "best_metric": 0.23624150454998016,
+  "best_model_checkpoint": "data/hansken_human_hql/checkpoint-511",
+  "epoch": 9.995110024449877,
+  "eval_steps": 500,
+  "global_step": 1022,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009779951100244499,
+      "grad_norm": 1.0709587335586548,
+      "learning_rate": 9.80392156862745e-07,
+      "loss": 1.4707,
+      "step": 1
+    },
+    {
+      "epoch": 0.0488997555012225,
+      "grad_norm": 1.1860318183898926,
+      "learning_rate": 4.901960784313726e-06,
+      "loss": 1.4227,
+      "step": 5
+    },
+    {
+      "epoch": 0.097799511002445,
+      "grad_norm": 1.145372986793518,
+      "learning_rate": 9.803921568627451e-06,
+      "loss": 1.444,
+      "step": 10
+    },
+    {
+      "epoch": 0.1466992665036675,
+      "grad_norm": 1.0051456689834595,
+      "learning_rate": 1.4705882352941177e-05,
+      "loss": 1.3414,
+      "step": 15
+    },
+    {
+      "epoch": 0.19559902200489,
+      "grad_norm": 0.5133008360862732,
+      "learning_rate": 1.9607843137254903e-05,
+      "loss": 1.2116,
+      "step": 20
+    },
+    {
+      "epoch": 0.24449877750611246,
+      "grad_norm": 0.41832494735717773,
+      "learning_rate": 2.4509803921568626e-05,
+      "loss": 1.1424,
+      "step": 25
+    },
+    {
+      "epoch": 0.293398533007335,
+      "grad_norm": 0.41829216480255127,
+      "learning_rate": 2.9411764705882354e-05,
+      "loss": 1.143,
+      "step": 30
+    },
+    {
+      "epoch": 0.3422982885085575,
+      "grad_norm": 0.35495856404304504,
+      "learning_rate": 3.431372549019608e-05,
+      "loss": 1.0472,
+      "step": 35
+    },
+    {
+      "epoch": 0.39119804400978,
+      "grad_norm": 0.40399229526519775,
+      "learning_rate": 3.9215686274509805e-05,
+      "loss": 0.9879,
+      "step": 40
+    },
+    {
+      "epoch": 0.4400977995110024,
+      "grad_norm": 0.31430941820144653,
+      "learning_rate": 4.411764705882353e-05,
+      "loss": 0.9467,
+      "step": 45
+    },
+    {
+      "epoch": 0.4889975550122249,
+      "grad_norm": 0.29712405800819397,
+      "learning_rate": 4.901960784313725e-05,
+      "loss": 0.885,
+      "step": 50
+    },
+    {
+      "epoch": 0.5378973105134475,
+      "grad_norm": 0.40078112483024597,
+      "learning_rate": 5.392156862745098e-05,
+      "loss": 0.7973,
+      "step": 55
+    },
+    {
+      "epoch": 0.58679706601467,
+      "grad_norm": 0.34199750423431396,
+      "learning_rate": 5.882352941176471e-05,
+      "loss": 0.776,
+      "step": 60
+    },
+    {
+      "epoch": 0.6356968215158925,
+      "grad_norm": 0.4243955910205841,
+      "learning_rate": 6.372549019607843e-05,
+      "loss": 0.651,
+      "step": 65
+    },
+    {
+      "epoch": 0.684596577017115,
+      "grad_norm": 0.30432572960853577,
+      "learning_rate": 6.862745098039216e-05,
+      "loss": 0.5769,
+      "step": 70
+    },
+    {
+      "epoch": 0.7334963325183375,
+      "grad_norm": 0.27279356122016907,
+      "learning_rate": 7.352941176470589e-05,
+      "loss": 0.5436,
+      "step": 75
+    },
+    {
+      "epoch": 0.78239608801956,
+      "grad_norm": 0.2576221823692322,
+      "learning_rate": 7.843137254901961e-05,
+      "loss": 0.501,
+      "step": 80
+    },
+    {
+      "epoch": 0.8312958435207825,
+      "grad_norm": 0.22290439903736115,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 0.4915,
+      "step": 85
+    },
+    {
+      "epoch": 0.8801955990220048,
+      "grad_norm": 0.21740856766700745,
+      "learning_rate": 8.823529411764706e-05,
+      "loss": 0.4487,
+      "step": 90
+    },
+    {
+      "epoch": 0.9290953545232273,
+      "grad_norm": 0.21560043096542358,
+      "learning_rate": 9.313725490196079e-05,
+      "loss": 0.4351,
+      "step": 95
+    },
+    {
+      "epoch": 0.9779951100244498,
+      "grad_norm": 0.2607389986515045,
+      "learning_rate": 9.80392156862745e-05,
+      "loss": 0.4508,
+      "step": 100
+    },
+    {
+      "epoch": 0.9975550122249389,
+      "eval_loss": 0.44326770305633545,
+      "eval_runtime": 398.4802,
+      "eval_samples_per_second": 1.029,
+      "eval_steps_per_second": 1.029,
+      "step": 102
+    },
+    {
+      "epoch": 1.0268948655256724,
+      "grad_norm": 0.21888603270053864,
+      "learning_rate": 0.00010294117647058823,
+      "loss": 0.3968,
+      "step": 105
+    },
+    {
+      "epoch": 1.075794621026895,
+      "grad_norm": 0.21742066740989685,
+      "learning_rate": 0.00010784313725490196,
+      "loss": 0.3785,
+      "step": 110
+    },
+    {
+      "epoch": 1.1246943765281174,
+      "grad_norm": 0.2523965537548065,
+      "learning_rate": 0.0001127450980392157,
+      "loss": 0.4034,
+      "step": 115
+    },
+    {
+      "epoch": 1.17359413202934,
+      "grad_norm": 0.2155005782842636,
+      "learning_rate": 0.00011764705882352942,
+      "loss": 0.3766,
+      "step": 120
+    },
+    {
+      "epoch": 1.2224938875305624,
+      "grad_norm": 0.25576308369636536,
+      "learning_rate": 0.00012254901960784316,
+      "loss": 0.364,
+      "step": 125
+    },
+    {
+      "epoch": 1.271393643031785,
+      "grad_norm": 0.2288295179605484,
+      "learning_rate": 0.00012745098039215687,
+      "loss": 0.3451,
+      "step": 130
+    },
+    {
+      "epoch": 1.3202933985330074,
+      "grad_norm": 0.2045079469680786,
+      "learning_rate": 0.0001323529411764706,
+      "loss": 0.3425,
+      "step": 135
+    },
+    {
+      "epoch": 1.36919315403423,
+      "grad_norm": 0.2297014445066452,
+      "learning_rate": 0.0001372549019607843,
+      "loss": 0.3658,
+      "step": 140
+    },
+    {
+      "epoch": 1.4180929095354524,
+      "grad_norm": 0.2170581817626953,
+      "learning_rate": 0.00014215686274509804,
+      "loss": 0.3482,
+      "step": 145
+    },
+    {
+      "epoch": 1.466992665036675,
+      "grad_norm": 0.2250969409942627,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 0.3353,
+      "step": 150
+    },
+    {
+      "epoch": 1.5158924205378974,
+      "grad_norm": 0.23191578686237335,
+      "learning_rate": 0.00015196078431372549,
+      "loss": 0.3271,
+      "step": 155
+    },
+    {
+      "epoch": 1.56479217603912,
+      "grad_norm": 0.2477528601884842,
+      "learning_rate": 0.00015686274509803922,
+      "loss": 0.3549,
+      "step": 160
+    },
+    {
+      "epoch": 1.6136919315403424,
+      "grad_norm": 0.20846064388751984,
+      "learning_rate": 0.00016176470588235295,
+      "loss": 0.3171,
+      "step": 165
+    },
+    {
+      "epoch": 1.662591687041565,
+      "grad_norm": 0.21829602122306824,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.3642,
+      "step": 170
+    },
+    {
+      "epoch": 1.7114914425427874,
+      "grad_norm": 0.22842282056808472,
+      "learning_rate": 0.0001715686274509804,
+      "loss": 0.3116,
+      "step": 175
+    },
+    {
+      "epoch": 1.76039119804401,
+      "grad_norm": 0.24106037616729736,
+      "learning_rate": 0.00017647058823529413,
+      "loss": 0.3066,
+      "step": 180
+    },
+    {
+      "epoch": 1.8092909535452324,
+      "grad_norm": 0.25696486234664917,
+      "learning_rate": 0.00018137254901960786,
+      "loss": 0.3053,
+      "step": 185
+    },
+    {
+      "epoch": 1.858190709046455,
+      "grad_norm": 0.22010771930217743,
+      "learning_rate": 0.00018627450980392157,
+      "loss": 0.3233,
+      "step": 190
+    },
+    {
+      "epoch": 1.9070904645476774,
+      "grad_norm": 0.2373352199792862,
+      "learning_rate": 0.0001911764705882353,
+      "loss": 0.3102,
+      "step": 195
+    },
+    {
+      "epoch": 1.9559902200488999,
+      "grad_norm": 0.21177123486995697,
+      "learning_rate": 0.000196078431372549,
+      "loss": 0.302,
+      "step": 200
+    },
+    {
+      "epoch": 1.9951100244498776,
+      "eval_loss": 0.3139691650867462,
+      "eval_runtime": 387.4792,
+      "eval_samples_per_second": 1.058,
+      "eval_steps_per_second": 1.058,
+      "step": 204
+    },
+    {
+      "epoch": 2.0048899755501224,
+      "grad_norm": 0.2193712592124939,
+      "learning_rate": 0.00019999985360565867,
+      "loss": 0.2813,
+      "step": 205
+    },
+    {
+      "epoch": 2.053789731051345,
+      "grad_norm": 0.3371932804584503,
+      "learning_rate": 0.00019999472984871732,
+      "loss": 0.2844,
+      "step": 210
+    },
+    {
+      "epoch": 2.1026894865525674,
+      "grad_norm": 0.23578821122646332,
+      "learning_rate": 0.00019998228680332932,
+      "loss": 0.263,
+      "step": 215
+    },
+    {
+      "epoch": 2.15158924205379,
+      "grad_norm": 0.27435311675071716,
+      "learning_rate": 0.00019996252538028507,
+      "loss": 0.2752,
+      "step": 220
+    },
+    {
+      "epoch": 2.2004889975550124,
+      "grad_norm": 0.24362725019454956,
+      "learning_rate": 0.00019993544702605638,
+      "loss": 0.2572,
+      "step": 225
+    },
+    {
+      "epoch": 2.249388753056235,
+      "grad_norm": 0.24360118806362152,
+      "learning_rate": 0.0001999010537226905,
+      "loss": 0.3191,
+      "step": 230
+    },
+    {
+      "epoch": 2.2982885085574574,
+      "grad_norm": 0.2612737715244293,
+      "learning_rate": 0.0001998593479876652,
+      "loss": 0.2506,
+      "step": 235
+    },
+    {
+      "epoch": 2.34718826405868,
+      "grad_norm": 0.21556636691093445,
+      "learning_rate": 0.00019981033287370443,
+      "loss": 0.2416,
+      "step": 240
+    },
+    {
+      "epoch": 2.3960880195599024,
+      "grad_norm": 0.22406277060508728,
+      "learning_rate": 0.00019975401196855482,
+      "loss": 0.273,
+      "step": 245
+    },
+    {
+      "epoch": 2.444987775061125,
+      "grad_norm": 0.3020350933074951,
+      "learning_rate": 0.00019969038939472315,
+      "loss": 0.2457,
+      "step": 250
+    },
+    {
+      "epoch": 2.4938875305623474,
+      "grad_norm": 0.20698243379592896,
+      "learning_rate": 0.00019961946980917456,
+      "loss": 0.2569,
+      "step": 255
+    },
+    {
+      "epoch": 2.54278728606357,
+      "grad_norm": 0.4294751286506653,
+      "learning_rate": 0.00019954125840299165,
+      "loss": 0.2246,
+      "step": 260
+    },
+    {
+      "epoch": 2.591687041564792,
+      "grad_norm": 0.37185847759246826,
+      "learning_rate": 0.00019945576090099452,
+      "loss": 0.229,
+      "step": 265
+    },
+    {
+      "epoch": 2.640586797066015,
+      "grad_norm": 0.2863105237483978,
+      "learning_rate": 0.00019936298356132176,
+      "loss": 0.2338,
+      "step": 270
+    },
+    {
+      "epoch": 2.689486552567237,
+      "grad_norm": 0.19301028549671173,
+      "learning_rate": 0.00019926293317497245,
+      "loss": 0.2167,
+      "step": 275
+    },
+    {
+      "epoch": 2.73838630806846,
+      "grad_norm": 0.22075964510440826,
+      "learning_rate": 0.00019915561706530883,
+      "loss": 0.2367,
+      "step": 280
+    },
+    {
+      "epoch": 2.787286063569682,
+      "grad_norm": 0.22829142212867737,
+      "learning_rate": 0.0001990410430875205,
+      "loss": 0.245,
+      "step": 285
+    },
+    {
+      "epoch": 2.836185819070905,
+      "grad_norm": 0.1982724666595459,
+      "learning_rate": 0.00019891921962804943,
+      "loss": 0.217,
+      "step": 290
+    },
+    {
+      "epoch": 2.885085574572127,
+      "grad_norm": 0.23672354221343994,
+      "learning_rate": 0.00019879015560397587,
+      "loss": 0.2298,
+      "step": 295
+    },
+    {
+      "epoch": 2.93398533007335,
+      "grad_norm": 0.21391943097114563,
+      "learning_rate": 0.00019865386046236596,
+      "loss": 0.2326,
+      "step": 300
+    },
+    {
+      "epoch": 2.982885085574572,
+      "grad_norm": 0.19821615517139435,
+      "learning_rate": 0.00019851034417958,
+      "loss": 0.2692,
+      "step": 305
+    },
+    {
+      "epoch": 2.9926650366748166,
+      "eval_loss": 0.2616053521633148,
+      "eval_runtime": 387.9813,
+      "eval_samples_per_second": 1.057,
+      "eval_steps_per_second": 1.057,
+      "step": 306
+    },
+    {
+      "epoch": 3.031784841075795,
+      "grad_norm": 0.22108572721481323,
+      "learning_rate": 0.0001983596172605423,
+      "loss": 0.2104,
+      "step": 310
+    },
+    {
+      "epoch": 3.0806845965770173,
+      "grad_norm": 0.24487629532814026,
+      "learning_rate": 0.00019820169073797228,
+      "loss": 0.1942,
+      "step": 315
+    },
+    {
+      "epoch": 3.12958435207824,
+      "grad_norm": 0.2110164612531662,
+      "learning_rate": 0.0001980365761715769,
+      "loss": 0.1833,
+      "step": 320
+    },
+    {
+      "epoch": 3.178484107579462,
+      "grad_norm": 0.20861805975437164,
+      "learning_rate": 0.0001978642856472045,
+      "loss": 0.1981,
+      "step": 325
+    },
+    {
+      "epoch": 3.227383863080685,
+      "grad_norm": 0.1969948559999466,
+      "learning_rate": 0.0001976848317759601,
+      "loss": 0.1868,
+      "step": 330
+    },
+    {
+      "epoch": 3.276283618581907,
+      "grad_norm": 0.19443638622760773,
+      "learning_rate": 0.0001974982276932824,
+      "loss": 0.1902,
+      "step": 335
+    },
+    {
+      "epoch": 3.32518337408313,
+      "grad_norm": 0.30058181285858154,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 0.2241,
+      "step": 340
+    },
+    {
+      "epoch": 3.374083129584352,
+      "grad_norm": 0.21647138893604279,
+      "learning_rate": 0.00019710362405124334,
+      "loss": 0.1838,
+      "step": 345
+    },
+    {
+      "epoch": 3.422982885085575,
+      "grad_norm": 0.19676022231578827,
+      "learning_rate": 0.00019689565337558288,
+      "loss": 0.1961,
+      "step": 350
+    },
+    {
+      "epoch": 3.471882640586797,
+      "grad_norm": 0.2371009737253189,
+      "learning_rate": 0.00019668059025377703,
+      "loss": 0.2052,
+      "step": 355
+    },
+    {
+      "epoch": 3.52078239608802,
+      "grad_norm": 0.21014133095741272,
+      "learning_rate": 0.00019645845042774553,
+      "loss": 0.1987,
+      "step": 360
+    },
+    {
+      "epoch": 3.569682151589242,
+      "grad_norm": 0.2178957760334015,
+      "learning_rate": 0.00019622925015739997,
+      "loss": 0.1903,
+      "step": 365
+    },
+    {
+      "epoch": 3.618581907090465,
+      "grad_norm": 0.21497862040996552,
+      "learning_rate": 0.0001959930062194534,
+      "loss": 0.2,
+      "step": 370
+    },
+    {
+      "epoch": 3.667481662591687,
+      "grad_norm": 0.22582682967185974,
+      "learning_rate": 0.00019574973590619243,
+      "loss": 0.1868,
+      "step": 375
+    },
+    {
+      "epoch": 3.71638141809291,
+      "grad_norm": 0.1827058047056198,
+      "learning_rate": 0.00019549945702421144,
+      "loss": 0.2172,
+      "step": 380
+    },
+    {
+      "epoch": 3.765281173594132,
+      "grad_norm": 0.19827835261821747,
+      "learning_rate": 0.00019524218789310912,
+      "loss": 0.1785,
+      "step": 385
+    },
+    {
+      "epoch": 3.8141809290953548,
+      "grad_norm": 0.2121572494506836,
+      "learning_rate": 0.0001949779473441478,
+      "loss": 0.1795,
+      "step": 390
+    },
+    {
+      "epoch": 3.863080684596577,
+      "grad_norm": 0.20197761058807373,
+      "learning_rate": 0.0001947067547188747,
+      "loss": 0.19,
+      "step": 395
+    },
+    {
+      "epoch": 3.9119804400977998,
+      "grad_norm": 0.21854069828987122,
+      "learning_rate": 0.00019442862986770646,
+      "loss": 0.1886,
+      "step": 400
+    },
+    {
+      "epoch": 3.960880195599022,
+      "grad_norm": 0.20974552631378174,
+      "learning_rate": 0.0001941435931484761,
+      "loss": 0.177,
+      "step": 405
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.24309584498405457,
+      "eval_runtime": 399.7773,
+      "eval_samples_per_second": 1.026,
+      "eval_steps_per_second": 1.026,
+      "step": 409
+    },
+    {
+      "epoch": 4.009779951100245,
+      "grad_norm": 0.19041913747787476,
+      "learning_rate": 0.0001938516654249428,
+      "loss": 0.1709,
+      "step": 410
+    },
+    {
+      "epoch": 4.058679706601467,
+      "grad_norm": 0.22610776126384735,
+      "learning_rate": 0.00019355286806526493,
+      "loss": 0.158,
+      "step": 415
+    },
+    {
+      "epoch": 4.10757946210269,
+      "grad_norm": 0.2044234424829483,
+      "learning_rate": 0.00019324722294043558,
+      "loss": 0.1522,
+      "step": 420
+    },
+    {
+      "epoch": 4.156479217603912,
+      "grad_norm": 0.2402704805135727,
+      "learning_rate": 0.00019293475242268223,
+      "loss": 0.1509,
+      "step": 425
+    },
+    {
+      "epoch": 4.205378973105135,
+      "grad_norm": 0.20224688947200775,
+      "learning_rate": 0.0001926154793838288,
+      "loss": 0.1565,
+      "step": 430
+    },
+    {
+      "epoch": 4.254278728606357,
+      "grad_norm": 0.21887710690498352,
+      "learning_rate": 0.00019228942719362143,
+      "loss": 0.1551,
+      "step": 435
+    },
+    {
+      "epoch": 4.30317848410758,
+      "grad_norm": 0.20886527001857758,
+      "learning_rate": 0.00019195661971801827,
+      "loss": 0.1568,
+      "step": 440
+    },
+    {
+      "epoch": 4.352078239608802,
+      "grad_norm": 0.21612216532230377,
+      "learning_rate": 0.00019161708131744222,
+      "loss": 0.1516,
+      "step": 445
+    },
+    {
+      "epoch": 4.400977995110025,
+      "grad_norm": 0.20036669075489044,
+      "learning_rate": 0.00019127083684499806,
+      "loss": 0.1529,
+      "step": 450
+    },
+    {
+      "epoch": 4.449877750611247,
+      "grad_norm": 0.3197900950908661,
+      "learning_rate": 0.00019091791164465305,
+      "loss": 0.1854,
+      "step": 455
+    },
+    {
+      "epoch": 4.49877750611247,
+      "grad_norm": 0.18851010501384735,
+      "learning_rate": 0.00019055833154938207,
+      "loss": 0.1574,
+      "step": 460
+    },
+    {
+      "epoch": 4.547677261613692,
+      "grad_norm": 0.214978888630867,
+      "learning_rate": 0.00019019212287927663,
+      "loss": 0.1555,
+      "step": 465
+    },
+    {
+      "epoch": 4.596577017114915,
+      "grad_norm": 0.21155217289924622,
+      "learning_rate": 0.00018981931243961824,
+      "loss": 0.176,
+      "step": 470
+    },
+    {
+      "epoch": 4.645476772616137,
+      "grad_norm": 0.18137674033641815,
+      "learning_rate": 0.00018943992751891653,
+      "loss": 0.1575,
+      "step": 475
+    },
+    {
+      "epoch": 4.69437652811736,
+      "grad_norm": 0.24663567543029785,
+      "learning_rate": 0.00018905399588691163,
+      "loss": 0.1568,
+      "step": 480
+    },
+    {
+      "epoch": 4.743276283618582,
+      "grad_norm": 0.19319510459899902,
+      "learning_rate": 0.0001886615457925417,
+      "loss": 0.1547,
+      "step": 485
+    },
+    {
+      "epoch": 4.792176039119805,
+      "grad_norm": 0.18611547350883484,
+      "learning_rate": 0.00018826260596187505,
+      "loss": 0.1755,
+      "step": 490
+    },
+    {
+      "epoch": 4.841075794621027,
+      "grad_norm": 0.47814473509788513,
+      "learning_rate": 0.00018785720559600752,
+      "loss": 0.1647,
+      "step": 495
+    },
+    {
+      "epoch": 4.88997555012225,
+      "grad_norm": 0.19350242614746094,
+      "learning_rate": 0.00018744537436892516,
+      "loss": 0.155,
+      "step": 500
+    },
+    {
+      "epoch": 4.938875305623472,
+      "grad_norm": 0.19956329464912415,
+      "learning_rate": 0.00018702714242533204,
+      "loss": 0.156,
+      "step": 505
+    },
+    {
+      "epoch": 4.987775061124695,
+      "grad_norm": 0.20709875226020813,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.1616,
+      "step": 510
+    },
+    {
+      "epoch": 4.997555012224939,
+      "eval_loss": 0.23624150454998016,
+      "eval_runtime": 387.7068,
+      "eval_samples_per_second": 1.058,
+      "eval_steps_per_second": 1.058,
+      "step": 511
+    },
+    {
+      "epoch": 5.036674816625917,
+      "grad_norm": 0.22790652513504028,
+      "learning_rate": 0.00018617159930774715,
+      "loss": 0.1377,
+      "step": 515
+    },
+    {
+      "epoch": 5.08557457212714,
+      "grad_norm": 0.21796418726444244,
+      "learning_rate": 0.00018573435075672424,
+      "loss": 0.1326,
+      "step": 520
+    },
+    {
+      "epoch": 5.134474327628362,
+      "grad_norm": 0.19105204939842224,
+      "learning_rate": 0.00018529082673054457,
+      "loss": 0.1303,
+      "step": 525
+    },
+    {
+      "epoch": 5.183374083129585,
+      "grad_norm": 0.2682870328426361,
+      "learning_rate": 0.00018484105969372182,
+      "loss": 0.1316,
+      "step": 530
+    },
+    {
+      "epoch": 5.232273838630807,
+      "grad_norm": 0.18370023369789124,
+      "learning_rate": 0.00018438508256773785,
+      "loss": 0.1323,
+      "step": 535
+    },
+    {
+      "epoch": 5.28117359413203,
+      "grad_norm": 0.24072639644145966,
+      "learning_rate": 0.00018392292872863267,
+      "loss": 0.1332,
+      "step": 540
+    },
+    {
+      "epoch": 5.330073349633252,
+      "grad_norm": 0.19523735344409943,
+      "learning_rate": 0.00018345463200456164,
+      "loss": 0.1344,
+      "step": 545
+    },
+    {
+      "epoch": 5.378973105134475,
+      "grad_norm": 0.24865508079528809,
+      "learning_rate": 0.0001829802266733193,
+      "loss": 0.1359,
+      "step": 550
+    },
+    {
+      "epoch": 5.427872860635697,
+      "grad_norm": 0.2039840966463089,
+      "learning_rate": 0.00018249974745983023,
+      "loss": 0.1337,
+      "step": 555
+    },
+    {
+      "epoch": 5.47677261613692,
+      "grad_norm": 0.20024679601192474,
+      "learning_rate": 0.00018201322953360758,
+      "loss": 0.154,
+      "step": 560
+    },
+    {
+      "epoch": 5.525672371638142,
+      "grad_norm": 0.1976476013660431,
+      "learning_rate": 0.0001815207085061784,
+      "loss": 0.1353,
+      "step": 565
+    },
+    {
+      "epoch": 5.574572127139365,
+      "grad_norm": 0.1974327266216278,
+      "learning_rate": 0.00018102222042847737,
+      "loss": 0.1373,
+      "step": 570
+    },
+    {
+      "epoch": 5.623471882640587,
+      "grad_norm": 0.27005520462989807,
+      "learning_rate": 0.00018051780178820765,
+      "loss": 0.1437,
+      "step": 575
+    },
+    {
+      "epoch": 5.67237163814181,
+      "grad_norm": 0.20781448483467102,
+      "learning_rate": 0.00018000748950717038,
+      "loss": 0.1322,
+      "step": 580
+    },
+    {
+      "epoch": 5.721271393643032,
+      "grad_norm": 0.20179703831672668,
+      "learning_rate": 0.000179491320938562,
+      "loss": 0.1378,
+      "step": 585
+    },
+    {
+      "epoch": 5.770171149144255,
+      "grad_norm": 0.22105282545089722,
+      "learning_rate": 0.00017896933386423998,
+      "loss": 0.136,
+      "step": 590
+    },
+    {
+      "epoch": 5.819070904645477,
+      "grad_norm": 0.4113224446773529,
+      "learning_rate": 0.00017844156649195759,
+      "loss": 0.1495,
+      "step": 595
+    },
+    {
+      "epoch": 5.8679706601467,
+      "grad_norm": 0.20451286435127258,
+      "learning_rate": 0.00017790805745256704,
+      "loss": 0.1318,
+      "step": 600
+    },
+    {
+      "epoch": 5.916870415647922,
+      "grad_norm": 0.18566569685935974,
+      "learning_rate": 0.0001773688457971919,
+      "loss": 0.1359,
+      "step": 605
+    },
+    {
+      "epoch": 5.965770171149144,
+      "grad_norm": 0.1862591803073883,
+      "learning_rate": 0.0001768239709943686,
+      "loss": 0.1358,
+      "step": 610
+    },
+    {
+      "epoch": 5.995110024449878,
+      "eval_loss": 0.23938237130641937,
+      "eval_runtime": 387.8478,
+      "eval_samples_per_second": 1.057,
+      "eval_steps_per_second": 1.057,
+      "step": 613
+    },
+    {
+      "epoch": 6.014669926650367,
+      "grad_norm": 0.16670842468738556,
+      "learning_rate": 0.0001762734729271575,
+      "loss": 0.1275,
+      "step": 615
+    },
+    {
+      "epoch": 6.06356968215159,
+      "grad_norm": 0.23901741206645966,
+      "learning_rate": 0.00017571739189022365,
+      "loss": 0.1113,
+      "step": 620
+    },
+    {
+      "epoch": 6.112469437652812,
+      "grad_norm": 0.19317218661308289,
+      "learning_rate": 0.00017515576858688722,
+      "loss": 0.1101,
+      "step": 625
+    },
+    {
+      "epoch": 6.161369193154035,
+      "grad_norm": 0.21369099617004395,
+      "learning_rate": 0.00017458864412614434,
+      "loss": 0.1122,
+      "step": 630
+    },
+    {
+      "epoch": 6.210268948655257,
+      "grad_norm": 0.21011659502983093,
+      "learning_rate": 0.00017401606001965782,
+      "loss": 0.1136,
+      "step": 635
+    },
+    {
+      "epoch": 6.25916870415648,
+      "grad_norm": 0.1860456019639969,
+      "learning_rate": 0.00017343805817871886,
+      "loss": 0.1305,
+      "step": 640
+    },
+    {
+      "epoch": 6.308068459657702,
+      "grad_norm": 0.23417602479457855,
+      "learning_rate": 0.00017285468091117904,
+      "loss": 0.1165,
+      "step": 645
+    },
+    {
+      "epoch": 6.356968215158924,
+      "grad_norm": 0.189472958445549,
+      "learning_rate": 0.00017226597091835378,
+      "loss": 0.119,
+      "step": 650
+    },
+    {
+      "epoch": 6.405867970660147,
+      "grad_norm": 0.2460348904132843,
+      "learning_rate": 0.00017167197129189652,
+      "loss": 0.1188,
+      "step": 655
+    },
+    {
+      "epoch": 6.45476772616137,
+      "grad_norm": 0.20059679448604584,
+      "learning_rate": 0.00017107272551064473,
+      "loss": 0.1194,
+      "step": 660
+    },
+    {
+      "epoch": 6.503667481662592,
+      "grad_norm": 0.19838838279247284,
+      "learning_rate": 0.00017046827743743726,
+      "loss": 0.1165,
+      "step": 665
+    },
+    {
+      "epoch": 6.552567237163814,
+      "grad_norm": 0.20280085504055023,
+      "learning_rate": 0.00016985867131590383,
+      "loss": 0.1168,
+      "step": 670
+    },
+    {
+      "epoch": 6.601466992665037,
+      "grad_norm": 0.27974265813827515,
+      "learning_rate": 0.00016924395176722647,
+      "loss": 0.122,
+      "step": 675
+    },
+    {
+      "epoch": 6.65036674816626,
+      "grad_norm": 0.1994495540857315,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.1173,
+      "step": 680
+    },
+    {
+      "epoch": 6.699266503667482,
+      "grad_norm": 0.20043040812015533,
+      "learning_rate": 0.00016799935274130546,
+      "loss": 0.1183,
+      "step": 685
+    },
+    {
+      "epoch": 6.748166259168704,
+      "grad_norm": 0.19184747338294983,
+      "learning_rate": 0.00016736956436465573,
+      "loss": 0.1192,
+      "step": 690
+    },
+    {
+      "epoch": 6.797066014669927,
+      "grad_norm": 0.20747938752174377,
+      "learning_rate": 0.00016673484475538146,
+      "loss": 0.1188,
+      "step": 695
+    },
+    {
+      "epoch": 6.84596577017115,
+      "grad_norm": 0.19285354018211365,
+      "learning_rate": 0.00016609524037289019,
+      "loss": 0.117,
+      "step": 700
+    },
+    {
+      "epoch": 6.894865525672372,
+      "grad_norm": 0.18242338299751282,
+      "learning_rate": 0.00016545079803413892,
+      "loss": 0.1208,
+      "step": 705
+    },
+    {
+      "epoch": 6.943765281173594,
+      "grad_norm": 0.19887416064739227,
+      "learning_rate": 0.00016480156491020727,
+      "loss": 0.1227,
+      "step": 710
+    },
+    {
+      "epoch": 6.992665036674817,
+      "grad_norm": 0.19773922860622406,
+      "learning_rate": 0.00016414758852284478,
+      "loss": 0.1199,
+      "step": 715
+    },
+    {
+      "epoch": 6.992665036674817,
+      "eval_loss": 0.24741248786449432,
+      "eval_runtime": 387.335,
+      "eval_samples_per_second": 1.059,
+      "eval_steps_per_second": 1.059,
+      "step": 715
+    },
+    {
+      "epoch": 7.041564792176039,
+      "grad_norm": 0.5106807351112366,
+      "learning_rate": 0.0001634889167409923,
+      "loss": 0.1051,
+      "step": 720
+    },
+    {
+      "epoch": 7.090464547677262,
+      "grad_norm": 0.18619847297668457,
+      "learning_rate": 0.0001628255977772784,
+      "loss": 0.0979,
+      "step": 725
+    },
+    {
+      "epoch": 7.139364303178484,
+      "grad_norm": 0.18676620721817017,
+      "learning_rate": 0.00016215768018449012,
+      "loss": 0.1009,
+      "step": 730
+    },
+    {
+      "epoch": 7.188264058679707,
+      "grad_norm": 0.2054695338010788,
+      "learning_rate": 0.00016148521285201927,
+      "loss": 0.1002,
+      "step": 735
+    },
+    {
+      "epoch": 7.237163814180929,
+      "grad_norm": 0.20496530830860138,
+      "learning_rate": 0.00016080824500228367,
+      "loss": 0.1011,
+      "step": 740
+    },
+    {
+      "epoch": 7.286063569682152,
+      "grad_norm": 0.18679122626781464,
+      "learning_rate": 0.0001601268261871244,
+      "loss": 0.1052,
+      "step": 745
+    },
+    {
+      "epoch": 7.334963325183374,
+      "grad_norm": 0.20614224672317505,
+      "learning_rate": 0.00015944100628417868,
+      "loss": 0.1021,
+      "step": 750
+    },
+    {
+      "epoch": 7.383863080684597,
+      "grad_norm": 0.20026642084121704,
+      "learning_rate": 0.00015875083549322908,
+      "loss": 0.1019,
+      "step": 755
+    },
+    {
+      "epoch": 7.432762836185819,
+      "grad_norm": 0.1852520853281021,
+      "learning_rate": 0.00015805636433252891,
+      "loss": 0.1028,
+      "step": 760
+    },
+    {
+      "epoch": 7.481662591687042,
+      "grad_norm": 0.19096429646015167,
+      "learning_rate": 0.0001573576436351046,
+      "loss": 0.1031,
+      "step": 765
+    },
+    {
+      "epoch": 7.530562347188264,
+      "grad_norm": 0.18263529241085052,
+      "learning_rate": 0.00015665472454503483,
+      "loss": 0.1033,
+      "step": 770
+    },
+    {
+      "epoch": 7.579462102689487,
+      "grad_norm": 0.1884106546640396,
+      "learning_rate": 0.00015594765851370684,
+      "loss": 0.1063,
+      "step": 775
+    },
+    {
+      "epoch": 7.628361858190709,
+      "grad_norm": 0.2005338817834854,
+      "learning_rate": 0.0001552364972960506,
+      "loss": 0.1054,
+      "step": 780
+    },
+    {
+      "epoch": 7.677261613691932,
+      "grad_norm": 0.184016153216362,
+      "learning_rate": 0.0001545212929467503,
+      "loss": 0.1048,
+      "step": 785
+    },
+    {
+      "epoch": 7.726161369193154,
+      "grad_norm": 0.19765067100524902,
+      "learning_rate": 0.0001538020978164341,
+      "loss": 0.1044,
+      "step": 790
+    },
+    {
+      "epoch": 7.775061124694377,
+      "grad_norm": 0.18265607953071594,
+      "learning_rate": 0.0001530789645478426,
+      "loss": 0.1051,
+      "step": 795
+    },
+    {
+      "epoch": 7.823960880195599,
+      "grad_norm": 0.19815443456172943,
+      "learning_rate": 0.00015235194607197508,
+      "loss": 0.1081,
+      "step": 800
+    },
+    {
+      "epoch": 7.872860635696822,
+      "grad_norm": 0.22219662368297577,
+      "learning_rate": 0.0001516210956042153,
+      "loss": 0.1071,
+      "step": 805
+    },
+    {
+      "epoch": 7.921760391198044,
+      "grad_norm": 0.20078670978546143,
+      "learning_rate": 0.0001508864666404365,
+      "loss": 0.1075,
+      "step": 810
+    },
+    {
+      "epoch": 7.970660146699267,
+      "grad_norm": 0.17794115841388702,
+      "learning_rate": 0.00015014811295308543,
+      "loss": 0.1051,
+      "step": 815
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.2625426948070526,
+      "eval_runtime": 387.4946,
+      "eval_samples_per_second": 1.058,
+      "eval_steps_per_second": 1.058,
+      "step": 818
+    },
+    {
+      "epoch": 8.01955990220049,
+      "grad_norm": 0.1608039289712906,
+      "learning_rate": 0.0001494060885872464,
+      "loss": 0.0994,
+      "step": 820
+    },
+    {
+      "epoch": 8.06845965770171,
+      "grad_norm": 0.2323434203863144,
+      "learning_rate": 0.00014866044785668563,
+      "loss": 0.0895,
+      "step": 825
+    },
+    {
+      "epoch": 8.117359413202934,
+      "grad_norm": 0.17606528103351593,
+      "learning_rate": 0.0001479112453398753,
+      "loss": 0.0849,
+      "step": 830
+    },
+    {
+      "epoch": 8.166259168704157,
+      "grad_norm": 0.19025173783302307,
+      "learning_rate": 0.0001471585358759987,
+      "loss": 0.0886,
+      "step": 835
+    },
+    {
+      "epoch": 8.21515892420538,
+      "grad_norm": 0.1990627497434616,
+      "learning_rate": 0.00014640237456093634,
+      "loss": 0.0905,
+      "step": 840
+    },
+    {
+      "epoch": 8.2640586797066,
+      "grad_norm": 0.1725684553384781,
+      "learning_rate": 0.00014564281674323297,
+      "loss": 0.0899,
+      "step": 845
+    },
+    {
+      "epoch": 8.312958435207824,
+      "grad_norm": 0.18845060467720032,
+      "learning_rate": 0.00014487991802004623,
+      "loss": 0.0886,
+      "step": 850
+    },
+    {
+      "epoch": 8.361858190709047,
+      "grad_norm": 0.23856212198734283,
+      "learning_rate": 0.00014411373423307714,
+      "loss": 0.0924,
+      "step": 855
+    },
+    {
+      "epoch": 8.41075794621027,
+      "grad_norm": 0.18084120750427246,
+      "learning_rate": 0.00014334432146448272,
+      "loss": 0.0918,
+      "step": 860
+    },
+    {
+      "epoch": 8.45965770171149,
+      "grad_norm": 0.18600909411907196,
+      "learning_rate": 0.00014257173603277095,
+      "loss": 0.0913,
+      "step": 865
+    },
+    {
+      "epoch": 8.508557457212714,
+      "grad_norm": 0.1851680874824524,
+      "learning_rate": 0.00014179603448867835,
+      "loss": 0.0912,
+      "step": 870
+    },
+    {
+      "epoch": 8.557457212713937,
+      "grad_norm": 0.1818709820508957,
+      "learning_rate": 0.00014101727361103076,
+      "loss": 0.0903,
+      "step": 875
+    },
+    {
+      "epoch": 8.60635696821516,
+      "grad_norm": 0.19458520412445068,
+      "learning_rate": 0.00014023551040258725,
+      "loss": 0.0916,
+      "step": 880
+    },
+    {
+      "epoch": 8.65525672371638,
+      "grad_norm": 0.17777447402477264,
+      "learning_rate": 0.00013945080208586775,
+      "loss": 0.0928,
+      "step": 885
+    },
+    {
+      "epoch": 8.704156479217604,
+      "grad_norm": 0.20647075772285461,
+      "learning_rate": 0.00013866320609896447,
+      "loss": 0.0926,
+      "step": 890
+    },
+    {
+      "epoch": 8.753056234718827,
+      "grad_norm": 0.18589670956134796,
+      "learning_rate": 0.00013787278009133776,
+      "loss": 0.0934,
+      "step": 895
+    },
+    {
+      "epoch": 8.80195599022005,
+      "grad_norm": 0.19582615792751312,
+      "learning_rate": 0.00013707958191959608,
+      "loss": 0.0954,
+      "step": 900
+    },
+    {
+      "epoch": 8.85085574572127,
+      "grad_norm": 0.19688870012760162,
+      "learning_rate": 0.00013628366964326153,
+      "loss": 0.0925,
+      "step": 905
+    },
+    {
+      "epoch": 8.899755501222494,
+      "grad_norm": 0.1874823123216629,
+      "learning_rate": 0.00013548510152051963,
+      "loss": 0.0939,
+      "step": 910
+    },
+    {
+      "epoch": 8.948655256723717,
+      "grad_norm": 0.1876133382320404,
+      "learning_rate": 0.00013468393600395525,
+      "loss": 0.097,
+      "step": 915
+    },
+    {
+      "epoch": 8.99755501222494,
+      "grad_norm": 0.1735718548297882,
+      "learning_rate": 0.00013388023173627414,
+      "loss": 0.0945,
+      "step": 920
+    },
+    {
+      "epoch": 8.99755501222494,
+      "eval_loss": 0.27974453568458557,
+      "eval_runtime": 387.9073,
+      "eval_samples_per_second": 1.057,
+      "eval_steps_per_second": 1.057,
+      "step": 920
+    },
+    {
+      "epoch": 9.04645476772616,
+      "grad_norm": 0.1655295491218567,
+      "learning_rate": 0.00013307404754601013,
+      "loss": 0.0806,
+      "step": 925
+    },
+    {
+      "epoch": 9.095354523227384,
+      "grad_norm": 0.19395217299461365,
+      "learning_rate": 0.0001322654424432195,
+      "loss": 0.0788,
+      "step": 930
+    },
+    {
+      "epoch": 9.144254278728607,
+      "grad_norm": 0.18941174447536469,
+      "learning_rate": 0.00013145447561516138,
+      "loss": 0.0793,
+      "step": 935
+    },
+    {
+      "epoch": 9.19315403422983,
+      "grad_norm": 0.20010443031787872,
+      "learning_rate": 0.00013064120642196548,
+      "loss": 0.0807,
+      "step": 940
+    },
+    {
+      "epoch": 9.24205378973105,
+      "grad_norm": 0.20777645707130432,
+      "learning_rate": 0.00012982569439228713,
+      "loss": 0.08,
+      "step": 945
+    },
+    {
+      "epoch": 9.290953545232274,
+      "grad_norm": 0.173665389418602,
+      "learning_rate": 0.00012900799921895003,
+      "loss": 0.0808,
+      "step": 950
+    },
+    {
+      "epoch": 9.339853300733497,
+      "grad_norm": 0.20865468680858612,
+      "learning_rate": 0.0001281881807545769,
+      "loss": 0.0808,
+      "step": 955
+    },
+    {
+      "epoch": 9.38875305623472,
+      "grad_norm": 0.18372130393981934,
+      "learning_rate": 0.0001273662990072083,
+      "loss": 0.0804,
+      "step": 960
+    },
+    {
+      "epoch": 9.43765281173594,
+      "grad_norm": 0.1785283237695694,
+      "learning_rate": 0.00012654241413591054,
+      "loss": 0.0812,
+      "step": 965
+    },
+    {
+      "epoch": 9.486552567237164,
+      "grad_norm": 0.17695043981075287,
+      "learning_rate": 0.000125716586446372,
+      "loss": 0.0827,
+      "step": 970
+    },
+    {
+      "epoch": 9.535452322738386,
+      "grad_norm": 0.18287776410579681,
+      "learning_rate": 0.00012488887638648907,
+      "loss": 0.083,
+      "step": 975
+    },
+    {
+      "epoch": 9.58435207823961,
+      "grad_norm": 0.20748884975910187,
+      "learning_rate": 0.00012405934454194146,
+      "loss": 0.0816,
+      "step": 980
+    },
+    {
+      "epoch": 9.63325183374083,
+      "grad_norm": 0.18160052597522736,
+      "learning_rate": 0.00012322805163175762,
+      "loss": 0.0823,
+      "step": 985
+    },
+    {
+      "epoch": 9.682151589242054,
+      "grad_norm": 0.17889925837516785,
+      "learning_rate": 0.0001223950585038703,
+      "loss": 0.0822,
+      "step": 990
+    },
+    {
+      "epoch": 9.731051344743276,
+      "grad_norm": 0.1896965056657791,
+      "learning_rate": 0.00012156042613066258,
+      "loss": 0.0839,
+      "step": 995
+    },
+    {
+      "epoch": 9.7799511002445,
+      "grad_norm": 0.19203361868858337,
+      "learning_rate": 0.00012072421560450497,
+      "loss": 0.0828,
+      "step": 1000
+    },
+    {
+      "epoch": 9.82885085574572,
+      "grad_norm": 0.18262554705142975,
+      "learning_rate": 0.00011988648813328367,
+      "loss": 0.0838,
+      "step": 1005
+    },
+    {
+      "epoch": 9.877750611246944,
+      "grad_norm": 0.18471267819404602,
+      "learning_rate": 0.0001190473050359203,
+      "loss": 0.084,
+      "step": 1010
+    },
+    {
+      "epoch": 9.926650366748166,
+      "grad_norm": 0.18675756454467773,
+      "learning_rate": 0.00011820672773788353,
+      "loss": 0.0835,
+      "step": 1015
+    },
+    {
+      "epoch": 9.97555012224939,
+      "grad_norm": 0.17983846366405487,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.0843,
+      "step": 1020
+    },
+    {
+      "epoch": 9.995110024449877,
+      "eval_loss": 0.2892283499240875,
+      "eval_runtime": 389.1699,
+      "eval_samples_per_second": 1.054,
+      "eval_steps_per_second": 1.054,
+      "step": 1022
+    },
+    {
+      "epoch": 9.995110024449877,
+      "step": 1022,
+      "total_flos": 7.585797735459062e+17,
+      "train_loss": 0.2326957560244605,
+      "train_runtime": 28787.7929,
+      "train_samples_per_second": 0.568,
+      "train_steps_per_second": 0.071
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2040,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.585797735459062e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}