Model save

Browse files

Files changed (5) hide show

README.md +70 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
train_results.json +9 -0
trainer_state.json +2682 -0

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+---
+license: llama3
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: meta-llama/Meta-Llama-3-8B-Instruct
+datasets:
+- generator
+model-index:
+- name: Meta-Llama-3-8B-Instruct
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/statking/huggingface/runs/1d81qcom)
+# Meta-Llama-3-8B-Instruct
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.1189
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.1206        | 0.9997 | 1877 | 1.1189          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.41.0.dev0
+- Pytorch 2.3.0+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c761c6247d33033738085a83b9a8a43555f98ec1b1d6b5da9ac7b7cc643b0a3
 size 2185327392

 version https://git-lfs.github.com/spec/v1
+oid sha256:38de2d88d470a960573d4f858dc17b44a1a293b92f99ddf486879bf39164c766
 size 2185327392

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997336884154461,
+    "total_flos": 2.979798729936077e+16,
+    "train_loss": 1.1429596533467938,
+    "train_runtime": 55739.5179,
+    "train_samples": 207864,
+    "train_samples_per_second": 2.156,
+    "train_steps_per_second": 0.034
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9997336884154461,
+    "total_flos": 2.979798729936077e+16,
+    "train_loss": 1.1429596533467938,
+    "train_runtime": 55739.5179,
+    "train_samples": 207864,
+    "train_samples_per_second": 2.156,
+    "train_steps_per_second": 0.034
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2682 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997336884154461,
+  "eval_steps": 500,
+  "global_step": 1877,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005326231691078562,
+      "grad_norm": 0.5528136747380069,
+      "learning_rate": 1.0638297872340427e-06,
+      "loss": 1.7189,
+      "step": 1
+    },
+    {
+      "epoch": 0.002663115845539281,
+      "grad_norm": 0.5731888996610293,
+      "learning_rate": 5.319148936170213e-06,
+      "loss": 1.7582,
+      "step": 5
+    },
+    {
+      "epoch": 0.005326231691078562,
+      "grad_norm": 0.5597088455341374,
+      "learning_rate": 1.0638297872340426e-05,
+      "loss": 1.7331,
+      "step": 10
+    },
+    {
+      "epoch": 0.007989347536617843,
+      "grad_norm": 0.6373028746703387,
+      "learning_rate": 1.595744680851064e-05,
+      "loss": 1.7083,
+      "step": 15
+    },
+    {
+      "epoch": 0.010652463382157125,
+      "grad_norm": 0.6190956100247592,
+      "learning_rate": 2.1276595744680852e-05,
+      "loss": 1.6453,
+      "step": 20
+    },
+    {
+      "epoch": 0.013315579227696404,
+      "grad_norm": 0.5346124338518666,
+      "learning_rate": 2.6595744680851064e-05,
+      "loss": 1.5715,
+      "step": 25
+    },
+    {
+      "epoch": 0.015978695073235686,
+      "grad_norm": 0.25993843509864667,
+      "learning_rate": 3.191489361702128e-05,
+      "loss": 1.4359,
+      "step": 30
+    },
+    {
+      "epoch": 0.018641810918774968,
+      "grad_norm": 0.45126098872878045,
+      "learning_rate": 3.723404255319149e-05,
+      "loss": 1.4122,
+      "step": 35
+    },
+    {
+      "epoch": 0.02130492676431425,
+      "grad_norm": 0.16268004246261306,
+      "learning_rate": 4.2553191489361704e-05,
+      "loss": 1.3378,
+      "step": 40
+    },
+    {
+      "epoch": 0.023968042609853527,
+      "grad_norm": 0.16755798993478008,
+      "learning_rate": 4.787234042553192e-05,
+      "loss": 1.3315,
+      "step": 45
+    },
+    {
+      "epoch": 0.02663115845539281,
+      "grad_norm": 0.14666977328343594,
+      "learning_rate": 5.319148936170213e-05,
+      "loss": 1.3259,
+      "step": 50
+    },
+    {
+      "epoch": 0.02929427430093209,
+      "grad_norm": 0.10629659674680719,
+      "learning_rate": 5.851063829787234e-05,
+      "loss": 1.288,
+      "step": 55
+    },
+    {
+      "epoch": 0.03195739014647137,
+      "grad_norm": 0.11263809733941188,
+      "learning_rate": 6.382978723404256e-05,
+      "loss": 1.2763,
+      "step": 60
+    },
+    {
+      "epoch": 0.03462050599201065,
+      "grad_norm": 0.14445654795383106,
+      "learning_rate": 6.914893617021277e-05,
+      "loss": 1.256,
+      "step": 65
+    },
+    {
+      "epoch": 0.037283621837549935,
+      "grad_norm": 0.07679775841947081,
+      "learning_rate": 7.446808510638298e-05,
+      "loss": 1.2481,
+      "step": 70
+    },
+    {
+      "epoch": 0.03994673768308921,
+      "grad_norm": 0.07097172061630208,
+      "learning_rate": 7.978723404255319e-05,
+      "loss": 1.2417,
+      "step": 75
+    },
+    {
+      "epoch": 0.0426098535286285,
+      "grad_norm": 0.07545648577598915,
+      "learning_rate": 8.510638297872341e-05,
+      "loss": 1.2048,
+      "step": 80
+    },
+    {
+      "epoch": 0.045272969374167776,
+      "grad_norm": 0.06657701317469632,
+      "learning_rate": 9.042553191489363e-05,
+      "loss": 1.2138,
+      "step": 85
+    },
+    {
+      "epoch": 0.047936085219707054,
+      "grad_norm": 0.08276972517588223,
+      "learning_rate": 9.574468085106384e-05,
+      "loss": 1.2267,
+      "step": 90
+    },
+    {
+      "epoch": 0.05059920106524634,
+      "grad_norm": 0.14086227463755532,
+      "learning_rate": 0.00010106382978723406,
+      "loss": 1.2185,
+      "step": 95
+    },
+    {
+      "epoch": 0.05326231691078562,
+      "grad_norm": 0.08222057468849275,
+      "learning_rate": 0.00010638297872340425,
+      "loss": 1.2021,
+      "step": 100
+    },
+    {
+      "epoch": 0.0559254327563249,
+      "grad_norm": 0.06741247738810993,
+      "learning_rate": 0.00011170212765957446,
+      "loss": 1.1957,
+      "step": 105
+    },
+    {
+      "epoch": 0.05858854860186418,
+      "grad_norm": 0.07692927491859038,
+      "learning_rate": 0.00011702127659574468,
+      "loss": 1.1901,
+      "step": 110
+    },
+    {
+      "epoch": 0.06125166444740346,
+      "grad_norm": 0.08953054401601632,
+      "learning_rate": 0.0001223404255319149,
+      "loss": 1.2002,
+      "step": 115
+    },
+    {
+      "epoch": 0.06391478029294274,
+      "grad_norm": 0.066045987418387,
+      "learning_rate": 0.00012765957446808513,
+      "loss": 1.2086,
+      "step": 120
+    },
+    {
+      "epoch": 0.06657789613848203,
+      "grad_norm": 0.06476207146640194,
+      "learning_rate": 0.00013297872340425532,
+      "loss": 1.215,
+      "step": 125
+    },
+    {
+      "epoch": 0.0692410119840213,
+      "grad_norm": 0.07334327374343644,
+      "learning_rate": 0.00013829787234042554,
+      "loss": 1.181,
+      "step": 130
+    },
+    {
+      "epoch": 0.07190412782956059,
+      "grad_norm": 0.071815699820189,
+      "learning_rate": 0.00014361702127659576,
+      "loss": 1.1795,
+      "step": 135
+    },
+    {
+      "epoch": 0.07456724367509987,
+      "grad_norm": 0.08268574123602224,
+      "learning_rate": 0.00014893617021276596,
+      "loss": 1.1892,
+      "step": 140
+    },
+    {
+      "epoch": 0.07723035952063914,
+      "grad_norm": 0.07585606469879155,
+      "learning_rate": 0.00015425531914893618,
+      "loss": 1.173,
+      "step": 145
+    },
+    {
+      "epoch": 0.07989347536617843,
+      "grad_norm": 0.06900850276332868,
+      "learning_rate": 0.00015957446808510637,
+      "loss": 1.1889,
+      "step": 150
+    },
+    {
+      "epoch": 0.08255659121171771,
+      "grad_norm": 0.07702070923317432,
+      "learning_rate": 0.00016489361702127662,
+      "loss": 1.1705,
+      "step": 155
+    },
+    {
+      "epoch": 0.085219707057257,
+      "grad_norm": 0.0724505634260966,
+      "learning_rate": 0.00017021276595744682,
+      "loss": 1.1801,
+      "step": 160
+    },
+    {
+      "epoch": 0.08788282290279627,
+      "grad_norm": 0.07762044850846143,
+      "learning_rate": 0.000175531914893617,
+      "loss": 1.1863,
+      "step": 165
+    },
+    {
+      "epoch": 0.09054593874833555,
+      "grad_norm": 0.0825988729050522,
+      "learning_rate": 0.00018085106382978726,
+      "loss": 1.1687,
+      "step": 170
+    },
+    {
+      "epoch": 0.09320905459387484,
+      "grad_norm": 0.08362625552402488,
+      "learning_rate": 0.00018617021276595746,
+      "loss": 1.1875,
+      "step": 175
+    },
+    {
+      "epoch": 0.09587217043941411,
+      "grad_norm": 0.07968629276225715,
+      "learning_rate": 0.00019148936170212768,
+      "loss": 1.1629,
+      "step": 180
+    },
+    {
+      "epoch": 0.0985352862849534,
+      "grad_norm": 0.09010747938874886,
+      "learning_rate": 0.00019680851063829787,
+      "loss": 1.1682,
+      "step": 185
+    },
+    {
+      "epoch": 0.10119840213049268,
+      "grad_norm": 0.10276104499280464,
+      "learning_rate": 0.00019999930805760402,
+      "loss": 1.1618,
+      "step": 190
+    },
+    {
+      "epoch": 0.10386151797603196,
+      "grad_norm": 0.07684956533429005,
+      "learning_rate": 0.00019999152381561955,
+      "loss": 1.1902,
+      "step": 195
+    },
+    {
+      "epoch": 0.10652463382157124,
+      "grad_norm": 0.09365634557072464,
+      "learning_rate": 0.0001999750910791767,
+      "loss": 1.1673,
+      "step": 200
+    },
+    {
+      "epoch": 0.10918774966711052,
+      "grad_norm": 0.08320439951882774,
+      "learning_rate": 0.00019995001126958025,
+      "loss": 1.1845,
+      "step": 205
+    },
+    {
+      "epoch": 0.1118508655126498,
+      "grad_norm": 0.08595161594886752,
+      "learning_rate": 0.00019991628655604003,
+      "loss": 1.1444,
+      "step": 210
+    },
+    {
+      "epoch": 0.11451398135818908,
+      "grad_norm": 0.07678856693871118,
+      "learning_rate": 0.00019987391985548328,
+      "loss": 1.1724,
+      "step": 215
+    },
+    {
+      "epoch": 0.11717709720372836,
+      "grad_norm": 0.07433277571881601,
+      "learning_rate": 0.0001998229148323023,
+      "loss": 1.1469,
+      "step": 220
+    },
+    {
+      "epoch": 0.11984021304926765,
+      "grad_norm": 0.11001095605643386,
+      "learning_rate": 0.00019976327589803767,
+      "loss": 1.1383,
+      "step": 225
+    },
+    {
+      "epoch": 0.12250332889480692,
+      "grad_norm": 0.0784178760835021,
+      "learning_rate": 0.0001996950082109965,
+      "loss": 1.1818,
+      "step": 230
+    },
+    {
+      "epoch": 0.12516644474034622,
+      "grad_norm": 0.08047194112395492,
+      "learning_rate": 0.00019961811767580648,
+      "loss": 1.1445,
+      "step": 235
+    },
+    {
+      "epoch": 0.1278295605858855,
+      "grad_norm": 0.0670667235371544,
+      "learning_rate": 0.0001995326109429049,
+      "loss": 1.1741,
+      "step": 240
+    },
+    {
+      "epoch": 0.13049267643142476,
+      "grad_norm": 0.07072589446768075,
+      "learning_rate": 0.00019943849540796375,
+      "loss": 1.157,
+      "step": 245
+    },
+    {
+      "epoch": 0.13315579227696406,
+      "grad_norm": 0.07466892570841129,
+      "learning_rate": 0.0001993357792112498,
+      "loss": 1.125,
+      "step": 250
+    },
+    {
+      "epoch": 0.13581890812250333,
+      "grad_norm": 0.07302104613788317,
+      "learning_rate": 0.0001992244712369207,
+      "loss": 1.1615,
+      "step": 255
+    },
+    {
+      "epoch": 0.1384820239680426,
+      "grad_norm": 0.07211635352591637,
+      "learning_rate": 0.00019910458111225646,
+      "loss": 1.1441,
+      "step": 260
+    },
+    {
+      "epoch": 0.1411451398135819,
+      "grad_norm": 0.07103357444221702,
+      "learning_rate": 0.00019897611920682677,
+      "loss": 1.1493,
+      "step": 265
+    },
+    {
+      "epoch": 0.14380825565912117,
+      "grad_norm": 0.0698227187710226,
+      "learning_rate": 0.00019883909663159424,
+      "loss": 1.1568,
+      "step": 270
+    },
+    {
+      "epoch": 0.14647137150466044,
+      "grad_norm": 0.07137557168765225,
+      "learning_rate": 0.0001986935252379532,
+      "loss": 1.171,
+      "step": 275
+    },
+    {
+      "epoch": 0.14913448735019974,
+      "grad_norm": 0.07605080544337586,
+      "learning_rate": 0.00019853941761670483,
+      "loss": 1.1623,
+      "step": 280
+    },
+    {
+      "epoch": 0.151797603195739,
+      "grad_norm": 0.09532848101140429,
+      "learning_rate": 0.00019837678709696798,
+      "loss": 1.1888,
+      "step": 285
+    },
+    {
+      "epoch": 0.15446071904127828,
+      "grad_norm": 0.07485256895909924,
+      "learning_rate": 0.00019820564774502644,
+      "loss": 1.1483,
+      "step": 290
+    },
+    {
+      "epoch": 0.15712383488681758,
+      "grad_norm": 0.07483378156117482,
+      "learning_rate": 0.0001980260143631122,
+      "loss": 1.1375,
+      "step": 295
+    },
+    {
+      "epoch": 0.15978695073235685,
+      "grad_norm": 0.07954155407009747,
+      "learning_rate": 0.00019783790248812533,
+      "loss": 1.1696,
+      "step": 300
+    },
+    {
+      "epoch": 0.16245006657789615,
+      "grad_norm": 0.08502452471103343,
+      "learning_rate": 0.00019764132839029,
+      "loss": 1.168,
+      "step": 305
+    },
+    {
+      "epoch": 0.16511318242343542,
+      "grad_norm": 0.08384910068571033,
+      "learning_rate": 0.00019743630907174725,
+      "loss": 1.1659,
+      "step": 310
+    },
+    {
+      "epoch": 0.1677762982689747,
+      "grad_norm": 0.06905463640642404,
+      "learning_rate": 0.0001972228622650846,
+      "loss": 1.1612,
+      "step": 315
+    },
+    {
+      "epoch": 0.170439414114514,
+      "grad_norm": 0.19257912301232658,
+      "learning_rate": 0.0001970010064318021,
+      "loss": 1.1517,
+      "step": 320
+    },
+    {
+      "epoch": 0.17310252996005326,
+      "grad_norm": 0.0793114626498931,
+      "learning_rate": 0.00019677076076071566,
+      "loss": 1.1385,
+      "step": 325
+    },
+    {
+      "epoch": 0.17576564580559254,
+      "grad_norm": 0.07393026070000318,
+      "learning_rate": 0.00019653214516629735,
+      "loss": 1.1426,
+      "step": 330
+    },
+    {
+      "epoch": 0.17842876165113183,
+      "grad_norm": 0.08179432509124362,
+      "learning_rate": 0.00019628518028695307,
+      "loss": 1.1104,
+      "step": 335
+    },
+    {
+      "epoch": 0.1810918774966711,
+      "grad_norm": 0.09735291608528279,
+      "learning_rate": 0.00019602988748323717,
+      "loss": 1.1563,
+      "step": 340
+    },
+    {
+      "epoch": 0.18375499334221038,
+      "grad_norm": 0.06743724715009518,
+      "learning_rate": 0.00019576628883600535,
+      "loss": 1.1406,
+      "step": 345
+    },
+    {
+      "epoch": 0.18641810918774968,
+      "grad_norm": 0.075326384879952,
+      "learning_rate": 0.00019549440714450444,
+      "loss": 1.1572,
+      "step": 350
+    },
+    {
+      "epoch": 0.18908122503328895,
+      "grad_norm": 0.07438689728031705,
+      "learning_rate": 0.00019521426592440072,
+      "loss": 1.1479,
+      "step": 355
+    },
+    {
+      "epoch": 0.19174434087882822,
+      "grad_norm": 0.07277611336304127,
+      "learning_rate": 0.00019492588940574586,
+      "loss": 1.1549,
+      "step": 360
+    },
+    {
+      "epoch": 0.19440745672436752,
+      "grad_norm": 0.0695324135241875,
+      "learning_rate": 0.0001946293025308813,
+      "loss": 1.1435,
+      "step": 365
+    },
+    {
+      "epoch": 0.1970705725699068,
+      "grad_norm": 0.06685927618032904,
+      "learning_rate": 0.00019432453095228076,
+      "loss": 1.1641,
+      "step": 370
+    },
+    {
+      "epoch": 0.19973368841544606,
+      "grad_norm": 0.0680367135740568,
+      "learning_rate": 0.00019401160103033174,
+      "loss": 1.1261,
+      "step": 375
+    },
+    {
+      "epoch": 0.20239680426098536,
+      "grad_norm": 0.08027336453756874,
+      "learning_rate": 0.00019369053983105532,
+      "loss": 1.1368,
+      "step": 380
+    },
+    {
+      "epoch": 0.20505992010652463,
+      "grad_norm": 0.0707161713953054,
+      "learning_rate": 0.00019336137512376532,
+      "loss": 1.1588,
+      "step": 385
+    },
+    {
+      "epoch": 0.20772303595206393,
+      "grad_norm": 0.07189527593634382,
+      "learning_rate": 0.00019302413537866642,
+      "loss": 1.1552,
+      "step": 390
+    },
+    {
+      "epoch": 0.2103861517976032,
+      "grad_norm": 0.0716934364253126,
+      "learning_rate": 0.0001926788497643916,
+      "loss": 1.1577,
+      "step": 395
+    },
+    {
+      "epoch": 0.21304926764314247,
+      "grad_norm": 0.065943892133018,
+      "learning_rate": 0.00019232554814547953,
+      "loss": 1.1203,
+      "step": 400
+    },
+    {
+      "epoch": 0.21571238348868177,
+      "grad_norm": 0.07352621386091099,
+      "learning_rate": 0.00019196426107979128,
+      "loss": 1.1266,
+      "step": 405
+    },
+    {
+      "epoch": 0.21837549933422104,
+      "grad_norm": 0.07441803674470306,
+      "learning_rate": 0.00019159501981586737,
+      "loss": 1.1432,
+      "step": 410
+    },
+    {
+      "epoch": 0.2210386151797603,
+      "grad_norm": 0.07291702193187057,
+      "learning_rate": 0.00019121785629022501,
+      "loss": 1.1344,
+      "step": 415
+    },
+    {
+      "epoch": 0.2237017310252996,
+      "grad_norm": 0.07094925179230635,
+      "learning_rate": 0.00019083280312459593,
+      "loss": 1.1137,
+      "step": 420
+    },
+    {
+      "epoch": 0.22636484687083888,
+      "grad_norm": 0.07399044805064979,
+      "learning_rate": 0.0001904398936231047,
+      "loss": 1.1533,
+      "step": 425
+    },
+    {
+      "epoch": 0.22902796271637815,
+      "grad_norm": 0.07782197426798759,
+      "learning_rate": 0.00019003916176938836,
+      "loss": 1.1458,
+      "step": 430
+    },
+    {
+      "epoch": 0.23169107856191745,
+      "grad_norm": 0.06822071830212563,
+      "learning_rate": 0.00018963064222365694,
+      "loss": 1.1448,
+      "step": 435
+    },
+    {
+      "epoch": 0.23435419440745672,
+      "grad_norm": 0.06944246120343146,
+      "learning_rate": 0.00018921437031969558,
+      "loss": 1.1577,
+      "step": 440
+    },
+    {
+      "epoch": 0.237017310252996,
+      "grad_norm": 0.07108688216307608,
+      "learning_rate": 0.0001887903820618087,
+      "loss": 1.1526,
+      "step": 445
+    },
+    {
+      "epoch": 0.2396804260985353,
+      "grad_norm": 0.08455610060485116,
+      "learning_rate": 0.00018835871412170563,
+      "loss": 1.1517,
+      "step": 450
+    },
+    {
+      "epoch": 0.24234354194407456,
+      "grad_norm": 0.06664786445884358,
+      "learning_rate": 0.0001879194038353289,
+      "loss": 1.1537,
+      "step": 455
+    },
+    {
+      "epoch": 0.24500665778961384,
+      "grad_norm": 0.07089581724112333,
+      "learning_rate": 0.00018747248919962498,
+      "loss": 1.1409,
+      "step": 460
+    },
+    {
+      "epoch": 0.24766977363515313,
+      "grad_norm": 0.07242825833109466,
+      "learning_rate": 0.00018701800886925782,
+      "loss": 1.1303,
+      "step": 465
+    },
+    {
+      "epoch": 0.25033288948069243,
+      "grad_norm": 0.06598593287452807,
+      "learning_rate": 0.00018655600215326546,
+      "loss": 1.1401,
+      "step": 470
+    },
+    {
+      "epoch": 0.2529960053262317,
+      "grad_norm": 0.07020789015379635,
+      "learning_rate": 0.00018608650901166032,
+      "loss": 1.1542,
+      "step": 475
+    },
+    {
+      "epoch": 0.255659121171771,
+      "grad_norm": 0.06441793150662321,
+      "learning_rate": 0.0001856095700519726,
+      "loss": 1.1276,
+      "step": 480
+    },
+    {
+      "epoch": 0.2583222370173103,
+      "grad_norm": 0.07254719498292789,
+      "learning_rate": 0.0001851252265257384,
+      "loss": 1.1212,
+      "step": 485
+    },
+    {
+      "epoch": 0.2609853528628495,
+      "grad_norm": 0.06917909155716108,
+      "learning_rate": 0.0001846335203249316,
+      "loss": 1.1298,
+      "step": 490
+    },
+    {
+      "epoch": 0.2636484687083888,
+      "grad_norm": 0.07470942417701212,
+      "learning_rate": 0.00018413449397834051,
+      "loss": 1.1456,
+      "step": 495
+    },
+    {
+      "epoch": 0.2663115845539281,
+      "grad_norm": 0.0693858861935873,
+      "learning_rate": 0.00018362819064788956,
+      "loss": 1.1327,
+      "step": 500
+    },
+    {
+      "epoch": 0.26897470039946736,
+      "grad_norm": 0.07182079092553902,
+      "learning_rate": 0.00018311465412490608,
+      "loss": 1.1628,
+      "step": 505
+    },
+    {
+      "epoch": 0.27163781624500666,
+      "grad_norm": 0.06682954118119949,
+      "learning_rate": 0.00018259392882633265,
+      "loss": 1.1528,
+      "step": 510
+    },
+    {
+      "epoch": 0.27430093209054596,
+      "grad_norm": 0.07248673749669132,
+      "learning_rate": 0.00018206605979088542,
+      "loss": 1.156,
+      "step": 515
+    },
+    {
+      "epoch": 0.2769640479360852,
+      "grad_norm": 0.06950216959497392,
+      "learning_rate": 0.0001815310926751586,
+      "loss": 1.119,
+      "step": 520
+    },
+    {
+      "epoch": 0.2796271637816245,
+      "grad_norm": 0.07067673018407011,
+      "learning_rate": 0.00018098907374967555,
+      "loss": 1.1211,
+      "step": 525
+    },
+    {
+      "epoch": 0.2822902796271638,
+      "grad_norm": 0.06820842392733384,
+      "learning_rate": 0.00018044004989488664,
+      "loss": 1.1281,
+      "step": 530
+    },
+    {
+      "epoch": 0.28495339547270304,
+      "grad_norm": 0.07418230217074875,
+      "learning_rate": 0.00017988406859711456,
+      "loss": 1.1409,
+      "step": 535
+    },
+    {
+      "epoch": 0.28761651131824234,
+      "grad_norm": 0.07009876259688716,
+      "learning_rate": 0.00017932117794444713,
+      "loss": 1.1381,
+      "step": 540
+    },
+    {
+      "epoch": 0.29027962716378164,
+      "grad_norm": 0.07129309605598672,
+      "learning_rate": 0.00017875142662257786,
+      "loss": 1.1387,
+      "step": 545
+    },
+    {
+      "epoch": 0.2929427430093209,
+      "grad_norm": 0.07830622678131702,
+      "learning_rate": 0.00017817486391059532,
+      "loss": 1.1165,
+      "step": 550
+    },
+    {
+      "epoch": 0.2956058588548602,
+      "grad_norm": 0.0709756673443606,
+      "learning_rate": 0.0001775915396767205,
+      "loss": 1.129,
+      "step": 555
+    },
+    {
+      "epoch": 0.2982689747003995,
+      "grad_norm": 0.06710174636010342,
+      "learning_rate": 0.00017700150437399405,
+      "loss": 1.1183,
+      "step": 560
+    },
+    {
+      "epoch": 0.3009320905459387,
+      "grad_norm": 0.07321620332053846,
+      "learning_rate": 0.0001764048090359121,
+      "loss": 1.1502,
+      "step": 565
+    },
+    {
+      "epoch": 0.303595206391478,
+      "grad_norm": 0.07613131980579707,
+      "learning_rate": 0.00017580150527201241,
+      "loss": 1.1322,
+      "step": 570
+    },
+    {
+      "epoch": 0.3062583222370173,
+      "grad_norm": 0.07480921806539248,
+      "learning_rate": 0.0001751916452634105,
+      "loss": 1.1269,
+      "step": 575
+    },
+    {
+      "epoch": 0.30892143808255657,
+      "grad_norm": 0.07386122393966031,
+      "learning_rate": 0.0001745752817582865,
+      "loss": 1.1528,
+      "step": 580
+    },
+    {
+      "epoch": 0.31158455392809586,
+      "grad_norm": 0.07032971968151802,
+      "learning_rate": 0.00017395246806732267,
+      "loss": 1.1642,
+      "step": 585
+    },
+    {
+      "epoch": 0.31424766977363516,
+      "grad_norm": 0.07910944378906298,
+      "learning_rate": 0.00017332325805909256,
+      "loss": 1.1328,
+      "step": 590
+    },
+    {
+      "epoch": 0.3169107856191744,
+      "grad_norm": 0.06775943214366806,
+      "learning_rate": 0.00017268770615540177,
+      "loss": 1.1142,
+      "step": 595
+    },
+    {
+      "epoch": 0.3195739014647137,
+      "grad_norm": 0.0858014191359942,
+      "learning_rate": 0.00017204586732658087,
+      "loss": 1.1393,
+      "step": 600
+    },
+    {
+      "epoch": 0.322237017310253,
+      "grad_norm": 0.06968407560583738,
+      "learning_rate": 0.00017139779708673085,
+      "loss": 1.1428,
+      "step": 605
+    },
+    {
+      "epoch": 0.3249001331557923,
+      "grad_norm": 0.06812443512073688,
+      "learning_rate": 0.00017074355148892167,
+      "loss": 1.1592,
+      "step": 610
+    },
+    {
+      "epoch": 0.32756324900133155,
+      "grad_norm": 0.07150170839574509,
+      "learning_rate": 0.00017008318712034403,
+      "loss": 1.1018,
+      "step": 615
+    },
+    {
+      "epoch": 0.33022636484687085,
+      "grad_norm": 0.06906369302490485,
+      "learning_rate": 0.00016941676109741508,
+      "loss": 1.1442,
+      "step": 620
+    },
+    {
+      "epoch": 0.33288948069241014,
+      "grad_norm": 0.07869503084625909,
+      "learning_rate": 0.00016874433106083814,
+      "loss": 1.1132,
+      "step": 625
+    },
+    {
+      "epoch": 0.3355525965379494,
+      "grad_norm": 0.07767900677929127,
+      "learning_rate": 0.00016806595517061744,
+      "loss": 1.1362,
+      "step": 630
+    },
+    {
+      "epoch": 0.3382157123834887,
+      "grad_norm": 0.06780573276986938,
+      "learning_rate": 0.00016738169210102764,
+      "loss": 1.1382,
+      "step": 635
+    },
+    {
+      "epoch": 0.340878828229028,
+      "grad_norm": 0.07855904717914698,
+      "learning_rate": 0.00016669160103553884,
+      "loss": 1.1146,
+      "step": 640
+    },
+    {
+      "epoch": 0.34354194407456723,
+      "grad_norm": 0.06976051466447154,
+      "learning_rate": 0.00016599574166169782,
+      "loss": 1.1156,
+      "step": 645
+    },
+    {
+      "epoch": 0.34620505992010653,
+      "grad_norm": 0.0659729198246215,
+      "learning_rate": 0.0001652941741659655,
+      "loss": 1.1636,
+      "step": 650
+    },
+    {
+      "epoch": 0.3488681757656458,
+      "grad_norm": 0.06820327345322129,
+      "learning_rate": 0.00016458695922851125,
+      "loss": 1.1272,
+      "step": 655
+    },
+    {
+      "epoch": 0.35153129161118507,
+      "grad_norm": 0.0706423611601847,
+      "learning_rate": 0.0001638741580179645,
+      "loss": 1.15,
+      "step": 660
+    },
+    {
+      "epoch": 0.35419440745672437,
+      "grad_norm": 0.07009862917994238,
+      "learning_rate": 0.0001631558321861241,
+      "loss": 1.1133,
+      "step": 665
+    },
+    {
+      "epoch": 0.35685752330226367,
+      "grad_norm": 0.10252097837226529,
+      "learning_rate": 0.00016243204386262616,
+      "loss": 1.1275,
+      "step": 670
+    },
+    {
+      "epoch": 0.3595206391478029,
+      "grad_norm": 0.0677270369109815,
+      "learning_rate": 0.0001617028556495699,
+      "loss": 1.1463,
+      "step": 675
+    },
+    {
+      "epoch": 0.3621837549933422,
+      "grad_norm": 0.07081566637081647,
+      "learning_rate": 0.00016096833061610336,
+      "loss": 1.1557,
+      "step": 680
+    },
+    {
+      "epoch": 0.3648468708388815,
+      "grad_norm": 0.07606309640077409,
+      "learning_rate": 0.0001602285322929684,
+      "loss": 1.1279,
+      "step": 685
+    },
+    {
+      "epoch": 0.36750998668442075,
+      "grad_norm": 0.06926585358652293,
+      "learning_rate": 0.00015948352466700562,
+      "loss": 1.1058,
+      "step": 690
+    },
+    {
+      "epoch": 0.37017310252996005,
+      "grad_norm": 0.0768394516058797,
+      "learning_rate": 0.00015873337217562012,
+      "loss": 1.1451,
+      "step": 695
+    },
+    {
+      "epoch": 0.37283621837549935,
+      "grad_norm": 0.07574776045146851,
+      "learning_rate": 0.00015797813970120806,
+      "loss": 1.1529,
+      "step": 700
+    },
+    {
+      "epoch": 0.3754993342210386,
+      "grad_norm": 0.08825811667362324,
+      "learning_rate": 0.00015721789256554493,
+      "loss": 1.1427,
+      "step": 705
+    },
+    {
+      "epoch": 0.3781624500665779,
+      "grad_norm": 0.07195501325596203,
+      "learning_rate": 0.00015645269652413572,
+      "loss": 1.1348,
+      "step": 710
+    },
+    {
+      "epoch": 0.3808255659121172,
+      "grad_norm": 0.07470988656942844,
+      "learning_rate": 0.00015568261776052747,
+      "loss": 1.1389,
+      "step": 715
+    },
+    {
+      "epoch": 0.38348868175765644,
+      "grad_norm": 0.07234292608880714,
+      "learning_rate": 0.0001549077228805851,
+      "loss": 1.1265,
+      "step": 720
+    },
+    {
+      "epoch": 0.38615179760319573,
+      "grad_norm": 0.07603813138240277,
+      "learning_rate": 0.00015412807890673012,
+      "loss": 1.0975,
+      "step": 725
+    },
+    {
+      "epoch": 0.38881491344873503,
+      "grad_norm": 0.06914740850103318,
+      "learning_rate": 0.00015334375327214435,
+      "loss": 1.1656,
+      "step": 730
+    },
+    {
+      "epoch": 0.3914780292942743,
+      "grad_norm": 0.0723373355088882,
+      "learning_rate": 0.00015255481381493686,
+      "loss": 1.1235,
+      "step": 735
+    },
+    {
+      "epoch": 0.3941411451398136,
+      "grad_norm": 0.07469762097501292,
+      "learning_rate": 0.00015176132877227672,
+      "loss": 1.1401,
+      "step": 740
+    },
+    {
+      "epoch": 0.3968042609853529,
+      "grad_norm": 0.06845354027517625,
+      "learning_rate": 0.00015096336677449123,
+      "loss": 1.1299,
+      "step": 745
+    },
+    {
+      "epoch": 0.3994673768308921,
+      "grad_norm": 0.07857096344059177,
+      "learning_rate": 0.0001501609968391295,
+      "loss": 1.1362,
+      "step": 750
+    },
+    {
+      "epoch": 0.4021304926764314,
+      "grad_norm": 0.07079465135436822,
+      "learning_rate": 0.00014935428836499332,
+      "loss": 1.1268,
+      "step": 755
+    },
+    {
+      "epoch": 0.4047936085219707,
+      "grad_norm": 0.07113035589654983,
+      "learning_rate": 0.0001485433111261346,
+      "loss": 1.1357,
+      "step": 760
+    },
+    {
+      "epoch": 0.40745672436750996,
+      "grad_norm": 0.0703269232774503,
+      "learning_rate": 0.0001477281352658203,
+      "loss": 1.1239,
+      "step": 765
+    },
+    {
+      "epoch": 0.41011984021304926,
+      "grad_norm": 0.07059355929742223,
+      "learning_rate": 0.00014690883129046584,
+      "loss": 1.1442,
+      "step": 770
+    },
+    {
+      "epoch": 0.41278295605858856,
+      "grad_norm": 0.07289277380542494,
+      "learning_rate": 0.0001460854700635366,
+      "loss": 1.1267,
+      "step": 775
+    },
+    {
+      "epoch": 0.41544607190412786,
+      "grad_norm": 0.06984202720886337,
+      "learning_rate": 0.00014525812279941896,
+      "loss": 1.1258,
+      "step": 780
+    },
+    {
+      "epoch": 0.4181091877496671,
+      "grad_norm": 0.07422048925652114,
+      "learning_rate": 0.00014442686105726067,
+      "loss": 1.1193,
+      "step": 785
+    },
+    {
+      "epoch": 0.4207723035952064,
+      "grad_norm": 0.0718716738772194,
+      "learning_rate": 0.00014359175673478162,
+      "loss": 1.133,
+      "step": 790
+    },
+    {
+      "epoch": 0.4234354194407457,
+      "grad_norm": 0.07204344165616748,
+      "learning_rate": 0.00014275288206205524,
+      "loss": 1.0967,
+      "step": 795
+    },
+    {
+      "epoch": 0.42609853528628494,
+      "grad_norm": 0.07353413673583019,
+      "learning_rate": 0.00014191030959526105,
+      "loss": 1.1261,
+      "step": 800
+    },
+    {
+      "epoch": 0.42876165113182424,
+      "grad_norm": 0.0707040442967912,
+      "learning_rate": 0.00014106411221040933,
+      "loss": 1.128,
+      "step": 805
+    },
+    {
+      "epoch": 0.43142476697736354,
+      "grad_norm": 0.07086259967904554,
+      "learning_rate": 0.00014021436309703765,
+      "loss": 1.107,
+      "step": 810
+    },
+    {
+      "epoch": 0.4340878828229028,
+      "grad_norm": 0.06994136097058291,
+      "learning_rate": 0.00013936113575188075,
+      "loss": 1.1221,
+      "step": 815
+    },
+    {
+      "epoch": 0.4367509986684421,
+      "grad_norm": 0.06961342073957084,
+      "learning_rate": 0.00013850450397251345,
+      "loss": 1.1208,
+      "step": 820
+    },
+    {
+      "epoch": 0.4394141145139814,
+      "grad_norm": 0.07040932769118938,
+      "learning_rate": 0.0001376445418509679,
+      "loss": 1.1208,
+      "step": 825
+    },
+    {
+      "epoch": 0.4420772303595206,
+      "grad_norm": 0.07187314857901307,
+      "learning_rate": 0.00013678132376732517,
+      "loss": 1.1267,
+      "step": 830
+    },
+    {
+      "epoch": 0.4447403462050599,
+      "grad_norm": 0.07002729221567015,
+      "learning_rate": 0.00013591492438328183,
+      "loss": 1.1421,
+      "step": 835
+    },
+    {
+      "epoch": 0.4474034620505992,
+      "grad_norm": 0.07235067324392022,
+      "learning_rate": 0.0001350454186356924,
+      "loss": 1.1191,
+      "step": 840
+    },
+    {
+      "epoch": 0.45006657789613846,
+      "grad_norm": 0.07568410556158327,
+      "learning_rate": 0.00013417288173008776,
+      "loss": 1.1123,
+      "step": 845
+    },
+    {
+      "epoch": 0.45272969374167776,
+      "grad_norm": 0.07613155957113646,
+      "learning_rate": 0.00013329738913417068,
+      "loss": 1.1137,
+      "step": 850
+    },
+    {
+      "epoch": 0.45539280958721706,
+      "grad_norm": 0.06854447943505289,
+      "learning_rate": 0.00013241901657128825,
+      "loss": 1.132,
+      "step": 855
+    },
+    {
+      "epoch": 0.4580559254327563,
+      "grad_norm": 0.06884442803824642,
+      "learning_rate": 0.00013153784001388247,
+      "loss": 1.1352,
+      "step": 860
+    },
+    {
+      "epoch": 0.4607190412782956,
+      "grad_norm": 0.0818107600119684,
+      "learning_rate": 0.00013065393567691913,
+      "loss": 1.101,
+      "step": 865
+    },
+    {
+      "epoch": 0.4633821571238349,
+      "grad_norm": 0.07690425325074156,
+      "learning_rate": 0.00012976738001129606,
+      "loss": 1.1052,
+      "step": 870
+    },
+    {
+      "epoch": 0.46604527296937415,
+      "grad_norm": 0.07952080928038242,
+      "learning_rate": 0.00012887824969723034,
+      "loss": 1.1172,
+      "step": 875
+    },
+    {
+      "epoch": 0.46870838881491345,
+      "grad_norm": 0.06637081804522467,
+      "learning_rate": 0.00012798662163762635,
+      "loss": 1.1236,
+      "step": 880
+    },
+    {
+      "epoch": 0.47137150466045274,
+      "grad_norm": 0.06753657536020181,
+      "learning_rate": 0.00012709257295142422,
+      "loss": 1.1304,
+      "step": 885
+    },
+    {
+      "epoch": 0.474034620505992,
+      "grad_norm": 0.07408199495580661,
+      "learning_rate": 0.00012619618096692943,
+      "loss": 1.1523,
+      "step": 890
+    },
+    {
+      "epoch": 0.4766977363515313,
+      "grad_norm": 0.07508862680813526,
+      "learning_rate": 0.0001252975232151248,
+      "loss": 1.1158,
+      "step": 895
+    },
+    {
+      "epoch": 0.4793608521970706,
+      "grad_norm": 0.07064406721668945,
+      "learning_rate": 0.0001243966774229645,
+      "loss": 1.1334,
+      "step": 900
+    },
+    {
+      "epoch": 0.48202396804260983,
+      "grad_norm": 0.0709335857697155,
+      "learning_rate": 0.00012349372150665118,
+      "loss": 1.1104,
+      "step": 905
+    },
+    {
+      "epoch": 0.48468708388814913,
+      "grad_norm": 0.06779392856513489,
+      "learning_rate": 0.00012258873356489714,
+      "loss": 1.1299,
+      "step": 910
+    },
+    {
+      "epoch": 0.4873501997336884,
+      "grad_norm": 0.07922531663031743,
+      "learning_rate": 0.00012168179187216893,
+      "loss": 1.13,
+      "step": 915
+    },
+    {
+      "epoch": 0.49001331557922767,
+      "grad_norm": 0.07185035702343927,
+      "learning_rate": 0.0001207729748719177,
+      "loss": 1.1402,
+      "step": 920
+    },
+    {
+      "epoch": 0.49267643142476697,
+      "grad_norm": 0.07162588557593508,
+      "learning_rate": 0.00011986236116979406,
+      "loss": 1.1308,
+      "step": 925
+    },
+    {
+      "epoch": 0.49533954727030627,
+      "grad_norm": 0.07242326329471309,
+      "learning_rate": 0.0001189500295268495,
+      "loss": 1.106,
+      "step": 930
+    },
+    {
+      "epoch": 0.4980026631158455,
+      "grad_norm": 0.07434286419305244,
+      "learning_rate": 0.0001180360588527242,
+      "loss": 1.119,
+      "step": 935
+    },
+    {
+      "epoch": 0.5006657789613849,
+      "grad_norm": 0.07304306593688702,
+      "learning_rate": 0.00011712052819882171,
+      "loss": 1.1503,
+      "step": 940
+    },
+    {
+      "epoch": 0.5033288948069241,
+      "grad_norm": 0.07214685709611712,
+      "learning_rate": 0.00011620351675147195,
+      "loss": 1.1095,
+      "step": 945
+    },
+    {
+      "epoch": 0.5059920106524634,
+      "grad_norm": 0.07192838129765652,
+      "learning_rate": 0.0001152851038250819,
+      "loss": 1.1451,
+      "step": 950
+    },
+    {
+      "epoch": 0.5086551264980027,
+      "grad_norm": 0.06935787206272043,
+      "learning_rate": 0.00011436536885527576,
+      "loss": 1.1251,
+      "step": 955
+    },
+    {
+      "epoch": 0.511318242343542,
+      "grad_norm": 0.06801727149157547,
+      "learning_rate": 0.00011344439139202421,
+      "loss": 1.1084,
+      "step": 960
+    },
+    {
+      "epoch": 0.5139813581890812,
+      "grad_norm": 0.07024555806497951,
+      "learning_rate": 0.00011252225109276404,
+      "loss": 1.1278,
+      "step": 965
+    },
+    {
+      "epoch": 0.5166444740346205,
+      "grad_norm": 0.06796418511383114,
+      "learning_rate": 0.00011159902771550837,
+      "loss": 1.1092,
+      "step": 970
+    },
+    {
+      "epoch": 0.5193075898801598,
+      "grad_norm": 0.07219790467805971,
+      "learning_rate": 0.00011067480111194817,
+      "loss": 1.1286,
+      "step": 975
+    },
+    {
+      "epoch": 0.521970705725699,
+      "grad_norm": 0.06944360419194191,
+      "learning_rate": 0.00010974965122054579,
+      "loss": 1.1184,
+      "step": 980
+    },
+    {
+      "epoch": 0.5246338215712384,
+      "grad_norm": 0.07229326850745169,
+      "learning_rate": 0.00010882365805962083,
+      "loss": 1.1212,
+      "step": 985
+    },
+    {
+      "epoch": 0.5272969374167776,
+      "grad_norm": 0.07181125732929394,
+      "learning_rate": 0.00010789690172042912,
+      "loss": 1.1137,
+      "step": 990
+    },
+    {
+      "epoch": 0.5299600532623169,
+      "grad_norm": 0.07145294348037948,
+      "learning_rate": 0.00010696946236023567,
+      "loss": 1.1365,
+      "step": 995
+    },
+    {
+      "epoch": 0.5326231691078562,
+      "grad_norm": 0.07057067837966788,
+      "learning_rate": 0.00010604142019538135,
+      "loss": 1.1176,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5352862849533955,
+      "grad_norm": 0.07461665274600122,
+      "learning_rate": 0.00010511285549434509,
+      "loss": 1.1152,
+      "step": 1005
+    },
+    {
+      "epoch": 0.5379494007989347,
+      "grad_norm": 0.07009722058482384,
+      "learning_rate": 0.00010418384857080117,
+      "loss": 1.1117,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5406125166444741,
+      "grad_norm": 0.07081680661261375,
+      "learning_rate": 0.00010325447977667263,
+      "loss": 1.1328,
+      "step": 1015
+    },
+    {
+      "epoch": 0.5432756324900133,
+      "grad_norm": 0.06980388274242631,
+      "learning_rate": 0.00010232482949518156,
+      "loss": 1.1404,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5459387483355526,
+      "grad_norm": 0.06946364728493262,
+      "learning_rate": 0.00010139497813389654,
+      "loss": 1.1127,
+      "step": 1025
+    },
+    {
+      "epoch": 0.5486018641810919,
+      "grad_norm": 0.06947172503952885,
+      "learning_rate": 0.00010046500611777798,
+      "loss": 1.0937,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5512649800266312,
+      "grad_norm": 0.07414647895551518,
+      "learning_rate": 9.953499388222202e-05,
+      "loss": 1.132,
+      "step": 1035
+    },
+    {
+      "epoch": 0.5539280958721704,
+      "grad_norm": 0.07085672498663681,
+      "learning_rate": 9.860502186610349e-05,
+      "loss": 1.0998,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5565912117177098,
+      "grad_norm": 0.07198312270884867,
+      "learning_rate": 9.767517050481846e-05,
+      "loss": 1.1263,
+      "step": 1045
+    },
+    {
+      "epoch": 0.559254327563249,
+      "grad_norm": 0.07070349541708286,
+      "learning_rate": 9.67455202233274e-05,
+      "loss": 1.1143,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5619174434087882,
+      "grad_norm": 0.06990981328605791,
+      "learning_rate": 9.581615142919887e-05,
+      "loss": 1.1168,
+      "step": 1055
+    },
+    {
+      "epoch": 0.5645805592543276,
+      "grad_norm": 0.07221018297233557,
+      "learning_rate": 9.488714450565491e-05,
+      "loss": 1.1123,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5672436750998668,
+      "grad_norm": 0.06895775963564511,
+      "learning_rate": 9.395857980461867e-05,
+      "loss": 1.1294,
+      "step": 1065
+    },
+    {
+      "epoch": 0.5699067909454061,
+      "grad_norm": 0.06904508970279108,
+      "learning_rate": 9.303053763976434e-05,
+      "loss": 1.1179,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5725699067909454,
+      "grad_norm": 0.07131791944898686,
+      "learning_rate": 9.210309827957089e-05,
+      "loss": 1.1297,
+      "step": 1075
+    },
+    {
+      "epoch": 0.5752330226364847,
+      "grad_norm": 0.07117429268373339,
+      "learning_rate": 9.117634194037922e-05,
+      "loss": 1.1285,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5778961384820239,
+      "grad_norm": 0.0720403827517469,
+      "learning_rate": 9.025034877945422e-05,
+      "loss": 1.1418,
+      "step": 1085
+    },
+    {
+      "epoch": 0.5805592543275633,
+      "grad_norm": 0.07399424819774852,
+      "learning_rate": 8.932519888805185e-05,
+      "loss": 1.1521,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5832223701731025,
+      "grad_norm": 0.06757782738616369,
+      "learning_rate": 8.840097228449165e-05,
+      "loss": 1.1468,
+      "step": 1095
+    },
+    {
+      "epoch": 0.5858854860186418,
+      "grad_norm": 0.07513462470349377,
+      "learning_rate": 8.747774890723599e-05,
+      "loss": 1.1008,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5885486018641811,
+      "grad_norm": 0.07193663724723076,
+      "learning_rate": 8.655560860797582e-05,
+      "loss": 1.1364,
+      "step": 1105
+    },
+    {
+      "epoch": 0.5912117177097204,
+      "grad_norm": 0.07451485853238139,
+      "learning_rate": 8.563463114472425e-05,
+      "loss": 1.1077,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5938748335552596,
+      "grad_norm": 0.07208026414944517,
+      "learning_rate": 8.471489617491812e-05,
+      "loss": 1.0828,
+      "step": 1115
+    },
+    {
+      "epoch": 0.596537949400799,
+      "grad_norm": 0.06968442188475359,
+      "learning_rate": 8.379648324852808e-05,
+      "loss": 1.0975,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5992010652463382,
+      "grad_norm": 0.07690321076998301,
+      "learning_rate": 8.287947180117832e-05,
+      "loss": 1.1149,
+      "step": 1125
+    },
+    {
+      "epoch": 0.6018641810918774,
+      "grad_norm": 0.07088556456471248,
+      "learning_rate": 8.196394114727585e-05,
+      "loss": 1.1193,
+      "step": 1130
+    },
+    {
+      "epoch": 0.6045272969374168,
+      "grad_norm": 0.07306182012233282,
+      "learning_rate": 8.104997047315048e-05,
+      "loss": 1.1222,
+      "step": 1135
+    },
+    {
+      "epoch": 0.607190412782956,
+      "grad_norm": 0.0735136578466246,
+      "learning_rate": 8.013763883020596e-05,
+      "loss": 1.1326,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6098535286284953,
+      "grad_norm": 0.0792004084312011,
+      "learning_rate": 7.92270251280823e-05,
+      "loss": 1.1125,
+      "step": 1145
+    },
+    {
+      "epoch": 0.6125166444740346,
+      "grad_norm": 0.06995636578434544,
+      "learning_rate": 7.831820812783108e-05,
+      "loss": 1.1397,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6151797603195739,
+      "grad_norm": 0.07156672219633958,
+      "learning_rate": 7.741126643510292e-05,
+      "loss": 1.1047,
+      "step": 1155
+    },
+    {
+      "epoch": 0.6178428761651131,
+      "grad_norm": 0.06990042451203095,
+      "learning_rate": 7.650627849334881e-05,
+      "loss": 1.0991,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6205059920106525,
+      "grad_norm": 0.07175460373797926,
+      "learning_rate": 7.560332257703555e-05,
+      "loss": 1.1179,
+      "step": 1165
+    },
+    {
+      "epoch": 0.6231691078561917,
+      "grad_norm": 0.07079860289283964,
+      "learning_rate": 7.470247678487522e-05,
+      "loss": 1.1179,
+      "step": 1170
+    },
+    {
+      "epoch": 0.625832223701731,
+      "grad_norm": 0.06955306779443195,
+      "learning_rate": 7.380381903307061e-05,
+      "loss": 1.1261,
+      "step": 1175
+    },
+    {
+      "epoch": 0.6284953395472703,
+      "grad_norm": 0.07040121930899221,
+      "learning_rate": 7.290742704857585e-05,
+      "loss": 1.128,
+      "step": 1180
+    },
+    {
+      "epoch": 0.6311584553928096,
+      "grad_norm": 0.0696134794381735,
+      "learning_rate": 7.201337836237365e-05,
+      "loss": 1.1006,
+      "step": 1185
+    },
+    {
+      "epoch": 0.6338215712383488,
+      "grad_norm": 0.0731972119965864,
+      "learning_rate": 7.112175030276969e-05,
+      "loss": 1.122,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6364846870838882,
+      "grad_norm": 0.07600883906312528,
+      "learning_rate": 7.023261998870395e-05,
+      "loss": 1.1054,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6391478029294274,
+      "grad_norm": 0.06927208348922254,
+      "learning_rate": 6.934606432308086e-05,
+      "loss": 1.1128,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6418109187749668,
+      "grad_norm": 0.06936985978073082,
+      "learning_rate": 6.846215998611757e-05,
+      "loss": 1.118,
+      "step": 1205
+    },
+    {
+      "epoch": 0.644474034620506,
+      "grad_norm": 0.07077273319280314,
+      "learning_rate": 6.758098342871174e-05,
+      "loss": 1.1093,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6471371504660453,
+      "grad_norm": 0.07079727575348424,
+      "learning_rate": 6.670261086582933e-05,
+      "loss": 1.1231,
+      "step": 1215
+    },
+    {
+      "epoch": 0.6498002663115846,
+      "grad_norm": 0.07181688909396321,
+      "learning_rate": 6.582711826991226e-05,
+      "loss": 1.1042,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6524633821571239,
+      "grad_norm": 0.07337324643783341,
+      "learning_rate": 6.495458136430765e-05,
+      "loss": 1.1042,
+      "step": 1225
+    },
+    {
+      "epoch": 0.6551264980026631,
+      "grad_norm": 0.07212862237245772,
+      "learning_rate": 6.408507561671819e-05,
+      "loss": 1.1509,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6577896138482024,
+      "grad_norm": 0.07047120035609092,
+      "learning_rate": 6.321867623267481e-05,
+      "loss": 1.1355,
+      "step": 1235
+    },
+    {
+      "epoch": 0.6604527296937417,
+      "grad_norm": 0.07459478385718604,
+      "learning_rate": 6.23554581490321e-05,
+      "loss": 1.1178,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6631158455392809,
+      "grad_norm": 0.06825726344400981,
+      "learning_rate": 6.149549602748656e-05,
+      "loss": 1.0862,
+      "step": 1245
+    },
+    {
+      "epoch": 0.6657789613848203,
+      "grad_norm": 0.07240193352920372,
+      "learning_rate": 6.063886424811929e-05,
+      "loss": 1.1292,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6684420772303595,
+      "grad_norm": 0.07197359550672061,
+      "learning_rate": 5.9785636902962374e-05,
+      "loss": 1.1306,
+      "step": 1255
+    },
+    {
+      "epoch": 0.6711051930758988,
+      "grad_norm": 0.07343322507680913,
+      "learning_rate": 5.893588778959067e-05,
+      "loss": 1.1365,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6737683089214381,
+      "grad_norm": 0.07842185463764602,
+      "learning_rate": 5.8089690404738925e-05,
+      "loss": 1.1395,
+      "step": 1265
+    },
+    {
+      "epoch": 0.6764314247669774,
+      "grad_norm": 0.07270053681642126,
+      "learning_rate": 5.7247117937944786e-05,
+      "loss": 1.1035,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6790945406125166,
+      "grad_norm": 0.07124055104083139,
+      "learning_rate": 5.640824326521841e-05,
+      "loss": 1.1121,
+      "step": 1275
+    },
+    {
+      "epoch": 0.681757656458056,
+      "grad_norm": 0.07031788560347749,
+      "learning_rate": 5.5573138942739365e-05,
+      "loss": 1.1192,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6844207723035952,
+      "grad_norm": 0.07253058219251593,
+      "learning_rate": 5.4741877200581057e-05,
+      "loss": 1.1324,
+      "step": 1285
+    },
+    {
+      "epoch": 0.6870838881491345,
+      "grad_norm": 0.0725305584439251,
+      "learning_rate": 5.391452993646342e-05,
+      "loss": 1.1387,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6897470039946738,
+      "grad_norm": 0.07021354602161774,
+      "learning_rate": 5.30911687095342e-05,
+      "loss": 1.126,
+      "step": 1295
+    },
+    {
+      "epoch": 0.6924101198402131,
+      "grad_norm": 0.07012940672098344,
+      "learning_rate": 5.227186473417971e-05,
+      "loss": 1.1486,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6950732356857523,
+      "grad_norm": 0.07232533360594767,
+      "learning_rate": 5.145668887386543e-05,
+      "loss": 1.1111,
+      "step": 1305
+    },
+    {
+      "epoch": 0.6977363515312917,
+      "grad_norm": 0.07242821758103567,
+      "learning_rate": 5.064571163500667e-05,
+      "loss": 1.1181,
+      "step": 1310
+    },
+    {
+      "epoch": 0.7003994673768309,
+      "grad_norm": 0.07148030530795384,
+      "learning_rate": 4.983900316087051e-05,
+      "loss": 1.0922,
+      "step": 1315
+    },
+    {
+      "epoch": 0.7030625832223701,
+      "grad_norm": 0.07277623674879777,
+      "learning_rate": 4.90366332255088e-05,
+      "loss": 1.0985,
+      "step": 1320
+    },
+    {
+      "epoch": 0.7057256990679095,
+      "grad_norm": 0.07450043090731064,
+      "learning_rate": 4.823867122772329e-05,
+      "loss": 1.1177,
+      "step": 1325
+    },
+    {
+      "epoch": 0.7083888149134487,
+      "grad_norm": 0.07438166449706331,
+      "learning_rate": 4.744518618506319e-05,
+      "loss": 1.1225,
+      "step": 1330
+    },
+    {
+      "epoch": 0.711051930758988,
+      "grad_norm": 0.07157294185481793,
+      "learning_rate": 4.665624672785566e-05,
+      "loss": 1.1291,
+      "step": 1335
+    },
+    {
+      "epoch": 0.7137150466045273,
+      "grad_norm": 0.07520781395221099,
+      "learning_rate": 4.5871921093269875e-05,
+      "loss": 1.1082,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7163781624500666,
+      "grad_norm": 0.07203181974145231,
+      "learning_rate": 4.5092277119414975e-05,
+      "loss": 1.1333,
+      "step": 1345
+    },
+    {
+      "epoch": 0.7190412782956058,
+      "grad_norm": 0.07130465564504203,
+      "learning_rate": 4.431738223947252e-05,
+      "loss": 1.0951,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7217043941411452,
+      "grad_norm": 0.075489827909183,
+      "learning_rate": 4.35473034758643e-05,
+      "loss": 1.1223,
+      "step": 1355
+    },
+    {
+      "epoch": 0.7243675099866844,
+      "grad_norm": 0.07030155738463333,
+      "learning_rate": 4.2782107434455054e-05,
+      "loss": 1.1222,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7270306258322237,
+      "grad_norm": 0.07218531177873333,
+      "learning_rate": 4.202186029879195e-05,
+      "loss": 1.1135,
+      "step": 1365
+    },
+    {
+      "epoch": 0.729693741677763,
+      "grad_norm": 0.07513760712282253,
+      "learning_rate": 4.12666278243799e-05,
+      "loss": 1.1181,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7323568575233023,
+      "grad_norm": 0.07435576640285345,
+      "learning_rate": 4.0516475332994383e-05,
+      "loss": 1.119,
+      "step": 1375
+    },
+    {
+      "epoch": 0.7350199733688415,
+      "grad_norm": 0.07125188708419379,
+      "learning_rate": 3.9771467707031615e-05,
+      "loss": 1.1201,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7376830892143809,
+      "grad_norm": 0.07352772721979937,
+      "learning_rate": 3.903166938389664e-05,
+      "loss": 1.112,
+      "step": 1385
+    },
+    {
+      "epoch": 0.7403462050599201,
+      "grad_norm": 0.07055845215545062,
+      "learning_rate": 3.8297144350430144e-05,
+      "loss": 1.1046,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7430093209054593,
+      "grad_norm": 0.07019837722638089,
+      "learning_rate": 3.756795613737388e-05,
+      "loss": 1.1306,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7456724367509987,
+      "grad_norm": 0.07239297940522006,
+      "learning_rate": 3.684416781387589e-05,
+      "loss": 1.1184,
+      "step": 1400
+    },
+    {
+      "epoch": 0.748335552596538,
+      "grad_norm": 0.07225126596889433,
+      "learning_rate": 3.6125841982035536e-05,
+      "loss": 1.0843,
+      "step": 1405
+    },
+    {
+      "epoch": 0.7509986684420772,
+      "grad_norm": 0.07431737331558284,
+      "learning_rate": 3.5413040771488746e-05,
+      "loss": 1.1145,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7536617842876165,
+      "grad_norm": 0.07502128782750854,
+      "learning_rate": 3.47058258340345e-05,
+      "loss": 1.1114,
+      "step": 1415
+    },
+    {
+      "epoch": 0.7563249001331558,
+      "grad_norm": 0.07207529195587527,
+      "learning_rate": 3.4004258338302195e-05,
+      "loss": 1.116,
+      "step": 1420
+    },
+    {
+      "epoch": 0.758988015978695,
+      "grad_norm": 0.07002467859956689,
+      "learning_rate": 3.3308398964461206e-05,
+      "loss": 1.1198,
+      "step": 1425
+    },
+    {
+      "epoch": 0.7616511318242344,
+      "grad_norm": 0.07224058742693344,
+      "learning_rate": 3.261830789897241e-05,
+      "loss": 1.1367,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7643142476697736,
+      "grad_norm": 0.07150872607987452,
+      "learning_rate": 3.193404482938256e-05,
+      "loss": 1.0982,
+      "step": 1435
+    },
+    {
+      "epoch": 0.7669773635153129,
+      "grad_norm": 0.07178998194161153,
+      "learning_rate": 3.1255668939161894e-05,
+      "loss": 1.1301,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7696404793608522,
+      "grad_norm": 0.07537933178179766,
+      "learning_rate": 3.058323890258498e-05,
+      "loss": 1.0962,
+      "step": 1445
+    },
+    {
+      "epoch": 0.7723035952063915,
+      "grad_norm": 0.07189617812023931,
+      "learning_rate": 2.9916812879655975e-05,
+      "loss": 1.1299,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7749667110519307,
+      "grad_norm": 0.07135524342299995,
+      "learning_rate": 2.925644851107835e-05,
+      "loss": 1.1189,
+      "step": 1455
+    },
+    {
+      "epoch": 0.7776298268974701,
+      "grad_norm": 0.06926273022163672,
+      "learning_rate": 2.860220291326915e-05,
+      "loss": 1.1068,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7802929427430093,
+      "grad_norm": 0.07236268331467403,
+      "learning_rate": 2.7954132673419143e-05,
+      "loss": 1.0981,
+      "step": 1465
+    },
+    {
+      "epoch": 0.7829560585885486,
+      "grad_norm": 0.07137745473045948,
+      "learning_rate": 2.7312293844598246e-05,
+      "loss": 1.1045,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7856191744340879,
+      "grad_norm": 0.07319916214034358,
+      "learning_rate": 2.6676741940907478e-05,
+      "loss": 1.1281,
+      "step": 1475
+    },
+    {
+      "epoch": 0.7882822902796272,
+      "grad_norm": 0.07414585611908868,
+      "learning_rate": 2.6047531932677383e-05,
+      "loss": 1.1225,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7909454061251664,
+      "grad_norm": 0.07066978423343756,
+      "learning_rate": 2.542471824171353e-05,
+      "loss": 1.1356,
+      "step": 1485
+    },
+    {
+      "epoch": 0.7936085219707057,
+      "grad_norm": 0.07124448208473912,
+      "learning_rate": 2.4808354736589523e-05,
+      "loss": 1.1323,
+      "step": 1490
+    },
+    {
+      "epoch": 0.796271637816245,
+      "grad_norm": 0.07192509266254882,
+      "learning_rate": 2.419849472798761e-05,
+      "loss": 1.1386,
+      "step": 1495
+    },
+    {
+      "epoch": 0.7989347536617842,
+      "grad_norm": 0.07470536521159465,
+      "learning_rate": 2.359519096408791e-05,
+      "loss": 1.1103,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8015978695073236,
+      "grad_norm": 0.07210949951260932,
+      "learning_rate": 2.2998495626005957e-05,
+      "loss": 1.1108,
+      "step": 1505
+    },
+    {
+      "epoch": 0.8042609853528628,
+      "grad_norm": 0.07516415250373631,
+      "learning_rate": 2.240846032327949e-05,
+      "loss": 1.1404,
+      "step": 1510
+    },
+    {
+      "epoch": 0.8069241011984021,
+      "grad_norm": 0.07560562529629619,
+      "learning_rate": 2.1825136089404718e-05,
+      "loss": 1.0935,
+      "step": 1515
+    },
+    {
+      "epoch": 0.8095872170439414,
+      "grad_norm": 0.07195974938474745,
+      "learning_rate": 2.1248573377422155e-05,
+      "loss": 1.1182,
+      "step": 1520
+    },
+    {
+      "epoch": 0.8122503328894807,
+      "grad_norm": 0.07250882969384367,
+      "learning_rate": 2.0678822055552906e-05,
+      "loss": 1.1189,
+      "step": 1525
+    },
+    {
+      "epoch": 0.8149134487350199,
+      "grad_norm": 0.0721751215640965,
+      "learning_rate": 2.0115931402885458e-05,
+      "loss": 1.1115,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8175765645805593,
+      "grad_norm": 0.0753848259347461,
+      "learning_rate": 1.955995010511338e-05,
+      "loss": 1.1348,
+      "step": 1535
+    },
+    {
+      "epoch": 0.8202396804260985,
+      "grad_norm": 0.0719207373284397,
+      "learning_rate": 1.901092625032448e-05,
+      "loss": 1.1042,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8229027962716379,
+      "grad_norm": 0.07032664869488064,
+      "learning_rate": 1.84689073248414e-05,
+      "loss": 1.1009,
+      "step": 1545
+    },
+    {
+      "epoch": 0.8255659121171771,
+      "grad_norm": 0.0700654057925292,
+      "learning_rate": 1.7933940209114597e-05,
+      "loss": 1.1269,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8282290279627164,
+      "grad_norm": 0.07325193867745135,
+      "learning_rate": 1.7406071173667372e-05,
+      "loss": 1.1138,
+      "step": 1555
+    },
+    {
+      "epoch": 0.8308921438082557,
+      "grad_norm": 0.07059680065497263,
+      "learning_rate": 1.6885345875093918e-05,
+      "loss": 1.1202,
+      "step": 1560
+    },
+    {
+      "epoch": 0.833555259653795,
+      "grad_norm": 0.06973843219886788,
+      "learning_rate": 1.6371809352110447e-05,
+      "loss": 1.109,
+      "step": 1565
+    },
+    {
+      "epoch": 0.8362183754993342,
+      "grad_norm": 0.07028429615451927,
+      "learning_rate": 1.5865506021659516e-05,
+      "loss": 1.1422,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8388814913448736,
+      "grad_norm": 0.07070004444035057,
+      "learning_rate": 1.5366479675068435e-05,
+      "loss": 1.1139,
+      "step": 1575
+    },
+    {
+      "epoch": 0.8415446071904128,
+      "grad_norm": 0.06902941716806588,
+      "learning_rate": 1.4874773474261638e-05,
+      "loss": 1.1179,
+      "step": 1580
+    },
+    {
+      "epoch": 0.844207723035952,
+      "grad_norm": 0.07227429481260465,
+      "learning_rate": 1.4390429948027428e-05,
+      "loss": 1.1156,
+      "step": 1585
+    },
+    {
+      "epoch": 0.8468708388814914,
+      "grad_norm": 0.07067313252269349,
+      "learning_rate": 1.3913490988339718e-05,
+      "loss": 1.1209,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8495339547270306,
+      "grad_norm": 0.07249077076436629,
+      "learning_rate": 1.3443997846734535e-05,
+      "loss": 1.1303,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8521970705725699,
+      "grad_norm": 0.07121930519374212,
+      "learning_rate": 1.2981991130742211e-05,
+      "loss": 1.1069,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8548601864181092,
+      "grad_norm": 0.06967637890151408,
+      "learning_rate": 1.2527510800375043e-05,
+      "loss": 1.1007,
+      "step": 1605
+    },
+    {
+      "epoch": 0.8575233022636485,
+      "grad_norm": 0.07205061200000021,
+      "learning_rate": 1.20805961646711e-05,
+      "loss": 1.1199,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8601864181091877,
+      "grad_norm": 0.07129006954483187,
+      "learning_rate": 1.1641285878294372e-05,
+      "loss": 1.1054,
+      "step": 1615
+    },
+    {
+      "epoch": 0.8628495339547271,
+      "grad_norm": 0.07540152645446788,
+      "learning_rate": 1.1209617938191307e-05,
+      "loss": 1.1032,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8655126498002663,
+      "grad_norm": 0.07238275344561401,
+      "learning_rate": 1.0785629680304432e-05,
+      "loss": 1.1246,
+      "step": 1625
+    },
+    {
+      "epoch": 0.8681757656458056,
+      "grad_norm": 0.07120538294411066,
+      "learning_rate": 1.0369357776343103e-05,
+      "loss": 1.0932,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8708388814913449,
+      "grad_norm": 0.07125849884630578,
+      "learning_rate": 9.960838230611635e-06,
+      "loss": 1.0728,
+      "step": 1635
+    },
+    {
+      "epoch": 0.8735019973368842,
+      "grad_norm": 0.07387109327159767,
+      "learning_rate": 9.560106376895306e-06,
+      "loss": 1.1275,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8761651131824234,
+      "grad_norm": 0.08867831950654811,
+      "learning_rate": 9.167196875404094e-06,
+      "loss": 1.1134,
+      "step": 1645
+    },
+    {
+      "epoch": 0.8788282290279628,
+      "grad_norm": 0.07206488784580595,
+      "learning_rate": 8.782143709775015e-06,
+      "loss": 1.109,
+      "step": 1650
+    },
+    {
+      "epoch": 0.881491344873502,
+      "grad_norm": 0.07037650282884113,
+      "learning_rate": 8.40498018413266e-06,
+      "loss": 1.0862,
+      "step": 1655
+    },
+    {
+      "epoch": 0.8841544607190412,
+      "grad_norm": 0.07329640323219759,
+      "learning_rate": 8.035738920208714e-06,
+      "loss": 1.1539,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8868175765645806,
+      "grad_norm": 0.07238224996992595,
+      "learning_rate": 7.67445185452046e-06,
+      "loss": 1.14,
+      "step": 1665
+    },
+    {
+      "epoch": 0.8894806924101198,
+      "grad_norm": 0.0709614207657161,
+      "learning_rate": 7.321150235608399e-06,
+      "loss": 1.1084,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8921438082556591,
+      "grad_norm": 0.0726275702018448,
+      "learning_rate": 6.9758646213336165e-06,
+      "loss": 1.1227,
+      "step": 1675
+    },
+    {
+      "epoch": 0.8948069241011984,
+      "grad_norm": 0.07467788565919785,
+      "learning_rate": 6.6386248762347004e-06,
+      "loss": 1.1135,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8974700399467377,
+      "grad_norm": 0.07203922681266979,
+      "learning_rate": 6.309460168944692e-06,
+      "loss": 1.1071,
+      "step": 1685
+    },
+    {
+      "epoch": 0.9001331557922769,
+      "grad_norm": 0.07114745434226347,
+      "learning_rate": 5.988398969668285e-06,
+      "loss": 1.1248,
+      "step": 1690
+    },
+    {
+      "epoch": 0.9027962716378163,
+      "grad_norm": 0.07111413913413335,
+      "learning_rate": 5.6754690477192396e-06,
+      "loss": 1.0872,
+      "step": 1695
+    },
+    {
+      "epoch": 0.9054593874833555,
+      "grad_norm": 0.07072306177993802,
+      "learning_rate": 5.370697469118713e-06,
+      "loss": 1.0824,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9081225033288948,
+      "grad_norm": 0.07185114603062198,
+      "learning_rate": 5.074110594254133e-06,
+      "loss": 1.107,
+      "step": 1705
+    },
+    {
+      "epoch": 0.9107856191744341,
+      "grad_norm": 0.07285137048040193,
+      "learning_rate": 4.78573407559928e-06,
+      "loss": 1.1173,
+      "step": 1710
+    },
+    {
+      "epoch": 0.9134487350199734,
+      "grad_norm": 0.07013700832744958,
+      "learning_rate": 4.5055928554955665e-06,
+      "loss": 1.116,
+      "step": 1715
+    },
+    {
+      "epoch": 0.9161118508655126,
+      "grad_norm": 0.06933918410316768,
+      "learning_rate": 4.233711163994669e-06,
+      "loss": 1.1038,
+      "step": 1720
+    },
+    {
+      "epoch": 0.918774966711052,
+      "grad_norm": 0.072978505928099,
+      "learning_rate": 3.970112516762825e-06,
+      "loss": 1.104,
+      "step": 1725
+    },
+    {
+      "epoch": 0.9214380825565912,
+      "grad_norm": 0.07096666064833597,
+      "learning_rate": 3.7148197130469576e-06,
+      "loss": 1.1169,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9241011984021305,
+      "grad_norm": 0.07024364472390462,
+      "learning_rate": 3.467854833702644e-06,
+      "loss": 1.1051,
+      "step": 1735
+    },
+    {
+      "epoch": 0.9267643142476698,
+      "grad_norm": 0.0725948329149083,
+      "learning_rate": 3.229239239284354e-06,
+      "loss": 1.1257,
+      "step": 1740
+    },
+    {
+      "epoch": 0.929427430093209,
+      "grad_norm": 0.07215970561357568,
+      "learning_rate": 2.9989935681979164e-06,
+      "loss": 1.1262,
+      "step": 1745
+    },
+    {
+      "epoch": 0.9320905459387483,
+      "grad_norm": 0.0737761172543898,
+      "learning_rate": 2.777137734915403e-06,
+      "loss": 1.1091,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9347536617842876,
+      "grad_norm": 0.0714550851543958,
+      "learning_rate": 2.563690928252749e-06,
+      "loss": 1.1283,
+      "step": 1755
+    },
+    {
+      "epoch": 0.9374167776298269,
+      "grad_norm": 0.07157410935337963,
+      "learning_rate": 2.358671609710017e-06,
+      "loss": 1.1239,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9400798934753661,
+      "grad_norm": 0.0715459262435765,
+      "learning_rate": 2.1620975118746835e-06,
+      "loss": 1.1283,
+      "step": 1765
+    },
+    {
+      "epoch": 0.9427430093209055,
+      "grad_norm": 0.07286793622379441,
+      "learning_rate": 1.9739856368878096e-06,
+      "loss": 1.1443,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9454061251664447,
+      "grad_norm": 0.07254981755374063,
+      "learning_rate": 1.794352254973597e-06,
+      "loss": 1.0752,
+      "step": 1775
+    },
+    {
+      "epoch": 0.948069241011984,
+      "grad_norm": 0.07583062173231125,
+      "learning_rate": 1.6232129030320453e-06,
+      "loss": 1.1011,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9507323568575233,
+      "grad_norm": 0.07237886602221254,
+      "learning_rate": 1.4605823832951948e-06,
+      "loss": 1.1063,
+      "step": 1785
+    },
+    {
+      "epoch": 0.9533954727030626,
+      "grad_norm": 0.07194863306691765,
+      "learning_rate": 1.3064747620468054e-06,
+      "loss": 1.0914,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9560585885486018,
+      "grad_norm": 0.07189535177125876,
+      "learning_rate": 1.1609033684057857e-06,
+      "loss": 1.1048,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9587217043941412,
+      "grad_norm": 0.07110020697306084,
+      "learning_rate": 1.0238807931732487e-06,
+      "loss": 1.1219,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9613848202396804,
+      "grad_norm": 0.07370460890021728,
+      "learning_rate": 8.95418887743571e-07,
+      "loss": 1.1317,
+      "step": 1805
+    },
+    {
+      "epoch": 0.9640479360852197,
+      "grad_norm": 0.07207691237434337,
+      "learning_rate": 7.75528763079314e-07,
+      "loss": 1.1126,
+      "step": 1810
+    },
+    {
+      "epoch": 0.966711051930759,
+      "grad_norm": 0.07440378502380679,
+      "learning_rate": 6.642207887502027e-07,
+      "loss": 1.1262,
+      "step": 1815
+    },
+    {
+      "epoch": 0.9693741677762983,
+      "grad_norm": 0.06937782526053558,
+      "learning_rate": 5.615045920362549e-07,
+      "loss": 1.1191,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9720372836218375,
+      "grad_norm": 0.07098237504174736,
+      "learning_rate": 4.673890570951023e-07,
+      "loss": 1.1218,
+      "step": 1825
+    },
+    {
+      "epoch": 0.9747003994673769,
+      "grad_norm": 0.07181367245928705,
+      "learning_rate": 3.8188232419352764e-07,
+      "loss": 1.1514,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9773635153129161,
+      "grad_norm": 0.07128549274991237,
+      "learning_rate": 3.049917890034837e-07,
+      "loss": 1.0945,
+      "step": 1835
+    },
+    {
+      "epoch": 0.9800266311584553,
+      "grad_norm": 0.07161530878264005,
+      "learning_rate": 2.3672410196232675e-07,
+      "loss": 1.1056,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9826897470039947,
+      "grad_norm": 0.07090950092701657,
+      "learning_rate": 1.7708516769769924e-07,
+      "loss": 1.1109,
+      "step": 1845
+    },
+    {
+      "epoch": 0.9853528628495339,
+      "grad_norm": 0.07140303003446424,
+      "learning_rate": 1.2608014451672702e-07,
+      "loss": 1.1252,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9880159786950732,
+      "grad_norm": 0.07103068810909154,
+      "learning_rate": 8.371344395996516e-08,
+      "loss": 1.1255,
+      "step": 1855
+    },
+    {
+      "epoch": 0.9906790945406125,
+      "grad_norm": 0.07153307782175972,
+      "learning_rate": 4.998873041975882e-08,
+      "loss": 1.1365,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9933422103861518,
+      "grad_norm": 0.06996817906726589,
+      "learning_rate": 2.490892082331886e-08,
+      "loss": 1.1142,
+      "step": 1865
+    },
+    {
+      "epoch": 0.996005326231691,
+      "grad_norm": 0.07126895143317796,
+      "learning_rate": 8.476184380468155e-09,
+      "loss": 1.1091,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9986684420772304,
+      "grad_norm": 0.0706408297239244,
+      "learning_rate": 6.919423959805826e-10,
+      "loss": 1.1206,
+      "step": 1875
+    },
+    {
+      "epoch": 0.9997336884154461,
+      "eval_loss": 1.118857979774475,
+      "eval_runtime": 1652.5253,
+      "eval_samples_per_second": 8.045,
+      "eval_steps_per_second": 0.503,
+      "step": 1877
+    },
+    {
+      "epoch": 0.9997336884154461,
+      "step": 1877,
+      "total_flos": 2.979798729936077e+16,
+      "train_loss": 1.1429596533467938,
+      "train_runtime": 55739.5179,
+      "train_samples_per_second": 2.156,
+      "train_steps_per_second": 0.034
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1877,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.979798729936077e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}