Upload 7 files

Browse files

Files changed (7) hide show

README.md +202 -0
adapter_config.json +30 -0
adapter_model.safetensors +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +2253 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: hishab/titulm-1b-bn-v1
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "hishab/titulm-1b-bn-v1",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "out_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a3b4361eea93a0bda321ad452248bbccc466147cd1696128e8396206dc26b99
+size 37768408

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff09c451a10d4088bb54a29920ab4c7f6bfe1e15f4fa7f70303d92cee153f304
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1da0efa6bd078f65dc649d4e90d30aba6d462a8362459d030146b3c40a5e6c58
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2253 @@

+{
+  "best_metric": 1.9664931297302246,
+  "best_model_checkpoint": "./lora_bn_resume/checkpoint-3000",
+  "epoch": 1.9292604501607717,
+  "eval_steps": 200,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006430868167202572,
+      "grad_norm": 0.7529953718185425,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 2.01,
+      "step": 10
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.8143910765647888,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 1.9794,
+      "step": 20
+    },
+    {
+      "epoch": 0.01929260450160772,
+      "grad_norm": 0.7554563283920288,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.9687,
+      "step": 30
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.701172411441803,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 2.0374,
+      "step": 40
+    },
+    {
+      "epoch": 0.03215434083601286,
+      "grad_norm": 0.7426002621650696,
+      "learning_rate": 0.00015,
+      "loss": 1.8484,
+      "step": 50
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.7900332808494568,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 1.91,
+      "step": 60
+    },
+    {
+      "epoch": 0.04501607717041801,
+      "grad_norm": 0.7825136184692383,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 1.9625,
+      "step": 70
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.9338003993034363,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 1.9668,
+      "step": 80
+    },
+    {
+      "epoch": 0.05787781350482315,
+      "grad_norm": 0.8660485148429871,
+      "learning_rate": 0.00027,
+      "loss": 2.0447,
+      "step": 90
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.8631746768951416,
+      "learning_rate": 0.0003,
+      "loss": 2.0347,
+      "step": 100
+    },
+    {
+      "epoch": 0.0707395498392283,
+      "grad_norm": 0.9202760457992554,
+      "learning_rate": 0.00029934282584884994,
+      "loss": 2.0218,
+      "step": 110
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.8508992791175842,
+      "learning_rate": 0.00029868565169769985,
+      "loss": 1.9808,
+      "step": 120
+    },
+    {
+      "epoch": 0.08360128617363344,
+      "grad_norm": 0.9962050914764404,
+      "learning_rate": 0.0002980284775465498,
+      "loss": 1.9586,
+      "step": 130
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.9159810543060303,
+      "learning_rate": 0.00029737130339539973,
+      "loss": 2.0257,
+      "step": 140
+    },
+    {
+      "epoch": 0.09646302250803858,
+      "grad_norm": 0.8135138750076294,
+      "learning_rate": 0.0002967141292442497,
+      "loss": 2.0103,
+      "step": 150
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.7933633327484131,
+      "learning_rate": 0.00029605695509309966,
+      "loss": 2.028,
+      "step": 160
+    },
+    {
+      "epoch": 0.10932475884244373,
+      "grad_norm": 0.9258368611335754,
+      "learning_rate": 0.00029539978094194957,
+      "loss": 2.0654,
+      "step": 170
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.8758969902992249,
+      "learning_rate": 0.00029474260679079954,
+      "loss": 1.9928,
+      "step": 180
+    },
+    {
+      "epoch": 0.12218649517684887,
+      "grad_norm": 0.8316165804862976,
+      "learning_rate": 0.00029408543263964945,
+      "loss": 1.9748,
+      "step": 190
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.8353763222694397,
+      "learning_rate": 0.0002934282584884994,
+      "loss": 2.0167,
+      "step": 200
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_loss": 2.0699551105499268,
+      "eval_runtime": 131.8406,
+      "eval_samples_per_second": 15.17,
+      "eval_steps_per_second": 1.896,
+      "step": 200
+    },
+    {
+      "epoch": 0.13504823151125403,
+      "grad_norm": 0.8024882078170776,
+      "learning_rate": 0.0002927710843373494,
+      "loss": 2.1039,
+      "step": 210
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.861377477645874,
+      "learning_rate": 0.0002921139101861993,
+      "loss": 2.023,
+      "step": 220
+    },
+    {
+      "epoch": 0.14790996784565916,
+      "grad_norm": 0.8247071504592896,
+      "learning_rate": 0.00029145673603504926,
+      "loss": 1.9341,
+      "step": 230
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.8182681202888489,
+      "learning_rate": 0.0002907995618838992,
+      "loss": 2.0137,
+      "step": 240
+    },
+    {
+      "epoch": 0.1607717041800643,
+      "grad_norm": 0.8556217551231384,
+      "learning_rate": 0.00029014238773274913,
+      "loss": 2.0638,
+      "step": 250
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.7721512913703918,
+      "learning_rate": 0.0002894852135815991,
+      "loss": 2.0061,
+      "step": 260
+    },
+    {
+      "epoch": 0.17363344051446947,
+      "grad_norm": 0.7948784828186035,
+      "learning_rate": 0.000288828039430449,
+      "loss": 1.9751,
+      "step": 270
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.7582404613494873,
+      "learning_rate": 0.000288170865279299,
+      "loss": 2.0254,
+      "step": 280
+    },
+    {
+      "epoch": 0.1864951768488746,
+      "grad_norm": 0.9620535969734192,
+      "learning_rate": 0.00028751369112814894,
+      "loss": 1.9978,
+      "step": 290
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.7374221682548523,
+      "learning_rate": 0.00028685651697699885,
+      "loss": 2.0631,
+      "step": 300
+    },
+    {
+      "epoch": 0.19935691318327975,
+      "grad_norm": 0.794651210308075,
+      "learning_rate": 0.0002861993428258488,
+      "loss": 1.9507,
+      "step": 310
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.7450920939445496,
+      "learning_rate": 0.00028554216867469873,
+      "loss": 2.0363,
+      "step": 320
+    },
+    {
+      "epoch": 0.21221864951768488,
+      "grad_norm": 0.7574348449707031,
+      "learning_rate": 0.0002848849945235487,
+      "loss": 2.0508,
+      "step": 330
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.9118533134460449,
+      "learning_rate": 0.00028422782037239866,
+      "loss": 2.0118,
+      "step": 340
+    },
+    {
+      "epoch": 0.22508038585209003,
+      "grad_norm": 0.8136394023895264,
+      "learning_rate": 0.0002835706462212486,
+      "loss": 2.1211,
+      "step": 350
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.9099079966545105,
+      "learning_rate": 0.00028291347207009854,
+      "loss": 2.0346,
+      "step": 360
+    },
+    {
+      "epoch": 0.2379421221864952,
+      "grad_norm": 0.830896258354187,
+      "learning_rate": 0.0002822562979189485,
+      "loss": 2.0494,
+      "step": 370
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.789002001285553,
+      "learning_rate": 0.0002815991237677984,
+      "loss": 1.9791,
+      "step": 380
+    },
+    {
+      "epoch": 0.2508038585209003,
+      "grad_norm": 0.8194644451141357,
+      "learning_rate": 0.0002809419496166484,
+      "loss": 2.0106,
+      "step": 390
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.8226191401481628,
+      "learning_rate": 0.00028028477546549835,
+      "loss": 2.0268,
+      "step": 400
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_loss": 2.057727575302124,
+      "eval_runtime": 127.2637,
+      "eval_samples_per_second": 15.715,
+      "eval_steps_per_second": 1.964,
+      "step": 400
+    },
+    {
+      "epoch": 0.26366559485530544,
+      "grad_norm": 0.796454668045044,
+      "learning_rate": 0.00027962760131434826,
+      "loss": 2.0376,
+      "step": 410
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.8327352404594421,
+      "learning_rate": 0.0002789704271631982,
+      "loss": 2.0481,
+      "step": 420
+    },
+    {
+      "epoch": 0.2765273311897106,
+      "grad_norm": 0.8051420450210571,
+      "learning_rate": 0.0002783132530120482,
+      "loss": 1.99,
+      "step": 430
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.7519128322601318,
+      "learning_rate": 0.0002776560788608981,
+      "loss": 2.0339,
+      "step": 440
+    },
+    {
+      "epoch": 0.28938906752411575,
+      "grad_norm": 0.8251495957374573,
+      "learning_rate": 0.00027699890470974807,
+      "loss": 2.0289,
+      "step": 450
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.7058277130126953,
+      "learning_rate": 0.000276341730558598,
+      "loss": 2.0669,
+      "step": 460
+    },
+    {
+      "epoch": 0.3022508038585209,
+      "grad_norm": 0.8475114107131958,
+      "learning_rate": 0.00027568455640744795,
+      "loss": 2.0506,
+      "step": 470
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.7855744957923889,
+      "learning_rate": 0.0002750273822562979,
+      "loss": 1.97,
+      "step": 480
+    },
+    {
+      "epoch": 0.31511254019292606,
+      "grad_norm": 0.727988064289093,
+      "learning_rate": 0.0002743702081051478,
+      "loss": 2.0705,
+      "step": 490
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.7662935853004456,
+      "learning_rate": 0.0002737130339539978,
+      "loss": 1.9678,
+      "step": 500
+    },
+    {
+      "epoch": 0.3279742765273312,
+      "grad_norm": 0.9171555638313293,
+      "learning_rate": 0.00027305585980284776,
+      "loss": 1.9818,
+      "step": 510
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.7959179282188416,
+      "learning_rate": 0.00027239868565169767,
+      "loss": 2.0014,
+      "step": 520
+    },
+    {
+      "epoch": 0.3408360128617363,
+      "grad_norm": 0.9359775185585022,
+      "learning_rate": 0.00027174151150054763,
+      "loss": 2.0244,
+      "step": 530
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.7740966081619263,
+      "learning_rate": 0.0002710843373493976,
+      "loss": 2.0883,
+      "step": 540
+    },
+    {
+      "epoch": 0.3536977491961415,
+      "grad_norm": 0.868601381778717,
+      "learning_rate": 0.0002704271631982475,
+      "loss": 2.0226,
+      "step": 550
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.8721134662628174,
+      "learning_rate": 0.0002697699890470975,
+      "loss": 2.0965,
+      "step": 560
+    },
+    {
+      "epoch": 0.3665594855305466,
+      "grad_norm": 0.8080394268035889,
+      "learning_rate": 0.00026911281489594744,
+      "loss": 2.0082,
+      "step": 570
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 1.7169413566589355,
+      "learning_rate": 0.00026845564074479735,
+      "loss": 2.039,
+      "step": 580
+    },
+    {
+      "epoch": 0.37942122186495175,
+      "grad_norm": 0.8220880031585693,
+      "learning_rate": 0.0002677984665936473,
+      "loss": 2.0696,
+      "step": 590
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.7639694213867188,
+      "learning_rate": 0.00026714129244249723,
+      "loss": 2.0014,
+      "step": 600
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_loss": 2.0443177223205566,
+      "eval_runtime": 133.8726,
+      "eval_samples_per_second": 14.94,
+      "eval_steps_per_second": 1.867,
+      "step": 600
+    },
+    {
+      "epoch": 0.39228295819935693,
+      "grad_norm": 0.817965567111969,
+      "learning_rate": 0.0002664841182913472,
+      "loss": 2.0553,
+      "step": 610
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.871166467666626,
+      "learning_rate": 0.00026582694414019716,
+      "loss": 2.0027,
+      "step": 620
+    },
+    {
+      "epoch": 0.40514469453376206,
+      "grad_norm": 0.7483948469161987,
+      "learning_rate": 0.00026516976998904707,
+      "loss": 2.0355,
+      "step": 630
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.8223303556442261,
+      "learning_rate": 0.00026451259583789704,
+      "loss": 2.0076,
+      "step": 640
+    },
+    {
+      "epoch": 0.4180064308681672,
+      "grad_norm": 0.80986088514328,
+      "learning_rate": 0.00026385542168674695,
+      "loss": 2.0781,
+      "step": 650
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.7527362704277039,
+      "learning_rate": 0.0002631982475355969,
+      "loss": 1.9727,
+      "step": 660
+    },
+    {
+      "epoch": 0.43086816720257237,
+      "grad_norm": 0.7571489810943604,
+      "learning_rate": 0.0002625410733844469,
+      "loss": 2.0205,
+      "step": 670
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.7976600527763367,
+      "learning_rate": 0.0002618838992332968,
+      "loss": 2.0505,
+      "step": 680
+    },
+    {
+      "epoch": 0.4437299035369775,
+      "grad_norm": 0.8057394623756409,
+      "learning_rate": 0.00026122672508214676,
+      "loss": 2.0351,
+      "step": 690
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.8420009016990662,
+      "learning_rate": 0.0002605695509309967,
+      "loss": 1.9655,
+      "step": 700
+    },
+    {
+      "epoch": 0.4565916398713826,
+      "grad_norm": 0.853597104549408,
+      "learning_rate": 0.00025991237677984664,
+      "loss": 1.9939,
+      "step": 710
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.7588443160057068,
+      "learning_rate": 0.0002592552026286966,
+      "loss": 2.032,
+      "step": 720
+    },
+    {
+      "epoch": 0.4694533762057878,
+      "grad_norm": 0.8099080920219421,
+      "learning_rate": 0.0002585980284775465,
+      "loss": 1.9817,
+      "step": 730
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.7894070148468018,
+      "learning_rate": 0.0002579408543263965,
+      "loss": 2.0001,
+      "step": 740
+    },
+    {
+      "epoch": 0.48231511254019294,
+      "grad_norm": 0.7474116683006287,
+      "learning_rate": 0.00025728368017524644,
+      "loss": 2.0077,
+      "step": 750
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.8076878786087036,
+      "learning_rate": 0.00025662650602409636,
+      "loss": 2.0394,
+      "step": 760
+    },
+    {
+      "epoch": 0.49517684887459806,
+      "grad_norm": 0.7559667825698853,
+      "learning_rate": 0.0002559693318729463,
+      "loss": 1.9753,
+      "step": 770
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.7402215600013733,
+      "learning_rate": 0.00025531215772179623,
+      "loss": 2.0353,
+      "step": 780
+    },
+    {
+      "epoch": 0.5080385852090032,
+      "grad_norm": 0.7112523317337036,
+      "learning_rate": 0.0002546549835706462,
+      "loss": 1.989,
+      "step": 790
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.7255666255950928,
+      "learning_rate": 0.00025399780941949616,
+      "loss": 1.9912,
+      "step": 800
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_loss": 2.0358893871307373,
+      "eval_runtime": 131.9747,
+      "eval_samples_per_second": 15.154,
+      "eval_steps_per_second": 1.894,
+      "step": 800
+    },
+    {
+      "epoch": 0.5209003215434084,
+      "grad_norm": 0.7614848613739014,
+      "learning_rate": 0.0002533406352683461,
+      "loss": 1.9507,
+      "step": 810
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.7834282517433167,
+      "learning_rate": 0.00025268346111719604,
+      "loss": 2.0572,
+      "step": 820
+    },
+    {
+      "epoch": 0.5337620578778135,
+      "grad_norm": 0.8642615079879761,
+      "learning_rate": 0.00025202628696604595,
+      "loss": 1.9766,
+      "step": 830
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.7937222123146057,
+      "learning_rate": 0.0002513691128148959,
+      "loss": 1.9718,
+      "step": 840
+    },
+    {
+      "epoch": 0.5466237942122186,
+      "grad_norm": 0.7922580242156982,
+      "learning_rate": 0.0002507119386637459,
+      "loss": 2.0098,
+      "step": 850
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.7464605569839478,
+      "learning_rate": 0.0002500547645125958,
+      "loss": 1.9529,
+      "step": 860
+    },
+    {
+      "epoch": 0.5594855305466238,
+      "grad_norm": 0.7568275332450867,
+      "learning_rate": 0.00024939759036144576,
+      "loss": 1.989,
+      "step": 870
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.7011362910270691,
+      "learning_rate": 0.00024874041621029573,
+      "loss": 2.031,
+      "step": 880
+    },
+    {
+      "epoch": 0.572347266881029,
+      "grad_norm": 0.7106270790100098,
+      "learning_rate": 0.00024808324205914564,
+      "loss": 2.022,
+      "step": 890
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.7415210604667664,
+      "learning_rate": 0.0002474260679079956,
+      "loss": 2.0595,
+      "step": 900
+    },
+    {
+      "epoch": 0.5852090032154341,
+      "grad_norm": 0.7313567399978638,
+      "learning_rate": 0.0002467688937568455,
+      "loss": 2.0293,
+      "step": 910
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.692523181438446,
+      "learning_rate": 0.0002461117196056955,
+      "loss": 2.0746,
+      "step": 920
+    },
+    {
+      "epoch": 0.5980707395498392,
+      "grad_norm": 0.6929277181625366,
+      "learning_rate": 0.00024545454545454545,
+      "loss": 1.955,
+      "step": 930
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.7199161648750305,
+      "learning_rate": 0.00024479737130339536,
+      "loss": 2.0454,
+      "step": 940
+    },
+    {
+      "epoch": 0.6109324758842444,
+      "grad_norm": 0.767314076423645,
+      "learning_rate": 0.00024414019715224533,
+      "loss": 2.0428,
+      "step": 950
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.8044443130493164,
+      "learning_rate": 0.00024348302300109526,
+      "loss": 1.9423,
+      "step": 960
+    },
+    {
+      "epoch": 0.6237942122186495,
+      "grad_norm": 0.702936589717865,
+      "learning_rate": 0.0002428258488499452,
+      "loss": 1.9271,
+      "step": 970
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.7394160032272339,
+      "learning_rate": 0.00024216867469879517,
+      "loss": 1.9674,
+      "step": 980
+    },
+    {
+      "epoch": 0.6366559485530546,
+      "grad_norm": 0.7981842160224915,
+      "learning_rate": 0.0002415115005476451,
+      "loss": 1.9932,
+      "step": 990
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.871896505355835,
+      "learning_rate": 0.00024085432639649505,
+      "loss": 2.0182,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_loss": 2.024224281311035,
+      "eval_runtime": 130.1041,
+      "eval_samples_per_second": 15.372,
+      "eval_steps_per_second": 1.922,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6495176848874598,
+      "grad_norm": 0.7123499512672424,
+      "learning_rate": 0.00024019715224534498,
+      "loss": 2.0923,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.7226546406745911,
+      "learning_rate": 0.00023953997809419495,
+      "loss": 2.0035,
+      "step": 1020
+    },
+    {
+      "epoch": 0.662379421221865,
+      "grad_norm": 0.7627468109130859,
+      "learning_rate": 0.0002388828039430449,
+      "loss": 1.9667,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.8175467252731323,
+      "learning_rate": 0.00023822562979189483,
+      "loss": 1.948,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6752411575562701,
+      "grad_norm": 0.690073549747467,
+      "learning_rate": 0.0002375684556407448,
+      "loss": 2.0498,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.9848446249961853,
+      "learning_rate": 0.0002369112814895947,
+      "loss": 1.9874,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6881028938906752,
+      "grad_norm": 0.7157571315765381,
+      "learning_rate": 0.00023625410733844467,
+      "loss": 2.0488,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.8503302931785583,
+      "learning_rate": 0.00023559693318729464,
+      "loss": 1.9958,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7009646302250804,
+      "grad_norm": 0.7864677906036377,
+      "learning_rate": 0.00023493975903614455,
+      "loss": 2.0212,
+      "step": 1090
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 1.7837698459625244,
+      "learning_rate": 0.0002342825848849945,
+      "loss": 1.9828,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7138263665594855,
+      "grad_norm": 0.7183972001075745,
+      "learning_rate": 0.00023362541073384445,
+      "loss": 2.0652,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.7377676963806152,
+      "learning_rate": 0.0002329682365826944,
+      "loss": 2.0123,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7266881028938906,
+      "grad_norm": 0.7170071601867676,
+      "learning_rate": 0.00023231106243154436,
+      "loss": 1.9759,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.6442170143127441,
+      "learning_rate": 0.00023165388828039427,
+      "loss": 2.047,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7395498392282959,
+      "grad_norm": 0.7356306910514832,
+      "learning_rate": 0.00023099671412924423,
+      "loss": 2.0438,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.7483031153678894,
+      "learning_rate": 0.0002303395399780942,
+      "loss": 2.0274,
+      "step": 1160
+    },
+    {
+      "epoch": 0.752411575562701,
+      "grad_norm": 0.7624642848968506,
+      "learning_rate": 0.0002296823658269441,
+      "loss": 1.9938,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.7435073256492615,
+      "learning_rate": 0.00022902519167579408,
+      "loss": 1.9848,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7652733118971061,
+      "grad_norm": 0.7327163219451904,
+      "learning_rate": 0.000228368017524644,
+      "loss": 2.0286,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.8398700952529907,
+      "learning_rate": 0.00022771084337349395,
+      "loss": 1.999,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_loss": 2.0166773796081543,
+      "eval_runtime": 129.989,
+      "eval_samples_per_second": 15.386,
+      "eval_steps_per_second": 1.923,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7781350482315113,
+      "grad_norm": 0.6727181673049927,
+      "learning_rate": 0.00022705366922234392,
+      "loss": 2.0044,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.8738404512405396,
+      "learning_rate": 0.00022639649507119383,
+      "loss": 2.0246,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7909967845659164,
+      "grad_norm": 0.760010302066803,
+      "learning_rate": 0.0002257393209200438,
+      "loss": 2.0058,
+      "step": 1230
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.701081395149231,
+      "learning_rate": 0.00022508214676889373,
+      "loss": 1.9974,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8038585209003215,
+      "grad_norm": 0.7346913814544678,
+      "learning_rate": 0.00022442497261774367,
+      "loss": 2.0884,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.7433114647865295,
+      "learning_rate": 0.00022376779846659364,
+      "loss": 1.9927,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8167202572347267,
+      "grad_norm": 0.7781444787979126,
+      "learning_rate": 0.00022311062431544358,
+      "loss": 2.001,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.7538995742797852,
+      "learning_rate": 0.00022245345016429352,
+      "loss": 1.9947,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8295819935691319,
+      "grad_norm": 0.7132537961006165,
+      "learning_rate": 0.00022179627601314345,
+      "loss": 1.9781,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.7174340486526489,
+      "learning_rate": 0.0002211391018619934,
+      "loss": 1.9848,
+      "step": 1300
+    },
+    {
+      "epoch": 0.842443729903537,
+      "grad_norm": 0.7245258092880249,
+      "learning_rate": 0.00022048192771084336,
+      "loss": 2.005,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.667892336845398,
+      "learning_rate": 0.0002198247535596933,
+      "loss": 1.9939,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8553054662379421,
+      "grad_norm": 0.7173146605491638,
+      "learning_rate": 0.00021916757940854324,
+      "loss": 2.0636,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.7765901684761047,
+      "learning_rate": 0.0002185104052573932,
+      "loss": 1.9966,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8681672025723473,
+      "grad_norm": 0.7077351808547974,
+      "learning_rate": 0.00021785323110624314,
+      "loss": 2.0078,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.736723780632019,
+      "learning_rate": 0.00021719605695509308,
+      "loss": 2.0292,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8810289389067524,
+      "grad_norm": 0.732185959815979,
+      "learning_rate": 0.00021653888280394302,
+      "loss": 2.0223,
+      "step": 1370
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.7002454400062561,
+      "learning_rate": 0.00021588170865279298,
+      "loss": 2.0068,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8938906752411575,
+      "grad_norm": 0.75859534740448,
+      "learning_rate": 0.00021522453450164292,
+      "loss": 1.9556,
+      "step": 1390
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.7475289106369019,
+      "learning_rate": 0.00021456736035049286,
+      "loss": 1.9792,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_loss": 2.0089023113250732,
+      "eval_runtime": 130.0325,
+      "eval_samples_per_second": 15.381,
+      "eval_steps_per_second": 1.923,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9067524115755627,
+      "grad_norm": 0.7917546629905701,
+      "learning_rate": 0.00021391018619934283,
+      "loss": 1.9999,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.7062447667121887,
+      "learning_rate": 0.00021325301204819274,
+      "loss": 1.9779,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9196141479099679,
+      "grad_norm": 0.6973288655281067,
+      "learning_rate": 0.0002125958378970427,
+      "loss": 2.0511,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.7297340035438538,
+      "learning_rate": 0.00021193866374589267,
+      "loss": 1.9764,
+      "step": 1440
+    },
+    {
+      "epoch": 0.932475884244373,
+      "grad_norm": 0.9256350994110107,
+      "learning_rate": 0.00021128148959474258,
+      "loss": 1.9559,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.6994000673294067,
+      "learning_rate": 0.00021062431544359255,
+      "loss": 2.0152,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9453376205787781,
+      "grad_norm": 0.7412806749343872,
+      "learning_rate": 0.00020996714129244246,
+      "loss": 1.9494,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.729680061340332,
+      "learning_rate": 0.00020930996714129242,
+      "loss": 2.0272,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9581993569131833,
+      "grad_norm": 0.7601342797279358,
+      "learning_rate": 0.0002086527929901424,
+      "loss": 1.9714,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.6875161528587341,
+      "learning_rate": 0.0002079956188389923,
+      "loss": 1.993,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9710610932475884,
+      "grad_norm": 0.7520968317985535,
+      "learning_rate": 0.00020733844468784227,
+      "loss": 2.0471,
+      "step": 1510
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.8061411380767822,
+      "learning_rate": 0.00020668127053669218,
+      "loss": 2.0145,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9839228295819936,
+      "grad_norm": 0.7837228775024414,
+      "learning_rate": 0.00020602409638554214,
+      "loss": 1.9889,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.744296133518219,
+      "learning_rate": 0.0002053669222343921,
+      "loss": 1.9834,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9967845659163987,
+      "grad_norm": 0.7137749791145325,
+      "learning_rate": 0.00020470974808324202,
+      "loss": 2.0582,
+      "step": 1550
+    },
+    {
+      "epoch": 1.0032154340836013,
+      "grad_norm": 0.718320906162262,
+      "learning_rate": 0.000204052573932092,
+      "loss": 1.9576,
+      "step": 1560
+    },
+    {
+      "epoch": 1.0096463022508038,
+      "grad_norm": 0.719998836517334,
+      "learning_rate": 0.00020339539978094195,
+      "loss": 1.9138,
+      "step": 1570
+    },
+    {
+      "epoch": 1.0160771704180065,
+      "grad_norm": 0.7154316306114197,
+      "learning_rate": 0.00020273822562979186,
+      "loss": 1.875,
+      "step": 1580
+    },
+    {
+      "epoch": 1.022508038585209,
+      "grad_norm": 0.6565534472465515,
+      "learning_rate": 0.00020208105147864183,
+      "loss": 1.9994,
+      "step": 1590
+    },
+    {
+      "epoch": 1.0289389067524115,
+      "grad_norm": 0.7222368121147156,
+      "learning_rate": 0.00020142387732749177,
+      "loss": 1.9591,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0289389067524115,
+      "eval_loss": 2.002497673034668,
+      "eval_runtime": 131.2869,
+      "eval_samples_per_second": 15.234,
+      "eval_steps_per_second": 1.904,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0353697749196142,
+      "grad_norm": 0.7213057279586792,
+      "learning_rate": 0.0002007667031763417,
+      "loss": 1.9464,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0418006430868167,
+      "grad_norm": 0.6436830163002014,
+      "learning_rate": 0.00020010952902519167,
+      "loss": 1.8951,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0482315112540193,
+      "grad_norm": 0.7160071134567261,
+      "learning_rate": 0.00019945235487404158,
+      "loss": 1.9062,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0546623794212218,
+      "grad_norm": 0.6585739850997925,
+      "learning_rate": 0.00019879518072289155,
+      "loss": 1.9514,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0610932475884245,
+      "grad_norm": 0.7445241808891296,
+      "learning_rate": 0.0001981380065717415,
+      "loss": 1.8301,
+      "step": 1650
+    },
+    {
+      "epoch": 1.067524115755627,
+      "grad_norm": 0.6654142141342163,
+      "learning_rate": 0.00019748083242059143,
+      "loss": 1.9048,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0739549839228295,
+      "grad_norm": 0.7550114393234253,
+      "learning_rate": 0.0001968236582694414,
+      "loss": 1.9266,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0803858520900322,
+      "grad_norm": 0.7276896834373474,
+      "learning_rate": 0.00019616648411829133,
+      "loss": 1.8942,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0868167202572347,
+      "grad_norm": 0.7431575059890747,
+      "learning_rate": 0.00019550930996714127,
+      "loss": 1.9148,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0932475884244373,
+      "grad_norm": 0.74256831407547,
+      "learning_rate": 0.0001948521358159912,
+      "loss": 1.942,
+      "step": 1700
+    },
+    {
+      "epoch": 1.09967845659164,
+      "grad_norm": 0.7295734286308289,
+      "learning_rate": 0.00019419496166484117,
+      "loss": 1.9331,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1061093247588425,
+      "grad_norm": 0.7749672532081604,
+      "learning_rate": 0.0001935377875136911,
+      "loss": 1.9373,
+      "step": 1720
+    },
+    {
+      "epoch": 1.112540192926045,
+      "grad_norm": 0.6896611452102661,
+      "learning_rate": 0.00019288061336254105,
+      "loss": 1.8813,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1189710610932475,
+      "grad_norm": 0.7282217741012573,
+      "learning_rate": 0.00019222343921139102,
+      "loss": 1.9634,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1254019292604502,
+      "grad_norm": 0.7761743068695068,
+      "learning_rate": 0.00019156626506024093,
+      "loss": 1.8708,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1318327974276527,
+      "grad_norm": 0.7596757411956787,
+      "learning_rate": 0.0001909090909090909,
+      "loss": 1.9446,
+      "step": 1760
+    },
+    {
+      "epoch": 1.1382636655948553,
+      "grad_norm": 0.7023797631263733,
+      "learning_rate": 0.00019025191675794086,
+      "loss": 1.8837,
+      "step": 1770
+    },
+    {
+      "epoch": 1.144694533762058,
+      "grad_norm": 0.7191573977470398,
+      "learning_rate": 0.00018959474260679077,
+      "loss": 1.9141,
+      "step": 1780
+    },
+    {
+      "epoch": 1.1511254019292605,
+      "grad_norm": 0.784885048866272,
+      "learning_rate": 0.00018893756845564074,
+      "loss": 1.9506,
+      "step": 1790
+    },
+    {
+      "epoch": 1.157556270096463,
+      "grad_norm": 0.710903525352478,
+      "learning_rate": 0.00018828039430449068,
+      "loss": 1.9157,
+      "step": 1800
+    },
+    {
+      "epoch": 1.157556270096463,
+      "eval_loss": 1.998835563659668,
+      "eval_runtime": 121.0458,
+      "eval_samples_per_second": 16.523,
+      "eval_steps_per_second": 2.065,
+      "step": 1800
+    },
+    {
+      "epoch": 1.1639871382636655,
+      "grad_norm": 0.7552351355552673,
+      "learning_rate": 0.00018762322015334062,
+      "loss": 1.9139,
+      "step": 1810
+    },
+    {
+      "epoch": 1.1704180064308682,
+      "grad_norm": 0.7722271084785461,
+      "learning_rate": 0.00018696604600219058,
+      "loss": 1.863,
+      "step": 1820
+    },
+    {
+      "epoch": 1.1768488745980707,
+      "grad_norm": 0.7195548415184021,
+      "learning_rate": 0.0001863088718510405,
+      "loss": 1.8697,
+      "step": 1830
+    },
+    {
+      "epoch": 1.1832797427652733,
+      "grad_norm": 0.7423893809318542,
+      "learning_rate": 0.00018565169769989046,
+      "loss": 1.9772,
+      "step": 1840
+    },
+    {
+      "epoch": 1.189710610932476,
+      "grad_norm": 0.7222315073013306,
+      "learning_rate": 0.00018499452354874042,
+      "loss": 1.9308,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1961414790996785,
+      "grad_norm": 0.6815035939216614,
+      "learning_rate": 0.00018433734939759034,
+      "loss": 1.9675,
+      "step": 1860
+    },
+    {
+      "epoch": 1.202572347266881,
+      "grad_norm": 0.7621594071388245,
+      "learning_rate": 0.0001836801752464403,
+      "loss": 1.9295,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2090032154340835,
+      "grad_norm": 0.7405025959014893,
+      "learning_rate": 0.0001830230010952902,
+      "loss": 1.9088,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2154340836012862,
+      "grad_norm": 0.6729809641838074,
+      "learning_rate": 0.00018236582694414018,
+      "loss": 1.9446,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2218649517684887,
+      "grad_norm": 0.7389471530914307,
+      "learning_rate": 0.00018170865279299014,
+      "loss": 1.8841,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2282958199356913,
+      "grad_norm": 0.6453628540039062,
+      "learning_rate": 0.00018105147864184006,
+      "loss": 1.8661,
+      "step": 1910
+    },
+    {
+      "epoch": 1.234726688102894,
+      "grad_norm": 0.6971079111099243,
+      "learning_rate": 0.00018039430449069002,
+      "loss": 1.9807,
+      "step": 1920
+    },
+    {
+      "epoch": 1.2411575562700965,
+      "grad_norm": 0.7807840704917908,
+      "learning_rate": 0.00017973713033953996,
+      "loss": 1.9475,
+      "step": 1930
+    },
+    {
+      "epoch": 1.247588424437299,
+      "grad_norm": 0.78909832239151,
+      "learning_rate": 0.0001790799561883899,
+      "loss": 1.8439,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2540192926045015,
+      "grad_norm": 0.7715321183204651,
+      "learning_rate": 0.00017842278203723986,
+      "loss": 1.9478,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2604501607717042,
+      "grad_norm": 0.7786479592323303,
+      "learning_rate": 0.0001777656078860898,
+      "loss": 1.8773,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2668810289389068,
+      "grad_norm": 0.6935726404190063,
+      "learning_rate": 0.00017710843373493974,
+      "loss": 1.94,
+      "step": 1970
+    },
+    {
+      "epoch": 1.2733118971061093,
+      "grad_norm": 0.7824066877365112,
+      "learning_rate": 0.00017645125958378968,
+      "loss": 1.8996,
+      "step": 1980
+    },
+    {
+      "epoch": 1.279742765273312,
+      "grad_norm": 0.7019379138946533,
+      "learning_rate": 0.00017579408543263962,
+      "loss": 1.9114,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2861736334405145,
+      "grad_norm": 0.8215466737747192,
+      "learning_rate": 0.00017513691128148958,
+      "loss": 1.8294,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2861736334405145,
+      "eval_loss": 1.9947528839111328,
+      "eval_runtime": 132.3397,
+      "eval_samples_per_second": 15.113,
+      "eval_steps_per_second": 1.889,
+      "step": 2000
+    },
+    {
+      "epoch": 1.292604501607717,
+      "grad_norm": 0.7088531851768494,
+      "learning_rate": 0.00017447973713033952,
+      "loss": 1.9497,
+      "step": 2010
+    },
+    {
+      "epoch": 1.2990353697749195,
+      "grad_norm": 0.7754150032997131,
+      "learning_rate": 0.00017382256297918946,
+      "loss": 1.9047,
+      "step": 2020
+    },
+    {
+      "epoch": 1.3054662379421222,
+      "grad_norm": 0.7185202836990356,
+      "learning_rate": 0.00017316538882803943,
+      "loss": 1.8529,
+      "step": 2030
+    },
+    {
+      "epoch": 1.3118971061093248,
+      "grad_norm": 0.7496573328971863,
+      "learning_rate": 0.00017250821467688937,
+      "loss": 1.8618,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3183279742765273,
+      "grad_norm": 0.6794284582138062,
+      "learning_rate": 0.0001718510405257393,
+      "loss": 1.898,
+      "step": 2050
+    },
+    {
+      "epoch": 1.32475884244373,
+      "grad_norm": 0.7059448957443237,
+      "learning_rate": 0.00017119386637458924,
+      "loss": 1.9594,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3311897106109325,
+      "grad_norm": 0.7007871866226196,
+      "learning_rate": 0.0001705366922234392,
+      "loss": 1.9476,
+      "step": 2070
+    },
+    {
+      "epoch": 1.337620578778135,
+      "grad_norm": 0.6973986029624939,
+      "learning_rate": 0.00016987951807228915,
+      "loss": 1.9567,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3440514469453375,
+      "grad_norm": 0.7169969081878662,
+      "learning_rate": 0.00016922234392113909,
+      "loss": 1.9685,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3504823151125402,
+      "grad_norm": 0.7009272575378418,
+      "learning_rate": 0.00016856516976998905,
+      "loss": 1.9714,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3569131832797428,
+      "grad_norm": 0.7070193290710449,
+      "learning_rate": 0.00016790799561883896,
+      "loss": 1.9695,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3633440514469453,
+      "grad_norm": 0.7268947958946228,
+      "learning_rate": 0.00016725082146768893,
+      "loss": 1.9107,
+      "step": 2120
+    },
+    {
+      "epoch": 1.369774919614148,
+      "grad_norm": 0.7544928789138794,
+      "learning_rate": 0.00016659364731653887,
+      "loss": 1.8658,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3762057877813505,
+      "grad_norm": 0.6320627927780151,
+      "learning_rate": 0.0001659364731653888,
+      "loss": 1.8917,
+      "step": 2140
+    },
+    {
+      "epoch": 1.382636655948553,
+      "grad_norm": 0.6863923668861389,
+      "learning_rate": 0.00016527929901423877,
+      "loss": 1.9237,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3890675241157555,
+      "grad_norm": 0.7775669097900391,
+      "learning_rate": 0.00016462212486308868,
+      "loss": 1.8548,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3954983922829582,
+      "grad_norm": 0.7198719382286072,
+      "learning_rate": 0.00016396495071193865,
+      "loss": 1.9145,
+      "step": 2170
+    },
+    {
+      "epoch": 1.4019292604501608,
+      "grad_norm": 0.7938317656517029,
+      "learning_rate": 0.00016330777656078861,
+      "loss": 1.8939,
+      "step": 2180
+    },
+    {
+      "epoch": 1.4083601286173635,
+      "grad_norm": 0.7361711263656616,
+      "learning_rate": 0.00016265060240963853,
+      "loss": 1.9642,
+      "step": 2190
+    },
+    {
+      "epoch": 1.414790996784566,
+      "grad_norm": 0.7385576963424683,
+      "learning_rate": 0.0001619934282584885,
+      "loss": 1.9134,
+      "step": 2200
+    },
+    {
+      "epoch": 1.414790996784566,
+      "eval_loss": 1.9883830547332764,
+      "eval_runtime": 130.0767,
+      "eval_samples_per_second": 15.376,
+      "eval_steps_per_second": 1.922,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4212218649517685,
+      "grad_norm": 0.7863461971282959,
+      "learning_rate": 0.0001613362541073384,
+      "loss": 2.0157,
+      "step": 2210
+    },
+    {
+      "epoch": 1.427652733118971,
+      "grad_norm": 0.7755898237228394,
+      "learning_rate": 0.00016067907995618837,
+      "loss": 1.8973,
+      "step": 2220
+    },
+    {
+      "epoch": 1.4340836012861735,
+      "grad_norm": 0.7090388536453247,
+      "learning_rate": 0.00016002190580503833,
+      "loss": 1.9034,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4405144694533762,
+      "grad_norm": 0.6487644910812378,
+      "learning_rate": 0.00015936473165388825,
+      "loss": 1.906,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4469453376205788,
+      "grad_norm": 0.6597898006439209,
+      "learning_rate": 0.0001587075575027382,
+      "loss": 1.843,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4533762057877815,
+      "grad_norm": 0.7069796323776245,
+      "learning_rate": 0.00015805038335158818,
+      "loss": 1.9554,
+      "step": 2260
+    },
+    {
+      "epoch": 1.459807073954984,
+      "grad_norm": 0.7358680367469788,
+      "learning_rate": 0.0001573932092004381,
+      "loss": 1.9268,
+      "step": 2270
+    },
+    {
+      "epoch": 1.4662379421221865,
+      "grad_norm": 0.675457775592804,
+      "learning_rate": 0.00015673603504928806,
+      "loss": 1.8981,
+      "step": 2280
+    },
+    {
+      "epoch": 1.472668810289389,
+      "grad_norm": 0.7369397878646851,
+      "learning_rate": 0.000156078860898138,
+      "loss": 1.9535,
+      "step": 2290
+    },
+    {
+      "epoch": 1.4790996784565915,
+      "grad_norm": 0.666994035243988,
+      "learning_rate": 0.00015542168674698793,
+      "loss": 1.8657,
+      "step": 2300
+    },
+    {
+      "epoch": 1.4855305466237942,
+      "grad_norm": 0.7241340279579163,
+      "learning_rate": 0.0001547645125958379,
+      "loss": 1.8097,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4919614147909968,
+      "grad_norm": 0.7224936485290527,
+      "learning_rate": 0.0001541073384446878,
+      "loss": 1.8397,
+      "step": 2320
+    },
+    {
+      "epoch": 1.4983922829581995,
+      "grad_norm": 0.7167637348175049,
+      "learning_rate": 0.00015345016429353778,
+      "loss": 1.9225,
+      "step": 2330
+    },
+    {
+      "epoch": 1.504823151125402,
+      "grad_norm": 0.7176666259765625,
+      "learning_rate": 0.00015279299014238771,
+      "loss": 1.8764,
+      "step": 2340
+    },
+    {
+      "epoch": 1.5112540192926045,
+      "grad_norm": 0.735252857208252,
+      "learning_rate": 0.00015213581599123765,
+      "loss": 1.8935,
+      "step": 2350
+    },
+    {
+      "epoch": 1.517684887459807,
+      "grad_norm": 0.6805827021598816,
+      "learning_rate": 0.00015147864184008762,
+      "loss": 1.9212,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5241157556270095,
+      "grad_norm": 0.7019375562667847,
+      "learning_rate": 0.00015082146768893756,
+      "loss": 1.9318,
+      "step": 2370
+    },
+    {
+      "epoch": 1.5305466237942122,
+      "grad_norm": 0.6795372366905212,
+      "learning_rate": 0.0001501642935377875,
+      "loss": 1.9023,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5369774919614148,
+      "grad_norm": 0.6497982144355774,
+      "learning_rate": 0.00014950711938663743,
+      "loss": 1.9721,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5434083601286175,
+      "grad_norm": 0.7713346481323242,
+      "learning_rate": 0.0001488499452354874,
+      "loss": 1.9906,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5434083601286175,
+      "eval_loss": 1.9822700023651123,
+      "eval_runtime": 130.376,
+      "eval_samples_per_second": 15.34,
+      "eval_steps_per_second": 1.918,
+      "step": 2400
+    },
+    {
+      "epoch": 1.54983922829582,
+      "grad_norm": 0.7202898263931274,
+      "learning_rate": 0.00014819277108433734,
+      "loss": 1.8816,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5562700964630225,
+      "grad_norm": 0.7167313694953918,
+      "learning_rate": 0.00014753559693318728,
+      "loss": 1.9316,
+      "step": 2420
+    },
+    {
+      "epoch": 1.562700964630225,
+      "grad_norm": 0.7133712768554688,
+      "learning_rate": 0.00014687842278203724,
+      "loss": 2.0053,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5691318327974275,
+      "grad_norm": 0.76304692029953,
+      "learning_rate": 0.00014622124863088718,
+      "loss": 1.8718,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5755627009646302,
+      "grad_norm": 0.667654812335968,
+      "learning_rate": 0.00014556407447973712,
+      "loss": 1.8727,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5819935691318328,
+      "grad_norm": 0.7308873534202576,
+      "learning_rate": 0.00014490690032858706,
+      "loss": 1.8918,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5884244372990355,
+      "grad_norm": 0.9376251697540283,
+      "learning_rate": 0.00014424972617743702,
+      "loss": 1.96,
+      "step": 2470
+    },
+    {
+      "epoch": 1.594855305466238,
+      "grad_norm": 0.6924982666969299,
+      "learning_rate": 0.00014359255202628696,
+      "loss": 1.8744,
+      "step": 2480
+    },
+    {
+      "epoch": 1.6012861736334405,
+      "grad_norm": 0.7420899868011475,
+      "learning_rate": 0.0001429353778751369,
+      "loss": 1.9112,
+      "step": 2490
+    },
+    {
+      "epoch": 1.607717041800643,
+      "grad_norm": 0.7384818196296692,
+      "learning_rate": 0.00014227820372398684,
+      "loss": 1.9562,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6141479099678455,
+      "grad_norm": 0.7550799250602722,
+      "learning_rate": 0.0001416210295728368,
+      "loss": 1.891,
+      "step": 2510
+    },
+    {
+      "epoch": 1.6205787781350482,
+      "grad_norm": 0.7184371948242188,
+      "learning_rate": 0.00014096385542168674,
+      "loss": 1.9361,
+      "step": 2520
+    },
+    {
+      "epoch": 1.6270096463022508,
+      "grad_norm": 0.770914614200592,
+      "learning_rate": 0.00014030668127053668,
+      "loss": 1.9132,
+      "step": 2530
+    },
+    {
+      "epoch": 1.6334405144694535,
+      "grad_norm": 0.7566716074943542,
+      "learning_rate": 0.00013964950711938662,
+      "loss": 1.8982,
+      "step": 2540
+    },
+    {
+      "epoch": 1.639871382636656,
+      "grad_norm": 0.6670147776603699,
+      "learning_rate": 0.00013899233296823656,
+      "loss": 1.9211,
+      "step": 2550
+    },
+    {
+      "epoch": 1.6463022508038585,
+      "grad_norm": 0.7093060612678528,
+      "learning_rate": 0.00013833515881708653,
+      "loss": 1.8881,
+      "step": 2560
+    },
+    {
+      "epoch": 1.652733118971061,
+      "grad_norm": 0.6549977660179138,
+      "learning_rate": 0.00013767798466593646,
+      "loss": 1.9187,
+      "step": 2570
+    },
+    {
+      "epoch": 1.6591639871382635,
+      "grad_norm": 0.7039531469345093,
+      "learning_rate": 0.0001370208105147864,
+      "loss": 1.9165,
+      "step": 2580
+    },
+    {
+      "epoch": 1.6655948553054662,
+      "grad_norm": 0.7216307520866394,
+      "learning_rate": 0.00013636363636363634,
+      "loss": 1.9228,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6720257234726688,
+      "grad_norm": 0.6866537928581238,
+      "learning_rate": 0.00013570646221248628,
+      "loss": 1.9003,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6720257234726688,
+      "eval_loss": 1.977206826210022,
+      "eval_runtime": 131.9243,
+      "eval_samples_per_second": 15.16,
+      "eval_steps_per_second": 1.895,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6784565916398715,
+      "grad_norm": 0.7328875660896301,
+      "learning_rate": 0.00013504928806133625,
+      "loss": 1.9,
+      "step": 2610
+    },
+    {
+      "epoch": 1.684887459807074,
+      "grad_norm": 0.7623500227928162,
+      "learning_rate": 0.00013439211391018618,
+      "loss": 1.9117,
+      "step": 2620
+    },
+    {
+      "epoch": 1.6913183279742765,
+      "grad_norm": 0.6996557712554932,
+      "learning_rate": 0.00013373493975903612,
+      "loss": 1.8342,
+      "step": 2630
+    },
+    {
+      "epoch": 1.697749196141479,
+      "grad_norm": 0.6597011685371399,
+      "learning_rate": 0.00013307776560788606,
+      "loss": 1.911,
+      "step": 2640
+    },
+    {
+      "epoch": 1.7041800643086815,
+      "grad_norm": 0.7154627442359924,
+      "learning_rate": 0.00013242059145673603,
+      "loss": 1.8955,
+      "step": 2650
+    },
+    {
+      "epoch": 1.7106109324758842,
+      "grad_norm": 0.6822642087936401,
+      "learning_rate": 0.00013176341730558597,
+      "loss": 1.928,
+      "step": 2660
+    },
+    {
+      "epoch": 1.717041800643087,
+      "grad_norm": 0.6770340204238892,
+      "learning_rate": 0.0001311062431544359,
+      "loss": 1.934,
+      "step": 2670
+    },
+    {
+      "epoch": 1.7234726688102895,
+      "grad_norm": 0.7235671877861023,
+      "learning_rate": 0.00013044906900328584,
+      "loss": 1.9248,
+      "step": 2680
+    },
+    {
+      "epoch": 1.729903536977492,
+      "grad_norm": 0.6428620219230652,
+      "learning_rate": 0.0001297918948521358,
+      "loss": 1.8998,
+      "step": 2690
+    },
+    {
+      "epoch": 1.7363344051446945,
+      "grad_norm": 0.7132564783096313,
+      "learning_rate": 0.00012913472070098575,
+      "loss": 1.9353,
+      "step": 2700
+    },
+    {
+      "epoch": 1.742765273311897,
+      "grad_norm": 0.7110019326210022,
+      "learning_rate": 0.0001284775465498357,
+      "loss": 1.8877,
+      "step": 2710
+    },
+    {
+      "epoch": 1.7491961414790995,
+      "grad_norm": 0.7546197772026062,
+      "learning_rate": 0.00012782037239868565,
+      "loss": 1.9219,
+      "step": 2720
+    },
+    {
+      "epoch": 1.7556270096463023,
+      "grad_norm": 0.8485615253448486,
+      "learning_rate": 0.0001271631982475356,
+      "loss": 1.9238,
+      "step": 2730
+    },
+    {
+      "epoch": 1.762057877813505,
+      "grad_norm": 0.7058401703834534,
+      "learning_rate": 0.00012650602409638553,
+      "loss": 1.9012,
+      "step": 2740
+    },
+    {
+      "epoch": 1.7684887459807075,
+      "grad_norm": 0.7222112417221069,
+      "learning_rate": 0.00012584884994523547,
+      "loss": 1.8442,
+      "step": 2750
+    },
+    {
+      "epoch": 1.77491961414791,
+      "grad_norm": 0.7010639905929565,
+      "learning_rate": 0.00012519167579408543,
+      "loss": 1.9322,
+      "step": 2760
+    },
+    {
+      "epoch": 1.7813504823151125,
+      "grad_norm": 0.6908234357833862,
+      "learning_rate": 0.00012453450164293537,
+      "loss": 1.9456,
+      "step": 2770
+    },
+    {
+      "epoch": 1.787781350482315,
+      "grad_norm": 0.6615903973579407,
+      "learning_rate": 0.0001238773274917853,
+      "loss": 1.9052,
+      "step": 2780
+    },
+    {
+      "epoch": 1.7942122186495175,
+      "grad_norm": 0.6688089370727539,
+      "learning_rate": 0.00012322015334063528,
+      "loss": 1.87,
+      "step": 2790
+    },
+    {
+      "epoch": 1.8006430868167203,
+      "grad_norm": 0.7396994233131409,
+      "learning_rate": 0.00012256297918948522,
+      "loss": 1.9243,
+      "step": 2800
+    },
+    {
+      "epoch": 1.8006430868167203,
+      "eval_loss": 1.974278450012207,
+      "eval_runtime": 144.2243,
+      "eval_samples_per_second": 13.867,
+      "eval_steps_per_second": 1.733,
+      "step": 2800
+    },
+    {
+      "epoch": 1.807073954983923,
+      "grad_norm": 0.6520466208457947,
+      "learning_rate": 0.00012190580503833514,
+      "loss": 1.902,
+      "step": 2810
+    },
+    {
+      "epoch": 1.8135048231511255,
+      "grad_norm": 0.7591603398323059,
+      "learning_rate": 0.00012124863088718509,
+      "loss": 1.9079,
+      "step": 2820
+    },
+    {
+      "epoch": 1.819935691318328,
+      "grad_norm": 0.6622514128684998,
+      "learning_rate": 0.00012059145673603504,
+      "loss": 1.9288,
+      "step": 2830
+    },
+    {
+      "epoch": 1.8263665594855305,
+      "grad_norm": 0.7578607797622681,
+      "learning_rate": 0.00011993428258488498,
+      "loss": 1.8936,
+      "step": 2840
+    },
+    {
+      "epoch": 1.832797427652733,
+      "grad_norm": 0.730093240737915,
+      "learning_rate": 0.00011927710843373494,
+      "loss": 1.8809,
+      "step": 2850
+    },
+    {
+      "epoch": 1.8392282958199357,
+      "grad_norm": 0.6403250098228455,
+      "learning_rate": 0.00011861993428258487,
+      "loss": 1.8866,
+      "step": 2860
+    },
+    {
+      "epoch": 1.8456591639871383,
+      "grad_norm": 0.7032350897789001,
+      "learning_rate": 0.00011796276013143481,
+      "loss": 1.938,
+      "step": 2870
+    },
+    {
+      "epoch": 1.852090032154341,
+      "grad_norm": 0.7376342415809631,
+      "learning_rate": 0.00011730558598028478,
+      "loss": 1.8925,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8585209003215435,
+      "grad_norm": 0.7093110680580139,
+      "learning_rate": 0.00011664841182913472,
+      "loss": 1.9029,
+      "step": 2890
+    },
+    {
+      "epoch": 1.864951768488746,
+      "grad_norm": 0.6826250553131104,
+      "learning_rate": 0.00011599123767798466,
+      "loss": 1.8956,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8713826366559485,
+      "grad_norm": 0.7709969282150269,
+      "learning_rate": 0.0001153340635268346,
+      "loss": 1.92,
+      "step": 2910
+    },
+    {
+      "epoch": 1.877813504823151,
+      "grad_norm": 0.6641222238540649,
+      "learning_rate": 0.00011467688937568453,
+      "loss": 1.8998,
+      "step": 2920
+    },
+    {
+      "epoch": 1.8842443729903537,
+      "grad_norm": 0.7321887612342834,
+      "learning_rate": 0.0001140197152245345,
+      "loss": 1.9257,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8906752411575563,
+      "grad_norm": 0.7000001668930054,
+      "learning_rate": 0.00011336254107338444,
+      "loss": 1.8944,
+      "step": 2940
+    },
+    {
+      "epoch": 1.897106109324759,
+      "grad_norm": 0.7347818613052368,
+      "learning_rate": 0.00011270536692223438,
+      "loss": 1.9256,
+      "step": 2950
+    },
+    {
+      "epoch": 1.9035369774919615,
+      "grad_norm": 0.708888590335846,
+      "learning_rate": 0.00011204819277108433,
+      "loss": 1.9307,
+      "step": 2960
+    },
+    {
+      "epoch": 1.909967845659164,
+      "grad_norm": 0.6980915665626526,
+      "learning_rate": 0.00011139101861993428,
+      "loss": 1.883,
+      "step": 2970
+    },
+    {
+      "epoch": 1.9163987138263665,
+      "grad_norm": 0.8052535653114319,
+      "learning_rate": 0.00011073384446878422,
+      "loss": 1.899,
+      "step": 2980
+    },
+    {
+      "epoch": 1.922829581993569,
+      "grad_norm": 0.707011878490448,
+      "learning_rate": 0.00011007667031763416,
+      "loss": 1.9263,
+      "step": 2990
+    },
+    {
+      "epoch": 1.9292604501607717,
+      "grad_norm": 0.7086938619613647,
+      "learning_rate": 0.00010941949616648411,
+      "loss": 1.883,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9292604501607717,
+      "eval_loss": 1.9664931297302246,
+      "eval_runtime": 133.023,
+      "eval_samples_per_second": 15.035,
+      "eval_steps_per_second": 1.879,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4665,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.0137669676957696e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cab22ba79cda15f54ce907097c40aecb6ebd4c038c6657764e0d5bf9d78a133c
+size 5048