diff --git a/README.md b/README.md index 0edbf9989fe4b03ebf829417eb9639d57da59162..b38ff0d17ed7c67ec3c087f49410776de9b20be4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ --- -base_model: NousResearch/Llama-2-7b-hf +license: apache-2.0 +base_model: cyberagent/calm2-7b-chat tags: - generated_from_trainer model-index: @@ -13,9 +14,9 @@ should probably proofread and complete it, then remove this comment. --> [Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) # out -This model is a fine-tuned version of [NousResearch/Llama-2-7b-hf](https://huggingface.co/NousResearch/Llama-2-7b-hf) on the None dataset. +This model is a fine-tuned version of [cyberagent/calm2-7b-chat](https://huggingface.co/cyberagent/calm2-7b-chat) on the None dataset. It achieves the following results on the evaluation set: -- Loss: 0.9443 +- Loss: 3.2941 ## Model description @@ -40,7 +41,8 @@ The following hyperparameters were used during training: - seed: 42 - distributed_type: multi-GPU - num_devices: 3 -- total_train_batch_size: 3 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 6 - total_eval_batch_size: 3 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 - lr_scheduler_type: cosine @@ -51,24 +53,26 @@ The following hyperparameters were used during training: | Training Loss | Epoch | Step | Validation Loss | |:-------------:|:-----:|:----:|:---------------:| -| 1.0254 | 0.03 | 1 | 3.0959 | -| 3.2648 | 0.06 | 2 | 3.0959 | -| 3.0345 | 0.12 | 4 | 1.6018 | -| 1.4912 | 0.18 | 6 | 1.4104 | -| 1.4298 | 0.24 | 8 | 1.2483 | -| 1.2217 | 0.29 | 10 | 1.1785 | -| 1.1975 | 0.35 | 12 | 1.1200 | -| 1.1377 | 0.41 | 14 | 1.0922 | -| 1.0991 | 0.47 | 16 | 1.0625 | -| 0.9783 | 0.53 | 18 | 1.0422 | -| 1.0558 | 0.59 | 20 | 1.0100 | -| 0.9894 | 0.65 | 22 | 0.9902 | -| 0.9677 | 0.71 | 24 | 0.9780 | -| 0.9782 | 0.76 | 26 | 0.9679 | -| 0.9944 | 0.82 | 28 | 0.9595 | -| 0.9245 | 0.88 | 30 | 0.9509 | -| 0.9676 | 0.94 | 32 | 0.9468 | -| 1.0653 | 1.0 | 34 | 0.9443 | +| 3.5899 | 0.0 | 1 | 4.1781 | +| 3.4689 | 0.05 | 352 | 3.5988 | +| 3.519 | 0.1 | 704 | 3.6209 | +| 3.4168 | 0.15 | 1056 | 3.6161 | +| 3.4906 | 0.2 | 1408 | 3.6054 | +| 3.4853 | 0.25 | 1760 | 3.5950 | +| 3.475 | 0.3 | 2112 | 3.5806 | +| 3.2476 | 0.35 | 2464 | 3.5675 | +| 3.2339 | 0.4 | 2816 | 3.5398 | +| 3.3711 | 0.45 | 3168 | 3.5233 | +| 3.4195 | 0.5 | 3520 | 3.5006 | +| 3.1105 | 0.55 | 3872 | 3.4804 | +| 3.2477 | 0.6 | 4224 | 3.4546 | +| 3.2753 | 0.65 | 4576 | 3.4311 | +| 3.1867 | 0.7 | 4928 | 3.4054 | +| 3.2009 | 0.75 | 5280 | 3.3783 | +| 3.1037 | 0.8 | 5632 | 3.3509 | +| 3.0394 | 0.85 | 5984 | 3.3314 | +| 3.0759 | 0.9 | 6336 | 3.3095 | +| 3.1989 | 0.95 | 6688 | 3.2941 | ### Framework versions diff --git a/checkpoint-3511/config.json b/checkpoint-3511/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2599737ed66369800e7a6efe6cdf23d0cfe85382 --- /dev/null +++ b/checkpoint-3511/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "cyberagent/calm2-7b-chat", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 1, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 500000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.34.1", + "use_cache": false, + "vocab_size": 65024 +} diff --git a/checkpoint-3511/generation_config.json b/checkpoint-3511/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28d913eb61a7fd74338a6f1ff8d2efb149f99dbc --- /dev/null +++ b/checkpoint-3511/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "pad_token_id": 1, + "transformers_version": "4.34.1" +} diff --git a/checkpoint-3511/global_step3511/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8309b42f4fe132d2c93fba3851e8026aa321c41a --- /dev/null +++ b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b69de0e4edec45f9ea053faaf401877cca861efb0f5a59f3599b96afbe1484ec +size 28036079859 diff --git a/checkpoint-3511/global_step3511/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32125be11c06b83d27152ecc970bde1a38f238fe --- /dev/null +++ b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b950a98a4da6d605d6653a6cdcc38deaae6f2d2effe2221e58e5b8edddb3329 +size 28036079859 diff --git a/checkpoint-3511/global_step3511/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a95dfb1d516c07bb8bc976d0e51e2f6c40b3cc63 --- /dev/null +++ b/checkpoint-3511/global_step3511/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c541a4b0ee0d2571c401c99523b386779df911a7f571de3716ffde4a56732e8b +size 28036079859 diff --git a/checkpoint-3511/global_step3511/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-3511/global_step3511/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afb218d9f925e4fdd1a0ca7402724b958806b96d --- /dev/null +++ b/checkpoint-3511/global_step3511/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421d8cdb934acd5188eefdf1f86c6a0419e050065fadb559967fe7ff4601e27f +size 138326 diff --git a/checkpoint-3511/global_step3511/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-3511/global_step3511/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a10dba63b10eee3cfda30648facff5b7d84c8049 --- /dev/null +++ b/checkpoint-3511/global_step3511/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:831e0a14bae6fa7923713108db97cb612ec1c9af94a062b0aac8eba8188ac539 +size 138326 diff --git a/checkpoint-3511/global_step3511/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-3511/global_step3511/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1ab0d4d99022dc1fc43c1c59b53c10ec98c0cf9 --- /dev/null +++ b/checkpoint-3511/global_step3511/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e66536e9129d4b84d87244c57b8af363ec5bc8888f06ce09c590d336f4eab78 +size 138326 diff --git a/checkpoint-3511/latest b/checkpoint-3511/latest new file mode 100644 index 0000000000000000000000000000000000000000..d799ca968a39ac4c3acfe88d2f8db1b10ab60cfb --- /dev/null +++ b/checkpoint-3511/latest @@ -0,0 +1 @@ +global_step3511 \ No newline at end of file diff --git a/checkpoint-3511/pytorch_model-00001-of-00002.bin b/checkpoint-3511/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..a9500679606fd3422eec1386c1143f0f8be6c2eb --- /dev/null +++ b/checkpoint-3511/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a9c5d808dc49d676626676d0cc841fc34ba34eceefa62880d4ca718b6f58b7 +size 9976594142 diff --git a/checkpoint-3511/pytorch_model-00002-of-00002.bin b/checkpoint-3511/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..bdc56d1eab384cba1ebc18e1e8b4c516fc75f704 --- /dev/null +++ b/checkpoint-3511/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f8747a97cae8128ac5873953f78e3a77c30f2fe5a657278339685e112efbe7 +size 4041391035 diff --git a/checkpoint-3511/pytorch_model.bin.index.json b/checkpoint-3511/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..85c4314f00522f62839f8495ec3f0d9adec0fcfa --- /dev/null +++ b/checkpoint-3511/pytorch_model.bin.index.json @@ -0,0 +1,266 @@ +{ + "metadata": { + "total_size": 14017896448 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/checkpoint-3511/rng_state_0.pth b/checkpoint-3511/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..417d52d6958ce1d6e6c0e711d6eb0a68a1f1ae42 --- /dev/null +++ b/checkpoint-3511/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1653d5b0e09c2d93759ad31b0bca034b949c5beacbcec854b9c133c18ff0f1 +size 16631 diff --git a/checkpoint-3511/rng_state_1.pth b/checkpoint-3511/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6389ab11d2699189dff857d5cf6911645ac491a7 --- /dev/null +++ b/checkpoint-3511/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:718a356e2faee3d07e0416c137f3bcdc0c70d127268ae7202882018ffa03e320 +size 16631 diff --git a/checkpoint-3511/rng_state_2.pth b/checkpoint-3511/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..553871298ac3918ceef48b7e90ee7784f4afe077 --- /dev/null +++ b/checkpoint-3511/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ab1200db9bd16e014659660734c61fe08517897cef6b3efe97c366790250f5 +size 16631 diff --git a/checkpoint-3511/trainer_state.json b/checkpoint-3511/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..715b0286f8d4f378226e9860ef8834e9e1202fd2 --- /dev/null +++ b/checkpoint-3511/trainer_state.json @@ -0,0 +1,21245 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997864312664626, + "eval_steps": 176, + "global_step": 3511, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.0, + "loss": 3.5718, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 5.830782413482666, + "eval_runtime": 2940.9278, + "eval_samples_per_second": 6.966, + "eval_steps_per_second": 2.322, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 6.0175, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 4.000000000000001e-06, + "loss": 5.8622, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 6e-06, + "loss": 5.8119, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 8.000000000000001e-06, + "loss": 4.4185, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 4.4103, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.2e-05, + "loss": 4.1269, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.848, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.7346, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 1.8e-05, + "loss": 3.5968, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 3.5486, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.492, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 2.4e-05, + "loss": 3.4527, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 2.6000000000000002e-05, + "loss": 3.4616, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 2.8000000000000003e-05, + "loss": 3.4919, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 3e-05, + "loss": 3.3711, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.3979, + "step": 17 + }, + { + "epoch": 0.01, + "learning_rate": 3.4000000000000007e-05, + "loss": 3.2596, + "step": 18 + }, + { + "epoch": 0.01, + "learning_rate": 3.6e-05, + "loss": 3.3727, + "step": 19 + }, + { + "epoch": 0.01, + "learning_rate": 3.8e-05, + "loss": 3.2903, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4e-05, + "loss": 3.3427, + "step": 21 + }, + { + "epoch": 0.01, + "learning_rate": 4.2e-05, + "loss": 3.3386, + "step": 22 + }, + { + "epoch": 0.01, + "learning_rate": 4.4000000000000006e-05, + "loss": 3.2772, + "step": 23 + }, + { + "epoch": 0.01, + "learning_rate": 4.600000000000001e-05, + "loss": 3.2423, + "step": 24 + }, + { + "epoch": 0.01, + "learning_rate": 4.8e-05, + "loss": 3.2333, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 5e-05, + "loss": 3.2475, + "step": 26 + }, + { + "epoch": 0.01, + "learning_rate": 5.2000000000000004e-05, + "loss": 3.3114, + "step": 27 + }, + { + "epoch": 0.01, + "learning_rate": 5.4000000000000005e-05, + "loss": 3.2723, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 5.6000000000000006e-05, + "loss": 3.1906, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 5.8e-05, + "loss": 3.2488, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 6e-05, + "loss": 3.2203, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 6.2e-05, + "loss": 3.2247, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 6.400000000000001e-05, + "loss": 3.1383, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 6.6e-05, + "loss": 3.1227, + "step": 34 + }, + { + "epoch": 0.01, + "learning_rate": 6.800000000000001e-05, + "loss": 3.2286, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 7e-05, + "loss": 3.2415, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 7.2e-05, + "loss": 3.0958, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 7.4e-05, + "loss": 3.1529, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 7.6e-05, + "loss": 3.1929, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 7.800000000000001e-05, + "loss": 3.1599, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 8e-05, + "loss": 3.1247, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 8.2e-05, + "loss": 3.1767, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 8.4e-05, + "loss": 3.213, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 8.6e-05, + "loss": 3.2521, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 8.800000000000001e-05, + "loss": 3.1038, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 9e-05, + "loss": 3.2639, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 9.200000000000001e-05, + "loss": 3.1598, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 9.4e-05, + "loss": 3.202, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 9.6e-05, + "loss": 3.2283, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 9.8e-05, + "loss": 3.1866, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 3.1442, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 0.00010200000000000001, + "loss": 3.0943, + "step": 52 + }, + { + "epoch": 0.02, + "learning_rate": 0.00010400000000000001, + "loss": 3.1444, + "step": 53 + }, + { + "epoch": 0.02, + "learning_rate": 0.00010600000000000002, + "loss": 3.1593, + "step": 54 + }, + { + "epoch": 0.02, + "learning_rate": 0.00010800000000000001, + "loss": 3.1638, + "step": 55 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011000000000000002, + "loss": 3.2374, + "step": 56 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011200000000000001, + "loss": 3.2035, + "step": 57 + }, + { + "epoch": 0.02, + "learning_rate": 0.00011399999999999999, + "loss": 3.2014, + "step": 58 + }, + { + "epoch": 0.02, + "learning_rate": 0.000116, + "loss": 3.2609, + "step": 59 + }, + { + "epoch": 0.02, + "learning_rate": 0.000118, + "loss": 3.1562, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 0.00012, + "loss": 3.2534, + "step": 61 + }, + { + "epoch": 0.02, + "learning_rate": 0.000122, + "loss": 3.1461, + "step": 62 + }, + { + "epoch": 0.02, + "learning_rate": 0.000124, + "loss": 3.2507, + "step": 63 + }, + { + "epoch": 0.02, + "learning_rate": 0.000126, + "loss": 3.2205, + "step": 64 + }, + { + "epoch": 0.02, + "learning_rate": 0.00012800000000000002, + "loss": 3.2393, + "step": 65 + }, + { + "epoch": 0.02, + "learning_rate": 0.00013000000000000002, + "loss": 3.1658, + "step": 66 + }, + { + "epoch": 0.02, + "learning_rate": 0.000132, + "loss": 3.1736, + "step": 67 + }, + { + "epoch": 0.02, + "learning_rate": 0.000134, + "loss": 3.2191, + "step": 68 + }, + { + "epoch": 0.02, + "learning_rate": 0.00013600000000000003, + "loss": 3.2439, + "step": 69 + }, + { + "epoch": 0.02, + "learning_rate": 0.000138, + "loss": 3.2145, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 0.00014, + "loss": 3.317, + "step": 71 + }, + { + "epoch": 0.02, + "learning_rate": 0.000142, + "loss": 3.1762, + "step": 72 + }, + { + "epoch": 0.02, + "learning_rate": 0.000144, + "loss": 3.2242, + "step": 73 + }, + { + "epoch": 0.02, + "learning_rate": 0.000146, + "loss": 3.2284, + "step": 74 + }, + { + "epoch": 0.02, + "learning_rate": 0.000148, + "loss": 3.2573, + "step": 75 + }, + { + "epoch": 0.02, + "learning_rate": 0.00015000000000000001, + "loss": 3.1517, + "step": 76 + }, + { + "epoch": 0.02, + "learning_rate": 0.000152, + "loss": 3.2148, + "step": 77 + }, + { + "epoch": 0.02, + "learning_rate": 0.000154, + "loss": 3.3142, + "step": 78 + }, + { + "epoch": 0.02, + "learning_rate": 0.00015600000000000002, + "loss": 3.2461, + "step": 79 + }, + { + "epoch": 0.02, + "learning_rate": 0.00015800000000000002, + "loss": 3.2156, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00016, + "loss": 3.2101, + "step": 81 + }, + { + "epoch": 0.02, + "learning_rate": 0.000162, + "loss": 3.2675, + "step": 82 + }, + { + "epoch": 0.02, + "learning_rate": 0.000164, + "loss": 3.3065, + "step": 83 + }, + { + "epoch": 0.02, + "learning_rate": 0.000166, + "loss": 3.3035, + "step": 84 + }, + { + "epoch": 0.02, + "learning_rate": 0.000168, + "loss": 3.2975, + "step": 85 + }, + { + "epoch": 0.02, + "learning_rate": 0.00017, + "loss": 3.3041, + "step": 86 + }, + { + "epoch": 0.02, + "learning_rate": 0.000172, + "loss": 3.2666, + "step": 87 + }, + { + "epoch": 0.03, + "learning_rate": 0.000174, + "loss": 3.2871, + "step": 88 + }, + { + "epoch": 0.03, + "learning_rate": 0.00017600000000000002, + "loss": 3.264, + "step": 89 + }, + { + "epoch": 0.03, + "learning_rate": 0.00017800000000000002, + "loss": 3.2833, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 0.00018, + "loss": 3.3067, + "step": 91 + }, + { + "epoch": 0.03, + "learning_rate": 0.000182, + "loss": 3.2589, + "step": 92 + }, + { + "epoch": 0.03, + "learning_rate": 0.00018400000000000003, + "loss": 3.3806, + "step": 93 + }, + { + "epoch": 0.03, + "learning_rate": 0.00018600000000000002, + "loss": 3.219, + "step": 94 + }, + { + "epoch": 0.03, + "learning_rate": 0.000188, + "loss": 3.3217, + "step": 95 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019, + "loss": 3.2479, + "step": 96 + }, + { + "epoch": 0.03, + "learning_rate": 0.000192, + "loss": 3.288, + "step": 97 + }, + { + "epoch": 0.03, + "learning_rate": 0.000194, + "loss": 3.2907, + "step": 98 + }, + { + "epoch": 0.03, + "learning_rate": 0.000196, + "loss": 3.1544, + "step": 99 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019800000000000002, + "loss": 3.3607, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002, + "loss": 3.2381, + "step": 101 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001999413661682791, + "loss": 3.3826, + "step": 102 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001998827323365582, + "loss": 3.3419, + "step": 103 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001998240985048373, + "loss": 3.2726, + "step": 104 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001997654646731164, + "loss": 3.2404, + "step": 105 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001997068308413955, + "loss": 3.267, + "step": 106 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019964819700967458, + "loss": 3.3027, + "step": 107 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001995895631779537, + "loss": 3.3157, + "step": 108 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001995309293462328, + "loss": 3.2919, + "step": 109 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019947229551451189, + "loss": 3.3454, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019941366168279098, + "loss": 3.3259, + "step": 111 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019935502785107007, + "loss": 3.3948, + "step": 112 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019929639401934917, + "loss": 3.2172, + "step": 113 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019923776018762826, + "loss": 3.3734, + "step": 114 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019917912635590736, + "loss": 3.3004, + "step": 115 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019912049252418648, + "loss": 3.326, + "step": 116 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019906185869246557, + "loss": 3.3342, + "step": 117 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019900322486074464, + "loss": 3.3513, + "step": 118 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019894459102902376, + "loss": 3.302, + "step": 119 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019888595719730286, + "loss": 3.2558, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019882732336558195, + "loss": 3.2531, + "step": 121 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019876868953386104, + "loss": 3.2707, + "step": 122 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019871005570214014, + "loss": 3.3266, + "step": 123 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019865142187041926, + "loss": 3.3349, + "step": 124 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019859278803869833, + "loss": 3.3154, + "step": 125 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019853415420697742, + "loss": 3.2647, + "step": 126 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019847552037525654, + "loss": 3.3634, + "step": 127 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019841688654353564, + "loss": 3.3126, + "step": 128 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001983582527118147, + "loss": 3.3529, + "step": 129 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019829961888009383, + "loss": 3.3262, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019824098504837292, + "loss": 3.3167, + "step": 131 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019818235121665201, + "loss": 3.4109, + "step": 132 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001981237173849311, + "loss": 3.1823, + "step": 133 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001980650835532102, + "loss": 3.3014, + "step": 134 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019800644972148932, + "loss": 3.3807, + "step": 135 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019794781588976842, + "loss": 3.3407, + "step": 136 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019788918205804749, + "loss": 3.4206, + "step": 137 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001978305482263266, + "loss": 3.4182, + "step": 138 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001977719143946057, + "loss": 3.3931, + "step": 139 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001977132805628848, + "loss": 3.3758, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001976546467311639, + "loss": 3.3941, + "step": 141 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019759601289944299, + "loss": 3.3912, + "step": 142 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001975373790677221, + "loss": 3.2704, + "step": 143 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019747874523600117, + "loss": 3.4061, + "step": 144 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019742011140428027, + "loss": 3.4493, + "step": 145 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001973614775725594, + "loss": 3.346, + "step": 146 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019730284374083848, + "loss": 3.3776, + "step": 147 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019724420990911755, + "loss": 3.3023, + "step": 148 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019718557607739667, + "loss": 3.3579, + "step": 149 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019712694224567577, + "loss": 3.2763, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019706830841395486, + "loss": 3.3203, + "step": 151 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019700967458223396, + "loss": 3.3814, + "step": 152 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019695104075051305, + "loss": 3.3263, + "step": 153 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019689240691879217, + "loss": 3.2977, + "step": 154 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019683377308707124, + "loss": 3.3911, + "step": 155 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019677513925535033, + "loss": 3.3715, + "step": 156 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019671650542362945, + "loss": 3.3647, + "step": 157 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019665787159190855, + "loss": 3.3792, + "step": 158 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019659923776018762, + "loss": 3.4429, + "step": 159 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019654060392846674, + "loss": 3.4239, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019648197009674583, + "loss": 3.4442, + "step": 161 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019642333626502493, + "loss": 3.4245, + "step": 162 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019636470243330402, + "loss": 3.2396, + "step": 163 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019630606860158311, + "loss": 3.375, + "step": 164 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019624743476986224, + "loss": 3.356, + "step": 165 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001961888009381413, + "loss": 3.2876, + "step": 166 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001961301671064204, + "loss": 3.3384, + "step": 167 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019607153327469952, + "loss": 3.4072, + "step": 168 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001960128994429786, + "loss": 3.4314, + "step": 169 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001959542656112577, + "loss": 3.3795, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001958956317795368, + "loss": 3.2868, + "step": 171 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001958369979478159, + "loss": 3.3781, + "step": 172 + }, + { + "epoch": 0.05, + "learning_rate": 0.000195778364116095, + "loss": 3.4215, + "step": 173 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019571973028437408, + "loss": 3.4015, + "step": 174 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019566109645265318, + "loss": 3.3857, + "step": 175 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001956024626209323, + "loss": 3.3115, + "step": 176 + }, + { + "epoch": 0.05, + "eval_loss": 3.4127612113952637, + "eval_runtime": 2941.7575, + "eval_samples_per_second": 6.964, + "eval_steps_per_second": 2.321, + "step": 176 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019554382878921137, + "loss": 3.3804, + "step": 177 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001954851949574905, + "loss": 3.3767, + "step": 178 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019542656112576958, + "loss": 3.1557, + "step": 179 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019536792729404868, + "loss": 3.4065, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019530929346232777, + "loss": 3.4646, + "step": 181 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019525065963060687, + "loss": 3.2766, + "step": 182 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019519202579888596, + "loss": 3.3506, + "step": 183 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019513339196716508, + "loss": 3.3775, + "step": 184 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019507475813544415, + "loss": 3.3716, + "step": 185 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019501612430372324, + "loss": 3.4035, + "step": 186 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019495749047200236, + "loss": 3.3617, + "step": 187 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019489885664028146, + "loss": 3.3539, + "step": 188 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019484022280856055, + "loss": 3.3618, + "step": 189 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019478158897683965, + "loss": 3.275, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019472295514511874, + "loss": 3.3481, + "step": 191 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019466432131339784, + "loss": 3.4303, + "step": 192 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019460568748167693, + "loss": 3.4039, + "step": 193 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019454705364995603, + "loss": 3.4618, + "step": 194 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019448841981823515, + "loss": 3.2109, + "step": 195 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019442978598651421, + "loss": 3.3467, + "step": 196 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019437115215479334, + "loss": 3.3039, + "step": 197 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019431251832307243, + "loss": 3.4182, + "step": 198 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019425388449135152, + "loss": 3.3874, + "step": 199 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019419525065963062, + "loss": 3.3259, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001941366168279097, + "loss": 3.4389, + "step": 201 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001940779829961888, + "loss": 3.311, + "step": 202 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001940193491644679, + "loss": 3.2309, + "step": 203 + }, + { + "epoch": 0.06, + "learning_rate": 0.000193960715332747, + "loss": 3.445, + "step": 204 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019390208150102612, + "loss": 3.4121, + "step": 205 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001938434476693052, + "loss": 3.4134, + "step": 206 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019378481383758428, + "loss": 3.3747, + "step": 207 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001937261800058634, + "loss": 3.2936, + "step": 208 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001936675461741425, + "loss": 3.2643, + "step": 209 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001936089123424216, + "loss": 3.4549, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019355027851070068, + "loss": 3.3649, + "step": 211 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019349164467897978, + "loss": 3.3066, + "step": 212 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001934330108472589, + "loss": 3.2665, + "step": 213 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019337437701553797, + "loss": 3.427, + "step": 214 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019331574318381706, + "loss": 3.4163, + "step": 215 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019325710935209618, + "loss": 3.4074, + "step": 216 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019319847552037528, + "loss": 3.3811, + "step": 217 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019313984168865434, + "loss": 3.4369, + "step": 218 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019308120785693346, + "loss": 3.3511, + "step": 219 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019302257402521256, + "loss": 3.3779, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019296394019349165, + "loss": 3.272, + "step": 221 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019290530636177075, + "loss": 3.3569, + "step": 222 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019284667253004984, + "loss": 3.4014, + "step": 223 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019278803869832896, + "loss": 3.3813, + "step": 224 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019272940486660803, + "loss": 3.3642, + "step": 225 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019267077103488712, + "loss": 3.384, + "step": 226 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019261213720316625, + "loss": 3.3342, + "step": 227 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019255350337144534, + "loss": 3.3221, + "step": 228 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001924948695397244, + "loss": 3.3293, + "step": 229 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019243623570800353, + "loss": 3.4013, + "step": 230 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019237760187628262, + "loss": 3.3182, + "step": 231 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019231896804456174, + "loss": 3.3558, + "step": 232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001922603342128408, + "loss": 3.2817, + "step": 233 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001922017003811199, + "loss": 3.4694, + "step": 234 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019214306654939903, + "loss": 3.4166, + "step": 235 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019208443271767812, + "loss": 3.2853, + "step": 236 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001920257988859572, + "loss": 3.334, + "step": 237 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001919671650542363, + "loss": 3.2872, + "step": 238 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001919085312225154, + "loss": 3.2861, + "step": 239 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001918498973907945, + "loss": 3.3128, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001917912635590736, + "loss": 3.3114, + "step": 241 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001917326297273527, + "loss": 3.3369, + "step": 242 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001916739958956318, + "loss": 3.3872, + "step": 243 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019161536206391088, + "loss": 3.347, + "step": 244 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019155672823218997, + "loss": 3.4326, + "step": 245 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001914980944004691, + "loss": 3.3617, + "step": 246 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001914394605687482, + "loss": 3.3735, + "step": 247 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019138082673702725, + "loss": 3.3881, + "step": 248 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019132219290530638, + "loss": 3.3635, + "step": 249 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019126355907358547, + "loss": 3.4983, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019120492524186456, + "loss": 3.2744, + "step": 251 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019114629141014366, + "loss": 3.3445, + "step": 252 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019108765757842275, + "loss": 3.3678, + "step": 253 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019102902374670187, + "loss": 3.2684, + "step": 254 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019097038991498094, + "loss": 3.2846, + "step": 255 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019091175608326004, + "loss": 3.3552, + "step": 256 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019085312225153916, + "loss": 3.2922, + "step": 257 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019079448841981825, + "loss": 3.3105, + "step": 258 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019073585458809735, + "loss": 3.3673, + "step": 259 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019067722075637644, + "loss": 3.3258, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019061858692465553, + "loss": 3.232, + "step": 261 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019055995309293463, + "loss": 3.3009, + "step": 262 + }, + { + "epoch": 0.07, + "learning_rate": 0.00019050131926121372, + "loss": 3.3539, + "step": 263 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019044268542949282, + "loss": 3.4128, + "step": 264 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019038405159777194, + "loss": 3.3834, + "step": 265 + }, + { + "epoch": 0.08, + "learning_rate": 0.000190325417766051, + "loss": 3.418, + "step": 266 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019026678393433013, + "loss": 3.2632, + "step": 267 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019020815010260922, + "loss": 3.398, + "step": 268 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019014951627088832, + "loss": 3.2799, + "step": 269 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001900908824391674, + "loss": 3.4037, + "step": 270 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001900322486074465, + "loss": 3.3112, + "step": 271 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001899736147757256, + "loss": 3.3791, + "step": 272 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001899149809440047, + "loss": 3.4785, + "step": 273 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001898563471122838, + "loss": 3.2971, + "step": 274 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018979771328056288, + "loss": 3.345, + "step": 275 + }, + { + "epoch": 0.08, + "learning_rate": 0.000189739079448842, + "loss": 3.3416, + "step": 276 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018968044561712107, + "loss": 3.4207, + "step": 277 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001896218117854002, + "loss": 3.3759, + "step": 278 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018956317795367929, + "loss": 3.4658, + "step": 279 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018950454412195838, + "loss": 3.3258, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018944591029023747, + "loss": 3.2984, + "step": 281 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018938727645851657, + "loss": 3.3746, + "step": 282 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018932864262679566, + "loss": 3.3248, + "step": 283 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018927000879507478, + "loss": 3.368, + "step": 284 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018921137496335385, + "loss": 3.3685, + "step": 285 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018915274113163297, + "loss": 3.2488, + "step": 286 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018909410729991207, + "loss": 3.4339, + "step": 287 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018903547346819116, + "loss": 3.2997, + "step": 288 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018897683963647026, + "loss": 3.3752, + "step": 289 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018891820580474935, + "loss": 3.3072, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018885957197302845, + "loss": 3.3456, + "step": 291 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018880093814130754, + "loss": 3.3443, + "step": 292 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018874230430958663, + "loss": 3.3228, + "step": 293 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018868367047786576, + "loss": 3.4276, + "step": 294 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018862503664614485, + "loss": 3.313, + "step": 295 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018856640281442392, + "loss": 3.3224, + "step": 296 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018850776898270304, + "loss": 3.3374, + "step": 297 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018844913515098213, + "loss": 3.4112, + "step": 298 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018839050131926123, + "loss": 3.3292, + "step": 299 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018833186748754032, + "loss": 3.2262, + "step": 300 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018827323365581942, + "loss": 3.3758, + "step": 301 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001882145998240985, + "loss": 3.3297, + "step": 302 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001881559659923776, + "loss": 3.2446, + "step": 303 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001880973321606567, + "loss": 3.4397, + "step": 304 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018803869832893582, + "loss": 3.3154, + "step": 305 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018798006449721491, + "loss": 3.3331, + "step": 306 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018792143066549398, + "loss": 3.2873, + "step": 307 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001878627968337731, + "loss": 3.3713, + "step": 308 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001878041630020522, + "loss": 3.3679, + "step": 309 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001877455291703313, + "loss": 3.3151, + "step": 310 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018768689533861039, + "loss": 3.3328, + "step": 311 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018762826150688948, + "loss": 3.4282, + "step": 312 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001875696276751686, + "loss": 3.2908, + "step": 313 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018751099384344767, + "loss": 3.3646, + "step": 314 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018745236001172676, + "loss": 3.3616, + "step": 315 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018739372618000588, + "loss": 3.3221, + "step": 316 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018733509234828498, + "loss": 3.1692, + "step": 317 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018727645851656405, + "loss": 3.327, + "step": 318 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018721782468484317, + "loss": 3.3521, + "step": 319 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018715919085312226, + "loss": 3.2864, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018710055702140136, + "loss": 3.4104, + "step": 321 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018704192318968045, + "loss": 3.3519, + "step": 322 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018698328935795954, + "loss": 3.3847, + "step": 323 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018692465552623867, + "loss": 3.3199, + "step": 324 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018686602169451773, + "loss": 3.3427, + "step": 325 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018680738786279683, + "loss": 3.3561, + "step": 326 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018674875403107595, + "loss": 3.2695, + "step": 327 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018669012019935504, + "loss": 3.3613, + "step": 328 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001866314863676341, + "loss": 3.3666, + "step": 329 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018657285253591323, + "loss": 3.356, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018651421870419233, + "loss": 3.312, + "step": 331 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018645558487247142, + "loss": 3.3382, + "step": 332 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018639695104075051, + "loss": 3.3648, + "step": 333 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001863383172090296, + "loss": 3.3673, + "step": 334 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018627968337730873, + "loss": 3.2083, + "step": 335 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001862210495455878, + "loss": 3.3504, + "step": 336 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001861624157138669, + "loss": 3.3457, + "step": 337 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018610378188214601, + "loss": 3.3261, + "step": 338 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001860451480504251, + "loss": 3.2873, + "step": 339 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001859865142187042, + "loss": 3.3141, + "step": 340 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001859278803869833, + "loss": 3.3467, + "step": 341 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001858692465552624, + "loss": 3.3692, + "step": 342 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001858106127235415, + "loss": 3.3813, + "step": 343 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018575197889182058, + "loss": 3.3228, + "step": 344 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018569334506009967, + "loss": 3.341, + "step": 345 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001856347112283788, + "loss": 3.3532, + "step": 346 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001855760773966579, + "loss": 3.3119, + "step": 347 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018551744356493698, + "loss": 3.3122, + "step": 348 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018545880973321608, + "loss": 3.3532, + "step": 349 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018540017590149517, + "loss": 3.3514, + "step": 350 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018534154206977427, + "loss": 3.3955, + "step": 351 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018528290823805336, + "loss": 3.3363, + "step": 352 + }, + { + "epoch": 0.1, + "eval_loss": 3.4325549602508545, + "eval_runtime": 2940.8692, + "eval_samples_per_second": 6.966, + "eval_steps_per_second": 2.322, + "step": 352 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018522427440633246, + "loss": 3.2021, + "step": 353 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018516564057461158, + "loss": 3.2987, + "step": 354 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018510700674289064, + "loss": 3.3542, + "step": 355 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018504837291116974, + "loss": 3.3654, + "step": 356 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018498973907944886, + "loss": 3.4304, + "step": 357 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018493110524772795, + "loss": 3.4141, + "step": 358 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018487247141600705, + "loss": 3.3684, + "step": 359 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018481383758428614, + "loss": 3.2512, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018475520375256524, + "loss": 3.2606, + "step": 361 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018469656992084433, + "loss": 3.2782, + "step": 362 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018463793608912343, + "loss": 3.3394, + "step": 363 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018457930225740252, + "loss": 3.4092, + "step": 364 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018452066842568164, + "loss": 3.2721, + "step": 365 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001844620345939607, + "loss": 3.3498, + "step": 366 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018440340076223983, + "loss": 3.304, + "step": 367 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018434476693051892, + "loss": 3.2888, + "step": 368 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018428613309879802, + "loss": 3.3503, + "step": 369 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001842274992670771, + "loss": 3.3369, + "step": 370 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001841688654353562, + "loss": 3.2832, + "step": 371 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001841102316036353, + "loss": 3.343, + "step": 372 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001840515977719144, + "loss": 3.3148, + "step": 373 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001839929639401935, + "loss": 3.3384, + "step": 374 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001839343301084726, + "loss": 3.2707, + "step": 375 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001838756962767517, + "loss": 3.38, + "step": 376 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018381706244503077, + "loss": 3.3358, + "step": 377 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001837584286133099, + "loss": 3.3641, + "step": 378 + }, + { + "epoch": 0.11, + "learning_rate": 0.000183699794781589, + "loss": 3.3345, + "step": 379 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018364116094986808, + "loss": 3.3788, + "step": 380 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018358252711814718, + "loss": 3.3313, + "step": 381 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018352389328642627, + "loss": 3.3056, + "step": 382 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018346525945470537, + "loss": 3.3711, + "step": 383 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018340662562298446, + "loss": 3.3281, + "step": 384 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018334799179126355, + "loss": 3.2252, + "step": 385 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018328935795954268, + "loss": 3.3881, + "step": 386 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018323072412782177, + "loss": 3.3027, + "step": 387 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018317209029610084, + "loss": 3.4018, + "step": 388 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018311345646437996, + "loss": 3.2718, + "step": 389 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018305482263265905, + "loss": 3.3853, + "step": 390 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018299618880093815, + "loss": 3.3667, + "step": 391 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018293755496921724, + "loss": 3.387, + "step": 392 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018287892113749634, + "loss": 3.294, + "step": 393 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018282028730577546, + "loss": 3.4073, + "step": 394 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018276165347405455, + "loss": 3.3658, + "step": 395 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018270301964233362, + "loss": 3.2588, + "step": 396 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018264438581061274, + "loss": 3.3282, + "step": 397 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018258575197889184, + "loss": 3.2229, + "step": 398 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018252711814717093, + "loss": 3.2904, + "step": 399 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018246848431545002, + "loss": 3.3482, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018240985048372912, + "loss": 3.3721, + "step": 401 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018235121665200824, + "loss": 3.3032, + "step": 402 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001822925828202873, + "loss": 3.3469, + "step": 403 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001822339489885664, + "loss": 3.2639, + "step": 404 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018217531515684552, + "loss": 3.421, + "step": 405 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018211668132512462, + "loss": 3.4373, + "step": 406 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018205804749340368, + "loss": 3.3135, + "step": 407 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001819994136616828, + "loss": 3.2879, + "step": 408 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001819407798299619, + "loss": 3.3979, + "step": 409 + }, + { + "epoch": 0.12, + "learning_rate": 0.000181882145998241, + "loss": 3.2865, + "step": 410 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001818235121665201, + "loss": 3.3575, + "step": 411 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018176487833479918, + "loss": 3.3499, + "step": 412 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001817062445030783, + "loss": 3.2888, + "step": 413 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018164761067135737, + "loss": 3.3062, + "step": 414 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018158897683963647, + "loss": 3.4237, + "step": 415 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001815303430079156, + "loss": 3.2535, + "step": 416 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018147170917619468, + "loss": 3.3479, + "step": 417 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018141307534447375, + "loss": 3.324, + "step": 418 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018135444151275287, + "loss": 3.3864, + "step": 419 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018129580768103196, + "loss": 3.2319, + "step": 420 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018123717384931106, + "loss": 3.3216, + "step": 421 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018117854001759015, + "loss": 3.2657, + "step": 422 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018111990618586925, + "loss": 3.3169, + "step": 423 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018106127235414837, + "loss": 3.3759, + "step": 424 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018100263852242744, + "loss": 3.4322, + "step": 425 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018094400469070653, + "loss": 3.3721, + "step": 426 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018088537085898565, + "loss": 3.3239, + "step": 427 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018082673702726475, + "loss": 3.3027, + "step": 428 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018076810319554384, + "loss": 3.3012, + "step": 429 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018070946936382293, + "loss": 3.2466, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018065083553210203, + "loss": 3.3689, + "step": 431 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018059220170038112, + "loss": 3.2777, + "step": 432 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018053356786866022, + "loss": 3.3685, + "step": 433 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001804749340369393, + "loss": 3.4021, + "step": 434 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018041630020521843, + "loss": 3.3168, + "step": 435 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001803576663734975, + "loss": 3.2871, + "step": 436 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018029903254177662, + "loss": 3.2451, + "step": 437 + }, + { + "epoch": 0.12, + "learning_rate": 0.00018024039871005572, + "loss": 3.3165, + "step": 438 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001801817648783348, + "loss": 3.3736, + "step": 439 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001801231310466139, + "loss": 3.2207, + "step": 440 + }, + { + "epoch": 0.13, + "learning_rate": 0.000180064497214893, + "loss": 3.3741, + "step": 441 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001800058633831721, + "loss": 3.34, + "step": 442 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017994722955145122, + "loss": 3.3197, + "step": 443 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017988859571973028, + "loss": 3.2689, + "step": 444 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017982996188800938, + "loss": 3.3944, + "step": 445 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001797713280562885, + "loss": 3.3577, + "step": 446 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001797126942245676, + "loss": 3.3384, + "step": 447 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001796540603928467, + "loss": 3.3514, + "step": 448 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017959542656112578, + "loss": 3.2715, + "step": 449 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017953679272940488, + "loss": 3.4112, + "step": 450 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017947815889768397, + "loss": 3.2883, + "step": 451 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017941952506596306, + "loss": 3.3312, + "step": 452 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017936089123424216, + "loss": 3.399, + "step": 453 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017930225740252128, + "loss": 3.2589, + "step": 454 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017924362357080035, + "loss": 3.3033, + "step": 455 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017918498973907947, + "loss": 3.2986, + "step": 456 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017912635590735856, + "loss": 3.1767, + "step": 457 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017906772207563766, + "loss": 3.3192, + "step": 458 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017900908824391675, + "loss": 3.3241, + "step": 459 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017895045441219585, + "loss": 3.201, + "step": 460 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017889182058047494, + "loss": 3.2929, + "step": 461 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017883318674875403, + "loss": 3.3327, + "step": 462 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017877455291703313, + "loss": 3.418, + "step": 463 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017871591908531225, + "loss": 3.3366, + "step": 464 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017865728525359134, + "loss": 3.2592, + "step": 465 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001785986514218704, + "loss": 3.3467, + "step": 466 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017854001759014953, + "loss": 3.3619, + "step": 467 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017848138375842863, + "loss": 3.3509, + "step": 468 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017842274992670772, + "loss": 3.1956, + "step": 469 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017836411609498682, + "loss": 3.3698, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001783054822632659, + "loss": 3.321, + "step": 471 + }, + { + "epoch": 0.13, + "learning_rate": 0.000178246848431545, + "loss": 3.2352, + "step": 472 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001781882145998241, + "loss": 3.224, + "step": 473 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001781295807681032, + "loss": 3.3287, + "step": 474 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017807094693638231, + "loss": 3.2841, + "step": 475 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001780123131046614, + "loss": 3.3262, + "step": 476 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017795367927294048, + "loss": 3.3455, + "step": 477 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001778950454412196, + "loss": 3.2282, + "step": 478 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001778364116094987, + "loss": 3.3557, + "step": 479 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017777777777777779, + "loss": 3.2092, + "step": 480 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017771914394605688, + "loss": 3.3508, + "step": 481 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017766051011433597, + "loss": 3.3306, + "step": 482 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001776018762826151, + "loss": 3.1666, + "step": 483 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017754324245089416, + "loss": 3.3202, + "step": 484 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017748460861917326, + "loss": 3.3197, + "step": 485 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017742597478745238, + "loss": 3.2957, + "step": 486 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017736734095573147, + "loss": 3.3891, + "step": 487 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017730870712401054, + "loss": 3.4085, + "step": 488 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017725007329228966, + "loss": 3.3088, + "step": 489 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017719143946056876, + "loss": 3.3182, + "step": 490 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017713280562884785, + "loss": 3.2542, + "step": 491 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017707417179712695, + "loss": 3.2773, + "step": 492 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017701553796540604, + "loss": 3.2836, + "step": 493 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017695690413368516, + "loss": 3.2482, + "step": 494 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017689827030196426, + "loss": 3.3793, + "step": 495 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017683963647024332, + "loss": 3.405, + "step": 496 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017678100263852244, + "loss": 3.2463, + "step": 497 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017672236880680154, + "loss": 3.2425, + "step": 498 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017666373497508063, + "loss": 3.3272, + "step": 499 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017660510114335973, + "loss": 3.3286, + "step": 500 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017654646731163882, + "loss": 3.2452, + "step": 501 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017648783347991794, + "loss": 3.2671, + "step": 502 + }, + { + "epoch": 0.14, + "learning_rate": 0.000176429199648197, + "loss": 3.3648, + "step": 503 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001763705658164761, + "loss": 3.3606, + "step": 504 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017631193198475523, + "loss": 3.2764, + "step": 505 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017625329815303432, + "loss": 3.2594, + "step": 506 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001761946643213134, + "loss": 3.317, + "step": 507 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001761360304895925, + "loss": 3.2618, + "step": 508 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001760773966578716, + "loss": 3.4423, + "step": 509 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001760187628261507, + "loss": 3.3306, + "step": 510 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001759601289944298, + "loss": 3.3913, + "step": 511 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017590149516270889, + "loss": 3.4276, + "step": 512 + }, + { + "epoch": 0.15, + "learning_rate": 0.000175842861330988, + "loss": 3.2858, + "step": 513 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017578422749926707, + "loss": 3.4223, + "step": 514 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017572559366754617, + "loss": 3.383, + "step": 515 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001756669598358253, + "loss": 3.1365, + "step": 516 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017560832600410438, + "loss": 3.2742, + "step": 517 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017554969217238348, + "loss": 3.2793, + "step": 518 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017549105834066257, + "loss": 3.2637, + "step": 519 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017543242450894167, + "loss": 3.3172, + "step": 520 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017537379067722076, + "loss": 3.2949, + "step": 521 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017531515684549986, + "loss": 3.3339, + "step": 522 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017525652301377895, + "loss": 3.1933, + "step": 523 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017519788918205807, + "loss": 3.3356, + "step": 524 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017513925535033714, + "loss": 3.2832, + "step": 525 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017508062151861623, + "loss": 3.2465, + "step": 526 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017502198768689535, + "loss": 3.2227, + "step": 527 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017496335385517445, + "loss": 3.2431, + "step": 528 + }, + { + "epoch": 0.15, + "eval_loss": 3.4339988231658936, + "eval_runtime": 2941.2933, + "eval_samples_per_second": 6.965, + "eval_steps_per_second": 2.322, + "step": 528 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017490472002345354, + "loss": 3.2959, + "step": 529 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017484608619173264, + "loss": 3.3558, + "step": 530 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017478745236001173, + "loss": 3.3577, + "step": 531 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017472881852829083, + "loss": 3.3003, + "step": 532 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017467018469656992, + "loss": 3.2796, + "step": 533 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017461155086484902, + "loss": 3.3259, + "step": 534 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017455291703312814, + "loss": 3.3561, + "step": 535 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001744942832014072, + "loss": 3.3565, + "step": 536 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017443564936968633, + "loss": 3.3997, + "step": 537 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017437701553796542, + "loss": 3.276, + "step": 538 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017431838170624451, + "loss": 3.285, + "step": 539 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001742597478745236, + "loss": 3.2681, + "step": 540 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001742011140428027, + "loss": 3.3277, + "step": 541 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001741424802110818, + "loss": 3.3858, + "step": 542 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001740838463793609, + "loss": 3.337, + "step": 543 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017402521254763999, + "loss": 3.2288, + "step": 544 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001739665787159191, + "loss": 3.3113, + "step": 545 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001739079448841982, + "loss": 3.36, + "step": 546 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017384931105247727, + "loss": 3.285, + "step": 547 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001737906772207564, + "loss": 3.2721, + "step": 548 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017373204338903548, + "loss": 3.2411, + "step": 549 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017367340955731458, + "loss": 3.2043, + "step": 550 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017361477572559367, + "loss": 3.2924, + "step": 551 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017355614189387277, + "loss": 3.2879, + "step": 552 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017349750806215186, + "loss": 3.292, + "step": 553 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017343887423043098, + "loss": 3.3447, + "step": 554 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017338024039871005, + "loss": 3.304, + "step": 555 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017332160656698917, + "loss": 3.298, + "step": 556 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017326297273526827, + "loss": 3.2038, + "step": 557 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017320433890354736, + "loss": 3.3378, + "step": 558 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017314570507182645, + "loss": 3.3532, + "step": 559 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017308707124010555, + "loss": 3.2387, + "step": 560 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017302843740838464, + "loss": 3.2829, + "step": 561 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017296980357666374, + "loss": 3.2958, + "step": 562 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017291116974494283, + "loss": 3.4214, + "step": 563 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017285253591322195, + "loss": 3.2844, + "step": 564 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017279390208150105, + "loss": 3.4155, + "step": 565 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017273526824978011, + "loss": 3.3632, + "step": 566 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017267663441805924, + "loss": 3.3174, + "step": 567 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017261800058633833, + "loss": 3.263, + "step": 568 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017255936675461742, + "loss": 3.3077, + "step": 569 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017250073292289652, + "loss": 3.3081, + "step": 570 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001724420990911756, + "loss": 3.2987, + "step": 571 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017238346525945473, + "loss": 3.3379, + "step": 572 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001723248314277338, + "loss": 3.324, + "step": 573 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001722661975960129, + "loss": 3.136, + "step": 574 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017220756376429202, + "loss": 3.261, + "step": 575 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001721489299325711, + "loss": 3.3249, + "step": 576 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017209029610085018, + "loss": 3.2153, + "step": 577 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001720316622691293, + "loss": 3.2558, + "step": 578 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001719730284374084, + "loss": 3.2936, + "step": 579 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001719143946056875, + "loss": 3.3077, + "step": 580 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017185576077396658, + "loss": 3.3064, + "step": 581 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017179712694224568, + "loss": 3.3206, + "step": 582 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001717384931105248, + "loss": 3.3589, + "step": 583 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017167985927880387, + "loss": 3.2409, + "step": 584 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017162122544708296, + "loss": 3.3868, + "step": 585 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017156259161536208, + "loss": 3.3109, + "step": 586 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017150395778364118, + "loss": 3.3553, + "step": 587 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017144532395192024, + "loss": 3.4257, + "step": 588 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017138669012019937, + "loss": 3.3823, + "step": 589 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017132805628847846, + "loss": 3.3419, + "step": 590 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017126942245675755, + "loss": 3.3304, + "step": 591 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017121078862503665, + "loss": 3.3838, + "step": 592 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017115215479331574, + "loss": 3.1078, + "step": 593 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017109352096159486, + "loss": 3.351, + "step": 594 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017103488712987393, + "loss": 3.2375, + "step": 595 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017097625329815303, + "loss": 3.2834, + "step": 596 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017091761946643215, + "loss": 3.373, + "step": 597 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017085898563471124, + "loss": 3.3146, + "step": 598 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017080035180299034, + "loss": 3.3141, + "step": 599 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017074171797126943, + "loss": 3.2932, + "step": 600 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017068308413954852, + "loss": 3.3591, + "step": 601 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017062445030782765, + "loss": 3.2531, + "step": 602 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001705658164761067, + "loss": 3.2469, + "step": 603 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001705071826443858, + "loss": 3.2581, + "step": 604 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017044854881266493, + "loss": 3.421, + "step": 605 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017038991498094402, + "loss": 3.2698, + "step": 606 + }, + { + "epoch": 0.17, + "learning_rate": 0.00017033128114922312, + "loss": 3.301, + "step": 607 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001702726473175022, + "loss": 3.2526, + "step": 608 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001702140134857813, + "loss": 3.318, + "step": 609 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001701553796540604, + "loss": 3.4293, + "step": 610 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001700967458223395, + "loss": 3.3258, + "step": 611 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001700381119906186, + "loss": 3.2636, + "step": 612 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001699794781588977, + "loss": 3.277, + "step": 613 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016992084432717678, + "loss": 3.2717, + "step": 614 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016986221049545587, + "loss": 3.2445, + "step": 615 + }, + { + "epoch": 0.18, + "learning_rate": 0.000169803576663735, + "loss": 3.3164, + "step": 616 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001697449428320141, + "loss": 3.3404, + "step": 617 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016968630900029318, + "loss": 3.3986, + "step": 618 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016962767516857228, + "loss": 3.3836, + "step": 619 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016956904133685137, + "loss": 3.3015, + "step": 620 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016951040750513046, + "loss": 3.2088, + "step": 621 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016945177367340956, + "loss": 3.2471, + "step": 622 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016939313984168865, + "loss": 3.2617, + "step": 623 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016933450600996777, + "loss": 3.3185, + "step": 624 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016927587217824684, + "loss": 3.342, + "step": 625 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016921723834652596, + "loss": 3.3705, + "step": 626 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016915860451480506, + "loss": 3.2627, + "step": 627 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016909997068308415, + "loss": 3.2532, + "step": 628 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016904133685136325, + "loss": 3.2419, + "step": 629 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016898270301964234, + "loss": 3.2455, + "step": 630 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016892406918792143, + "loss": 3.1866, + "step": 631 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016886543535620053, + "loss": 3.3726, + "step": 632 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016880680152447962, + "loss": 3.2777, + "step": 633 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016874816769275875, + "loss": 3.3045, + "step": 634 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016868953386103784, + "loss": 3.2767, + "step": 635 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001686309000293169, + "loss": 3.2479, + "step": 636 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016857226619759603, + "loss": 3.2611, + "step": 637 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016851363236587512, + "loss": 3.1176, + "step": 638 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016845499853415422, + "loss": 3.1545, + "step": 639 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001683963647024333, + "loss": 3.3133, + "step": 640 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001683377308707124, + "loss": 3.4034, + "step": 641 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001682790970389915, + "loss": 3.2923, + "step": 642 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001682204632072706, + "loss": 3.2192, + "step": 643 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001681618293755497, + "loss": 3.3476, + "step": 644 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001681031955438288, + "loss": 3.2615, + "step": 645 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001680445617121079, + "loss": 3.2505, + "step": 646 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016798592788038697, + "loss": 3.1758, + "step": 647 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001679272940486661, + "loss": 3.2939, + "step": 648 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001678686602169452, + "loss": 3.2557, + "step": 649 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016781002638522428, + "loss": 3.3448, + "step": 650 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016775139255350338, + "loss": 3.3577, + "step": 651 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016769275872178247, + "loss": 3.1448, + "step": 652 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001676341248900616, + "loss": 3.3233, + "step": 653 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016757549105834069, + "loss": 3.2937, + "step": 654 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016751685722661975, + "loss": 3.1965, + "step": 655 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016745822339489887, + "loss": 3.284, + "step": 656 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016739958956317797, + "loss": 3.2228, + "step": 657 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016734095573145706, + "loss": 3.2362, + "step": 658 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016728232189973616, + "loss": 3.2238, + "step": 659 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016722368806801525, + "loss": 3.3228, + "step": 660 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016716505423629437, + "loss": 3.2826, + "step": 661 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016710642040457344, + "loss": 3.3176, + "step": 662 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016704778657285253, + "loss": 3.2703, + "step": 663 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016698915274113166, + "loss": 3.2819, + "step": 664 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016693051890941075, + "loss": 3.3043, + "step": 665 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016687188507768982, + "loss": 3.344, + "step": 666 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016681325124596894, + "loss": 3.2764, + "step": 667 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016675461741424803, + "loss": 3.2562, + "step": 668 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016669598358252713, + "loss": 3.2733, + "step": 669 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016663734975080622, + "loss": 3.2605, + "step": 670 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016657871591908532, + "loss": 3.2495, + "step": 671 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016652008208736444, + "loss": 3.2638, + "step": 672 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001664614482556435, + "loss": 3.3926, + "step": 673 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001664028144239226, + "loss": 3.1857, + "step": 674 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016634418059220172, + "loss": 3.2965, + "step": 675 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016628554676048081, + "loss": 3.3535, + "step": 676 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016622691292875988, + "loss": 3.3562, + "step": 677 + }, + { + "epoch": 0.19, + "learning_rate": 0.000166168279097039, + "loss": 3.2641, + "step": 678 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001661096452653181, + "loss": 3.2676, + "step": 679 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001660510114335972, + "loss": 3.2632, + "step": 680 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016599237760187629, + "loss": 3.2399, + "step": 681 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016593374377015538, + "loss": 3.3722, + "step": 682 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001658751099384345, + "loss": 3.1751, + "step": 683 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016581647610671357, + "loss": 3.304, + "step": 684 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016575784227499266, + "loss": 3.3627, + "step": 685 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016569920844327179, + "loss": 3.2769, + "step": 686 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016564057461155088, + "loss": 3.333, + "step": 687 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016558194077982997, + "loss": 3.2288, + "step": 688 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016552330694810907, + "loss": 3.2726, + "step": 689 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016546467311638816, + "loss": 3.2776, + "step": 690 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016540603928466726, + "loss": 3.2747, + "step": 691 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016534740545294635, + "loss": 3.2204, + "step": 692 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016528877162122545, + "loss": 3.2548, + "step": 693 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016523013778950457, + "loss": 3.2743, + "step": 694 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016517150395778363, + "loss": 3.33, + "step": 695 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016511287012606273, + "loss": 3.3194, + "step": 696 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016505423629434185, + "loss": 3.2658, + "step": 697 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016499560246262094, + "loss": 3.1893, + "step": 698 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016493696863090004, + "loss": 3.3421, + "step": 699 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016487833479917913, + "loss": 3.2042, + "step": 700 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016481970096745823, + "loss": 3.3802, + "step": 701 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016476106713573732, + "loss": 3.272, + "step": 702 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016470243330401642, + "loss": 3.3066, + "step": 703 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001646437994722955, + "loss": 3.3326, + "step": 704 + }, + { + "epoch": 0.2, + "eval_loss": 3.4172792434692383, + "eval_runtime": 2941.3473, + "eval_samples_per_second": 6.965, + "eval_steps_per_second": 2.322, + "step": 704 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016458516564057463, + "loss": 3.2473, + "step": 705 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016452653180885373, + "loss": 3.2433, + "step": 706 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016446789797713282, + "loss": 3.2078, + "step": 707 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016440926414541191, + "loss": 3.3541, + "step": 708 + }, + { + "epoch": 0.2, + "learning_rate": 0.000164350630313691, + "loss": 3.394, + "step": 709 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001642919964819701, + "loss": 3.3283, + "step": 710 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001642333626502492, + "loss": 3.2292, + "step": 711 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001641747288185283, + "loss": 3.3035, + "step": 712 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001641160949868074, + "loss": 3.3012, + "step": 713 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016405746115508648, + "loss": 3.182, + "step": 714 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001639988273233656, + "loss": 3.349, + "step": 715 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001639401934916447, + "loss": 3.3429, + "step": 716 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001638815596599238, + "loss": 3.3554, + "step": 717 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016382292582820288, + "loss": 3.2935, + "step": 718 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016376429199648198, + "loss": 3.3004, + "step": 719 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016370565816476107, + "loss": 3.2854, + "step": 720 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016364702433304017, + "loss": 3.3397, + "step": 721 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016358839050131926, + "loss": 3.2613, + "step": 722 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016352975666959836, + "loss": 3.3017, + "step": 723 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016347112283787748, + "loss": 3.1922, + "step": 724 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016341248900615654, + "loss": 3.3134, + "step": 725 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016335385517443567, + "loss": 3.2636, + "step": 726 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016329522134271476, + "loss": 3.2879, + "step": 727 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016323658751099385, + "loss": 3.2707, + "step": 728 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016317795367927295, + "loss": 3.2435, + "step": 729 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016311931984755204, + "loss": 3.2848, + "step": 730 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016306068601583114, + "loss": 3.4065, + "step": 731 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016300205218411023, + "loss": 3.194, + "step": 732 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016294341835238933, + "loss": 3.2039, + "step": 733 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016288478452066845, + "loss": 3.2882, + "step": 734 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016282615068894754, + "loss": 3.3171, + "step": 735 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001627675168572266, + "loss": 3.3337, + "step": 736 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016270888302550573, + "loss": 3.2181, + "step": 737 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016265024919378483, + "loss": 3.318, + "step": 738 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016259161536206392, + "loss": 3.2269, + "step": 739 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016253298153034301, + "loss": 3.3264, + "step": 740 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001624743476986221, + "loss": 3.2743, + "step": 741 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016241571386690123, + "loss": 3.2306, + "step": 742 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001623570800351803, + "loss": 3.3459, + "step": 743 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001622984462034594, + "loss": 3.3822, + "step": 744 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001622398123717385, + "loss": 3.3037, + "step": 745 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001621811785400176, + "loss": 3.2747, + "step": 746 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016212254470829667, + "loss": 3.2404, + "step": 747 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001620639108765758, + "loss": 3.1767, + "step": 748 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001620052770448549, + "loss": 3.3484, + "step": 749 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016194664321313398, + "loss": 3.2865, + "step": 750 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016188800938141308, + "loss": 3.224, + "step": 751 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016182937554969217, + "loss": 3.1685, + "step": 752 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001617707417179713, + "loss": 3.1998, + "step": 753 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016171210788625036, + "loss": 3.2195, + "step": 754 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016165347405452946, + "loss": 3.3184, + "step": 755 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016159484022280858, + "loss": 3.2806, + "step": 756 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016153620639108767, + "loss": 3.2766, + "step": 757 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016147757255936674, + "loss": 3.2813, + "step": 758 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016141893872764586, + "loss": 3.247, + "step": 759 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016136030489592495, + "loss": 3.3013, + "step": 760 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016130167106420408, + "loss": 3.2898, + "step": 761 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016124303723248314, + "loss": 3.193, + "step": 762 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016118440340076224, + "loss": 3.2976, + "step": 763 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016112576956904136, + "loss": 3.3017, + "step": 764 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016106713573732045, + "loss": 3.3193, + "step": 765 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016100850190559952, + "loss": 3.2682, + "step": 766 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016094986807387864, + "loss": 3.2774, + "step": 767 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016089123424215774, + "loss": 3.3002, + "step": 768 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016083260041043683, + "loss": 3.1781, + "step": 769 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016077396657871592, + "loss": 3.2766, + "step": 770 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016071533274699502, + "loss": 3.2483, + "step": 771 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016065669891527414, + "loss": 3.2515, + "step": 772 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001605980650835532, + "loss": 3.2771, + "step": 773 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001605394312518323, + "loss": 3.2535, + "step": 774 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016048079742011142, + "loss": 3.3926, + "step": 775 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016042216358839052, + "loss": 3.2368, + "step": 776 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001603635297566696, + "loss": 3.1698, + "step": 777 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001603048959249487, + "loss": 3.2374, + "step": 778 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001602462620932278, + "loss": 3.3045, + "step": 779 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001601876282615069, + "loss": 3.229, + "step": 780 + }, + { + "epoch": 0.22, + "learning_rate": 0.000160128994429786, + "loss": 3.278, + "step": 781 + }, + { + "epoch": 0.22, + "learning_rate": 0.00016007036059806508, + "loss": 3.2746, + "step": 782 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001600117267663442, + "loss": 3.2816, + "step": 783 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015995309293462327, + "loss": 3.2566, + "step": 784 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015989445910290237, + "loss": 3.3259, + "step": 785 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001598358252711815, + "loss": 3.2119, + "step": 786 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015977719143946058, + "loss": 3.0796, + "step": 787 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015971855760773968, + "loss": 3.2004, + "step": 788 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015965992377601877, + "loss": 3.3308, + "step": 789 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015960128994429787, + "loss": 3.3194, + "step": 790 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015954265611257696, + "loss": 3.3085, + "step": 791 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015948402228085605, + "loss": 3.2244, + "step": 792 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015942538844913515, + "loss": 3.0929, + "step": 793 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015936675461741427, + "loss": 3.236, + "step": 794 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015930812078569334, + "loss": 3.2935, + "step": 795 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015924948695397246, + "loss": 3.2709, + "step": 796 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015919085312225155, + "loss": 3.2316, + "step": 797 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015913221929053065, + "loss": 3.2388, + "step": 798 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015907358545880974, + "loss": 3.2513, + "step": 799 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015901495162708884, + "loss": 3.1954, + "step": 800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015895631779536793, + "loss": 3.2512, + "step": 801 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015889768396364702, + "loss": 3.2425, + "step": 802 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015883905013192612, + "loss": 3.2533, + "step": 803 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015878041630020524, + "loss": 3.3006, + "step": 804 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015872178246848433, + "loss": 3.3069, + "step": 805 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001586631486367634, + "loss": 3.2092, + "step": 806 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015860451480504252, + "loss": 3.2734, + "step": 807 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015854588097332162, + "loss": 3.2091, + "step": 808 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001584872471416007, + "loss": 3.2802, + "step": 809 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001584286133098798, + "loss": 3.2291, + "step": 810 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001583699794781589, + "loss": 3.3642, + "step": 811 + }, + { + "epoch": 0.23, + "learning_rate": 0.000158311345646438, + "loss": 3.2719, + "step": 812 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015825271181471712, + "loss": 3.3143, + "step": 813 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015819407798299618, + "loss": 3.1932, + "step": 814 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001581354441512753, + "loss": 3.353, + "step": 815 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001580768103195544, + "loss": 3.1533, + "step": 816 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001580181764878335, + "loss": 3.2604, + "step": 817 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001579595426561126, + "loss": 3.1985, + "step": 818 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015790090882439168, + "loss": 3.2331, + "step": 819 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015784227499267078, + "loss": 3.1971, + "step": 820 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015778364116094987, + "loss": 3.276, + "step": 821 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015772500732922896, + "loss": 3.2724, + "step": 822 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015766637349750809, + "loss": 3.2373, + "step": 823 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015760773966578718, + "loss": 3.23, + "step": 824 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015754910583406625, + "loss": 3.2432, + "step": 825 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015749047200234537, + "loss": 3.2401, + "step": 826 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015743183817062446, + "loss": 3.2634, + "step": 827 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015737320433890356, + "loss": 3.3252, + "step": 828 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015731457050718265, + "loss": 3.2793, + "step": 829 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015725593667546175, + "loss": 3.2755, + "step": 830 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015719730284374087, + "loss": 3.3025, + "step": 831 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015713866901201994, + "loss": 3.1987, + "step": 832 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015708003518029903, + "loss": 3.3599, + "step": 833 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015702140134857815, + "loss": 3.3191, + "step": 834 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015696276751685725, + "loss": 3.2095, + "step": 835 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001569041336851363, + "loss": 3.2534, + "step": 836 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015684549985341543, + "loss": 3.3055, + "step": 837 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015678686602169453, + "loss": 3.2446, + "step": 838 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015672823218997362, + "loss": 3.3143, + "step": 839 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015666959835825272, + "loss": 3.2018, + "step": 840 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001566109645265318, + "loss": 3.2666, + "step": 841 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015655233069481093, + "loss": 3.249, + "step": 842 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015649369686309, + "loss": 3.3042, + "step": 843 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001564350630313691, + "loss": 3.2625, + "step": 844 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015637642919964822, + "loss": 3.2064, + "step": 845 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001563177953679273, + "loss": 3.1942, + "step": 846 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015625916153620638, + "loss": 3.2523, + "step": 847 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001562005277044855, + "loss": 3.1982, + "step": 848 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001561418938727646, + "loss": 3.244, + "step": 849 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001560832600410437, + "loss": 3.2332, + "step": 850 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015602462620932278, + "loss": 3.2345, + "step": 851 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015596599237760188, + "loss": 3.1666, + "step": 852 + }, + { + "epoch": 0.24, + "learning_rate": 0.000155907358545881, + "loss": 3.2655, + "step": 853 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015584872471416006, + "loss": 3.211, + "step": 854 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015579009088243916, + "loss": 3.0659, + "step": 855 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015573145705071828, + "loss": 3.2308, + "step": 856 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015567282321899737, + "loss": 3.1882, + "step": 857 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015561418938727647, + "loss": 3.2903, + "step": 858 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015555555555555556, + "loss": 3.2234, + "step": 859 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015549692172383466, + "loss": 3.268, + "step": 860 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015543828789211378, + "loss": 3.245, + "step": 861 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015537965406039285, + "loss": 3.2705, + "step": 862 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015532102022867194, + "loss": 3.2247, + "step": 863 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015526238639695106, + "loss": 3.2725, + "step": 864 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015520375256523016, + "loss": 3.2226, + "step": 865 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015514511873350922, + "loss": 3.325, + "step": 866 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015508648490178834, + "loss": 3.1678, + "step": 867 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015502785107006744, + "loss": 3.1907, + "step": 868 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015496921723834653, + "loss": 3.2839, + "step": 869 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015491058340662563, + "loss": 3.2195, + "step": 870 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015485194957490472, + "loss": 3.2793, + "step": 871 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015479331574318384, + "loss": 3.2944, + "step": 872 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001547346819114629, + "loss": 3.2232, + "step": 873 + }, + { + "epoch": 0.25, + "learning_rate": 0.000154676048079742, + "loss": 3.2663, + "step": 874 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015461741424802113, + "loss": 3.229, + "step": 875 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015455878041630022, + "loss": 3.2287, + "step": 876 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015450014658457931, + "loss": 3.184, + "step": 877 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001544415127528584, + "loss": 3.2551, + "step": 878 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001543828789211375, + "loss": 3.2443, + "step": 879 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001543242450894166, + "loss": 3.1881, + "step": 880 + }, + { + "epoch": 0.25, + "eval_loss": 3.4091506004333496, + "eval_runtime": 2941.0944, + "eval_samples_per_second": 6.966, + "eval_steps_per_second": 2.322, + "step": 880 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001542656112576957, + "loss": 3.1824, + "step": 881 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001542069774259748, + "loss": 3.2434, + "step": 882 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001541483435942539, + "loss": 3.177, + "step": 883 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015408970976253298, + "loss": 3.2763, + "step": 884 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001540310759308121, + "loss": 3.2472, + "step": 885 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001539724420990912, + "loss": 3.1991, + "step": 886 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015391380826737029, + "loss": 3.2498, + "step": 887 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015385517443564938, + "loss": 3.2251, + "step": 888 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015379654060392847, + "loss": 3.2537, + "step": 889 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015373790677220757, + "loss": 3.244, + "step": 890 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015367927294048666, + "loss": 3.1972, + "step": 891 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015362063910876576, + "loss": 3.2767, + "step": 892 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015356200527704485, + "loss": 3.2709, + "step": 893 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015350337144532397, + "loss": 3.2772, + "step": 894 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015344473761360304, + "loss": 3.3388, + "step": 895 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015338610378188216, + "loss": 3.21, + "step": 896 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015332746995016126, + "loss": 3.2313, + "step": 897 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015326883611844035, + "loss": 3.4058, + "step": 898 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015321020228671944, + "loss": 3.2922, + "step": 899 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015315156845499854, + "loss": 3.2429, + "step": 900 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015309293462327763, + "loss": 3.3019, + "step": 901 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015303430079155673, + "loss": 3.2689, + "step": 902 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015297566695983582, + "loss": 3.2101, + "step": 903 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015291703312811494, + "loss": 3.2623, + "step": 904 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015285839929639404, + "loss": 3.2196, + "step": 905 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001527997654646731, + "loss": 3.2559, + "step": 906 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015274113163295223, + "loss": 3.2888, + "step": 907 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015268249780123132, + "loss": 3.1584, + "step": 908 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015262386396951041, + "loss": 3.1906, + "step": 909 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001525652301377895, + "loss": 3.2, + "step": 910 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001525065963060686, + "loss": 3.3503, + "step": 911 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015244796247434772, + "loss": 3.3084, + "step": 912 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001523893286426268, + "loss": 3.2494, + "step": 913 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015233069481090589, + "loss": 3.2489, + "step": 914 + }, + { + "epoch": 0.26, + "learning_rate": 0.000152272060979185, + "loss": 3.2155, + "step": 915 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001522134271474641, + "loss": 3.2881, + "step": 916 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001521547933157432, + "loss": 3.2205, + "step": 917 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001520961594840223, + "loss": 3.1929, + "step": 918 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015203752565230138, + "loss": 3.307, + "step": 919 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015197889182058048, + "loss": 3.2806, + "step": 920 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015192025798885957, + "loss": 3.2074, + "step": 921 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015186162415713867, + "loss": 3.176, + "step": 922 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001518029903254178, + "loss": 3.2676, + "step": 923 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015174435649369688, + "loss": 3.2985, + "step": 924 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015168572266197595, + "loss": 3.2657, + "step": 925 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015162708883025507, + "loss": 3.2334, + "step": 926 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015156845499853417, + "loss": 3.2857, + "step": 927 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015150982116681326, + "loss": 3.1691, + "step": 928 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015145118733509236, + "loss": 3.2631, + "step": 929 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015139255350337145, + "loss": 3.2074, + "step": 930 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015133391967165057, + "loss": 3.1926, + "step": 931 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015127528583992964, + "loss": 3.2915, + "step": 932 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015121665200820873, + "loss": 3.3273, + "step": 933 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015115801817648785, + "loss": 3.2319, + "step": 934 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015109938434476695, + "loss": 3.2416, + "step": 935 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015104075051304602, + "loss": 3.2577, + "step": 936 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015098211668132514, + "loss": 3.1961, + "step": 937 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015092348284960423, + "loss": 3.2255, + "step": 938 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015086484901788333, + "loss": 3.2932, + "step": 939 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015080621518616242, + "loss": 3.2688, + "step": 940 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015074758135444151, + "loss": 3.1805, + "step": 941 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015068894752272064, + "loss": 3.2549, + "step": 942 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001506303136909997, + "loss": 3.2801, + "step": 943 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001505716798592788, + "loss": 3.2847, + "step": 944 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015051304602755792, + "loss": 3.2777, + "step": 945 + }, + { + "epoch": 0.27, + "learning_rate": 0.000150454412195837, + "loss": 3.2552, + "step": 946 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015039577836411608, + "loss": 3.3059, + "step": 947 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001503371445323952, + "loss": 3.2302, + "step": 948 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001502785107006743, + "loss": 3.3052, + "step": 949 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001502198768689534, + "loss": 3.2225, + "step": 950 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015016124303723248, + "loss": 3.3093, + "step": 951 + }, + { + "epoch": 0.27, + "learning_rate": 0.00015010260920551158, + "loss": 3.3046, + "step": 952 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001500439753737907, + "loss": 3.2808, + "step": 953 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014998534154206977, + "loss": 3.2288, + "step": 954 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014992670771034886, + "loss": 3.302, + "step": 955 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014986807387862798, + "loss": 3.2052, + "step": 956 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014980944004690708, + "loss": 3.2758, + "step": 957 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014975080621518617, + "loss": 3.2215, + "step": 958 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014969217238346527, + "loss": 3.2778, + "step": 959 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014963353855174436, + "loss": 3.2338, + "step": 960 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014957490472002345, + "loss": 3.2369, + "step": 961 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014951627088830255, + "loss": 3.1956, + "step": 962 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014945763705658164, + "loss": 3.188, + "step": 963 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014939900322486076, + "loss": 3.1952, + "step": 964 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014934036939313983, + "loss": 3.2734, + "step": 965 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014928173556141895, + "loss": 3.1608, + "step": 966 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014922310172969805, + "loss": 3.3127, + "step": 967 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014916446789797714, + "loss": 3.283, + "step": 968 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014910583406625624, + "loss": 3.2635, + "step": 969 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014904720023453533, + "loss": 3.2095, + "step": 970 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014898856640281442, + "loss": 3.2504, + "step": 971 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014892993257109355, + "loss": 3.2941, + "step": 972 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001488712987393726, + "loss": 3.3075, + "step": 973 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014881266490765173, + "loss": 3.2687, + "step": 974 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014875403107593083, + "loss": 3.1588, + "step": 975 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014869539724420992, + "loss": 3.2512, + "step": 976 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014863676341248902, + "loss": 3.3122, + "step": 977 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001485781295807681, + "loss": 3.1459, + "step": 978 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001485194957490472, + "loss": 3.2629, + "step": 979 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001484608619173263, + "loss": 3.1711, + "step": 980 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001484022280856054, + "loss": 3.1911, + "step": 981 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001483435942538845, + "loss": 3.2479, + "step": 982 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001482849604221636, + "loss": 3.2854, + "step": 983 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014822632659044268, + "loss": 3.2204, + "step": 984 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001481676927587218, + "loss": 3.33, + "step": 985 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001481090589270009, + "loss": 3.1618, + "step": 986 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014805042509528, + "loss": 3.2279, + "step": 987 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014799179126355908, + "loss": 3.1532, + "step": 988 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014793315743183818, + "loss": 3.3308, + "step": 989 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014787452360011727, + "loss": 3.1687, + "step": 990 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014781588976839637, + "loss": 3.2199, + "step": 991 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014775725593667546, + "loss": 3.3156, + "step": 992 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014769862210495458, + "loss": 3.3221, + "step": 993 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014763998827323368, + "loss": 3.2839, + "step": 994 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014758135444151274, + "loss": 3.2538, + "step": 995 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014752272060979186, + "loss": 3.1649, + "step": 996 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014746408677807096, + "loss": 3.2941, + "step": 997 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014740545294635005, + "loss": 3.1584, + "step": 998 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014734681911462915, + "loss": 3.1711, + "step": 999 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014728818528290824, + "loss": 3.0869, + "step": 1000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014722955145118736, + "loss": 3.1919, + "step": 1001 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014717091761946643, + "loss": 3.2672, + "step": 1002 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014711228378774552, + "loss": 3.1987, + "step": 1003 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014705364995602465, + "loss": 3.341, + "step": 1004 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014699501612430374, + "loss": 3.1695, + "step": 1005 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001469363822925828, + "loss": 3.1899, + "step": 1006 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014687774846086193, + "loss": 3.2716, + "step": 1007 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014681911462914102, + "loss": 3.2247, + "step": 1008 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014676048079742012, + "loss": 3.2712, + "step": 1009 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001467018469656992, + "loss": 3.2421, + "step": 1010 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001466432131339783, + "loss": 3.2861, + "step": 1011 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014658457930225743, + "loss": 3.2033, + "step": 1012 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001465259454705365, + "loss": 3.195, + "step": 1013 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001464673116388156, + "loss": 3.1744, + "step": 1014 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001464086778070947, + "loss": 3.2552, + "step": 1015 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001463500439753738, + "loss": 3.2764, + "step": 1016 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014629141014365287, + "loss": 3.2084, + "step": 1017 + }, + { + "epoch": 0.29, + "learning_rate": 0.000146232776311932, + "loss": 3.2608, + "step": 1018 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001461741424802111, + "loss": 3.2078, + "step": 1019 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001461155086484902, + "loss": 3.2861, + "step": 1020 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014605687481676928, + "loss": 3.1761, + "step": 1021 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014599824098504837, + "loss": 3.258, + "step": 1022 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001459396071533275, + "loss": 3.2378, + "step": 1023 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014588097332160659, + "loss": 3.2288, + "step": 1024 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014582233948988565, + "loss": 3.2112, + "step": 1025 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014576370565816477, + "loss": 3.1836, + "step": 1026 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014570507182644387, + "loss": 3.2554, + "step": 1027 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014564643799472296, + "loss": 3.1601, + "step": 1028 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014558780416300206, + "loss": 3.1421, + "step": 1029 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014552917033128115, + "loss": 3.2407, + "step": 1030 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014547053649956027, + "loss": 3.1458, + "step": 1031 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014541190266783934, + "loss": 3.115, + "step": 1032 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014535326883611844, + "loss": 3.2058, + "step": 1033 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014529463500439756, + "loss": 3.1415, + "step": 1034 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014523600117267665, + "loss": 3.231, + "step": 1035 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014517736734095572, + "loss": 3.2668, + "step": 1036 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014511873350923484, + "loss": 3.2877, + "step": 1037 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014506009967751393, + "loss": 3.2669, + "step": 1038 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014500146584579303, + "loss": 3.2344, + "step": 1039 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014494283201407212, + "loss": 3.2069, + "step": 1040 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014488419818235122, + "loss": 3.2648, + "step": 1041 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014482556435063034, + "loss": 3.1754, + "step": 1042 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001447669305189094, + "loss": 3.3066, + "step": 1043 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001447082966871885, + "loss": 3.2235, + "step": 1044 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014464966285546762, + "loss": 3.3046, + "step": 1045 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014459102902374672, + "loss": 3.1704, + "step": 1046 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001445323951920258, + "loss": 3.1684, + "step": 1047 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001444737613603049, + "loss": 3.2783, + "step": 1048 + }, + { + "epoch": 0.3, + "learning_rate": 0.000144415127528584, + "loss": 3.2234, + "step": 1049 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001443564936968631, + "loss": 3.2467, + "step": 1050 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001442978598651422, + "loss": 3.1604, + "step": 1051 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014423922603342128, + "loss": 3.1556, + "step": 1052 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001441805922017004, + "loss": 3.324, + "step": 1053 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014412195836997947, + "loss": 3.2424, + "step": 1054 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001440633245382586, + "loss": 3.2606, + "step": 1055 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014400469070653769, + "loss": 3.2596, + "step": 1056 + }, + { + "epoch": 0.3, + "eval_loss": 3.3927457332611084, + "eval_runtime": 2940.1937, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 1056 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014394605687481678, + "loss": 3.1789, + "step": 1057 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014388742304309587, + "loss": 3.1009, + "step": 1058 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014382878921137497, + "loss": 3.2032, + "step": 1059 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014377015537965406, + "loss": 3.1781, + "step": 1060 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014371152154793316, + "loss": 3.1923, + "step": 1061 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014365288771621225, + "loss": 3.2216, + "step": 1062 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014359425388449135, + "loss": 3.2706, + "step": 1063 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014353562005277047, + "loss": 3.1661, + "step": 1064 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014347698622104953, + "loss": 3.2143, + "step": 1065 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014341835238932866, + "loss": 3.1501, + "step": 1066 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014335971855760775, + "loss": 3.1273, + "step": 1067 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014330108472588684, + "loss": 3.1188, + "step": 1068 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014324245089416594, + "loss": 3.2799, + "step": 1069 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014318381706244503, + "loss": 3.2518, + "step": 1070 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014312518323072413, + "loss": 3.2237, + "step": 1071 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014306654939900325, + "loss": 3.2532, + "step": 1072 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014300791556728232, + "loss": 3.2757, + "step": 1073 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014294928173556144, + "loss": 3.3044, + "step": 1074 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014289064790384053, + "loss": 3.3563, + "step": 1075 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014283201407211963, + "loss": 3.2188, + "step": 1076 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014277338024039872, + "loss": 3.2744, + "step": 1077 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014271474640867782, + "loss": 3.2477, + "step": 1078 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001426561125769569, + "loss": 3.1948, + "step": 1079 + }, + { + "epoch": 0.31, + "learning_rate": 0.000142597478745236, + "loss": 3.2597, + "step": 1080 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001425388449135151, + "loss": 3.2332, + "step": 1081 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014248021108179422, + "loss": 3.2089, + "step": 1082 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014242157725007331, + "loss": 3.0902, + "step": 1083 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014236294341835238, + "loss": 3.2496, + "step": 1084 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001423043095866315, + "loss": 3.2572, + "step": 1085 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001422456757549106, + "loss": 3.1796, + "step": 1086 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001421870419231897, + "loss": 3.1108, + "step": 1087 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014212840809146879, + "loss": 3.2689, + "step": 1088 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014206977425974788, + "loss": 3.2664, + "step": 1089 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014201114042802697, + "loss": 3.2866, + "step": 1090 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014195250659630607, + "loss": 3.1632, + "step": 1091 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014189387276458516, + "loss": 3.1654, + "step": 1092 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014183523893286428, + "loss": 3.2031, + "step": 1093 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014177660510114338, + "loss": 3.1857, + "step": 1094 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014171797126942245, + "loss": 3.2165, + "step": 1095 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014165933743770157, + "loss": 3.1198, + "step": 1096 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014160070360598066, + "loss": 3.1494, + "step": 1097 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014154206977425976, + "loss": 3.1959, + "step": 1098 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014148343594253885, + "loss": 3.2452, + "step": 1099 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014142480211081794, + "loss": 3.1721, + "step": 1100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014136616827909707, + "loss": 3.1778, + "step": 1101 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014130753444737613, + "loss": 3.2613, + "step": 1102 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014124890061565523, + "loss": 3.2279, + "step": 1103 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014119026678393435, + "loss": 3.2708, + "step": 1104 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014113163295221344, + "loss": 3.2222, + "step": 1105 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001410729991204925, + "loss": 3.1533, + "step": 1106 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014101436528877163, + "loss": 3.1679, + "step": 1107 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014095573145705073, + "loss": 3.211, + "step": 1108 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014089709762532982, + "loss": 3.2857, + "step": 1109 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014083846379360891, + "loss": 3.2805, + "step": 1110 + }, + { + "epoch": 0.32, + "learning_rate": 0.000140779829961888, + "loss": 3.243, + "step": 1111 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014072119613016713, + "loss": 3.3112, + "step": 1112 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001406625622984462, + "loss": 3.215, + "step": 1113 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001406039284667253, + "loss": 3.2921, + "step": 1114 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001405452946350044, + "loss": 3.3166, + "step": 1115 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001404866608032835, + "loss": 3.1663, + "step": 1116 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014042802697156257, + "loss": 3.1809, + "step": 1117 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001403693931398417, + "loss": 3.2643, + "step": 1118 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001403107593081208, + "loss": 3.1126, + "step": 1119 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014025212547639988, + "loss": 3.2337, + "step": 1120 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014019349164467898, + "loss": 3.2727, + "step": 1121 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014013485781295807, + "loss": 3.2654, + "step": 1122 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001400762239812372, + "loss": 3.2244, + "step": 1123 + }, + { + "epoch": 0.32, + "learning_rate": 0.00014001759014951626, + "loss": 3.1932, + "step": 1124 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013995895631779536, + "loss": 3.1855, + "step": 1125 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013990032248607448, + "loss": 3.1859, + "step": 1126 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013984168865435357, + "loss": 3.2393, + "step": 1127 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013978305482263267, + "loss": 3.2284, + "step": 1128 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013972442099091176, + "loss": 3.2553, + "step": 1129 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013966578715919086, + "loss": 3.1857, + "step": 1130 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013960715332746998, + "loss": 3.2055, + "step": 1131 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013954851949574904, + "loss": 3.2172, + "step": 1132 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013948988566402814, + "loss": 3.1507, + "step": 1133 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013943125183230726, + "loss": 3.2008, + "step": 1134 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013937261800058635, + "loss": 3.109, + "step": 1135 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013931398416886545, + "loss": 3.1814, + "step": 1136 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013925535033714454, + "loss": 3.1604, + "step": 1137 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013919671650542364, + "loss": 3.1832, + "step": 1138 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013913808267370273, + "loss": 3.1034, + "step": 1139 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013907944884198183, + "loss": 3.2224, + "step": 1140 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013902081501026092, + "loss": 3.119, + "step": 1141 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013896218117854004, + "loss": 3.2043, + "step": 1142 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001389035473468191, + "loss": 3.2507, + "step": 1143 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013884491351509823, + "loss": 3.2858, + "step": 1144 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013878627968337732, + "loss": 3.1763, + "step": 1145 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013872764585165642, + "loss": 3.2726, + "step": 1146 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001386690120199355, + "loss": 3.2172, + "step": 1147 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001386103781882146, + "loss": 3.1881, + "step": 1148 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001385517443564937, + "loss": 3.1157, + "step": 1149 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001384931105247728, + "loss": 3.2874, + "step": 1150 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001384344766930519, + "loss": 3.2671, + "step": 1151 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013837584286133098, + "loss": 3.1038, + "step": 1152 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001383172090296101, + "loss": 3.2201, + "step": 1153 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013825857519788917, + "loss": 3.2126, + "step": 1154 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001381999413661683, + "loss": 3.2184, + "step": 1155 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001381413075344474, + "loss": 3.1785, + "step": 1156 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013808267370272648, + "loss": 3.1887, + "step": 1157 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013802403987100558, + "loss": 3.2579, + "step": 1158 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013796540603928467, + "loss": 3.0964, + "step": 1159 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013790677220756377, + "loss": 3.1758, + "step": 1160 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013784813837584286, + "loss": 3.1086, + "step": 1161 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013778950454412195, + "loss": 3.2121, + "step": 1162 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013773087071240108, + "loss": 3.0181, + "step": 1163 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013767223688068017, + "loss": 3.0773, + "step": 1164 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013761360304895924, + "loss": 3.1632, + "step": 1165 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013755496921723836, + "loss": 3.1465, + "step": 1166 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013749633538551745, + "loss": 3.1723, + "step": 1167 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013743770155379655, + "loss": 3.2446, + "step": 1168 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013737906772207564, + "loss": 3.2361, + "step": 1169 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013732043389035474, + "loss": 3.2172, + "step": 1170 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013726180005863386, + "loss": 3.2092, + "step": 1171 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013720316622691292, + "loss": 3.2105, + "step": 1172 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013714453239519202, + "loss": 3.2252, + "step": 1173 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013708589856347114, + "loss": 3.2547, + "step": 1174 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013702726473175024, + "loss": 3.1852, + "step": 1175 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001369686309000293, + "loss": 3.1305, + "step": 1176 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013690999706830842, + "loss": 3.0818, + "step": 1177 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013685136323658752, + "loss": 3.1579, + "step": 1178 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001367927294048666, + "loss": 3.2461, + "step": 1179 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001367340955731457, + "loss": 3.1656, + "step": 1180 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001366754617414248, + "loss": 3.1799, + "step": 1181 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013661682790970392, + "loss": 3.2685, + "step": 1182 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013655819407798302, + "loss": 3.1792, + "step": 1183 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013649956024626208, + "loss": 3.2519, + "step": 1184 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001364409264145412, + "loss": 3.1943, + "step": 1185 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001363822925828203, + "loss": 3.1631, + "step": 1186 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001363236587510994, + "loss": 3.1836, + "step": 1187 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001362650249193785, + "loss": 3.215, + "step": 1188 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013620639108765758, + "loss": 3.1792, + "step": 1189 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001361477572559367, + "loss": 3.1073, + "step": 1190 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013608912342421577, + "loss": 3.1156, + "step": 1191 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013603048959249487, + "loss": 3.2696, + "step": 1192 + }, + { + "epoch": 0.34, + "learning_rate": 0.000135971855760774, + "loss": 3.1732, + "step": 1193 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013591322192905308, + "loss": 3.2555, + "step": 1194 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013585458809733215, + "loss": 3.2004, + "step": 1195 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013579595426561127, + "loss": 3.1993, + "step": 1196 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013573732043389036, + "loss": 3.2065, + "step": 1197 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013567868660216946, + "loss": 3.1537, + "step": 1198 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013562005277044855, + "loss": 3.2104, + "step": 1199 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013556141893872765, + "loss": 3.2029, + "step": 1200 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013550278510700677, + "loss": 3.2434, + "step": 1201 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013544415127528584, + "loss": 3.2309, + "step": 1202 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013538551744356493, + "loss": 3.2562, + "step": 1203 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013532688361184405, + "loss": 3.1681, + "step": 1204 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013526824978012315, + "loss": 3.1001, + "step": 1205 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001352096159484022, + "loss": 3.1597, + "step": 1206 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013515098211668133, + "loss": 3.1631, + "step": 1207 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013509234828496043, + "loss": 3.2069, + "step": 1208 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013503371445323952, + "loss": 3.0383, + "step": 1209 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013497508062151862, + "loss": 3.1166, + "step": 1210 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001349164467897977, + "loss": 3.1958, + "step": 1211 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013485781295807683, + "loss": 3.1733, + "step": 1212 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001347991791263559, + "loss": 3.2266, + "step": 1213 + }, + { + "epoch": 0.35, + "learning_rate": 0.000134740545294635, + "loss": 3.0868, + "step": 1214 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013468191146291412, + "loss": 3.261, + "step": 1215 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001346232776311932, + "loss": 3.2114, + "step": 1216 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001345646437994723, + "loss": 3.1535, + "step": 1217 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001345060099677514, + "loss": 3.1547, + "step": 1218 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001344473761360305, + "loss": 3.0801, + "step": 1219 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001343887423043096, + "loss": 3.174, + "step": 1220 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013433010847258868, + "loss": 3.1559, + "step": 1221 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013427147464086778, + "loss": 3.1063, + "step": 1222 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001342128408091469, + "loss": 3.1266, + "step": 1223 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013415420697742597, + "loss": 3.2614, + "step": 1224 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001340955731457051, + "loss": 3.2669, + "step": 1225 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013403693931398418, + "loss": 3.1827, + "step": 1226 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013397830548226328, + "loss": 3.1157, + "step": 1227 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013391967165054237, + "loss": 3.139, + "step": 1228 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013386103781882146, + "loss": 3.1076, + "step": 1229 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013380240398710056, + "loss": 3.1338, + "step": 1230 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013374377015537968, + "loss": 3.1602, + "step": 1231 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013368513632365875, + "loss": 3.1357, + "step": 1232 + }, + { + "epoch": 0.35, + "eval_loss": 3.382495641708374, + "eval_runtime": 2940.5137, + "eval_samples_per_second": 6.967, + "eval_steps_per_second": 2.322, + "step": 1232 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013362650249193784, + "loss": 3.2242, + "step": 1233 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013356786866021696, + "loss": 3.0442, + "step": 1234 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013350923482849606, + "loss": 3.1842, + "step": 1235 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013345060099677515, + "loss": 3.1555, + "step": 1236 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013339196716505425, + "loss": 3.2172, + "step": 1237 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013333333333333334, + "loss": 3.1764, + "step": 1238 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013327469950161243, + "loss": 3.1507, + "step": 1239 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013321606566989153, + "loss": 3.1659, + "step": 1240 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013315743183817062, + "loss": 3.1516, + "step": 1241 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013309879800644974, + "loss": 3.1561, + "step": 1242 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001330401641747288, + "loss": 3.2656, + "step": 1243 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013298153034300793, + "loss": 3.1399, + "step": 1244 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013292289651128703, + "loss": 3.1239, + "step": 1245 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013286426267956612, + "loss": 3.1478, + "step": 1246 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013280562884784522, + "loss": 3.2921, + "step": 1247 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001327469950161243, + "loss": 3.2601, + "step": 1248 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001326883611844034, + "loss": 3.2197, + "step": 1249 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001326297273526825, + "loss": 3.1752, + "step": 1250 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001325710935209616, + "loss": 3.2316, + "step": 1251 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013251245968924071, + "loss": 3.2014, + "step": 1252 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001324538258575198, + "loss": 3.2569, + "step": 1253 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013239519202579888, + "loss": 3.0543, + "step": 1254 + }, + { + "epoch": 0.36, + "learning_rate": 0.000132336558194078, + "loss": 3.2453, + "step": 1255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001322779243623571, + "loss": 3.0933, + "step": 1256 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013221929053063619, + "loss": 3.3076, + "step": 1257 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013216065669891528, + "loss": 3.1836, + "step": 1258 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013210202286719437, + "loss": 3.1859, + "step": 1259 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013204338903547347, + "loss": 3.0626, + "step": 1260 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013198475520375256, + "loss": 3.2315, + "step": 1261 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013192612137203166, + "loss": 3.1726, + "step": 1262 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013186748754031078, + "loss": 3.1264, + "step": 1263 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013180885370858987, + "loss": 3.111, + "step": 1264 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013175021987686894, + "loss": 3.2399, + "step": 1265 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013169158604514806, + "loss": 3.2369, + "step": 1266 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013163295221342716, + "loss": 3.186, + "step": 1267 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013157431838170625, + "loss": 3.2087, + "step": 1268 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013151568454998534, + "loss": 3.1764, + "step": 1269 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013145705071826444, + "loss": 3.2164, + "step": 1270 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013139841688654356, + "loss": 3.1762, + "step": 1271 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013133978305482263, + "loss": 3.1607, + "step": 1272 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013128114922310172, + "loss": 3.1759, + "step": 1273 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013122251539138084, + "loss": 3.2293, + "step": 1274 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013116388155965994, + "loss": 3.1406, + "step": 1275 + }, + { + "epoch": 0.36, + "learning_rate": 0.000131105247727939, + "loss": 3.2147, + "step": 1276 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013104661389621813, + "loss": 3.1534, + "step": 1277 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013098798006449722, + "loss": 3.2401, + "step": 1278 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013092934623277632, + "loss": 3.1721, + "step": 1279 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001308707124010554, + "loss": 3.1573, + "step": 1280 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001308120785693345, + "loss": 3.1665, + "step": 1281 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013075344473761363, + "loss": 3.0761, + "step": 1282 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013069481090589272, + "loss": 3.0883, + "step": 1283 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001306361770741718, + "loss": 3.1714, + "step": 1284 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001305775432424509, + "loss": 3.1752, + "step": 1285 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013051890941073, + "loss": 3.2126, + "step": 1286 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001304602755790091, + "loss": 3.0492, + "step": 1287 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001304016417472882, + "loss": 3.0933, + "step": 1288 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013034300791556729, + "loss": 3.2208, + "step": 1289 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001302843740838464, + "loss": 3.1825, + "step": 1290 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013022574025212547, + "loss": 3.0963, + "step": 1291 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013016710642040457, + "loss": 3.1437, + "step": 1292 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001301084725886837, + "loss": 3.1687, + "step": 1293 + }, + { + "epoch": 0.37, + "learning_rate": 0.00013004983875696278, + "loss": 3.1651, + "step": 1294 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012999120492524185, + "loss": 3.2251, + "step": 1295 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012993257109352097, + "loss": 2.9805, + "step": 1296 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012987393726180007, + "loss": 3.1236, + "step": 1297 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012981530343007916, + "loss": 3.2004, + "step": 1298 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012975666959835826, + "loss": 3.2191, + "step": 1299 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012969803576663735, + "loss": 3.1347, + "step": 1300 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012963940193491647, + "loss": 3.1517, + "step": 1301 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012958076810319554, + "loss": 3.1624, + "step": 1302 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012952213427147463, + "loss": 3.2144, + "step": 1303 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012946350043975375, + "loss": 3.2031, + "step": 1304 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012940486660803285, + "loss": 3.1027, + "step": 1305 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012934623277631194, + "loss": 3.1753, + "step": 1306 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012928759894459104, + "loss": 3.1492, + "step": 1307 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012922896511287013, + "loss": 3.1104, + "step": 1308 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012917033128114923, + "loss": 3.0851, + "step": 1309 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012911169744942832, + "loss": 3.2225, + "step": 1310 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012905306361770741, + "loss": 3.1292, + "step": 1311 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012899442978598654, + "loss": 3.1407, + "step": 1312 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001289357959542656, + "loss": 3.1578, + "step": 1313 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001288771621225447, + "loss": 3.1143, + "step": 1314 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012881852829082382, + "loss": 3.2208, + "step": 1315 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001287598944591029, + "loss": 3.1903, + "step": 1316 + }, + { + "epoch": 0.38, + "learning_rate": 0.000128701260627382, + "loss": 3.1242, + "step": 1317 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001286426267956611, + "loss": 3.1529, + "step": 1318 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001285839929639402, + "loss": 3.0988, + "step": 1319 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001285253591322193, + "loss": 3.2027, + "step": 1320 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012846672530049839, + "loss": 3.1763, + "step": 1321 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012840809146877748, + "loss": 3.1793, + "step": 1322 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001283494576370566, + "loss": 3.2243, + "step": 1323 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012829082380533567, + "loss": 3.1714, + "step": 1324 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001282321899736148, + "loss": 3.2676, + "step": 1325 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012817355614189388, + "loss": 3.2629, + "step": 1326 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012811492231017298, + "loss": 3.1628, + "step": 1327 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012805628847845207, + "loss": 3.1174, + "step": 1328 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012799765464673117, + "loss": 3.2498, + "step": 1329 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012793902081501026, + "loss": 3.2414, + "step": 1330 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012788038698328936, + "loss": 3.1061, + "step": 1331 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012782175315156845, + "loss": 3.1805, + "step": 1332 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012776311931984757, + "loss": 3.1126, + "step": 1333 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012770448548812667, + "loss": 3.2429, + "step": 1334 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012764585165640573, + "loss": 3.1792, + "step": 1335 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012758721782468485, + "loss": 3.088, + "step": 1336 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012752858399296395, + "loss": 3.1542, + "step": 1337 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012746995016124304, + "loss": 2.9935, + "step": 1338 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012741131632952214, + "loss": 3.2113, + "step": 1339 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012735268249780123, + "loss": 3.2206, + "step": 1340 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012729404866608035, + "loss": 3.2388, + "step": 1341 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012723541483435945, + "loss": 3.2296, + "step": 1342 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012717678100263851, + "loss": 3.2265, + "step": 1343 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012711814717091764, + "loss": 3.2736, + "step": 1344 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012705951333919673, + "loss": 3.2912, + "step": 1345 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012700087950747582, + "loss": 3.046, + "step": 1346 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012694224567575492, + "loss": 3.1895, + "step": 1347 + }, + { + "epoch": 0.38, + "learning_rate": 0.000126883611844034, + "loss": 3.2153, + "step": 1348 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001268249780123131, + "loss": 3.1984, + "step": 1349 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001267663441805922, + "loss": 3.2679, + "step": 1350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001267077103488713, + "loss": 3.1259, + "step": 1351 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012664907651715042, + "loss": 3.2098, + "step": 1352 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001265904426854295, + "loss": 3.285, + "step": 1353 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012653180885370858, + "loss": 3.213, + "step": 1354 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001264731750219877, + "loss": 3.188, + "step": 1355 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001264145411902668, + "loss": 3.2731, + "step": 1356 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001263559073585459, + "loss": 3.1551, + "step": 1357 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012629727352682498, + "loss": 3.2515, + "step": 1358 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012623863969510408, + "loss": 3.1407, + "step": 1359 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001261800058633832, + "loss": 3.1234, + "step": 1360 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012612137203166227, + "loss": 3.1077, + "step": 1361 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012606273819994136, + "loss": 3.1269, + "step": 1362 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012600410436822048, + "loss": 3.1977, + "step": 1363 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012594547053649958, + "loss": 3.0656, + "step": 1364 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012588683670477864, + "loss": 3.1758, + "step": 1365 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012582820287305776, + "loss": 3.0228, + "step": 1366 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012576956904133686, + "loss": 3.2206, + "step": 1367 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012571093520961595, + "loss": 3.1765, + "step": 1368 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012565230137789505, + "loss": 3.2076, + "step": 1369 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012559366754617414, + "loss": 3.2441, + "step": 1370 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012553503371445326, + "loss": 3.1674, + "step": 1371 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012547639988273233, + "loss": 3.1856, + "step": 1372 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012541776605101143, + "loss": 3.1736, + "step": 1373 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012535913221929055, + "loss": 3.2034, + "step": 1374 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012530049838756964, + "loss": 3.084, + "step": 1375 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001252418645558487, + "loss": 3.0943, + "step": 1376 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012518323072412783, + "loss": 3.2176, + "step": 1377 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012512459689240692, + "loss": 3.0948, + "step": 1378 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012506596306068602, + "loss": 3.0982, + "step": 1379 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001250073292289651, + "loss": 3.1181, + "step": 1380 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001249486953972442, + "loss": 3.1607, + "step": 1381 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012489006156552333, + "loss": 3.1799, + "step": 1382 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001248314277338024, + "loss": 3.0335, + "step": 1383 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001247727939020815, + "loss": 3.2615, + "step": 1384 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001247141600703606, + "loss": 3.1527, + "step": 1385 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001246555262386397, + "loss": 3.2526, + "step": 1386 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001245968924069188, + "loss": 3.1236, + "step": 1387 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001245382585751979, + "loss": 3.1207, + "step": 1388 + }, + { + "epoch": 0.4, + "learning_rate": 0.000124479624743477, + "loss": 3.0895, + "step": 1389 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001244209909117561, + "loss": 3.0801, + "step": 1390 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012436235708003518, + "loss": 3.1893, + "step": 1391 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012430372324831427, + "loss": 3.1752, + "step": 1392 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001242450894165934, + "loss": 3.1808, + "step": 1393 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001241864555848725, + "loss": 3.1455, + "step": 1394 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012412782175315158, + "loss": 3.081, + "step": 1395 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012406918792143068, + "loss": 3.3061, + "step": 1396 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012401055408970977, + "loss": 3.1577, + "step": 1397 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012395192025798886, + "loss": 3.1186, + "step": 1398 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012389328642626796, + "loss": 3.1425, + "step": 1399 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012383465259454705, + "loss": 3.0414, + "step": 1400 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012377601876282617, + "loss": 3.0458, + "step": 1401 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012371738493110524, + "loss": 3.1533, + "step": 1402 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012365875109938434, + "loss": 3.1256, + "step": 1403 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012360011726766346, + "loss": 3.2095, + "step": 1404 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012354148343594255, + "loss": 3.0902, + "step": 1405 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012348284960422165, + "loss": 3.2279, + "step": 1406 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012342421577250074, + "loss": 3.2378, + "step": 1407 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012336558194077983, + "loss": 3.1117, + "step": 1408 + }, + { + "epoch": 0.4, + "eval_loss": 3.358116388320923, + "eval_runtime": 2940.1995, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 1408 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012330694810905893, + "loss": 3.0724, + "step": 1409 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012324831427733802, + "loss": 3.0971, + "step": 1410 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012318968044561712, + "loss": 3.1188, + "step": 1411 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012313104661389624, + "loss": 3.1674, + "step": 1412 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001230724127821753, + "loss": 3.1835, + "step": 1413 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012301377895045443, + "loss": 3.1091, + "step": 1414 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012295514511873352, + "loss": 3.1179, + "step": 1415 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012289651128701262, + "loss": 3.0855, + "step": 1416 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001228378774552917, + "loss": 3.1328, + "step": 1417 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001227792436235708, + "loss": 3.1542, + "step": 1418 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001227206097918499, + "loss": 3.0795, + "step": 1419 + }, + { + "epoch": 0.4, + "learning_rate": 0.000122661975960129, + "loss": 3.2493, + "step": 1420 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001226033421284081, + "loss": 2.9858, + "step": 1421 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001225447082966872, + "loss": 3.1612, + "step": 1422 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001224860744649663, + "loss": 3.2002, + "step": 1423 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012242744063324537, + "loss": 3.092, + "step": 1424 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001223688068015245, + "loss": 3.1716, + "step": 1425 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001223101729698036, + "loss": 3.2572, + "step": 1426 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012225153913808268, + "loss": 3.1645, + "step": 1427 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012219290530636178, + "loss": 3.1283, + "step": 1428 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012213427147464087, + "loss": 3.1762, + "step": 1429 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012207563764291996, + "loss": 3.2059, + "step": 1430 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012201700381119907, + "loss": 3.1986, + "step": 1431 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012195836997947817, + "loss": 3.1574, + "step": 1432 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012189973614775727, + "loss": 3.1534, + "step": 1433 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012184110231603635, + "loss": 2.9625, + "step": 1434 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012178246848431545, + "loss": 3.153, + "step": 1435 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012172383465259456, + "loss": 3.2113, + "step": 1436 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012166520082087365, + "loss": 3.1937, + "step": 1437 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012160656698915273, + "loss": 3.2227, + "step": 1438 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012154793315743185, + "loss": 3.0936, + "step": 1439 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012148929932571093, + "loss": 3.1431, + "step": 1440 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012143066549399004, + "loss": 3.1674, + "step": 1441 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012137203166226914, + "loss": 3.1355, + "step": 1442 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012131339783054823, + "loss": 3.2486, + "step": 1443 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012125476399882734, + "loss": 3.2519, + "step": 1444 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012119613016710642, + "loss": 3.1181, + "step": 1445 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012113749633538551, + "loss": 3.1866, + "step": 1446 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012107886250366462, + "loss": 3.0154, + "step": 1447 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012102022867194372, + "loss": 3.1062, + "step": 1448 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012096159484022282, + "loss": 3.2628, + "step": 1449 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012090296100850192, + "loss": 3.207, + "step": 1450 + }, + { + "epoch": 0.41, + "learning_rate": 0.000120844327176781, + "loss": 3.1502, + "step": 1451 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012078569334506012, + "loss": 3.201, + "step": 1452 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001207270595133392, + "loss": 3.0811, + "step": 1453 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001206684256816183, + "loss": 3.1412, + "step": 1454 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001206097918498974, + "loss": 3.215, + "step": 1455 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001205511580181765, + "loss": 3.1313, + "step": 1456 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012049252418645558, + "loss": 3.0466, + "step": 1457 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012043389035473469, + "loss": 2.945, + "step": 1458 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012037525652301378, + "loss": 3.191, + "step": 1459 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012031662269129289, + "loss": 3.1762, + "step": 1460 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012025798885957198, + "loss": 3.1753, + "step": 1461 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012019935502785106, + "loss": 3.118, + "step": 1462 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012014072119613018, + "loss": 3.1987, + "step": 1463 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012008208736440927, + "loss": 3.1258, + "step": 1464 + }, + { + "epoch": 0.42, + "learning_rate": 0.00012002345353268836, + "loss": 3.0951, + "step": 1465 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011996481970096747, + "loss": 3.2152, + "step": 1466 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011990618586924656, + "loss": 3.0928, + "step": 1467 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011984755203752567, + "loss": 3.0311, + "step": 1468 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011978891820580475, + "loss": 3.0732, + "step": 1469 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011973028437408385, + "loss": 3.1958, + "step": 1470 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011967165054236295, + "loss": 3.1185, + "step": 1471 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011961301671064205, + "loss": 3.1083, + "step": 1472 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011955438287892113, + "loss": 3.1742, + "step": 1473 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011949574904720025, + "loss": 3.1251, + "step": 1474 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011943711521547933, + "loss": 3.2046, + "step": 1475 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011937848138375844, + "loss": 3.1094, + "step": 1476 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011931984755203753, + "loss": 3.0576, + "step": 1477 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011926121372031663, + "loss": 3.1764, + "step": 1478 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011920257988859573, + "loss": 3.1838, + "step": 1479 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011914394605687483, + "loss": 3.1026, + "step": 1480 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011908531222515391, + "loss": 3.2143, + "step": 1481 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011902667839343302, + "loss": 3.1547, + "step": 1482 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011896804456171211, + "loss": 3.1739, + "step": 1483 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001189094107299912, + "loss": 3.1513, + "step": 1484 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011885077689827031, + "loss": 3.1518, + "step": 1485 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001187921430665494, + "loss": 3.1859, + "step": 1486 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011873350923482852, + "loss": 3.1476, + "step": 1487 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001186748754031076, + "loss": 3.0416, + "step": 1488 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011861624157138669, + "loss": 3.1129, + "step": 1489 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001185576077396658, + "loss": 3.1618, + "step": 1490 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001184989739079449, + "loss": 3.2003, + "step": 1491 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011844034007622397, + "loss": 3.0911, + "step": 1492 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011838170624450308, + "loss": 3.2046, + "step": 1493 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011832307241278218, + "loss": 3.1409, + "step": 1494 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011826443858106128, + "loss": 3.0772, + "step": 1495 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011820580474934038, + "loss": 3.1814, + "step": 1496 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011814717091761946, + "loss": 3.1657, + "step": 1497 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011808853708589858, + "loss": 3.1691, + "step": 1498 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011802990325417766, + "loss": 3.2188, + "step": 1499 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011797126942245676, + "loss": 3.0851, + "step": 1500 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011791263559073586, + "loss": 3.206, + "step": 1501 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011785400175901496, + "loss": 3.0703, + "step": 1502 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011779536792729407, + "loss": 3.1974, + "step": 1503 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011773673409557315, + "loss": 3.109, + "step": 1504 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011767810026385224, + "loss": 3.1281, + "step": 1505 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011761946643213135, + "loss": 3.2016, + "step": 1506 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011756083260041044, + "loss": 3.1947, + "step": 1507 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011750219876868954, + "loss": 3.1425, + "step": 1508 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011744356493696865, + "loss": 3.2012, + "step": 1509 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011738493110524773, + "loss": 3.1649, + "step": 1510 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011732629727352682, + "loss": 3.113, + "step": 1511 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011726766344180593, + "loss": 3.1457, + "step": 1512 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011720902961008502, + "loss": 3.1941, + "step": 1513 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011715039577836413, + "loss": 3.1677, + "step": 1514 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011709176194664322, + "loss": 3.1692, + "step": 1515 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001170331281149223, + "loss": 3.2623, + "step": 1516 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011697449428320141, + "loss": 3.0789, + "step": 1517 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011691586045148051, + "loss": 3.0899, + "step": 1518 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001168572266197596, + "loss": 3.2451, + "step": 1519 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011679859278803871, + "loss": 3.1909, + "step": 1520 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011673995895631779, + "loss": 3.1379, + "step": 1521 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011668132512459691, + "loss": 3.1656, + "step": 1522 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011662269129287599, + "loss": 3.0479, + "step": 1523 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011656405746115509, + "loss": 3.2149, + "step": 1524 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001165054236294342, + "loss": 3.1593, + "step": 1525 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011644678979771329, + "loss": 3.0278, + "step": 1526 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011638815596599237, + "loss": 3.1505, + "step": 1527 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011632952213427148, + "loss": 3.0834, + "step": 1528 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011627088830255057, + "loss": 3.1135, + "step": 1529 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011621225447082968, + "loss": 3.2249, + "step": 1530 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011615362063910877, + "loss": 3.1105, + "step": 1531 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011609498680738786, + "loss": 3.1778, + "step": 1532 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011603635297566698, + "loss": 3.0562, + "step": 1533 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011597771914394606, + "loss": 3.1572, + "step": 1534 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011591908531222515, + "loss": 3.2102, + "step": 1535 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011586045148050426, + "loss": 3.0796, + "step": 1536 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011580181764878335, + "loss": 3.2072, + "step": 1537 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011574318381706246, + "loss": 3.1993, + "step": 1538 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011568454998534156, + "loss": 3.1107, + "step": 1539 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011562591615362064, + "loss": 3.2597, + "step": 1540 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011556728232189974, + "loss": 3.0999, + "step": 1541 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011550864849017884, + "loss": 3.2182, + "step": 1542 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011545001465845793, + "loss": 3.1421, + "step": 1543 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011539138082673704, + "loss": 3.0629, + "step": 1544 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011533274699501612, + "loss": 3.1179, + "step": 1545 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011527411316329522, + "loss": 3.1367, + "step": 1546 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011521547933157432, + "loss": 3.1791, + "step": 1547 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011515684549985342, + "loss": 3.1518, + "step": 1548 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011509821166813253, + "loss": 3.1704, + "step": 1549 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011503957783641162, + "loss": 3.1042, + "step": 1550 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001149809440046907, + "loss": 3.2331, + "step": 1551 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011492231017296981, + "loss": 3.1015, + "step": 1552 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001148636763412489, + "loss": 3.0461, + "step": 1553 + }, + { + "epoch": 0.44, + "learning_rate": 0.000114805042509528, + "loss": 3.2285, + "step": 1554 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001147464086778071, + "loss": 3.1485, + "step": 1555 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011468777484608619, + "loss": 3.1799, + "step": 1556 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011462914101436531, + "loss": 3.1473, + "step": 1557 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011457050718264439, + "loss": 3.1539, + "step": 1558 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011451187335092348, + "loss": 3.1422, + "step": 1559 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011445323951920259, + "loss": 3.1082, + "step": 1560 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011439460568748169, + "loss": 3.1161, + "step": 1561 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011433597185576077, + "loss": 3.2615, + "step": 1562 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011427733802403989, + "loss": 3.1079, + "step": 1563 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011421870419231897, + "loss": 3.1397, + "step": 1564 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011416007036059808, + "loss": 3.1363, + "step": 1565 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011410143652887717, + "loss": 3.1261, + "step": 1566 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011404280269715626, + "loss": 3.0705, + "step": 1567 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011398416886543537, + "loss": 3.1307, + "step": 1568 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011392553503371445, + "loss": 3.0949, + "step": 1569 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011386690120199355, + "loss": 3.2387, + "step": 1570 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011380826737027266, + "loss": 3.2034, + "step": 1571 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011374963353855175, + "loss": 3.1882, + "step": 1572 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011369099970683083, + "loss": 3.0792, + "step": 1573 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011363236587510995, + "loss": 3.2237, + "step": 1574 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011357373204338903, + "loss": 3.2187, + "step": 1575 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011351509821166814, + "loss": 3.11, + "step": 1576 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011345646437994724, + "loss": 3.1262, + "step": 1577 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011339783054822633, + "loss": 3.1073, + "step": 1578 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011333919671650544, + "loss": 3.0989, + "step": 1579 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011328056288478452, + "loss": 3.1451, + "step": 1580 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011322192905306361, + "loss": 3.1339, + "step": 1581 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011316329522134272, + "loss": 3.2096, + "step": 1582 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011310466138962181, + "loss": 3.2126, + "step": 1583 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011304602755790092, + "loss": 3.1355, + "step": 1584 + }, + { + "epoch": 0.45, + "eval_loss": 3.33901309967041, + "eval_runtime": 2939.0345, + "eval_samples_per_second": 6.971, + "eval_steps_per_second": 2.324, + "step": 1584 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011298739372618002, + "loss": 3.2127, + "step": 1585 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001129287598944591, + "loss": 3.0874, + "step": 1586 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011287012606273822, + "loss": 3.1317, + "step": 1587 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001128114922310173, + "loss": 3.1545, + "step": 1588 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001127528583992964, + "loss": 3.0883, + "step": 1589 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001126942245675755, + "loss": 3.1379, + "step": 1590 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001126355907358546, + "loss": 3.1477, + "step": 1591 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001125769569041337, + "loss": 3.0437, + "step": 1592 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011251832307241279, + "loss": 3.092, + "step": 1593 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011245968924069188, + "loss": 3.1902, + "step": 1594 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011240105540897099, + "loss": 3.1561, + "step": 1595 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011234242157725008, + "loss": 3.1247, + "step": 1596 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011228378774552916, + "loss": 3.1098, + "step": 1597 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011222515391380828, + "loss": 3.178, + "step": 1598 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011216652008208736, + "loss": 3.0661, + "step": 1599 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011210788625036646, + "loss": 3.0487, + "step": 1600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011204925241864557, + "loss": 3.1827, + "step": 1601 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011199061858692466, + "loss": 3.1593, + "step": 1602 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011193198475520377, + "loss": 3.0422, + "step": 1603 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011187335092348285, + "loss": 3.1839, + "step": 1604 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011181471709176194, + "loss": 3.0899, + "step": 1605 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011175608326004105, + "loss": 3.2206, + "step": 1606 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011169744942832015, + "loss": 3.0958, + "step": 1607 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011163881559659923, + "loss": 3.1919, + "step": 1608 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011158018176487835, + "loss": 3.2004, + "step": 1609 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011152154793315743, + "loss": 3.1084, + "step": 1610 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011146291410143655, + "loss": 3.0346, + "step": 1611 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011140428026971563, + "loss": 3.0911, + "step": 1612 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011134564643799473, + "loss": 2.992, + "step": 1613 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011128701260627383, + "loss": 3.0912, + "step": 1614 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011122837877455293, + "loss": 3.1363, + "step": 1615 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011116974494283201, + "loss": 3.0195, + "step": 1616 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011111111111111112, + "loss": 3.0398, + "step": 1617 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011105247727939021, + "loss": 2.9856, + "step": 1618 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011099384344766932, + "loss": 3.1327, + "step": 1619 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011093520961594841, + "loss": 3.1394, + "step": 1620 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001108765757842275, + "loss": 3.1844, + "step": 1621 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011081794195250662, + "loss": 3.0771, + "step": 1622 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001107593081207857, + "loss": 3.1131, + "step": 1623 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011070067428906479, + "loss": 3.2034, + "step": 1624 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001106420404573439, + "loss": 3.0252, + "step": 1625 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011058340662562299, + "loss": 3.0887, + "step": 1626 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011052477279390207, + "loss": 3.0802, + "step": 1627 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011046613896218118, + "loss": 3.1181, + "step": 1628 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011040750513046028, + "loss": 3.1592, + "step": 1629 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011034887129873938, + "loss": 3.1655, + "step": 1630 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011029023746701848, + "loss": 3.143, + "step": 1631 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011023160363529756, + "loss": 3.1253, + "step": 1632 + }, + { + "epoch": 0.47, + "learning_rate": 0.00011017296980357668, + "loss": 3.1357, + "step": 1633 + }, + { + "epoch": 0.47, + "learning_rate": 0.00011011433597185576, + "loss": 3.0326, + "step": 1634 + }, + { + "epoch": 0.47, + "learning_rate": 0.00011005570214013485, + "loss": 3.1027, + "step": 1635 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010999706830841396, + "loss": 3.0767, + "step": 1636 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010993843447669306, + "loss": 3.0888, + "step": 1637 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010987980064497216, + "loss": 3.0943, + "step": 1638 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010982116681325126, + "loss": 3.2003, + "step": 1639 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010976253298153034, + "loss": 3.1548, + "step": 1640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010970389914980945, + "loss": 2.9824, + "step": 1641 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010964526531808854, + "loss": 3.144, + "step": 1642 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010958663148636764, + "loss": 3.0977, + "step": 1643 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010952799765464674, + "loss": 3.1912, + "step": 1644 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010946936382292583, + "loss": 3.0353, + "step": 1645 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010941072999120495, + "loss": 3.0586, + "step": 1646 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010935209615948403, + "loss": 3.0516, + "step": 1647 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010929346232776312, + "loss": 3.117, + "step": 1648 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010923482849604223, + "loss": 3.0475, + "step": 1649 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010917619466432132, + "loss": 3.0815, + "step": 1650 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001091175608326004, + "loss": 3.1132, + "step": 1651 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010905892700087951, + "loss": 3.0678, + "step": 1652 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001090002931691586, + "loss": 3.1199, + "step": 1653 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001089416593374377, + "loss": 3.1443, + "step": 1654 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010888302550571681, + "loss": 3.13, + "step": 1655 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010882439167399589, + "loss": 3.1086, + "step": 1656 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010876575784227501, + "loss": 3.1537, + "step": 1657 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010870712401055409, + "loss": 3.1161, + "step": 1658 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010864849017883319, + "loss": 3.0585, + "step": 1659 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001085898563471123, + "loss": 3.1445, + "step": 1660 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010853122251539139, + "loss": 3.0764, + "step": 1661 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010847258868367047, + "loss": 2.9982, + "step": 1662 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010841395485194959, + "loss": 3.1593, + "step": 1663 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010835532102022867, + "loss": 3.0856, + "step": 1664 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010829668718850778, + "loss": 3.1578, + "step": 1665 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010823805335678687, + "loss": 3.1565, + "step": 1666 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010817941952506597, + "loss": 2.9098, + "step": 1667 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010812078569334508, + "loss": 3.1399, + "step": 1668 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010806215186162416, + "loss": 3.0203, + "step": 1669 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010800351802990325, + "loss": 3.1578, + "step": 1670 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010794488419818236, + "loss": 3.1458, + "step": 1671 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010788625036646145, + "loss": 3.2191, + "step": 1672 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010782761653474056, + "loss": 3.0857, + "step": 1673 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010776898270301966, + "loss": 3.096, + "step": 1674 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010771034887129874, + "loss": 3.1596, + "step": 1675 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010765171503957784, + "loss": 3.0256, + "step": 1676 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010759308120785694, + "loss": 3.0621, + "step": 1677 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010753444737613603, + "loss": 3.117, + "step": 1678 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010747581354441514, + "loss": 3.1636, + "step": 1679 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010741717971269422, + "loss": 3.1441, + "step": 1680 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010735854588097332, + "loss": 3.1357, + "step": 1681 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010729991204925242, + "loss": 3.0222, + "step": 1682 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010724127821753152, + "loss": 3.1049, + "step": 1683 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010718264438581063, + "loss": 3.2128, + "step": 1684 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010712401055408972, + "loss": 3.1145, + "step": 1685 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001070653767223688, + "loss": 3.1629, + "step": 1686 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010700674289064791, + "loss": 3.0896, + "step": 1687 + }, + { + "epoch": 0.48, + "learning_rate": 0.000106948109058927, + "loss": 2.9751, + "step": 1688 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001068894752272061, + "loss": 3.0981, + "step": 1689 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001068308413954852, + "loss": 3.055, + "step": 1690 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001067722075637643, + "loss": 3.1003, + "step": 1691 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010671357373204341, + "loss": 3.2445, + "step": 1692 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010665493990032249, + "loss": 3.0466, + "step": 1693 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010659630606860158, + "loss": 3.1444, + "step": 1694 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010653767223688069, + "loss": 3.0813, + "step": 1695 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010647903840515978, + "loss": 3.1947, + "step": 1696 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010642040457343887, + "loss": 3.05, + "step": 1697 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010636177074171799, + "loss": 3.0693, + "step": 1698 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010630313690999707, + "loss": 3.172, + "step": 1699 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010624450307827618, + "loss": 3.1232, + "step": 1700 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010618586924655527, + "loss": 3.1415, + "step": 1701 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010612723541483436, + "loss": 3.1177, + "step": 1702 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010606860158311347, + "loss": 3.0969, + "step": 1703 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010600996775139255, + "loss": 3.1011, + "step": 1704 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010595133391967165, + "loss": 3.0822, + "step": 1705 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010589270008795075, + "loss": 3.1034, + "step": 1706 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010583406625622985, + "loss": 3.1182, + "step": 1707 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010577543242450893, + "loss": 3.0735, + "step": 1708 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010571679859278805, + "loss": 3.158, + "step": 1709 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010565816476106713, + "loss": 3.1124, + "step": 1710 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010559953092934624, + "loss": 3.0835, + "step": 1711 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010554089709762533, + "loss": 3.1919, + "step": 1712 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010548226326590443, + "loss": 3.248, + "step": 1713 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010542362943418354, + "loss": 3.0471, + "step": 1714 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010536499560246262, + "loss": 3.1297, + "step": 1715 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010530636177074171, + "loss": 3.1031, + "step": 1716 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010524772793902082, + "loss": 3.0993, + "step": 1717 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010518909410729991, + "loss": 3.0457, + "step": 1718 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010513046027557902, + "loss": 3.1279, + "step": 1719 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010507182644385812, + "loss": 3.1209, + "step": 1720 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001050131926121372, + "loss": 2.9745, + "step": 1721 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010495455878041632, + "loss": 3.1604, + "step": 1722 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001048959249486954, + "loss": 3.1523, + "step": 1723 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010483729111697449, + "loss": 3.1549, + "step": 1724 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001047786572852536, + "loss": 3.1036, + "step": 1725 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001047200234535327, + "loss": 3.0448, + "step": 1726 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001046613896218118, + "loss": 3.102, + "step": 1727 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010460275579009088, + "loss": 3.0906, + "step": 1728 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010454412195836998, + "loss": 3.0744, + "step": 1729 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010448548812664909, + "loss": 3.0511, + "step": 1730 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010442685429492818, + "loss": 3.1741, + "step": 1731 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010436822046320726, + "loss": 3.0744, + "step": 1732 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010430958663148638, + "loss": 3.1373, + "step": 1733 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010425095279976546, + "loss": 3.1071, + "step": 1734 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010419231896804457, + "loss": 3.0417, + "step": 1735 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010413368513632367, + "loss": 3.0372, + "step": 1736 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010407505130460276, + "loss": 3.1596, + "step": 1737 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010401641747288187, + "loss": 3.0743, + "step": 1738 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010395778364116095, + "loss": 3.142, + "step": 1739 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010389914980944004, + "loss": 3.1523, + "step": 1740 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010384051597771915, + "loss": 3.1408, + "step": 1741 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010378188214599825, + "loss": 3.0652, + "step": 1742 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010372324831427733, + "loss": 3.1059, + "step": 1743 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010366461448255645, + "loss": 3.0498, + "step": 1744 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010360598065083553, + "loss": 3.2089, + "step": 1745 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010354734681911465, + "loss": 3.0765, + "step": 1746 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010348871298739373, + "loss": 3.2044, + "step": 1747 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010343007915567282, + "loss": 3.1166, + "step": 1748 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010337144532395193, + "loss": 3.0821, + "step": 1749 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010331281149223103, + "loss": 3.0803, + "step": 1750 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010325417766051011, + "loss": 3.1026, + "step": 1751 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010319554382878922, + "loss": 3.1071, + "step": 1752 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010313690999706831, + "loss": 3.1106, + "step": 1753 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010307827616534742, + "loss": 3.0305, + "step": 1754 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010301964233362651, + "loss": 3.1255, + "step": 1755 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010296100850190559, + "loss": 3.0228, + "step": 1756 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010290237467018471, + "loss": 2.9448, + "step": 1757 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001028437408384638, + "loss": 3.1852, + "step": 1758 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010278510700674289, + "loss": 3.0626, + "step": 1759 + }, + { + "epoch": 0.5, + "learning_rate": 0.000102726473175022, + "loss": 3.1446, + "step": 1760 + }, + { + "epoch": 0.5, + "eval_loss": 3.320050001144409, + "eval_runtime": 2939.4559, + "eval_samples_per_second": 6.97, + "eval_steps_per_second": 2.323, + "step": 1760 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010266783934330109, + "loss": 3.1573, + "step": 1761 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001026092055115802, + "loss": 3.0945, + "step": 1762 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010255057167985928, + "loss": 3.028, + "step": 1763 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010249193784813837, + "loss": 3.1876, + "step": 1764 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010243330401641748, + "loss": 3.0141, + "step": 1765 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010237467018469658, + "loss": 3.0834, + "step": 1766 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010231603635297566, + "loss": 3.1375, + "step": 1767 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010225740252125478, + "loss": 3.0324, + "step": 1768 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010219876868953386, + "loss": 3.1365, + "step": 1769 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010214013485781295, + "loss": 3.0708, + "step": 1770 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010208150102609206, + "loss": 3.0796, + "step": 1771 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010202286719437116, + "loss": 3.1592, + "step": 1772 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010196423336265026, + "loss": 3.0714, + "step": 1773 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010190559953092936, + "loss": 3.1238, + "step": 1774 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010184696569920844, + "loss": 3.1243, + "step": 1775 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010178833186748755, + "loss": 3.1121, + "step": 1776 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010172969803576664, + "loss": 3.1047, + "step": 1777 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010167106420404574, + "loss": 3.066, + "step": 1778 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010161243037232484, + "loss": 3.1417, + "step": 1779 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010155379654060392, + "loss": 3.0592, + "step": 1780 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010149516270888305, + "loss": 3.1251, + "step": 1781 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010143652887716213, + "loss": 3.1002, + "step": 1782 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010137789504544122, + "loss": 3.1436, + "step": 1783 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010131926121372033, + "loss": 3.0973, + "step": 1784 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010126062738199942, + "loss": 3.1076, + "step": 1785 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001012019935502785, + "loss": 3.2234, + "step": 1786 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010114335971855761, + "loss": 3.1533, + "step": 1787 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001010847258868367, + "loss": 3.0841, + "step": 1788 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010102609205511581, + "loss": 3.0895, + "step": 1789 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010096745822339491, + "loss": 3.2488, + "step": 1790 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010090882439167399, + "loss": 3.1007, + "step": 1791 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010085019055995311, + "loss": 3.0292, + "step": 1792 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010079155672823219, + "loss": 3.0973, + "step": 1793 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010073292289651129, + "loss": 3.1285, + "step": 1794 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010067428906479039, + "loss": 3.0221, + "step": 1795 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010061565523306949, + "loss": 3.0917, + "step": 1796 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010055702140134857, + "loss": 3.0951, + "step": 1797 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010049838756962769, + "loss": 3.1532, + "step": 1798 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010043975373790677, + "loss": 3.1203, + "step": 1799 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010038111990618588, + "loss": 3.2073, + "step": 1800 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010032248607446497, + "loss": 3.1434, + "step": 1801 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010026385224274407, + "loss": 3.1396, + "step": 1802 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010020521841102317, + "loss": 3.2, + "step": 1803 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010014658457930226, + "loss": 3.1205, + "step": 1804 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010008795074758135, + "loss": 3.2149, + "step": 1805 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010002931691586046, + "loss": 3.0374, + "step": 1806 + }, + { + "epoch": 0.51, + "learning_rate": 9.997068308413955e-05, + "loss": 3.0511, + "step": 1807 + }, + { + "epoch": 0.51, + "learning_rate": 9.991204925241865e-05, + "loss": 3.1071, + "step": 1808 + }, + { + "epoch": 0.52, + "learning_rate": 9.985341542069775e-05, + "loss": 3.055, + "step": 1809 + }, + { + "epoch": 0.52, + "learning_rate": 9.979478158897685e-05, + "loss": 3.0356, + "step": 1810 + }, + { + "epoch": 0.52, + "learning_rate": 9.973614775725594e-05, + "loss": 3.1747, + "step": 1811 + }, + { + "epoch": 0.52, + "learning_rate": 9.967751392553504e-05, + "loss": 3.1509, + "step": 1812 + }, + { + "epoch": 0.52, + "learning_rate": 9.961888009381413e-05, + "loss": 3.1017, + "step": 1813 + }, + { + "epoch": 0.52, + "learning_rate": 9.956024626209324e-05, + "loss": 3.1541, + "step": 1814 + }, + { + "epoch": 0.52, + "learning_rate": 9.950161243037232e-05, + "loss": 3.0134, + "step": 1815 + }, + { + "epoch": 0.52, + "learning_rate": 9.944297859865143e-05, + "loss": 3.0249, + "step": 1816 + }, + { + "epoch": 0.52, + "learning_rate": 9.938434476693052e-05, + "loss": 3.0088, + "step": 1817 + }, + { + "epoch": 0.52, + "learning_rate": 9.932571093520963e-05, + "loss": 3.0087, + "step": 1818 + }, + { + "epoch": 0.52, + "learning_rate": 9.926707710348871e-05, + "loss": 3.1616, + "step": 1819 + }, + { + "epoch": 0.52, + "learning_rate": 9.920844327176782e-05, + "loss": 3.0938, + "step": 1820 + }, + { + "epoch": 0.52, + "learning_rate": 9.914980944004691e-05, + "loss": 3.0721, + "step": 1821 + }, + { + "epoch": 0.52, + "learning_rate": 9.909117560832601e-05, + "loss": 3.2352, + "step": 1822 + }, + { + "epoch": 0.52, + "learning_rate": 9.90325417766051e-05, + "loss": 3.0524, + "step": 1823 + }, + { + "epoch": 0.52, + "learning_rate": 9.897390794488421e-05, + "loss": 3.1045, + "step": 1824 + }, + { + "epoch": 0.52, + "learning_rate": 9.89152741131633e-05, + "loss": 3.0853, + "step": 1825 + }, + { + "epoch": 0.52, + "learning_rate": 9.88566402814424e-05, + "loss": 3.121, + "step": 1826 + }, + { + "epoch": 0.52, + "learning_rate": 9.879800644972149e-05, + "loss": 3.0032, + "step": 1827 + }, + { + "epoch": 0.52, + "learning_rate": 9.873937261800059e-05, + "loss": 3.1874, + "step": 1828 + }, + { + "epoch": 0.52, + "learning_rate": 9.86807387862797e-05, + "loss": 3.0471, + "step": 1829 + }, + { + "epoch": 0.52, + "learning_rate": 9.862210495455878e-05, + "loss": 3.1055, + "step": 1830 + }, + { + "epoch": 0.52, + "learning_rate": 9.856347112283788e-05, + "loss": 3.0707, + "step": 1831 + }, + { + "epoch": 0.52, + "learning_rate": 9.850483729111698e-05, + "loss": 3.1021, + "step": 1832 + }, + { + "epoch": 0.52, + "learning_rate": 9.844620345939609e-05, + "loss": 3.1917, + "step": 1833 + }, + { + "epoch": 0.52, + "learning_rate": 9.838756962767517e-05, + "loss": 3.0467, + "step": 1834 + }, + { + "epoch": 0.52, + "learning_rate": 9.832893579595427e-05, + "loss": 3.1434, + "step": 1835 + }, + { + "epoch": 0.52, + "learning_rate": 9.827030196423337e-05, + "loss": 3.1226, + "step": 1836 + }, + { + "epoch": 0.52, + "learning_rate": 9.821166813251246e-05, + "loss": 3.0731, + "step": 1837 + }, + { + "epoch": 0.52, + "learning_rate": 9.815303430079156e-05, + "loss": 3.1525, + "step": 1838 + }, + { + "epoch": 0.52, + "learning_rate": 9.809440046907065e-05, + "loss": 3.0906, + "step": 1839 + }, + { + "epoch": 0.52, + "learning_rate": 9.803576663734976e-05, + "loss": 3.0288, + "step": 1840 + }, + { + "epoch": 0.52, + "learning_rate": 9.797713280562885e-05, + "loss": 3.0371, + "step": 1841 + }, + { + "epoch": 0.52, + "learning_rate": 9.791849897390795e-05, + "loss": 3.0029, + "step": 1842 + }, + { + "epoch": 0.52, + "learning_rate": 9.785986514218704e-05, + "loss": 3.158, + "step": 1843 + }, + { + "epoch": 0.53, + "learning_rate": 9.780123131046615e-05, + "loss": 3.1321, + "step": 1844 + }, + { + "epoch": 0.53, + "learning_rate": 9.774259747874524e-05, + "loss": 3.1156, + "step": 1845 + }, + { + "epoch": 0.53, + "learning_rate": 9.768396364702434e-05, + "loss": 3.0089, + "step": 1846 + }, + { + "epoch": 0.53, + "learning_rate": 9.762532981530343e-05, + "loss": 3.1628, + "step": 1847 + }, + { + "epoch": 0.53, + "learning_rate": 9.756669598358254e-05, + "loss": 3.1002, + "step": 1848 + }, + { + "epoch": 0.53, + "learning_rate": 9.750806215186162e-05, + "loss": 3.0235, + "step": 1849 + }, + { + "epoch": 0.53, + "learning_rate": 9.744942832014073e-05, + "loss": 2.957, + "step": 1850 + }, + { + "epoch": 0.53, + "learning_rate": 9.739079448841982e-05, + "loss": 3.0432, + "step": 1851 + }, + { + "epoch": 0.53, + "learning_rate": 9.733216065669892e-05, + "loss": 3.1521, + "step": 1852 + }, + { + "epoch": 0.53, + "learning_rate": 9.727352682497801e-05, + "loss": 3.0779, + "step": 1853 + }, + { + "epoch": 0.53, + "learning_rate": 9.721489299325711e-05, + "loss": 3.1235, + "step": 1854 + }, + { + "epoch": 0.53, + "learning_rate": 9.715625916153621e-05, + "loss": 2.9702, + "step": 1855 + }, + { + "epoch": 0.53, + "learning_rate": 9.709762532981531e-05, + "loss": 3.136, + "step": 1856 + }, + { + "epoch": 0.53, + "learning_rate": 9.70389914980944e-05, + "loss": 3.0957, + "step": 1857 + }, + { + "epoch": 0.53, + "learning_rate": 9.69803576663735e-05, + "loss": 3.055, + "step": 1858 + }, + { + "epoch": 0.53, + "learning_rate": 9.69217238346526e-05, + "loss": 3.0824, + "step": 1859 + }, + { + "epoch": 0.53, + "learning_rate": 9.68630900029317e-05, + "loss": 3.0799, + "step": 1860 + }, + { + "epoch": 0.53, + "learning_rate": 9.68044561712108e-05, + "loss": 3.0447, + "step": 1861 + }, + { + "epoch": 0.53, + "learning_rate": 9.674582233948989e-05, + "loss": 3.1054, + "step": 1862 + }, + { + "epoch": 0.53, + "learning_rate": 9.668718850776898e-05, + "loss": 3.0275, + "step": 1863 + }, + { + "epoch": 0.53, + "learning_rate": 9.662855467604809e-05, + "loss": 3.1579, + "step": 1864 + }, + { + "epoch": 0.53, + "learning_rate": 9.656992084432717e-05, + "loss": 3.0837, + "step": 1865 + }, + { + "epoch": 0.53, + "learning_rate": 9.651128701260628e-05, + "loss": 3.0448, + "step": 1866 + }, + { + "epoch": 0.53, + "learning_rate": 9.645265318088537e-05, + "loss": 2.9995, + "step": 1867 + }, + { + "epoch": 0.53, + "learning_rate": 9.639401934916448e-05, + "loss": 3.0649, + "step": 1868 + }, + { + "epoch": 0.53, + "learning_rate": 9.633538551744356e-05, + "loss": 3.0695, + "step": 1869 + }, + { + "epoch": 0.53, + "learning_rate": 9.627675168572267e-05, + "loss": 3.0222, + "step": 1870 + }, + { + "epoch": 0.53, + "learning_rate": 9.621811785400176e-05, + "loss": 3.1588, + "step": 1871 + }, + { + "epoch": 0.53, + "learning_rate": 9.615948402228087e-05, + "loss": 3.09, + "step": 1872 + }, + { + "epoch": 0.53, + "learning_rate": 9.610085019055995e-05, + "loss": 3.1339, + "step": 1873 + }, + { + "epoch": 0.53, + "learning_rate": 9.604221635883906e-05, + "loss": 3.0358, + "step": 1874 + }, + { + "epoch": 0.53, + "learning_rate": 9.598358252711816e-05, + "loss": 3.0608, + "step": 1875 + }, + { + "epoch": 0.53, + "learning_rate": 9.592494869539725e-05, + "loss": 3.1485, + "step": 1876 + }, + { + "epoch": 0.53, + "learning_rate": 9.586631486367634e-05, + "loss": 3.0677, + "step": 1877 + }, + { + "epoch": 0.53, + "learning_rate": 9.580768103195544e-05, + "loss": 3.0062, + "step": 1878 + }, + { + "epoch": 0.54, + "learning_rate": 9.574904720023455e-05, + "loss": 3.0928, + "step": 1879 + }, + { + "epoch": 0.54, + "learning_rate": 9.569041336851363e-05, + "loss": 3.1299, + "step": 1880 + }, + { + "epoch": 0.54, + "learning_rate": 9.563177953679273e-05, + "loss": 3.1471, + "step": 1881 + }, + { + "epoch": 0.54, + "learning_rate": 9.557314570507183e-05, + "loss": 3.1515, + "step": 1882 + }, + { + "epoch": 0.54, + "learning_rate": 9.551451187335094e-05, + "loss": 3.077, + "step": 1883 + }, + { + "epoch": 0.54, + "learning_rate": 9.545587804163002e-05, + "loss": 3.1111, + "step": 1884 + }, + { + "epoch": 0.54, + "learning_rate": 9.539724420990913e-05, + "loss": 3.1064, + "step": 1885 + }, + { + "epoch": 0.54, + "learning_rate": 9.533861037818822e-05, + "loss": 3.1031, + "step": 1886 + }, + { + "epoch": 0.54, + "learning_rate": 9.527997654646731e-05, + "loss": 3.0807, + "step": 1887 + }, + { + "epoch": 0.54, + "learning_rate": 9.522134271474641e-05, + "loss": 3.1281, + "step": 1888 + }, + { + "epoch": 0.54, + "learning_rate": 9.51627088830255e-05, + "loss": 3.1272, + "step": 1889 + }, + { + "epoch": 0.54, + "learning_rate": 9.510407505130461e-05, + "loss": 3.1405, + "step": 1890 + }, + { + "epoch": 0.54, + "learning_rate": 9.50454412195837e-05, + "loss": 3.1068, + "step": 1891 + }, + { + "epoch": 0.54, + "learning_rate": 9.49868073878628e-05, + "loss": 2.9872, + "step": 1892 + }, + { + "epoch": 0.54, + "learning_rate": 9.49281735561419e-05, + "loss": 3.1021, + "step": 1893 + }, + { + "epoch": 0.54, + "learning_rate": 9.4869539724421e-05, + "loss": 3.053, + "step": 1894 + }, + { + "epoch": 0.54, + "learning_rate": 9.48109058927001e-05, + "loss": 3.1373, + "step": 1895 + }, + { + "epoch": 0.54, + "learning_rate": 9.475227206097919e-05, + "loss": 3.1356, + "step": 1896 + }, + { + "epoch": 0.54, + "learning_rate": 9.469363822925828e-05, + "loss": 3.1121, + "step": 1897 + }, + { + "epoch": 0.54, + "learning_rate": 9.463500439753739e-05, + "loss": 3.0237, + "step": 1898 + }, + { + "epoch": 0.54, + "learning_rate": 9.457637056581649e-05, + "loss": 3.0802, + "step": 1899 + }, + { + "epoch": 0.54, + "learning_rate": 9.451773673409558e-05, + "loss": 2.99, + "step": 1900 + }, + { + "epoch": 0.54, + "learning_rate": 9.445910290237468e-05, + "loss": 3.1088, + "step": 1901 + }, + { + "epoch": 0.54, + "learning_rate": 9.440046907065377e-05, + "loss": 3.0627, + "step": 1902 + }, + { + "epoch": 0.54, + "learning_rate": 9.434183523893288e-05, + "loss": 3.0744, + "step": 1903 + }, + { + "epoch": 0.54, + "learning_rate": 9.428320140721196e-05, + "loss": 3.0008, + "step": 1904 + }, + { + "epoch": 0.54, + "learning_rate": 9.422456757549107e-05, + "loss": 3.11, + "step": 1905 + }, + { + "epoch": 0.54, + "learning_rate": 9.416593374377016e-05, + "loss": 3.0389, + "step": 1906 + }, + { + "epoch": 0.54, + "learning_rate": 9.410729991204925e-05, + "loss": 3.0924, + "step": 1907 + }, + { + "epoch": 0.54, + "learning_rate": 9.404866608032835e-05, + "loss": 3.0821, + "step": 1908 + }, + { + "epoch": 0.54, + "learning_rate": 9.399003224860746e-05, + "loss": 3.0144, + "step": 1909 + }, + { + "epoch": 0.54, + "learning_rate": 9.393139841688655e-05, + "loss": 3.0703, + "step": 1910 + }, + { + "epoch": 0.54, + "learning_rate": 9.387276458516565e-05, + "loss": 3.1178, + "step": 1911 + }, + { + "epoch": 0.54, + "learning_rate": 9.381413075344474e-05, + "loss": 3.077, + "step": 1912 + }, + { + "epoch": 0.54, + "learning_rate": 9.375549692172383e-05, + "loss": 2.9368, + "step": 1913 + }, + { + "epoch": 0.55, + "learning_rate": 9.369686309000294e-05, + "loss": 3.1405, + "step": 1914 + }, + { + "epoch": 0.55, + "learning_rate": 9.363822925828202e-05, + "loss": 2.9568, + "step": 1915 + }, + { + "epoch": 0.55, + "learning_rate": 9.357959542656113e-05, + "loss": 3.1213, + "step": 1916 + }, + { + "epoch": 0.55, + "learning_rate": 9.352096159484023e-05, + "loss": 3.1674, + "step": 1917 + }, + { + "epoch": 0.55, + "learning_rate": 9.346232776311933e-05, + "loss": 3.0357, + "step": 1918 + }, + { + "epoch": 0.55, + "learning_rate": 9.340369393139841e-05, + "loss": 3.048, + "step": 1919 + }, + { + "epoch": 0.55, + "learning_rate": 9.334506009967752e-05, + "loss": 2.9384, + "step": 1920 + }, + { + "epoch": 0.55, + "learning_rate": 9.328642626795662e-05, + "loss": 3.0337, + "step": 1921 + }, + { + "epoch": 0.55, + "learning_rate": 9.322779243623571e-05, + "loss": 3.1666, + "step": 1922 + }, + { + "epoch": 0.55, + "learning_rate": 9.31691586045148e-05, + "loss": 3.0372, + "step": 1923 + }, + { + "epoch": 0.55, + "learning_rate": 9.31105247727939e-05, + "loss": 3.1386, + "step": 1924 + }, + { + "epoch": 0.55, + "learning_rate": 9.305189094107301e-05, + "loss": 3.1946, + "step": 1925 + }, + { + "epoch": 0.55, + "learning_rate": 9.29932571093521e-05, + "loss": 3.1404, + "step": 1926 + }, + { + "epoch": 0.55, + "learning_rate": 9.29346232776312e-05, + "loss": 2.9996, + "step": 1927 + }, + { + "epoch": 0.55, + "learning_rate": 9.287598944591029e-05, + "loss": 3.123, + "step": 1928 + }, + { + "epoch": 0.55, + "learning_rate": 9.28173556141894e-05, + "loss": 3.1337, + "step": 1929 + }, + { + "epoch": 0.55, + "learning_rate": 9.275872178246849e-05, + "loss": 3.0988, + "step": 1930 + }, + { + "epoch": 0.55, + "learning_rate": 9.270008795074759e-05, + "loss": 3.132, + "step": 1931 + }, + { + "epoch": 0.55, + "learning_rate": 9.264145411902668e-05, + "loss": 2.9339, + "step": 1932 + }, + { + "epoch": 0.55, + "learning_rate": 9.258282028730579e-05, + "loss": 3.0116, + "step": 1933 + }, + { + "epoch": 0.55, + "learning_rate": 9.252418645558487e-05, + "loss": 3.1258, + "step": 1934 + }, + { + "epoch": 0.55, + "learning_rate": 9.246555262386398e-05, + "loss": 3.0558, + "step": 1935 + }, + { + "epoch": 0.55, + "learning_rate": 9.240691879214307e-05, + "loss": 2.9847, + "step": 1936 + }, + { + "epoch": 0.55, + "eval_loss": 3.3032548427581787, + "eval_runtime": 2939.4432, + "eval_samples_per_second": 6.97, + "eval_steps_per_second": 2.323, + "step": 1936 + }, + { + "epoch": 0.55, + "learning_rate": 9.234828496042217e-05, + "loss": 3.1278, + "step": 1937 + }, + { + "epoch": 0.55, + "learning_rate": 9.228965112870126e-05, + "loss": 3.0167, + "step": 1938 + }, + { + "epoch": 0.55, + "learning_rate": 9.223101729698035e-05, + "loss": 3.0658, + "step": 1939 + }, + { + "epoch": 0.55, + "learning_rate": 9.217238346525946e-05, + "loss": 3.1698, + "step": 1940 + }, + { + "epoch": 0.55, + "learning_rate": 9.211374963353856e-05, + "loss": 3.0073, + "step": 1941 + }, + { + "epoch": 0.55, + "learning_rate": 9.205511580181765e-05, + "loss": 2.965, + "step": 1942 + }, + { + "epoch": 0.55, + "learning_rate": 9.199648197009675e-05, + "loss": 2.9975, + "step": 1943 + }, + { + "epoch": 0.55, + "learning_rate": 9.193784813837585e-05, + "loss": 3.1208, + "step": 1944 + }, + { + "epoch": 0.55, + "learning_rate": 9.187921430665495e-05, + "loss": 3.0442, + "step": 1945 + }, + { + "epoch": 0.55, + "learning_rate": 9.182058047493404e-05, + "loss": 3.0306, + "step": 1946 + }, + { + "epoch": 0.55, + "learning_rate": 9.176194664321314e-05, + "loss": 3.0539, + "step": 1947 + }, + { + "epoch": 0.55, + "learning_rate": 9.170331281149223e-05, + "loss": 2.979, + "step": 1948 + }, + { + "epoch": 0.55, + "learning_rate": 9.164467897977134e-05, + "loss": 3.0677, + "step": 1949 + }, + { + "epoch": 0.56, + "learning_rate": 9.158604514805042e-05, + "loss": 2.9672, + "step": 1950 + }, + { + "epoch": 0.56, + "learning_rate": 9.152741131632953e-05, + "loss": 3.1164, + "step": 1951 + }, + { + "epoch": 0.56, + "learning_rate": 9.146877748460862e-05, + "loss": 3.0753, + "step": 1952 + }, + { + "epoch": 0.56, + "learning_rate": 9.141014365288773e-05, + "loss": 3.1209, + "step": 1953 + }, + { + "epoch": 0.56, + "learning_rate": 9.135150982116681e-05, + "loss": 3.1353, + "step": 1954 + }, + { + "epoch": 0.56, + "learning_rate": 9.129287598944592e-05, + "loss": 3.1298, + "step": 1955 + }, + { + "epoch": 0.56, + "learning_rate": 9.123424215772501e-05, + "loss": 3.0803, + "step": 1956 + }, + { + "epoch": 0.56, + "learning_rate": 9.117560832600412e-05, + "loss": 3.1678, + "step": 1957 + }, + { + "epoch": 0.56, + "learning_rate": 9.11169744942832e-05, + "loss": 3.0652, + "step": 1958 + }, + { + "epoch": 0.56, + "learning_rate": 9.105834066256231e-05, + "loss": 3.1459, + "step": 1959 + }, + { + "epoch": 0.56, + "learning_rate": 9.09997068308414e-05, + "loss": 2.9366, + "step": 1960 + }, + { + "epoch": 0.56, + "learning_rate": 9.09410729991205e-05, + "loss": 3.0726, + "step": 1961 + }, + { + "epoch": 0.56, + "learning_rate": 9.088243916739959e-05, + "loss": 3.0109, + "step": 1962 + }, + { + "epoch": 0.56, + "learning_rate": 9.082380533567869e-05, + "loss": 3.0312, + "step": 1963 + }, + { + "epoch": 0.56, + "learning_rate": 9.07651715039578e-05, + "loss": 3.0686, + "step": 1964 + }, + { + "epoch": 0.56, + "learning_rate": 9.070653767223687e-05, + "loss": 3.0977, + "step": 1965 + }, + { + "epoch": 0.56, + "learning_rate": 9.064790384051598e-05, + "loss": 3.079, + "step": 1966 + }, + { + "epoch": 0.56, + "learning_rate": 9.058927000879508e-05, + "loss": 3.0986, + "step": 1967 + }, + { + "epoch": 0.56, + "learning_rate": 9.053063617707418e-05, + "loss": 3.1546, + "step": 1968 + }, + { + "epoch": 0.56, + "learning_rate": 9.047200234535327e-05, + "loss": 2.9669, + "step": 1969 + }, + { + "epoch": 0.56, + "learning_rate": 9.041336851363237e-05, + "loss": 3.1267, + "step": 1970 + }, + { + "epoch": 0.56, + "learning_rate": 9.035473468191147e-05, + "loss": 3.09, + "step": 1971 + }, + { + "epoch": 0.56, + "learning_rate": 9.029610085019056e-05, + "loss": 3.0128, + "step": 1972 + }, + { + "epoch": 0.56, + "learning_rate": 9.023746701846966e-05, + "loss": 3.0213, + "step": 1973 + }, + { + "epoch": 0.56, + "learning_rate": 9.017883318674875e-05, + "loss": 2.9842, + "step": 1974 + }, + { + "epoch": 0.56, + "learning_rate": 9.012019935502786e-05, + "loss": 3.0716, + "step": 1975 + }, + { + "epoch": 0.56, + "learning_rate": 9.006156552330695e-05, + "loss": 3.1559, + "step": 1976 + }, + { + "epoch": 0.56, + "learning_rate": 9.000293169158605e-05, + "loss": 2.9197, + "step": 1977 + }, + { + "epoch": 0.56, + "learning_rate": 8.994429785986514e-05, + "loss": 2.9311, + "step": 1978 + }, + { + "epoch": 0.56, + "learning_rate": 8.988566402814425e-05, + "loss": 3.086, + "step": 1979 + }, + { + "epoch": 0.56, + "learning_rate": 8.982703019642334e-05, + "loss": 3.0, + "step": 1980 + }, + { + "epoch": 0.56, + "learning_rate": 8.976839636470244e-05, + "loss": 3.0534, + "step": 1981 + }, + { + "epoch": 0.56, + "learning_rate": 8.970976253298153e-05, + "loss": 2.9953, + "step": 1982 + }, + { + "epoch": 0.56, + "learning_rate": 8.965112870126064e-05, + "loss": 3.1116, + "step": 1983 + }, + { + "epoch": 0.56, + "learning_rate": 8.959249486953973e-05, + "loss": 3.1197, + "step": 1984 + }, + { + "epoch": 0.57, + "learning_rate": 8.953386103781883e-05, + "loss": 3.0178, + "step": 1985 + }, + { + "epoch": 0.57, + "learning_rate": 8.947522720609792e-05, + "loss": 3.0182, + "step": 1986 + }, + { + "epoch": 0.57, + "learning_rate": 8.941659337437702e-05, + "loss": 2.9657, + "step": 1987 + }, + { + "epoch": 0.57, + "learning_rate": 8.935795954265613e-05, + "loss": 3.1071, + "step": 1988 + }, + { + "epoch": 0.57, + "learning_rate": 8.92993257109352e-05, + "loss": 3.1459, + "step": 1989 + }, + { + "epoch": 0.57, + "learning_rate": 8.924069187921431e-05, + "loss": 3.0574, + "step": 1990 + }, + { + "epoch": 0.57, + "learning_rate": 8.918205804749341e-05, + "loss": 2.9549, + "step": 1991 + }, + { + "epoch": 0.57, + "learning_rate": 8.91234242157725e-05, + "loss": 3.0443, + "step": 1992 + }, + { + "epoch": 0.57, + "learning_rate": 8.90647903840516e-05, + "loss": 2.9937, + "step": 1993 + }, + { + "epoch": 0.57, + "learning_rate": 8.90061565523307e-05, + "loss": 3.0391, + "step": 1994 + }, + { + "epoch": 0.57, + "learning_rate": 8.89475227206098e-05, + "loss": 2.9892, + "step": 1995 + }, + { + "epoch": 0.57, + "learning_rate": 8.888888888888889e-05, + "loss": 3.0484, + "step": 1996 + }, + { + "epoch": 0.57, + "learning_rate": 8.883025505716799e-05, + "loss": 3.0741, + "step": 1997 + }, + { + "epoch": 0.57, + "learning_rate": 8.877162122544708e-05, + "loss": 3.1785, + "step": 1998 + }, + { + "epoch": 0.57, + "learning_rate": 8.871298739372619e-05, + "loss": 3.0739, + "step": 1999 + }, + { + "epoch": 0.57, + "learning_rate": 8.865435356200527e-05, + "loss": 3.0987, + "step": 2000 + }, + { + "epoch": 0.57, + "learning_rate": 8.859571973028438e-05, + "loss": 3.1262, + "step": 2001 + }, + { + "epoch": 0.57, + "learning_rate": 8.853708589856347e-05, + "loss": 3.077, + "step": 2002 + }, + { + "epoch": 0.57, + "learning_rate": 8.847845206684258e-05, + "loss": 2.9417, + "step": 2003 + }, + { + "epoch": 0.57, + "learning_rate": 8.841981823512166e-05, + "loss": 2.9296, + "step": 2004 + }, + { + "epoch": 0.57, + "learning_rate": 8.836118440340077e-05, + "loss": 3.0343, + "step": 2005 + }, + { + "epoch": 0.57, + "learning_rate": 8.830255057167986e-05, + "loss": 3.1408, + "step": 2006 + }, + { + "epoch": 0.57, + "learning_rate": 8.824391673995897e-05, + "loss": 3.0642, + "step": 2007 + }, + { + "epoch": 0.57, + "learning_rate": 8.818528290823805e-05, + "loss": 3.0765, + "step": 2008 + }, + { + "epoch": 0.57, + "learning_rate": 8.812664907651716e-05, + "loss": 3.0626, + "step": 2009 + }, + { + "epoch": 0.57, + "learning_rate": 8.806801524479625e-05, + "loss": 3.0625, + "step": 2010 + }, + { + "epoch": 0.57, + "learning_rate": 8.800938141307535e-05, + "loss": 2.9488, + "step": 2011 + }, + { + "epoch": 0.57, + "learning_rate": 8.795074758135444e-05, + "loss": 3.0291, + "step": 2012 + }, + { + "epoch": 0.57, + "learning_rate": 8.789211374963354e-05, + "loss": 3.0251, + "step": 2013 + }, + { + "epoch": 0.57, + "learning_rate": 8.783347991791265e-05, + "loss": 3.0409, + "step": 2014 + }, + { + "epoch": 0.57, + "learning_rate": 8.777484608619174e-05, + "loss": 3.0749, + "step": 2015 + }, + { + "epoch": 0.57, + "learning_rate": 8.771621225447083e-05, + "loss": 3.1053, + "step": 2016 + }, + { + "epoch": 0.57, + "learning_rate": 8.765757842274993e-05, + "loss": 3.1473, + "step": 2017 + }, + { + "epoch": 0.57, + "learning_rate": 8.759894459102904e-05, + "loss": 3.0467, + "step": 2018 + }, + { + "epoch": 0.57, + "learning_rate": 8.754031075930812e-05, + "loss": 2.9854, + "step": 2019 + }, + { + "epoch": 0.58, + "learning_rate": 8.748167692758722e-05, + "loss": 2.9706, + "step": 2020 + }, + { + "epoch": 0.58, + "learning_rate": 8.742304309586632e-05, + "loss": 3.0775, + "step": 2021 + }, + { + "epoch": 0.58, + "learning_rate": 8.736440926414541e-05, + "loss": 3.0419, + "step": 2022 + }, + { + "epoch": 0.58, + "learning_rate": 8.730577543242451e-05, + "loss": 3.107, + "step": 2023 + }, + { + "epoch": 0.58, + "learning_rate": 8.72471416007036e-05, + "loss": 2.9794, + "step": 2024 + }, + { + "epoch": 0.58, + "learning_rate": 8.718850776898271e-05, + "loss": 3.0311, + "step": 2025 + }, + { + "epoch": 0.58, + "learning_rate": 8.71298739372618e-05, + "loss": 3.1402, + "step": 2026 + }, + { + "epoch": 0.58, + "learning_rate": 8.70712401055409e-05, + "loss": 2.9644, + "step": 2027 + }, + { + "epoch": 0.58, + "learning_rate": 8.701260627381999e-05, + "loss": 2.997, + "step": 2028 + }, + { + "epoch": 0.58, + "learning_rate": 8.69539724420991e-05, + "loss": 2.9759, + "step": 2029 + }, + { + "epoch": 0.58, + "learning_rate": 8.68953386103782e-05, + "loss": 3.0223, + "step": 2030 + }, + { + "epoch": 0.58, + "learning_rate": 8.683670477865729e-05, + "loss": 3.1275, + "step": 2031 + }, + { + "epoch": 0.58, + "learning_rate": 8.677807094693638e-05, + "loss": 3.071, + "step": 2032 + }, + { + "epoch": 0.58, + "learning_rate": 8.671943711521549e-05, + "loss": 3.1222, + "step": 2033 + }, + { + "epoch": 0.58, + "learning_rate": 8.666080328349459e-05, + "loss": 2.9666, + "step": 2034 + }, + { + "epoch": 0.58, + "learning_rate": 8.660216945177368e-05, + "loss": 3.034, + "step": 2035 + }, + { + "epoch": 0.58, + "learning_rate": 8.654353562005277e-05, + "loss": 3.04, + "step": 2036 + }, + { + "epoch": 0.58, + "learning_rate": 8.648490178833187e-05, + "loss": 3.0028, + "step": 2037 + }, + { + "epoch": 0.58, + "learning_rate": 8.642626795661098e-05, + "loss": 3.087, + "step": 2038 + }, + { + "epoch": 0.58, + "learning_rate": 8.636763412489006e-05, + "loss": 3.164, + "step": 2039 + }, + { + "epoch": 0.58, + "learning_rate": 8.630900029316917e-05, + "loss": 3.0221, + "step": 2040 + }, + { + "epoch": 0.58, + "learning_rate": 8.625036646144826e-05, + "loss": 3.0615, + "step": 2041 + }, + { + "epoch": 0.58, + "learning_rate": 8.619173262972737e-05, + "loss": 3.0341, + "step": 2042 + }, + { + "epoch": 0.58, + "learning_rate": 8.613309879800645e-05, + "loss": 3.0749, + "step": 2043 + }, + { + "epoch": 0.58, + "learning_rate": 8.607446496628556e-05, + "loss": 2.9954, + "step": 2044 + }, + { + "epoch": 0.58, + "learning_rate": 8.601583113456465e-05, + "loss": 3.0631, + "step": 2045 + }, + { + "epoch": 0.58, + "learning_rate": 8.595719730284374e-05, + "loss": 3.1342, + "step": 2046 + }, + { + "epoch": 0.58, + "learning_rate": 8.589856347112284e-05, + "loss": 3.076, + "step": 2047 + }, + { + "epoch": 0.58, + "learning_rate": 8.583992963940193e-05, + "loss": 3.1095, + "step": 2048 + }, + { + "epoch": 0.58, + "learning_rate": 8.578129580768104e-05, + "loss": 3.0005, + "step": 2049 + }, + { + "epoch": 0.58, + "learning_rate": 8.572266197596012e-05, + "loss": 3.033, + "step": 2050 + }, + { + "epoch": 0.58, + "learning_rate": 8.566402814423923e-05, + "loss": 3.0772, + "step": 2051 + }, + { + "epoch": 0.58, + "learning_rate": 8.560539431251832e-05, + "loss": 3.0078, + "step": 2052 + }, + { + "epoch": 0.58, + "learning_rate": 8.554676048079743e-05, + "loss": 3.1205, + "step": 2053 + }, + { + "epoch": 0.58, + "learning_rate": 8.548812664907651e-05, + "loss": 2.9833, + "step": 2054 + }, + { + "epoch": 0.59, + "learning_rate": 8.542949281735562e-05, + "loss": 2.9453, + "step": 2055 + }, + { + "epoch": 0.59, + "learning_rate": 8.537085898563471e-05, + "loss": 3.0687, + "step": 2056 + }, + { + "epoch": 0.59, + "learning_rate": 8.531222515391382e-05, + "loss": 2.9526, + "step": 2057 + }, + { + "epoch": 0.59, + "learning_rate": 8.52535913221929e-05, + "loss": 2.9666, + "step": 2058 + }, + { + "epoch": 0.59, + "learning_rate": 8.519495749047201e-05, + "loss": 3.0944, + "step": 2059 + }, + { + "epoch": 0.59, + "learning_rate": 8.51363236587511e-05, + "loss": 3.0659, + "step": 2060 + }, + { + "epoch": 0.59, + "learning_rate": 8.50776898270302e-05, + "loss": 3.1344, + "step": 2061 + }, + { + "epoch": 0.59, + "learning_rate": 8.50190559953093e-05, + "loss": 2.9884, + "step": 2062 + }, + { + "epoch": 0.59, + "learning_rate": 8.496042216358839e-05, + "loss": 3.1274, + "step": 2063 + }, + { + "epoch": 0.59, + "learning_rate": 8.49017883318675e-05, + "loss": 2.9231, + "step": 2064 + }, + { + "epoch": 0.59, + "learning_rate": 8.484315450014659e-05, + "loss": 2.9281, + "step": 2065 + }, + { + "epoch": 0.59, + "learning_rate": 8.478452066842569e-05, + "loss": 3.0655, + "step": 2066 + }, + { + "epoch": 0.59, + "learning_rate": 8.472588683670478e-05, + "loss": 2.9997, + "step": 2067 + }, + { + "epoch": 0.59, + "learning_rate": 8.466725300498389e-05, + "loss": 3.0595, + "step": 2068 + }, + { + "epoch": 0.59, + "learning_rate": 8.460861917326298e-05, + "loss": 3.0622, + "step": 2069 + }, + { + "epoch": 0.59, + "learning_rate": 8.454998534154208e-05, + "loss": 3.0297, + "step": 2070 + }, + { + "epoch": 0.59, + "learning_rate": 8.449135150982117e-05, + "loss": 3.0824, + "step": 2071 + }, + { + "epoch": 0.59, + "learning_rate": 8.443271767810026e-05, + "loss": 3.0544, + "step": 2072 + }, + { + "epoch": 0.59, + "learning_rate": 8.437408384637937e-05, + "loss": 3.0849, + "step": 2073 + }, + { + "epoch": 0.59, + "learning_rate": 8.431545001465845e-05, + "loss": 3.0525, + "step": 2074 + }, + { + "epoch": 0.59, + "learning_rate": 8.425681618293756e-05, + "loss": 3.0667, + "step": 2075 + }, + { + "epoch": 0.59, + "learning_rate": 8.419818235121666e-05, + "loss": 3.0996, + "step": 2076 + }, + { + "epoch": 0.59, + "learning_rate": 8.413954851949575e-05, + "loss": 3.0324, + "step": 2077 + }, + { + "epoch": 0.59, + "learning_rate": 8.408091468777484e-05, + "loss": 3.086, + "step": 2078 + }, + { + "epoch": 0.59, + "learning_rate": 8.402228085605395e-05, + "loss": 3.0223, + "step": 2079 + }, + { + "epoch": 0.59, + "learning_rate": 8.396364702433305e-05, + "loss": 3.0605, + "step": 2080 + }, + { + "epoch": 0.59, + "learning_rate": 8.390501319261214e-05, + "loss": 3.0081, + "step": 2081 + }, + { + "epoch": 0.59, + "learning_rate": 8.384637936089123e-05, + "loss": 2.9988, + "step": 2082 + }, + { + "epoch": 0.59, + "learning_rate": 8.378774552917034e-05, + "loss": 3.0181, + "step": 2083 + }, + { + "epoch": 0.59, + "learning_rate": 8.372911169744944e-05, + "loss": 3.0363, + "step": 2084 + }, + { + "epoch": 0.59, + "learning_rate": 8.367047786572853e-05, + "loss": 3.0154, + "step": 2085 + }, + { + "epoch": 0.59, + "learning_rate": 8.361184403400763e-05, + "loss": 3.0487, + "step": 2086 + }, + { + "epoch": 0.59, + "learning_rate": 8.355321020228672e-05, + "loss": 3.0229, + "step": 2087 + }, + { + "epoch": 0.59, + "learning_rate": 8.349457637056583e-05, + "loss": 3.0661, + "step": 2088 + }, + { + "epoch": 0.59, + "learning_rate": 8.343594253884491e-05, + "loss": 2.9821, + "step": 2089 + }, + { + "epoch": 0.6, + "learning_rate": 8.337730870712402e-05, + "loss": 3.0254, + "step": 2090 + }, + { + "epoch": 0.6, + "learning_rate": 8.331867487540311e-05, + "loss": 3.0446, + "step": 2091 + }, + { + "epoch": 0.6, + "learning_rate": 8.326004104368222e-05, + "loss": 2.99, + "step": 2092 + }, + { + "epoch": 0.6, + "learning_rate": 8.32014072119613e-05, + "loss": 2.9763, + "step": 2093 + }, + { + "epoch": 0.6, + "learning_rate": 8.314277338024041e-05, + "loss": 2.9533, + "step": 2094 + }, + { + "epoch": 0.6, + "learning_rate": 8.30841395485195e-05, + "loss": 3.0655, + "step": 2095 + }, + { + "epoch": 0.6, + "learning_rate": 8.30255057167986e-05, + "loss": 3.0108, + "step": 2096 + }, + { + "epoch": 0.6, + "learning_rate": 8.296687188507769e-05, + "loss": 2.9899, + "step": 2097 + }, + { + "epoch": 0.6, + "learning_rate": 8.290823805335678e-05, + "loss": 3.0429, + "step": 2098 + }, + { + "epoch": 0.6, + "learning_rate": 8.284960422163589e-05, + "loss": 3.0015, + "step": 2099 + }, + { + "epoch": 0.6, + "learning_rate": 8.279097038991499e-05, + "loss": 2.9944, + "step": 2100 + }, + { + "epoch": 0.6, + "learning_rate": 8.273233655819408e-05, + "loss": 3.0209, + "step": 2101 + }, + { + "epoch": 0.6, + "learning_rate": 8.267370272647318e-05, + "loss": 3.132, + "step": 2102 + }, + { + "epoch": 0.6, + "learning_rate": 8.261506889475228e-05, + "loss": 3.0258, + "step": 2103 + }, + { + "epoch": 0.6, + "learning_rate": 8.255643506303136e-05, + "loss": 3.0699, + "step": 2104 + }, + { + "epoch": 0.6, + "learning_rate": 8.249780123131047e-05, + "loss": 2.9658, + "step": 2105 + }, + { + "epoch": 0.6, + "learning_rate": 8.243916739958957e-05, + "loss": 2.9796, + "step": 2106 + }, + { + "epoch": 0.6, + "learning_rate": 8.238053356786866e-05, + "loss": 3.0962, + "step": 2107 + }, + { + "epoch": 0.6, + "learning_rate": 8.232189973614775e-05, + "loss": 3.032, + "step": 2108 + }, + { + "epoch": 0.6, + "learning_rate": 8.226326590442686e-05, + "loss": 3.0286, + "step": 2109 + }, + { + "epoch": 0.6, + "learning_rate": 8.220463207270596e-05, + "loss": 2.9863, + "step": 2110 + }, + { + "epoch": 0.6, + "learning_rate": 8.214599824098505e-05, + "loss": 3.0452, + "step": 2111 + }, + { + "epoch": 0.6, + "learning_rate": 8.208736440926415e-05, + "loss": 3.1286, + "step": 2112 + }, + { + "epoch": 0.6, + "eval_loss": 3.279003858566284, + "eval_runtime": 2938.7929, + "eval_samples_per_second": 6.971, + "eval_steps_per_second": 2.324, + "step": 2112 + }, + { + "epoch": 0.6, + "learning_rate": 8.202873057754324e-05, + "loss": 2.9913, + "step": 2113 + }, + { + "epoch": 0.6, + "learning_rate": 8.197009674582235e-05, + "loss": 3.0838, + "step": 2114 + }, + { + "epoch": 0.6, + "learning_rate": 8.191146291410144e-05, + "loss": 2.9803, + "step": 2115 + }, + { + "epoch": 0.6, + "learning_rate": 8.185282908238054e-05, + "loss": 3.0087, + "step": 2116 + }, + { + "epoch": 0.6, + "learning_rate": 8.179419525065963e-05, + "loss": 3.0088, + "step": 2117 + }, + { + "epoch": 0.6, + "learning_rate": 8.173556141893874e-05, + "loss": 3.0678, + "step": 2118 + }, + { + "epoch": 0.6, + "learning_rate": 8.167692758721783e-05, + "loss": 2.9735, + "step": 2119 + }, + { + "epoch": 0.6, + "learning_rate": 8.161829375549693e-05, + "loss": 3.0428, + "step": 2120 + }, + { + "epoch": 0.6, + "learning_rate": 8.155965992377602e-05, + "loss": 2.979, + "step": 2121 + }, + { + "epoch": 0.6, + "learning_rate": 8.150102609205512e-05, + "loss": 3.0504, + "step": 2122 + }, + { + "epoch": 0.6, + "learning_rate": 8.144239226033422e-05, + "loss": 2.8659, + "step": 2123 + }, + { + "epoch": 0.6, + "learning_rate": 8.13837584286133e-05, + "loss": 2.92, + "step": 2124 + }, + { + "epoch": 0.61, + "learning_rate": 8.132512459689241e-05, + "loss": 3.0536, + "step": 2125 + }, + { + "epoch": 0.61, + "learning_rate": 8.126649076517151e-05, + "loss": 3.0977, + "step": 2126 + }, + { + "epoch": 0.61, + "learning_rate": 8.120785693345061e-05, + "loss": 2.9551, + "step": 2127 + }, + { + "epoch": 0.61, + "learning_rate": 8.11492231017297e-05, + "loss": 3.1118, + "step": 2128 + }, + { + "epoch": 0.61, + "learning_rate": 8.10905892700088e-05, + "loss": 3.0925, + "step": 2129 + }, + { + "epoch": 0.61, + "learning_rate": 8.10319554382879e-05, + "loss": 3.0587, + "step": 2130 + }, + { + "epoch": 0.61, + "learning_rate": 8.097332160656699e-05, + "loss": 2.9647, + "step": 2131 + }, + { + "epoch": 0.61, + "learning_rate": 8.091468777484609e-05, + "loss": 2.8933, + "step": 2132 + }, + { + "epoch": 0.61, + "learning_rate": 8.085605394312518e-05, + "loss": 3.0305, + "step": 2133 + }, + { + "epoch": 0.61, + "learning_rate": 8.079742011140429e-05, + "loss": 3.0834, + "step": 2134 + }, + { + "epoch": 0.61, + "learning_rate": 8.073878627968337e-05, + "loss": 3.0384, + "step": 2135 + }, + { + "epoch": 0.61, + "learning_rate": 8.068015244796248e-05, + "loss": 3.1248, + "step": 2136 + }, + { + "epoch": 0.61, + "learning_rate": 8.062151861624157e-05, + "loss": 2.9678, + "step": 2137 + }, + { + "epoch": 0.61, + "learning_rate": 8.056288478452068e-05, + "loss": 3.0379, + "step": 2138 + }, + { + "epoch": 0.61, + "learning_rate": 8.050425095279976e-05, + "loss": 2.9091, + "step": 2139 + }, + { + "epoch": 0.61, + "learning_rate": 8.044561712107887e-05, + "loss": 3.0026, + "step": 2140 + }, + { + "epoch": 0.61, + "learning_rate": 8.038698328935796e-05, + "loss": 3.0794, + "step": 2141 + }, + { + "epoch": 0.61, + "learning_rate": 8.032834945763707e-05, + "loss": 3.1296, + "step": 2142 + }, + { + "epoch": 0.61, + "learning_rate": 8.026971562591615e-05, + "loss": 3.1056, + "step": 2143 + }, + { + "epoch": 0.61, + "learning_rate": 8.021108179419526e-05, + "loss": 3.0213, + "step": 2144 + }, + { + "epoch": 0.61, + "learning_rate": 8.015244796247435e-05, + "loss": 3.1048, + "step": 2145 + }, + { + "epoch": 0.61, + "learning_rate": 8.009381413075345e-05, + "loss": 3.0592, + "step": 2146 + }, + { + "epoch": 0.61, + "learning_rate": 8.003518029903254e-05, + "loss": 3.0547, + "step": 2147 + }, + { + "epoch": 0.61, + "learning_rate": 7.997654646731164e-05, + "loss": 3.1033, + "step": 2148 + }, + { + "epoch": 0.61, + "learning_rate": 7.991791263559074e-05, + "loss": 3.0537, + "step": 2149 + }, + { + "epoch": 0.61, + "learning_rate": 7.985927880386984e-05, + "loss": 3.1241, + "step": 2150 + }, + { + "epoch": 0.61, + "learning_rate": 7.980064497214893e-05, + "loss": 3.0887, + "step": 2151 + }, + { + "epoch": 0.61, + "learning_rate": 7.974201114042803e-05, + "loss": 3.0157, + "step": 2152 + }, + { + "epoch": 0.61, + "learning_rate": 7.968337730870713e-05, + "loss": 2.8817, + "step": 2153 + }, + { + "epoch": 0.61, + "learning_rate": 7.962474347698623e-05, + "loss": 3.1184, + "step": 2154 + }, + { + "epoch": 0.61, + "learning_rate": 7.956610964526532e-05, + "loss": 2.9394, + "step": 2155 + }, + { + "epoch": 0.61, + "learning_rate": 7.950747581354442e-05, + "loss": 3.0529, + "step": 2156 + }, + { + "epoch": 0.61, + "learning_rate": 7.944884198182351e-05, + "loss": 3.0572, + "step": 2157 + }, + { + "epoch": 0.61, + "learning_rate": 7.939020815010262e-05, + "loss": 2.9754, + "step": 2158 + }, + { + "epoch": 0.61, + "learning_rate": 7.93315743183817e-05, + "loss": 3.0794, + "step": 2159 + }, + { + "epoch": 0.62, + "learning_rate": 7.927294048666081e-05, + "loss": 3.0672, + "step": 2160 + }, + { + "epoch": 0.62, + "learning_rate": 7.92143066549399e-05, + "loss": 3.1011, + "step": 2161 + }, + { + "epoch": 0.62, + "learning_rate": 7.9155672823219e-05, + "loss": 3.0374, + "step": 2162 + }, + { + "epoch": 0.62, + "learning_rate": 7.909703899149809e-05, + "loss": 3.0082, + "step": 2163 + }, + { + "epoch": 0.62, + "learning_rate": 7.90384051597772e-05, + "loss": 3.1259, + "step": 2164 + }, + { + "epoch": 0.62, + "learning_rate": 7.89797713280563e-05, + "loss": 3.0714, + "step": 2165 + }, + { + "epoch": 0.62, + "learning_rate": 7.892113749633539e-05, + "loss": 3.0764, + "step": 2166 + }, + { + "epoch": 0.62, + "learning_rate": 7.886250366461448e-05, + "loss": 3.0549, + "step": 2167 + }, + { + "epoch": 0.62, + "learning_rate": 7.880386983289359e-05, + "loss": 3.0911, + "step": 2168 + }, + { + "epoch": 0.62, + "learning_rate": 7.874523600117268e-05, + "loss": 3.0779, + "step": 2169 + }, + { + "epoch": 0.62, + "learning_rate": 7.868660216945178e-05, + "loss": 2.9613, + "step": 2170 + }, + { + "epoch": 0.62, + "learning_rate": 7.862796833773087e-05, + "loss": 3.1114, + "step": 2171 + }, + { + "epoch": 0.62, + "learning_rate": 7.856933450600997e-05, + "loss": 3.0424, + "step": 2172 + }, + { + "epoch": 0.62, + "learning_rate": 7.851070067428908e-05, + "loss": 3.0557, + "step": 2173 + }, + { + "epoch": 0.62, + "learning_rate": 7.845206684256816e-05, + "loss": 2.9939, + "step": 2174 + }, + { + "epoch": 0.62, + "learning_rate": 7.839343301084726e-05, + "loss": 3.0031, + "step": 2175 + }, + { + "epoch": 0.62, + "learning_rate": 7.833479917912636e-05, + "loss": 2.9967, + "step": 2176 + }, + { + "epoch": 0.62, + "learning_rate": 7.827616534740547e-05, + "loss": 2.9714, + "step": 2177 + }, + { + "epoch": 0.62, + "learning_rate": 7.821753151568455e-05, + "loss": 2.9984, + "step": 2178 + }, + { + "epoch": 0.62, + "learning_rate": 7.815889768396365e-05, + "loss": 3.1013, + "step": 2179 + }, + { + "epoch": 0.62, + "learning_rate": 7.810026385224275e-05, + "loss": 3.0813, + "step": 2180 + }, + { + "epoch": 0.62, + "learning_rate": 7.804163002052184e-05, + "loss": 2.9323, + "step": 2181 + }, + { + "epoch": 0.62, + "learning_rate": 7.798299618880094e-05, + "loss": 2.9601, + "step": 2182 + }, + { + "epoch": 0.62, + "learning_rate": 7.792436235708003e-05, + "loss": 2.9847, + "step": 2183 + }, + { + "epoch": 0.62, + "learning_rate": 7.786572852535914e-05, + "loss": 3.081, + "step": 2184 + }, + { + "epoch": 0.62, + "learning_rate": 7.780709469363823e-05, + "loss": 3.0644, + "step": 2185 + }, + { + "epoch": 0.62, + "learning_rate": 7.774846086191733e-05, + "loss": 3.0474, + "step": 2186 + }, + { + "epoch": 0.62, + "learning_rate": 7.768982703019642e-05, + "loss": 3.0382, + "step": 2187 + }, + { + "epoch": 0.62, + "learning_rate": 7.763119319847553e-05, + "loss": 3.0454, + "step": 2188 + }, + { + "epoch": 0.62, + "learning_rate": 7.757255936675461e-05, + "loss": 3.0269, + "step": 2189 + }, + { + "epoch": 0.62, + "learning_rate": 7.751392553503372e-05, + "loss": 2.9757, + "step": 2190 + }, + { + "epoch": 0.62, + "learning_rate": 7.745529170331281e-05, + "loss": 2.9173, + "step": 2191 + }, + { + "epoch": 0.62, + "learning_rate": 7.739665787159192e-05, + "loss": 2.992, + "step": 2192 + }, + { + "epoch": 0.62, + "learning_rate": 7.7338024039871e-05, + "loss": 2.976, + "step": 2193 + }, + { + "epoch": 0.62, + "learning_rate": 7.727939020815011e-05, + "loss": 3.0376, + "step": 2194 + }, + { + "epoch": 0.63, + "learning_rate": 7.72207563764292e-05, + "loss": 3.0334, + "step": 2195 + }, + { + "epoch": 0.63, + "learning_rate": 7.71621225447083e-05, + "loss": 3.0348, + "step": 2196 + }, + { + "epoch": 0.63, + "learning_rate": 7.71034887129874e-05, + "loss": 3.0085, + "step": 2197 + }, + { + "epoch": 0.63, + "learning_rate": 7.704485488126649e-05, + "loss": 2.9804, + "step": 2198 + }, + { + "epoch": 0.63, + "learning_rate": 7.69862210495456e-05, + "loss": 2.9899, + "step": 2199 + }, + { + "epoch": 0.63, + "learning_rate": 7.692758721782469e-05, + "loss": 2.9572, + "step": 2200 + }, + { + "epoch": 0.63, + "learning_rate": 7.686895338610378e-05, + "loss": 3.043, + "step": 2201 + }, + { + "epoch": 0.63, + "learning_rate": 7.681031955438288e-05, + "loss": 3.0281, + "step": 2202 + }, + { + "epoch": 0.63, + "learning_rate": 7.675168572266199e-05, + "loss": 2.8836, + "step": 2203 + }, + { + "epoch": 0.63, + "learning_rate": 7.669305189094108e-05, + "loss": 3.0611, + "step": 2204 + }, + { + "epoch": 0.63, + "learning_rate": 7.663441805922017e-05, + "loss": 2.9676, + "step": 2205 + }, + { + "epoch": 0.63, + "learning_rate": 7.657578422749927e-05, + "loss": 3.1005, + "step": 2206 + }, + { + "epoch": 0.63, + "learning_rate": 7.651715039577836e-05, + "loss": 2.9966, + "step": 2207 + }, + { + "epoch": 0.63, + "learning_rate": 7.645851656405747e-05, + "loss": 3.0027, + "step": 2208 + }, + { + "epoch": 0.63, + "learning_rate": 7.639988273233655e-05, + "loss": 3.0368, + "step": 2209 + }, + { + "epoch": 0.63, + "learning_rate": 7.634124890061566e-05, + "loss": 3.0812, + "step": 2210 + }, + { + "epoch": 0.63, + "learning_rate": 7.628261506889475e-05, + "loss": 3.1381, + "step": 2211 + }, + { + "epoch": 0.63, + "learning_rate": 7.622398123717386e-05, + "loss": 3.1242, + "step": 2212 + }, + { + "epoch": 0.63, + "learning_rate": 7.616534740545294e-05, + "loss": 3.0857, + "step": 2213 + }, + { + "epoch": 0.63, + "learning_rate": 7.610671357373205e-05, + "loss": 3.0792, + "step": 2214 + }, + { + "epoch": 0.63, + "learning_rate": 7.604807974201115e-05, + "loss": 3.1115, + "step": 2215 + }, + { + "epoch": 0.63, + "learning_rate": 7.598944591029024e-05, + "loss": 3.0161, + "step": 2216 + }, + { + "epoch": 0.63, + "learning_rate": 7.593081207856933e-05, + "loss": 3.1468, + "step": 2217 + }, + { + "epoch": 0.63, + "learning_rate": 7.587217824684844e-05, + "loss": 2.8137, + "step": 2218 + }, + { + "epoch": 0.63, + "learning_rate": 7.581354441512754e-05, + "loss": 3.0643, + "step": 2219 + }, + { + "epoch": 0.63, + "learning_rate": 7.575491058340663e-05, + "loss": 3.114, + "step": 2220 + }, + { + "epoch": 0.63, + "learning_rate": 7.569627675168572e-05, + "loss": 2.95, + "step": 2221 + }, + { + "epoch": 0.63, + "learning_rate": 7.563764291996482e-05, + "loss": 2.9803, + "step": 2222 + }, + { + "epoch": 0.63, + "learning_rate": 7.557900908824393e-05, + "loss": 3.0316, + "step": 2223 + }, + { + "epoch": 0.63, + "learning_rate": 7.552037525652301e-05, + "loss": 2.9819, + "step": 2224 + }, + { + "epoch": 0.63, + "learning_rate": 7.546174142480212e-05, + "loss": 2.9961, + "step": 2225 + }, + { + "epoch": 0.63, + "learning_rate": 7.540310759308121e-05, + "loss": 3.0465, + "step": 2226 + }, + { + "epoch": 0.63, + "learning_rate": 7.534447376136032e-05, + "loss": 2.9261, + "step": 2227 + }, + { + "epoch": 0.63, + "learning_rate": 7.52858399296394e-05, + "loss": 2.9324, + "step": 2228 + }, + { + "epoch": 0.63, + "learning_rate": 7.52272060979185e-05, + "loss": 2.9709, + "step": 2229 + }, + { + "epoch": 0.64, + "learning_rate": 7.51685722661976e-05, + "loss": 3.0211, + "step": 2230 + }, + { + "epoch": 0.64, + "learning_rate": 7.51099384344767e-05, + "loss": 2.9417, + "step": 2231 + }, + { + "epoch": 0.64, + "learning_rate": 7.505130460275579e-05, + "loss": 3.1206, + "step": 2232 + }, + { + "epoch": 0.64, + "learning_rate": 7.499267077103488e-05, + "loss": 3.0087, + "step": 2233 + }, + { + "epoch": 0.64, + "learning_rate": 7.493403693931399e-05, + "loss": 3.0394, + "step": 2234 + }, + { + "epoch": 0.64, + "learning_rate": 7.487540310759309e-05, + "loss": 2.9766, + "step": 2235 + }, + { + "epoch": 0.64, + "learning_rate": 7.481676927587218e-05, + "loss": 3.0468, + "step": 2236 + }, + { + "epoch": 0.64, + "learning_rate": 7.475813544415127e-05, + "loss": 3.0013, + "step": 2237 + }, + { + "epoch": 0.64, + "learning_rate": 7.469950161243038e-05, + "loss": 3.077, + "step": 2238 + }, + { + "epoch": 0.64, + "learning_rate": 7.464086778070948e-05, + "loss": 3.0647, + "step": 2239 + }, + { + "epoch": 0.64, + "learning_rate": 7.458223394898857e-05, + "loss": 2.9289, + "step": 2240 + }, + { + "epoch": 0.64, + "learning_rate": 7.452360011726767e-05, + "loss": 3.1249, + "step": 2241 + }, + { + "epoch": 0.64, + "learning_rate": 7.446496628554677e-05, + "loss": 3.096, + "step": 2242 + }, + { + "epoch": 0.64, + "learning_rate": 7.440633245382587e-05, + "loss": 2.9995, + "step": 2243 + }, + { + "epoch": 0.64, + "learning_rate": 7.434769862210496e-05, + "loss": 3.1667, + "step": 2244 + }, + { + "epoch": 0.64, + "learning_rate": 7.428906479038406e-05, + "loss": 3.0341, + "step": 2245 + }, + { + "epoch": 0.64, + "learning_rate": 7.423043095866315e-05, + "loss": 3.0475, + "step": 2246 + }, + { + "epoch": 0.64, + "learning_rate": 7.417179712694224e-05, + "loss": 3.0424, + "step": 2247 + }, + { + "epoch": 0.64, + "learning_rate": 7.411316329522134e-05, + "loss": 3.0059, + "step": 2248 + }, + { + "epoch": 0.64, + "learning_rate": 7.405452946350045e-05, + "loss": 3.0318, + "step": 2249 + }, + { + "epoch": 0.64, + "learning_rate": 7.399589563177954e-05, + "loss": 3.0965, + "step": 2250 + }, + { + "epoch": 0.64, + "learning_rate": 7.393726180005864e-05, + "loss": 3.0951, + "step": 2251 + }, + { + "epoch": 0.64, + "learning_rate": 7.387862796833773e-05, + "loss": 3.024, + "step": 2252 + }, + { + "epoch": 0.64, + "learning_rate": 7.381999413661684e-05, + "loss": 3.028, + "step": 2253 + }, + { + "epoch": 0.64, + "learning_rate": 7.376136030489593e-05, + "loss": 3.0371, + "step": 2254 + }, + { + "epoch": 0.64, + "learning_rate": 7.370272647317503e-05, + "loss": 2.8833, + "step": 2255 + }, + { + "epoch": 0.64, + "learning_rate": 7.364409264145412e-05, + "loss": 2.9803, + "step": 2256 + }, + { + "epoch": 0.64, + "learning_rate": 7.358545880973322e-05, + "loss": 3.1169, + "step": 2257 + }, + { + "epoch": 0.64, + "learning_rate": 7.352682497801232e-05, + "loss": 3.0181, + "step": 2258 + }, + { + "epoch": 0.64, + "learning_rate": 7.34681911462914e-05, + "loss": 2.974, + "step": 2259 + }, + { + "epoch": 0.64, + "learning_rate": 7.340955731457051e-05, + "loss": 3.0522, + "step": 2260 + }, + { + "epoch": 0.64, + "learning_rate": 7.33509234828496e-05, + "loss": 2.9721, + "step": 2261 + }, + { + "epoch": 0.64, + "learning_rate": 7.329228965112871e-05, + "loss": 3.0173, + "step": 2262 + }, + { + "epoch": 0.64, + "learning_rate": 7.32336558194078e-05, + "loss": 3.1451, + "step": 2263 + }, + { + "epoch": 0.64, + "learning_rate": 7.31750219876869e-05, + "loss": 3.0079, + "step": 2264 + }, + { + "epoch": 0.64, + "learning_rate": 7.3116388155966e-05, + "loss": 3.0314, + "step": 2265 + }, + { + "epoch": 0.65, + "learning_rate": 7.30577543242451e-05, + "loss": 2.9993, + "step": 2266 + }, + { + "epoch": 0.65, + "learning_rate": 7.299912049252419e-05, + "loss": 2.9569, + "step": 2267 + }, + { + "epoch": 0.65, + "learning_rate": 7.294048666080329e-05, + "loss": 2.911, + "step": 2268 + }, + { + "epoch": 0.65, + "learning_rate": 7.288185282908239e-05, + "loss": 3.0419, + "step": 2269 + }, + { + "epoch": 0.65, + "learning_rate": 7.282321899736148e-05, + "loss": 3.0791, + "step": 2270 + }, + { + "epoch": 0.65, + "learning_rate": 7.276458516564058e-05, + "loss": 3.0664, + "step": 2271 + }, + { + "epoch": 0.65, + "learning_rate": 7.270595133391967e-05, + "loss": 3.0604, + "step": 2272 + }, + { + "epoch": 0.65, + "learning_rate": 7.264731750219878e-05, + "loss": 3.0102, + "step": 2273 + }, + { + "epoch": 0.65, + "learning_rate": 7.258868367047786e-05, + "loss": 2.9504, + "step": 2274 + }, + { + "epoch": 0.65, + "learning_rate": 7.253004983875697e-05, + "loss": 3.0398, + "step": 2275 + }, + { + "epoch": 0.65, + "learning_rate": 7.247141600703606e-05, + "loss": 3.0571, + "step": 2276 + }, + { + "epoch": 0.65, + "learning_rate": 7.241278217531517e-05, + "loss": 3.0104, + "step": 2277 + }, + { + "epoch": 0.65, + "learning_rate": 7.235414834359425e-05, + "loss": 3.1231, + "step": 2278 + }, + { + "epoch": 0.65, + "learning_rate": 7.229551451187336e-05, + "loss": 2.9585, + "step": 2279 + }, + { + "epoch": 0.65, + "learning_rate": 7.223688068015245e-05, + "loss": 3.1243, + "step": 2280 + }, + { + "epoch": 0.65, + "learning_rate": 7.217824684843155e-05, + "loss": 2.92, + "step": 2281 + }, + { + "epoch": 0.65, + "learning_rate": 7.211961301671064e-05, + "loss": 2.9059, + "step": 2282 + }, + { + "epoch": 0.65, + "learning_rate": 7.206097918498974e-05, + "loss": 3.1515, + "step": 2283 + }, + { + "epoch": 0.65, + "learning_rate": 7.200234535326884e-05, + "loss": 3.0166, + "step": 2284 + }, + { + "epoch": 0.65, + "learning_rate": 7.194371152154794e-05, + "loss": 3.1146, + "step": 2285 + }, + { + "epoch": 0.65, + "learning_rate": 7.188507768982703e-05, + "loss": 2.9681, + "step": 2286 + }, + { + "epoch": 0.65, + "learning_rate": 7.182644385810613e-05, + "loss": 2.9228, + "step": 2287 + }, + { + "epoch": 0.65, + "learning_rate": 7.176781002638523e-05, + "loss": 3.0504, + "step": 2288 + }, + { + "epoch": 0.65, + "eval_loss": 3.2625436782836914, + "eval_runtime": 2940.0782, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 2288 + }, + { + "epoch": 0.65, + "learning_rate": 7.170917619466433e-05, + "loss": 2.9851, + "step": 2289 + }, + { + "epoch": 0.65, + "learning_rate": 7.165054236294342e-05, + "loss": 2.9923, + "step": 2290 + }, + { + "epoch": 0.65, + "learning_rate": 7.159190853122252e-05, + "loss": 2.9812, + "step": 2291 + }, + { + "epoch": 0.65, + "learning_rate": 7.153327469950162e-05, + "loss": 2.928, + "step": 2292 + }, + { + "epoch": 0.65, + "learning_rate": 7.147464086778072e-05, + "loss": 3.0387, + "step": 2293 + }, + { + "epoch": 0.65, + "learning_rate": 7.141600703605981e-05, + "loss": 3.0022, + "step": 2294 + }, + { + "epoch": 0.65, + "learning_rate": 7.135737320433891e-05, + "loss": 3.096, + "step": 2295 + }, + { + "epoch": 0.65, + "learning_rate": 7.1298739372618e-05, + "loss": 3.0206, + "step": 2296 + }, + { + "epoch": 0.65, + "learning_rate": 7.124010554089711e-05, + "loss": 3.0001, + "step": 2297 + }, + { + "epoch": 0.65, + "learning_rate": 7.118147170917619e-05, + "loss": 2.9541, + "step": 2298 + }, + { + "epoch": 0.65, + "learning_rate": 7.11228378774553e-05, + "loss": 3.0121, + "step": 2299 + }, + { + "epoch": 0.65, + "learning_rate": 7.106420404573439e-05, + "loss": 2.9971, + "step": 2300 + }, + { + "epoch": 0.66, + "learning_rate": 7.100557021401349e-05, + "loss": 3.0368, + "step": 2301 + }, + { + "epoch": 0.66, + "learning_rate": 7.094693638229258e-05, + "loss": 2.9971, + "step": 2302 + }, + { + "epoch": 0.66, + "learning_rate": 7.088830255057169e-05, + "loss": 2.9714, + "step": 2303 + }, + { + "epoch": 0.66, + "learning_rate": 7.082966871885078e-05, + "loss": 2.9774, + "step": 2304 + }, + { + "epoch": 0.66, + "learning_rate": 7.077103488712988e-05, + "loss": 2.9699, + "step": 2305 + }, + { + "epoch": 0.66, + "learning_rate": 7.071240105540897e-05, + "loss": 3.0251, + "step": 2306 + }, + { + "epoch": 0.66, + "learning_rate": 7.065376722368807e-05, + "loss": 3.0374, + "step": 2307 + }, + { + "epoch": 0.66, + "learning_rate": 7.059513339196717e-05, + "loss": 3.0141, + "step": 2308 + }, + { + "epoch": 0.66, + "learning_rate": 7.053649956024626e-05, + "loss": 2.9218, + "step": 2309 + }, + { + "epoch": 0.66, + "learning_rate": 7.047786572852536e-05, + "loss": 2.9611, + "step": 2310 + }, + { + "epoch": 0.66, + "learning_rate": 7.041923189680446e-05, + "loss": 3.0331, + "step": 2311 + }, + { + "epoch": 0.66, + "learning_rate": 7.036059806508357e-05, + "loss": 3.0187, + "step": 2312 + }, + { + "epoch": 0.66, + "learning_rate": 7.030196423336265e-05, + "loss": 2.9778, + "step": 2313 + }, + { + "epoch": 0.66, + "learning_rate": 7.024333040164175e-05, + "loss": 3.031, + "step": 2314 + }, + { + "epoch": 0.66, + "learning_rate": 7.018469656992085e-05, + "loss": 3.0012, + "step": 2315 + }, + { + "epoch": 0.66, + "learning_rate": 7.012606273819994e-05, + "loss": 2.9672, + "step": 2316 + }, + { + "epoch": 0.66, + "learning_rate": 7.006742890647904e-05, + "loss": 2.8362, + "step": 2317 + }, + { + "epoch": 0.66, + "learning_rate": 7.000879507475813e-05, + "loss": 3.0038, + "step": 2318 + }, + { + "epoch": 0.66, + "learning_rate": 6.995016124303724e-05, + "loss": 2.9375, + "step": 2319 + }, + { + "epoch": 0.66, + "learning_rate": 6.989152741131633e-05, + "loss": 3.0321, + "step": 2320 + }, + { + "epoch": 0.66, + "learning_rate": 6.983289357959543e-05, + "loss": 3.0003, + "step": 2321 + }, + { + "epoch": 0.66, + "learning_rate": 6.977425974787452e-05, + "loss": 2.9768, + "step": 2322 + }, + { + "epoch": 0.66, + "learning_rate": 6.971562591615363e-05, + "loss": 3.0315, + "step": 2323 + }, + { + "epoch": 0.66, + "learning_rate": 6.965699208443272e-05, + "loss": 3.0134, + "step": 2324 + }, + { + "epoch": 0.66, + "learning_rate": 6.959835825271182e-05, + "loss": 3.0509, + "step": 2325 + }, + { + "epoch": 0.66, + "learning_rate": 6.953972442099091e-05, + "loss": 3.008, + "step": 2326 + }, + { + "epoch": 0.66, + "learning_rate": 6.948109058927002e-05, + "loss": 2.9663, + "step": 2327 + }, + { + "epoch": 0.66, + "learning_rate": 6.942245675754911e-05, + "loss": 2.8545, + "step": 2328 + }, + { + "epoch": 0.66, + "learning_rate": 6.936382292582821e-05, + "loss": 2.982, + "step": 2329 + }, + { + "epoch": 0.66, + "learning_rate": 6.93051890941073e-05, + "loss": 3.0049, + "step": 2330 + }, + { + "epoch": 0.66, + "learning_rate": 6.92465552623864e-05, + "loss": 3.0179, + "step": 2331 + }, + { + "epoch": 0.66, + "learning_rate": 6.918792143066549e-05, + "loss": 2.9824, + "step": 2332 + }, + { + "epoch": 0.66, + "learning_rate": 6.912928759894459e-05, + "loss": 2.9842, + "step": 2333 + }, + { + "epoch": 0.66, + "learning_rate": 6.90706537672237e-05, + "loss": 3.0041, + "step": 2334 + }, + { + "epoch": 0.66, + "learning_rate": 6.901201993550279e-05, + "loss": 2.9466, + "step": 2335 + }, + { + "epoch": 0.67, + "learning_rate": 6.895338610378188e-05, + "loss": 3.0105, + "step": 2336 + }, + { + "epoch": 0.67, + "learning_rate": 6.889475227206098e-05, + "loss": 2.9508, + "step": 2337 + }, + { + "epoch": 0.67, + "learning_rate": 6.883611844034009e-05, + "loss": 2.911, + "step": 2338 + }, + { + "epoch": 0.67, + "learning_rate": 6.877748460861918e-05, + "loss": 3.0504, + "step": 2339 + }, + { + "epoch": 0.67, + "learning_rate": 6.871885077689827e-05, + "loss": 3.1059, + "step": 2340 + }, + { + "epoch": 0.67, + "learning_rate": 6.866021694517737e-05, + "loss": 3.1035, + "step": 2341 + }, + { + "epoch": 0.67, + "learning_rate": 6.860158311345646e-05, + "loss": 3.0156, + "step": 2342 + }, + { + "epoch": 0.67, + "learning_rate": 6.854294928173557e-05, + "loss": 3.033, + "step": 2343 + }, + { + "epoch": 0.67, + "learning_rate": 6.848431545001465e-05, + "loss": 3.0288, + "step": 2344 + }, + { + "epoch": 0.67, + "learning_rate": 6.842568161829376e-05, + "loss": 2.9321, + "step": 2345 + }, + { + "epoch": 0.67, + "learning_rate": 6.836704778657285e-05, + "loss": 3.012, + "step": 2346 + }, + { + "epoch": 0.67, + "learning_rate": 6.830841395485196e-05, + "loss": 2.9936, + "step": 2347 + }, + { + "epoch": 0.67, + "learning_rate": 6.824978012313104e-05, + "loss": 2.868, + "step": 2348 + }, + { + "epoch": 0.67, + "learning_rate": 6.819114629141015e-05, + "loss": 3.0964, + "step": 2349 + }, + { + "epoch": 0.67, + "learning_rate": 6.813251245968924e-05, + "loss": 3.035, + "step": 2350 + }, + { + "epoch": 0.67, + "learning_rate": 6.807387862796835e-05, + "loss": 3.0535, + "step": 2351 + }, + { + "epoch": 0.67, + "learning_rate": 6.801524479624743e-05, + "loss": 3.0401, + "step": 2352 + }, + { + "epoch": 0.67, + "learning_rate": 6.795661096452654e-05, + "loss": 3.0324, + "step": 2353 + }, + { + "epoch": 0.67, + "learning_rate": 6.789797713280563e-05, + "loss": 3.0138, + "step": 2354 + }, + { + "epoch": 0.67, + "learning_rate": 6.783934330108473e-05, + "loss": 3.0568, + "step": 2355 + }, + { + "epoch": 0.67, + "learning_rate": 6.778070946936382e-05, + "loss": 3.1124, + "step": 2356 + }, + { + "epoch": 0.67, + "learning_rate": 6.772207563764292e-05, + "loss": 2.9387, + "step": 2357 + }, + { + "epoch": 0.67, + "learning_rate": 6.766344180592203e-05, + "loss": 3.0234, + "step": 2358 + }, + { + "epoch": 0.67, + "learning_rate": 6.76048079742011e-05, + "loss": 2.8785, + "step": 2359 + }, + { + "epoch": 0.67, + "learning_rate": 6.754617414248021e-05, + "loss": 3.0862, + "step": 2360 + }, + { + "epoch": 0.67, + "learning_rate": 6.748754031075931e-05, + "loss": 2.997, + "step": 2361 + }, + { + "epoch": 0.67, + "learning_rate": 6.742890647903842e-05, + "loss": 2.9434, + "step": 2362 + }, + { + "epoch": 0.67, + "learning_rate": 6.73702726473175e-05, + "loss": 3.0576, + "step": 2363 + }, + { + "epoch": 0.67, + "learning_rate": 6.73116388155966e-05, + "loss": 2.9594, + "step": 2364 + }, + { + "epoch": 0.67, + "learning_rate": 6.72530049838757e-05, + "loss": 3.0256, + "step": 2365 + }, + { + "epoch": 0.67, + "learning_rate": 6.71943711521548e-05, + "loss": 3.0172, + "step": 2366 + }, + { + "epoch": 0.67, + "learning_rate": 6.713573732043389e-05, + "loss": 3.0303, + "step": 2367 + }, + { + "epoch": 0.67, + "learning_rate": 6.707710348871298e-05, + "loss": 3.0042, + "step": 2368 + }, + { + "epoch": 0.67, + "learning_rate": 6.701846965699209e-05, + "loss": 3.0087, + "step": 2369 + }, + { + "epoch": 0.67, + "learning_rate": 6.695983582527118e-05, + "loss": 3.0963, + "step": 2370 + }, + { + "epoch": 0.68, + "learning_rate": 6.690120199355028e-05, + "loss": 2.9385, + "step": 2371 + }, + { + "epoch": 0.68, + "learning_rate": 6.684256816182937e-05, + "loss": 3.026, + "step": 2372 + }, + { + "epoch": 0.68, + "learning_rate": 6.678393433010848e-05, + "loss": 2.9379, + "step": 2373 + }, + { + "epoch": 0.68, + "learning_rate": 6.672530049838758e-05, + "loss": 3.0521, + "step": 2374 + }, + { + "epoch": 0.68, + "learning_rate": 6.666666666666667e-05, + "loss": 3.0137, + "step": 2375 + }, + { + "epoch": 0.68, + "learning_rate": 6.660803283494576e-05, + "loss": 3.0204, + "step": 2376 + }, + { + "epoch": 0.68, + "learning_rate": 6.654939900322487e-05, + "loss": 3.004, + "step": 2377 + }, + { + "epoch": 0.68, + "learning_rate": 6.649076517150397e-05, + "loss": 2.9018, + "step": 2378 + }, + { + "epoch": 0.68, + "learning_rate": 6.643213133978306e-05, + "loss": 2.8955, + "step": 2379 + }, + { + "epoch": 0.68, + "learning_rate": 6.637349750806216e-05, + "loss": 3.0233, + "step": 2380 + }, + { + "epoch": 0.68, + "learning_rate": 6.631486367634125e-05, + "loss": 3.008, + "step": 2381 + }, + { + "epoch": 0.68, + "learning_rate": 6.625622984462036e-05, + "loss": 2.9618, + "step": 2382 + }, + { + "epoch": 0.68, + "learning_rate": 6.619759601289944e-05, + "loss": 2.8988, + "step": 2383 + }, + { + "epoch": 0.68, + "learning_rate": 6.613896218117855e-05, + "loss": 3.0136, + "step": 2384 + }, + { + "epoch": 0.68, + "learning_rate": 6.608032834945764e-05, + "loss": 2.952, + "step": 2385 + }, + { + "epoch": 0.68, + "learning_rate": 6.602169451773673e-05, + "loss": 2.9119, + "step": 2386 + }, + { + "epoch": 0.68, + "learning_rate": 6.596306068601583e-05, + "loss": 2.9193, + "step": 2387 + }, + { + "epoch": 0.68, + "learning_rate": 6.590442685429494e-05, + "loss": 3.0116, + "step": 2388 + }, + { + "epoch": 0.68, + "learning_rate": 6.584579302257403e-05, + "loss": 2.9762, + "step": 2389 + }, + { + "epoch": 0.68, + "learning_rate": 6.578715919085313e-05, + "loss": 3.0246, + "step": 2390 + }, + { + "epoch": 0.68, + "learning_rate": 6.572852535913222e-05, + "loss": 3.1, + "step": 2391 + }, + { + "epoch": 0.68, + "learning_rate": 6.566989152741131e-05, + "loss": 2.9515, + "step": 2392 + }, + { + "epoch": 0.68, + "learning_rate": 6.561125769569042e-05, + "loss": 3.0162, + "step": 2393 + }, + { + "epoch": 0.68, + "learning_rate": 6.55526238639695e-05, + "loss": 2.9888, + "step": 2394 + }, + { + "epoch": 0.68, + "learning_rate": 6.549399003224861e-05, + "loss": 2.9035, + "step": 2395 + }, + { + "epoch": 0.68, + "learning_rate": 6.54353562005277e-05, + "loss": 2.9091, + "step": 2396 + }, + { + "epoch": 0.68, + "learning_rate": 6.537672236880681e-05, + "loss": 2.9632, + "step": 2397 + }, + { + "epoch": 0.68, + "learning_rate": 6.53180885370859e-05, + "loss": 2.9739, + "step": 2398 + }, + { + "epoch": 0.68, + "learning_rate": 6.5259454705365e-05, + "loss": 3.0176, + "step": 2399 + }, + { + "epoch": 0.68, + "learning_rate": 6.52008208736441e-05, + "loss": 3.131, + "step": 2400 + }, + { + "epoch": 0.68, + "learning_rate": 6.51421870419232e-05, + "loss": 2.9436, + "step": 2401 + }, + { + "epoch": 0.68, + "learning_rate": 6.508355321020228e-05, + "loss": 2.9454, + "step": 2402 + }, + { + "epoch": 0.68, + "learning_rate": 6.502491937848139e-05, + "loss": 2.8289, + "step": 2403 + }, + { + "epoch": 0.68, + "learning_rate": 6.496628554676049e-05, + "loss": 3.1042, + "step": 2404 + }, + { + "epoch": 0.68, + "learning_rate": 6.490765171503958e-05, + "loss": 2.8591, + "step": 2405 + }, + { + "epoch": 0.69, + "learning_rate": 6.484901788331868e-05, + "loss": 3.0201, + "step": 2406 + }, + { + "epoch": 0.69, + "learning_rate": 6.479038405159777e-05, + "loss": 2.9499, + "step": 2407 + }, + { + "epoch": 0.69, + "learning_rate": 6.473175021987688e-05, + "loss": 2.9972, + "step": 2408 + }, + { + "epoch": 0.69, + "learning_rate": 6.467311638815597e-05, + "loss": 3.1087, + "step": 2409 + }, + { + "epoch": 0.69, + "learning_rate": 6.461448255643507e-05, + "loss": 3.0046, + "step": 2410 + }, + { + "epoch": 0.69, + "learning_rate": 6.455584872471416e-05, + "loss": 3.0949, + "step": 2411 + }, + { + "epoch": 0.69, + "learning_rate": 6.449721489299327e-05, + "loss": 3.0089, + "step": 2412 + }, + { + "epoch": 0.69, + "learning_rate": 6.443858106127235e-05, + "loss": 3.0895, + "step": 2413 + }, + { + "epoch": 0.69, + "learning_rate": 6.437994722955146e-05, + "loss": 3.0326, + "step": 2414 + }, + { + "epoch": 0.69, + "learning_rate": 6.432131339783055e-05, + "loss": 2.9201, + "step": 2415 + }, + { + "epoch": 0.69, + "learning_rate": 6.426267956610965e-05, + "loss": 2.9691, + "step": 2416 + }, + { + "epoch": 0.69, + "learning_rate": 6.420404573438874e-05, + "loss": 2.9328, + "step": 2417 + }, + { + "epoch": 0.69, + "learning_rate": 6.414541190266783e-05, + "loss": 2.9624, + "step": 2418 + }, + { + "epoch": 0.69, + "learning_rate": 6.408677807094694e-05, + "loss": 2.9547, + "step": 2419 + }, + { + "epoch": 0.69, + "learning_rate": 6.402814423922604e-05, + "loss": 2.9956, + "step": 2420 + }, + { + "epoch": 0.69, + "learning_rate": 6.396951040750513e-05, + "loss": 2.9702, + "step": 2421 + }, + { + "epoch": 0.69, + "learning_rate": 6.391087657578422e-05, + "loss": 3.004, + "step": 2422 + }, + { + "epoch": 0.69, + "learning_rate": 6.385224274406333e-05, + "loss": 2.9308, + "step": 2423 + }, + { + "epoch": 0.69, + "learning_rate": 6.379360891234243e-05, + "loss": 2.9878, + "step": 2424 + }, + { + "epoch": 0.69, + "learning_rate": 6.373497508062152e-05, + "loss": 3.0244, + "step": 2425 + }, + { + "epoch": 0.69, + "learning_rate": 6.367634124890062e-05, + "loss": 2.9162, + "step": 2426 + }, + { + "epoch": 0.69, + "learning_rate": 6.361770741717972e-05, + "loss": 2.9459, + "step": 2427 + }, + { + "epoch": 0.69, + "learning_rate": 6.355907358545882e-05, + "loss": 2.9241, + "step": 2428 + }, + { + "epoch": 0.69, + "learning_rate": 6.350043975373791e-05, + "loss": 2.9893, + "step": 2429 + }, + { + "epoch": 0.69, + "learning_rate": 6.3441805922017e-05, + "loss": 3.0918, + "step": 2430 + }, + { + "epoch": 0.69, + "learning_rate": 6.33831720902961e-05, + "loss": 2.9185, + "step": 2431 + }, + { + "epoch": 0.69, + "learning_rate": 6.332453825857521e-05, + "loss": 3.0776, + "step": 2432 + }, + { + "epoch": 0.69, + "learning_rate": 6.326590442685429e-05, + "loss": 3.0014, + "step": 2433 + }, + { + "epoch": 0.69, + "learning_rate": 6.32072705951334e-05, + "loss": 2.9948, + "step": 2434 + }, + { + "epoch": 0.69, + "learning_rate": 6.314863676341249e-05, + "loss": 2.8847, + "step": 2435 + }, + { + "epoch": 0.69, + "learning_rate": 6.30900029316916e-05, + "loss": 2.9817, + "step": 2436 + }, + { + "epoch": 0.69, + "learning_rate": 6.303136909997068e-05, + "loss": 2.8848, + "step": 2437 + }, + { + "epoch": 0.69, + "learning_rate": 6.297273526824979e-05, + "loss": 3.1049, + "step": 2438 + }, + { + "epoch": 0.69, + "learning_rate": 6.291410143652888e-05, + "loss": 3.028, + "step": 2439 + }, + { + "epoch": 0.69, + "learning_rate": 6.285546760480798e-05, + "loss": 3.0241, + "step": 2440 + }, + { + "epoch": 0.7, + "learning_rate": 6.279683377308707e-05, + "loss": 2.9669, + "step": 2441 + }, + { + "epoch": 0.7, + "learning_rate": 6.273819994136617e-05, + "loss": 2.9753, + "step": 2442 + }, + { + "epoch": 0.7, + "learning_rate": 6.267956610964527e-05, + "loss": 3.0769, + "step": 2443 + }, + { + "epoch": 0.7, + "learning_rate": 6.262093227792435e-05, + "loss": 3.095, + "step": 2444 + }, + { + "epoch": 0.7, + "learning_rate": 6.256229844620346e-05, + "loss": 3.0358, + "step": 2445 + }, + { + "epoch": 0.7, + "learning_rate": 6.250366461448256e-05, + "loss": 3.009, + "step": 2446 + }, + { + "epoch": 0.7, + "learning_rate": 6.244503078276166e-05, + "loss": 2.9533, + "step": 2447 + }, + { + "epoch": 0.7, + "learning_rate": 6.238639695104074e-05, + "loss": 2.9614, + "step": 2448 + }, + { + "epoch": 0.7, + "learning_rate": 6.232776311931985e-05, + "loss": 2.9125, + "step": 2449 + }, + { + "epoch": 0.7, + "learning_rate": 6.226912928759895e-05, + "loss": 3.0652, + "step": 2450 + }, + { + "epoch": 0.7, + "learning_rate": 6.221049545587805e-05, + "loss": 3.0921, + "step": 2451 + }, + { + "epoch": 0.7, + "learning_rate": 6.215186162415714e-05, + "loss": 2.9856, + "step": 2452 + }, + { + "epoch": 0.7, + "learning_rate": 6.209322779243624e-05, + "loss": 2.9609, + "step": 2453 + }, + { + "epoch": 0.7, + "learning_rate": 6.203459396071534e-05, + "loss": 3.035, + "step": 2454 + }, + { + "epoch": 0.7, + "learning_rate": 6.197596012899443e-05, + "loss": 2.9847, + "step": 2455 + }, + { + "epoch": 0.7, + "learning_rate": 6.191732629727353e-05, + "loss": 3.0316, + "step": 2456 + }, + { + "epoch": 0.7, + "learning_rate": 6.185869246555262e-05, + "loss": 2.9794, + "step": 2457 + }, + { + "epoch": 0.7, + "learning_rate": 6.180005863383173e-05, + "loss": 2.9552, + "step": 2458 + }, + { + "epoch": 0.7, + "learning_rate": 6.174142480211082e-05, + "loss": 2.9213, + "step": 2459 + }, + { + "epoch": 0.7, + "learning_rate": 6.168279097038992e-05, + "loss": 3.0125, + "step": 2460 + }, + { + "epoch": 0.7, + "learning_rate": 6.162415713866901e-05, + "loss": 2.9652, + "step": 2461 + }, + { + "epoch": 0.7, + "learning_rate": 6.156552330694812e-05, + "loss": 3.0152, + "step": 2462 + }, + { + "epoch": 0.7, + "learning_rate": 6.150688947522721e-05, + "loss": 3.0937, + "step": 2463 + }, + { + "epoch": 0.7, + "learning_rate": 6.144825564350631e-05, + "loss": 2.9314, + "step": 2464 + }, + { + "epoch": 0.7, + "eval_loss": 3.2398521900177, + "eval_runtime": 2939.9576, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 2464 + }, + { + "epoch": 0.7, + "learning_rate": 6.13896218117854e-05, + "loss": 2.972, + "step": 2465 + }, + { + "epoch": 0.7, + "learning_rate": 6.13309879800645e-05, + "loss": 3.0006, + "step": 2466 + }, + { + "epoch": 0.7, + "learning_rate": 6.12723541483436e-05, + "loss": 2.9709, + "step": 2467 + }, + { + "epoch": 0.7, + "learning_rate": 6.121372031662269e-05, + "loss": 3.0649, + "step": 2468 + }, + { + "epoch": 0.7, + "learning_rate": 6.11550864849018e-05, + "loss": 2.8472, + "step": 2469 + }, + { + "epoch": 0.7, + "learning_rate": 6.109645265318089e-05, + "loss": 2.9919, + "step": 2470 + }, + { + "epoch": 0.7, + "learning_rate": 6.103781882145998e-05, + "loss": 2.9293, + "step": 2471 + }, + { + "epoch": 0.7, + "learning_rate": 6.097918498973908e-05, + "loss": 2.9208, + "step": 2472 + }, + { + "epoch": 0.7, + "learning_rate": 6.092055115801818e-05, + "loss": 2.9485, + "step": 2473 + }, + { + "epoch": 0.7, + "learning_rate": 6.086191732629728e-05, + "loss": 2.8891, + "step": 2474 + }, + { + "epoch": 0.7, + "learning_rate": 6.0803283494576366e-05, + "loss": 2.9678, + "step": 2475 + }, + { + "epoch": 0.71, + "learning_rate": 6.074464966285547e-05, + "loss": 2.9454, + "step": 2476 + }, + { + "epoch": 0.71, + "learning_rate": 6.068601583113457e-05, + "loss": 2.9956, + "step": 2477 + }, + { + "epoch": 0.71, + "learning_rate": 6.062738199941367e-05, + "loss": 2.9863, + "step": 2478 + }, + { + "epoch": 0.71, + "learning_rate": 6.056874816769276e-05, + "loss": 2.9921, + "step": 2479 + }, + { + "epoch": 0.71, + "learning_rate": 6.051011433597186e-05, + "loss": 3.0104, + "step": 2480 + }, + { + "epoch": 0.71, + "learning_rate": 6.045148050425096e-05, + "loss": 2.8295, + "step": 2481 + }, + { + "epoch": 0.71, + "learning_rate": 6.039284667253006e-05, + "loss": 2.97, + "step": 2482 + }, + { + "epoch": 0.71, + "learning_rate": 6.033421284080915e-05, + "loss": 3.0534, + "step": 2483 + }, + { + "epoch": 0.71, + "learning_rate": 6.027557900908825e-05, + "loss": 2.9503, + "step": 2484 + }, + { + "epoch": 0.71, + "learning_rate": 6.021694517736734e-05, + "loss": 2.9844, + "step": 2485 + }, + { + "epoch": 0.71, + "learning_rate": 6.0158311345646444e-05, + "loss": 2.8801, + "step": 2486 + }, + { + "epoch": 0.71, + "learning_rate": 6.009967751392553e-05, + "loss": 2.9881, + "step": 2487 + }, + { + "epoch": 0.71, + "learning_rate": 6.004104368220463e-05, + "loss": 2.8921, + "step": 2488 + }, + { + "epoch": 0.71, + "learning_rate": 5.9982409850483734e-05, + "loss": 2.9311, + "step": 2489 + }, + { + "epoch": 0.71, + "learning_rate": 5.9923776018762835e-05, + "loss": 2.9632, + "step": 2490 + }, + { + "epoch": 0.71, + "learning_rate": 5.986514218704192e-05, + "loss": 3.0163, + "step": 2491 + }, + { + "epoch": 0.71, + "learning_rate": 5.9806508355321024e-05, + "loss": 3.016, + "step": 2492 + }, + { + "epoch": 0.71, + "learning_rate": 5.9747874523600125e-05, + "loss": 2.9402, + "step": 2493 + }, + { + "epoch": 0.71, + "learning_rate": 5.968924069187922e-05, + "loss": 2.962, + "step": 2494 + }, + { + "epoch": 0.71, + "learning_rate": 5.963060686015831e-05, + "loss": 2.9942, + "step": 2495 + }, + { + "epoch": 0.71, + "learning_rate": 5.9571973028437414e-05, + "loss": 2.9702, + "step": 2496 + }, + { + "epoch": 0.71, + "learning_rate": 5.951333919671651e-05, + "loss": 3.0159, + "step": 2497 + }, + { + "epoch": 0.71, + "learning_rate": 5.94547053649956e-05, + "loss": 2.9585, + "step": 2498 + }, + { + "epoch": 0.71, + "learning_rate": 5.93960715332747e-05, + "loss": 2.919, + "step": 2499 + }, + { + "epoch": 0.71, + "learning_rate": 5.93374377015538e-05, + "loss": 2.9407, + "step": 2500 + }, + { + "epoch": 0.71, + "learning_rate": 5.92788038698329e-05, + "loss": 2.973, + "step": 2501 + }, + { + "epoch": 0.71, + "learning_rate": 5.922017003811199e-05, + "loss": 2.9339, + "step": 2502 + }, + { + "epoch": 0.71, + "learning_rate": 5.916153620639109e-05, + "loss": 3.0156, + "step": 2503 + }, + { + "epoch": 0.71, + "learning_rate": 5.910290237467019e-05, + "loss": 2.9989, + "step": 2504 + }, + { + "epoch": 0.71, + "learning_rate": 5.904426854294929e-05, + "loss": 2.9483, + "step": 2505 + }, + { + "epoch": 0.71, + "learning_rate": 5.898563471122838e-05, + "loss": 2.9201, + "step": 2506 + }, + { + "epoch": 0.71, + "learning_rate": 5.892700087950748e-05, + "loss": 3.0595, + "step": 2507 + }, + { + "epoch": 0.71, + "learning_rate": 5.886836704778657e-05, + "loss": 3.0191, + "step": 2508 + }, + { + "epoch": 0.71, + "learning_rate": 5.8809733216065674e-05, + "loss": 2.9978, + "step": 2509 + }, + { + "epoch": 0.71, + "learning_rate": 5.875109938434477e-05, + "loss": 3.0281, + "step": 2510 + }, + { + "epoch": 0.72, + "learning_rate": 5.869246555262386e-05, + "loss": 3.0379, + "step": 2511 + }, + { + "epoch": 0.72, + "learning_rate": 5.8633831720902964e-05, + "loss": 2.922, + "step": 2512 + }, + { + "epoch": 0.72, + "learning_rate": 5.8575197889182065e-05, + "loss": 2.9158, + "step": 2513 + }, + { + "epoch": 0.72, + "learning_rate": 5.851656405746115e-05, + "loss": 3.0778, + "step": 2514 + }, + { + "epoch": 0.72, + "learning_rate": 5.8457930225740254e-05, + "loss": 3.0146, + "step": 2515 + }, + { + "epoch": 0.72, + "learning_rate": 5.8399296394019355e-05, + "loss": 2.9351, + "step": 2516 + }, + { + "epoch": 0.72, + "learning_rate": 5.8340662562298456e-05, + "loss": 2.8747, + "step": 2517 + }, + { + "epoch": 0.72, + "learning_rate": 5.8282028730577544e-05, + "loss": 2.9747, + "step": 2518 + }, + { + "epoch": 0.72, + "learning_rate": 5.8223394898856645e-05, + "loss": 3.0039, + "step": 2519 + }, + { + "epoch": 0.72, + "learning_rate": 5.816476106713574e-05, + "loss": 2.9437, + "step": 2520 + }, + { + "epoch": 0.72, + "learning_rate": 5.810612723541484e-05, + "loss": 2.8976, + "step": 2521 + }, + { + "epoch": 0.72, + "learning_rate": 5.804749340369393e-05, + "loss": 2.9928, + "step": 2522 + }, + { + "epoch": 0.72, + "learning_rate": 5.798885957197303e-05, + "loss": 2.9574, + "step": 2523 + }, + { + "epoch": 0.72, + "learning_rate": 5.793022574025213e-05, + "loss": 2.9724, + "step": 2524 + }, + { + "epoch": 0.72, + "learning_rate": 5.787159190853123e-05, + "loss": 2.9679, + "step": 2525 + }, + { + "epoch": 0.72, + "learning_rate": 5.781295807681032e-05, + "loss": 3.0835, + "step": 2526 + }, + { + "epoch": 0.72, + "learning_rate": 5.775432424508942e-05, + "loss": 3.0269, + "step": 2527 + }, + { + "epoch": 0.72, + "learning_rate": 5.769569041336852e-05, + "loss": 2.991, + "step": 2528 + }, + { + "epoch": 0.72, + "learning_rate": 5.763705658164761e-05, + "loss": 3.0316, + "step": 2529 + }, + { + "epoch": 0.72, + "learning_rate": 5.757842274992671e-05, + "loss": 3.0771, + "step": 2530 + }, + { + "epoch": 0.72, + "learning_rate": 5.751978891820581e-05, + "loss": 2.996, + "step": 2531 + }, + { + "epoch": 0.72, + "learning_rate": 5.7461155086484905e-05, + "loss": 3.0538, + "step": 2532 + }, + { + "epoch": 0.72, + "learning_rate": 5.7402521254764e-05, + "loss": 2.9899, + "step": 2533 + }, + { + "epoch": 0.72, + "learning_rate": 5.7343887423043093e-05, + "loss": 2.9144, + "step": 2534 + }, + { + "epoch": 0.72, + "learning_rate": 5.7285253591322195e-05, + "loss": 2.973, + "step": 2535 + }, + { + "epoch": 0.72, + "learning_rate": 5.7226619759601296e-05, + "loss": 2.9375, + "step": 2536 + }, + { + "epoch": 0.72, + "learning_rate": 5.716798592788038e-05, + "loss": 3.0128, + "step": 2537 + }, + { + "epoch": 0.72, + "learning_rate": 5.7109352096159484e-05, + "loss": 3.0526, + "step": 2538 + }, + { + "epoch": 0.72, + "learning_rate": 5.7050718264438585e-05, + "loss": 2.9862, + "step": 2539 + }, + { + "epoch": 0.72, + "learning_rate": 5.6992084432717686e-05, + "loss": 2.9249, + "step": 2540 + }, + { + "epoch": 0.72, + "learning_rate": 5.6933450600996774e-05, + "loss": 2.9432, + "step": 2541 + }, + { + "epoch": 0.72, + "learning_rate": 5.6874816769275875e-05, + "loss": 2.945, + "step": 2542 + }, + { + "epoch": 0.72, + "learning_rate": 5.6816182937554976e-05, + "loss": 2.9054, + "step": 2543 + }, + { + "epoch": 0.72, + "learning_rate": 5.675754910583407e-05, + "loss": 3.0417, + "step": 2544 + }, + { + "epoch": 0.72, + "learning_rate": 5.6698915274113165e-05, + "loss": 2.93, + "step": 2545 + }, + { + "epoch": 0.72, + "learning_rate": 5.664028144239226e-05, + "loss": 2.9802, + "step": 2546 + }, + { + "epoch": 0.73, + "learning_rate": 5.658164761067136e-05, + "loss": 2.9032, + "step": 2547 + }, + { + "epoch": 0.73, + "learning_rate": 5.652301377895046e-05, + "loss": 3.063, + "step": 2548 + }, + { + "epoch": 0.73, + "learning_rate": 5.646437994722955e-05, + "loss": 2.9094, + "step": 2549 + }, + { + "epoch": 0.73, + "learning_rate": 5.640574611550865e-05, + "loss": 2.8761, + "step": 2550 + }, + { + "epoch": 0.73, + "learning_rate": 5.634711228378775e-05, + "loss": 2.8374, + "step": 2551 + }, + { + "epoch": 0.73, + "learning_rate": 5.628847845206685e-05, + "loss": 2.9685, + "step": 2552 + }, + { + "epoch": 0.73, + "learning_rate": 5.622984462034594e-05, + "loss": 2.9527, + "step": 2553 + }, + { + "epoch": 0.73, + "learning_rate": 5.617121078862504e-05, + "loss": 2.9442, + "step": 2554 + }, + { + "epoch": 0.73, + "learning_rate": 5.611257695690414e-05, + "loss": 3.0894, + "step": 2555 + }, + { + "epoch": 0.73, + "learning_rate": 5.605394312518323e-05, + "loss": 3.0, + "step": 2556 + }, + { + "epoch": 0.73, + "learning_rate": 5.599530929346233e-05, + "loss": 2.9207, + "step": 2557 + }, + { + "epoch": 0.73, + "learning_rate": 5.5936675461741425e-05, + "loss": 2.918, + "step": 2558 + }, + { + "epoch": 0.73, + "learning_rate": 5.5878041630020526e-05, + "loss": 3.0187, + "step": 2559 + }, + { + "epoch": 0.73, + "learning_rate": 5.5819407798299613e-05, + "loss": 2.9735, + "step": 2560 + }, + { + "epoch": 0.73, + "learning_rate": 5.5760773966578715e-05, + "loss": 3.0096, + "step": 2561 + }, + { + "epoch": 0.73, + "learning_rate": 5.5702140134857816e-05, + "loss": 2.9933, + "step": 2562 + }, + { + "epoch": 0.73, + "learning_rate": 5.564350630313692e-05, + "loss": 2.9946, + "step": 2563 + }, + { + "epoch": 0.73, + "learning_rate": 5.5584872471416004e-05, + "loss": 3.0202, + "step": 2564 + }, + { + "epoch": 0.73, + "learning_rate": 5.5526238639695105e-05, + "loss": 2.8782, + "step": 2565 + }, + { + "epoch": 0.73, + "learning_rate": 5.5467604807974206e-05, + "loss": 3.0155, + "step": 2566 + }, + { + "epoch": 0.73, + "learning_rate": 5.540897097625331e-05, + "loss": 3.0636, + "step": 2567 + }, + { + "epoch": 0.73, + "learning_rate": 5.5350337144532395e-05, + "loss": 3.0327, + "step": 2568 + }, + { + "epoch": 0.73, + "learning_rate": 5.5291703312811496e-05, + "loss": 2.9718, + "step": 2569 + }, + { + "epoch": 0.73, + "learning_rate": 5.523306948109059e-05, + "loss": 2.9151, + "step": 2570 + }, + { + "epoch": 0.73, + "learning_rate": 5.517443564936969e-05, + "loss": 3.0394, + "step": 2571 + }, + { + "epoch": 0.73, + "learning_rate": 5.511580181764878e-05, + "loss": 2.9542, + "step": 2572 + }, + { + "epoch": 0.73, + "learning_rate": 5.505716798592788e-05, + "loss": 2.9608, + "step": 2573 + }, + { + "epoch": 0.73, + "learning_rate": 5.499853415420698e-05, + "loss": 2.901, + "step": 2574 + }, + { + "epoch": 0.73, + "learning_rate": 5.493990032248608e-05, + "loss": 2.9036, + "step": 2575 + }, + { + "epoch": 0.73, + "learning_rate": 5.488126649076517e-05, + "loss": 2.9425, + "step": 2576 + }, + { + "epoch": 0.73, + "learning_rate": 5.482263265904427e-05, + "loss": 2.9666, + "step": 2577 + }, + { + "epoch": 0.73, + "learning_rate": 5.476399882732337e-05, + "loss": 2.9727, + "step": 2578 + }, + { + "epoch": 0.73, + "learning_rate": 5.470536499560247e-05, + "loss": 2.9093, + "step": 2579 + }, + { + "epoch": 0.73, + "learning_rate": 5.464673116388156e-05, + "loss": 2.9456, + "step": 2580 + }, + { + "epoch": 0.73, + "learning_rate": 5.458809733216066e-05, + "loss": 2.9856, + "step": 2581 + }, + { + "epoch": 0.74, + "learning_rate": 5.4529463500439756e-05, + "loss": 3.0115, + "step": 2582 + }, + { + "epoch": 0.74, + "learning_rate": 5.447082966871885e-05, + "loss": 2.9719, + "step": 2583 + }, + { + "epoch": 0.74, + "learning_rate": 5.4412195836997945e-05, + "loss": 2.996, + "step": 2584 + }, + { + "epoch": 0.74, + "learning_rate": 5.4353562005277046e-05, + "loss": 3.067, + "step": 2585 + }, + { + "epoch": 0.74, + "learning_rate": 5.429492817355615e-05, + "loss": 3.0018, + "step": 2586 + }, + { + "epoch": 0.74, + "learning_rate": 5.4236294341835235e-05, + "loss": 2.9288, + "step": 2587 + }, + { + "epoch": 0.74, + "learning_rate": 5.4177660510114336e-05, + "loss": 2.9967, + "step": 2588 + }, + { + "epoch": 0.74, + "learning_rate": 5.411902667839344e-05, + "loss": 2.9905, + "step": 2589 + }, + { + "epoch": 0.74, + "learning_rate": 5.406039284667254e-05, + "loss": 3.0039, + "step": 2590 + }, + { + "epoch": 0.74, + "learning_rate": 5.4001759014951625e-05, + "loss": 2.9316, + "step": 2591 + }, + { + "epoch": 0.74, + "learning_rate": 5.3943125183230727e-05, + "loss": 2.9999, + "step": 2592 + }, + { + "epoch": 0.74, + "learning_rate": 5.388449135150983e-05, + "loss": 2.9992, + "step": 2593 + }, + { + "epoch": 0.74, + "learning_rate": 5.382585751978892e-05, + "loss": 2.9757, + "step": 2594 + }, + { + "epoch": 0.74, + "learning_rate": 5.3767223688068016e-05, + "loss": 2.8192, + "step": 2595 + }, + { + "epoch": 0.74, + "learning_rate": 5.370858985634711e-05, + "loss": 2.8452, + "step": 2596 + }, + { + "epoch": 0.74, + "learning_rate": 5.364995602462621e-05, + "loss": 2.8474, + "step": 2597 + }, + { + "epoch": 0.74, + "learning_rate": 5.359132219290531e-05, + "loss": 2.8424, + "step": 2598 + }, + { + "epoch": 0.74, + "learning_rate": 5.35326883611844e-05, + "loss": 3.046, + "step": 2599 + }, + { + "epoch": 0.74, + "learning_rate": 5.34740545294635e-05, + "loss": 2.914, + "step": 2600 + }, + { + "epoch": 0.74, + "learning_rate": 5.34154206977426e-05, + "loss": 2.9644, + "step": 2601 + }, + { + "epoch": 0.74, + "learning_rate": 5.3356786866021704e-05, + "loss": 2.9215, + "step": 2602 + }, + { + "epoch": 0.74, + "learning_rate": 5.329815303430079e-05, + "loss": 2.872, + "step": 2603 + }, + { + "epoch": 0.74, + "learning_rate": 5.323951920257989e-05, + "loss": 2.9656, + "step": 2604 + }, + { + "epoch": 0.74, + "learning_rate": 5.318088537085899e-05, + "loss": 2.9664, + "step": 2605 + }, + { + "epoch": 0.74, + "learning_rate": 5.312225153913809e-05, + "loss": 2.8479, + "step": 2606 + }, + { + "epoch": 0.74, + "learning_rate": 5.306361770741718e-05, + "loss": 2.9984, + "step": 2607 + }, + { + "epoch": 0.74, + "learning_rate": 5.3004983875696276e-05, + "loss": 3.0044, + "step": 2608 + }, + { + "epoch": 0.74, + "learning_rate": 5.294635004397538e-05, + "loss": 3.0522, + "step": 2609 + }, + { + "epoch": 0.74, + "learning_rate": 5.2887716212254465e-05, + "loss": 2.9028, + "step": 2610 + }, + { + "epoch": 0.74, + "learning_rate": 5.2829082380533566e-05, + "loss": 2.9666, + "step": 2611 + }, + { + "epoch": 0.74, + "learning_rate": 5.277044854881267e-05, + "loss": 2.948, + "step": 2612 + }, + { + "epoch": 0.74, + "learning_rate": 5.271181471709177e-05, + "loss": 2.9854, + "step": 2613 + }, + { + "epoch": 0.74, + "learning_rate": 5.2653180885370856e-05, + "loss": 2.9549, + "step": 2614 + }, + { + "epoch": 0.74, + "learning_rate": 5.259454705364996e-05, + "loss": 2.8702, + "step": 2615 + }, + { + "epoch": 0.74, + "learning_rate": 5.253591322192906e-05, + "loss": 2.9028, + "step": 2616 + }, + { + "epoch": 0.75, + "learning_rate": 5.247727939020816e-05, + "loss": 2.9593, + "step": 2617 + }, + { + "epoch": 0.75, + "learning_rate": 5.2418645558487247e-05, + "loss": 3.0233, + "step": 2618 + }, + { + "epoch": 0.75, + "learning_rate": 5.236001172676635e-05, + "loss": 2.9256, + "step": 2619 + }, + { + "epoch": 0.75, + "learning_rate": 5.230137789504544e-05, + "loss": 2.9698, + "step": 2620 + }, + { + "epoch": 0.75, + "learning_rate": 5.224274406332454e-05, + "loss": 2.8858, + "step": 2621 + }, + { + "epoch": 0.75, + "learning_rate": 5.218411023160363e-05, + "loss": 2.9088, + "step": 2622 + }, + { + "epoch": 0.75, + "learning_rate": 5.212547639988273e-05, + "loss": 2.9003, + "step": 2623 + }, + { + "epoch": 0.75, + "learning_rate": 5.206684256816183e-05, + "loss": 2.9809, + "step": 2624 + }, + { + "epoch": 0.75, + "learning_rate": 5.2008208736440934e-05, + "loss": 2.9083, + "step": 2625 + }, + { + "epoch": 0.75, + "learning_rate": 5.194957490472002e-05, + "loss": 2.9803, + "step": 2626 + }, + { + "epoch": 0.75, + "learning_rate": 5.189094107299912e-05, + "loss": 2.9718, + "step": 2627 + }, + { + "epoch": 0.75, + "learning_rate": 5.1832307241278224e-05, + "loss": 2.9033, + "step": 2628 + }, + { + "epoch": 0.75, + "learning_rate": 5.1773673409557325e-05, + "loss": 3.0781, + "step": 2629 + }, + { + "epoch": 0.75, + "learning_rate": 5.171503957783641e-05, + "loss": 2.9367, + "step": 2630 + }, + { + "epoch": 0.75, + "learning_rate": 5.165640574611551e-05, + "loss": 2.9469, + "step": 2631 + }, + { + "epoch": 0.75, + "learning_rate": 5.159777191439461e-05, + "loss": 2.8456, + "step": 2632 + }, + { + "epoch": 0.75, + "learning_rate": 5.153913808267371e-05, + "loss": 2.9042, + "step": 2633 + }, + { + "epoch": 0.75, + "learning_rate": 5.1480504250952796e-05, + "loss": 2.9669, + "step": 2634 + }, + { + "epoch": 0.75, + "learning_rate": 5.14218704192319e-05, + "loss": 2.9404, + "step": 2635 + }, + { + "epoch": 0.75, + "learning_rate": 5.1363236587511e-05, + "loss": 2.9794, + "step": 2636 + }, + { + "epoch": 0.75, + "learning_rate": 5.13046027557901e-05, + "loss": 3.0569, + "step": 2637 + }, + { + "epoch": 0.75, + "learning_rate": 5.124596892406919e-05, + "loss": 2.9156, + "step": 2638 + }, + { + "epoch": 0.75, + "learning_rate": 5.118733509234829e-05, + "loss": 3.0051, + "step": 2639 + }, + { + "epoch": 0.75, + "learning_rate": 5.112870126062739e-05, + "loss": 2.991, + "step": 2640 + }, + { + "epoch": 0.75, + "eval_loss": 3.2172892093658447, + "eval_runtime": 2940.2613, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 2640 + }, + { + "epoch": 0.75, + "learning_rate": 5.107006742890648e-05, + "loss": 2.8543, + "step": 2641 + }, + { + "epoch": 0.75, + "learning_rate": 5.101143359718558e-05, + "loss": 3.0516, + "step": 2642 + }, + { + "epoch": 0.75, + "learning_rate": 5.095279976546468e-05, + "loss": 2.9691, + "step": 2643 + }, + { + "epoch": 0.75, + "learning_rate": 5.089416593374377e-05, + "loss": 3.0091, + "step": 2644 + }, + { + "epoch": 0.75, + "learning_rate": 5.083553210202287e-05, + "loss": 2.9718, + "step": 2645 + }, + { + "epoch": 0.75, + "learning_rate": 5.077689827030196e-05, + "loss": 2.933, + "step": 2646 + }, + { + "epoch": 0.75, + "learning_rate": 5.071826443858106e-05, + "loss": 2.9114, + "step": 2647 + }, + { + "epoch": 0.75, + "learning_rate": 5.0659630606860164e-05, + "loss": 2.9301, + "step": 2648 + }, + { + "epoch": 0.75, + "learning_rate": 5.060099677513925e-05, + "loss": 2.8686, + "step": 2649 + }, + { + "epoch": 0.75, + "learning_rate": 5.054236294341835e-05, + "loss": 2.9059, + "step": 2650 + }, + { + "epoch": 0.75, + "learning_rate": 5.0483729111697454e-05, + "loss": 3.0058, + "step": 2651 + }, + { + "epoch": 0.76, + "learning_rate": 5.0425095279976555e-05, + "loss": 2.922, + "step": 2652 + }, + { + "epoch": 0.76, + "learning_rate": 5.036646144825564e-05, + "loss": 2.9205, + "step": 2653 + }, + { + "epoch": 0.76, + "learning_rate": 5.0307827616534744e-05, + "loss": 2.9853, + "step": 2654 + }, + { + "epoch": 0.76, + "learning_rate": 5.0249193784813845e-05, + "loss": 2.9316, + "step": 2655 + }, + { + "epoch": 0.76, + "learning_rate": 5.019055995309294e-05, + "loss": 3.0522, + "step": 2656 + }, + { + "epoch": 0.76, + "learning_rate": 5.0131926121372033e-05, + "loss": 2.9679, + "step": 2657 + }, + { + "epoch": 0.76, + "learning_rate": 5.007329228965113e-05, + "loss": 2.9912, + "step": 2658 + }, + { + "epoch": 0.76, + "learning_rate": 5.001465845793023e-05, + "loss": 2.9458, + "step": 2659 + }, + { + "epoch": 0.76, + "learning_rate": 4.995602462620932e-05, + "loss": 2.9799, + "step": 2660 + }, + { + "epoch": 0.76, + "learning_rate": 4.9897390794488424e-05, + "loss": 2.9888, + "step": 2661 + }, + { + "epoch": 0.76, + "learning_rate": 4.983875696276752e-05, + "loss": 2.8084, + "step": 2662 + }, + { + "epoch": 0.76, + "learning_rate": 4.978012313104662e-05, + "loss": 2.8867, + "step": 2663 + }, + { + "epoch": 0.76, + "learning_rate": 4.9721489299325714e-05, + "loss": 2.9993, + "step": 2664 + }, + { + "epoch": 0.76, + "learning_rate": 4.9662855467604815e-05, + "loss": 2.9978, + "step": 2665 + }, + { + "epoch": 0.76, + "learning_rate": 4.960422163588391e-05, + "loss": 2.9658, + "step": 2666 + }, + { + "epoch": 0.76, + "learning_rate": 4.9545587804163004e-05, + "loss": 2.9173, + "step": 2667 + }, + { + "epoch": 0.76, + "learning_rate": 4.9486953972442105e-05, + "loss": 2.925, + "step": 2668 + }, + { + "epoch": 0.76, + "learning_rate": 4.94283201407212e-05, + "loss": 2.919, + "step": 2669 + }, + { + "epoch": 0.76, + "learning_rate": 4.9369686309000293e-05, + "loss": 2.8328, + "step": 2670 + }, + { + "epoch": 0.76, + "learning_rate": 4.931105247727939e-05, + "loss": 3.0428, + "step": 2671 + }, + { + "epoch": 0.76, + "learning_rate": 4.925241864555849e-05, + "loss": 3.0001, + "step": 2672 + }, + { + "epoch": 0.76, + "learning_rate": 4.919378481383758e-05, + "loss": 2.9359, + "step": 2673 + }, + { + "epoch": 0.76, + "learning_rate": 4.9135150982116684e-05, + "loss": 2.9077, + "step": 2674 + }, + { + "epoch": 0.76, + "learning_rate": 4.907651715039578e-05, + "loss": 2.992, + "step": 2675 + }, + { + "epoch": 0.76, + "learning_rate": 4.901788331867488e-05, + "loss": 2.9674, + "step": 2676 + }, + { + "epoch": 0.76, + "learning_rate": 4.8959249486953974e-05, + "loss": 2.8465, + "step": 2677 + }, + { + "epoch": 0.76, + "learning_rate": 4.8900615655233075e-05, + "loss": 2.8251, + "step": 2678 + }, + { + "epoch": 0.76, + "learning_rate": 4.884198182351217e-05, + "loss": 2.9652, + "step": 2679 + }, + { + "epoch": 0.76, + "learning_rate": 4.878334799179127e-05, + "loss": 2.9478, + "step": 2680 + }, + { + "epoch": 0.76, + "learning_rate": 4.8724714160070365e-05, + "loss": 3.0002, + "step": 2681 + }, + { + "epoch": 0.76, + "learning_rate": 4.866608032834946e-05, + "loss": 3.0055, + "step": 2682 + }, + { + "epoch": 0.76, + "learning_rate": 4.8607446496628553e-05, + "loss": 2.9612, + "step": 2683 + }, + { + "epoch": 0.76, + "learning_rate": 4.8548812664907655e-05, + "loss": 3.0569, + "step": 2684 + }, + { + "epoch": 0.76, + "learning_rate": 4.849017883318675e-05, + "loss": 2.9817, + "step": 2685 + }, + { + "epoch": 0.76, + "learning_rate": 4.843154500146585e-05, + "loss": 2.8774, + "step": 2686 + }, + { + "epoch": 0.77, + "learning_rate": 4.8372911169744944e-05, + "loss": 2.9759, + "step": 2687 + }, + { + "epoch": 0.77, + "learning_rate": 4.8314277338024045e-05, + "loss": 3.0761, + "step": 2688 + }, + { + "epoch": 0.77, + "learning_rate": 4.825564350630314e-05, + "loss": 2.9908, + "step": 2689 + }, + { + "epoch": 0.77, + "learning_rate": 4.819700967458224e-05, + "loss": 2.9525, + "step": 2690 + }, + { + "epoch": 0.77, + "learning_rate": 4.8138375842861335e-05, + "loss": 2.8244, + "step": 2691 + }, + { + "epoch": 0.77, + "learning_rate": 4.8079742011140436e-05, + "loss": 2.8924, + "step": 2692 + }, + { + "epoch": 0.77, + "learning_rate": 4.802110817941953e-05, + "loss": 2.9047, + "step": 2693 + }, + { + "epoch": 0.77, + "learning_rate": 4.7962474347698625e-05, + "loss": 3.007, + "step": 2694 + }, + { + "epoch": 0.77, + "learning_rate": 4.790384051597772e-05, + "loss": 3.0086, + "step": 2695 + }, + { + "epoch": 0.77, + "learning_rate": 4.7845206684256813e-05, + "loss": 2.9496, + "step": 2696 + }, + { + "epoch": 0.77, + "learning_rate": 4.7786572852535915e-05, + "loss": 2.9391, + "step": 2697 + }, + { + "epoch": 0.77, + "learning_rate": 4.772793902081501e-05, + "loss": 2.9309, + "step": 2698 + }, + { + "epoch": 0.77, + "learning_rate": 4.766930518909411e-05, + "loss": 3.0011, + "step": 2699 + }, + { + "epoch": 0.77, + "learning_rate": 4.7610671357373204e-05, + "loss": 2.9296, + "step": 2700 + }, + { + "epoch": 0.77, + "learning_rate": 4.7552037525652305e-05, + "loss": 2.913, + "step": 2701 + }, + { + "epoch": 0.77, + "learning_rate": 4.74934036939314e-05, + "loss": 2.827, + "step": 2702 + }, + { + "epoch": 0.77, + "learning_rate": 4.74347698622105e-05, + "loss": 2.978, + "step": 2703 + }, + { + "epoch": 0.77, + "learning_rate": 4.7376136030489595e-05, + "loss": 3.0295, + "step": 2704 + }, + { + "epoch": 0.77, + "learning_rate": 4.7317502198768696e-05, + "loss": 3.0656, + "step": 2705 + }, + { + "epoch": 0.77, + "learning_rate": 4.725886836704779e-05, + "loss": 2.8391, + "step": 2706 + }, + { + "epoch": 0.77, + "learning_rate": 4.7200234535326885e-05, + "loss": 2.9854, + "step": 2707 + }, + { + "epoch": 0.77, + "learning_rate": 4.714160070360598e-05, + "loss": 2.8733, + "step": 2708 + }, + { + "epoch": 0.77, + "learning_rate": 4.708296687188508e-05, + "loss": 2.8921, + "step": 2709 + }, + { + "epoch": 0.77, + "learning_rate": 4.7024333040164175e-05, + "loss": 2.8333, + "step": 2710 + }, + { + "epoch": 0.77, + "learning_rate": 4.6965699208443276e-05, + "loss": 2.9524, + "step": 2711 + }, + { + "epoch": 0.77, + "learning_rate": 4.690706537672237e-05, + "loss": 2.9885, + "step": 2712 + }, + { + "epoch": 0.77, + "learning_rate": 4.684843154500147e-05, + "loss": 3.07, + "step": 2713 + }, + { + "epoch": 0.77, + "learning_rate": 4.6789797713280565e-05, + "loss": 2.9899, + "step": 2714 + }, + { + "epoch": 0.77, + "learning_rate": 4.6731163881559667e-05, + "loss": 2.9843, + "step": 2715 + }, + { + "epoch": 0.77, + "learning_rate": 4.667253004983876e-05, + "loss": 3.0218, + "step": 2716 + }, + { + "epoch": 0.77, + "learning_rate": 4.6613896218117855e-05, + "loss": 2.8883, + "step": 2717 + }, + { + "epoch": 0.77, + "learning_rate": 4.655526238639695e-05, + "loss": 2.9983, + "step": 2718 + }, + { + "epoch": 0.77, + "learning_rate": 4.649662855467605e-05, + "loss": 2.91, + "step": 2719 + }, + { + "epoch": 0.77, + "learning_rate": 4.6437994722955145e-05, + "loss": 2.9078, + "step": 2720 + }, + { + "epoch": 0.77, + "learning_rate": 4.6379360891234246e-05, + "loss": 2.8622, + "step": 2721 + }, + { + "epoch": 0.78, + "learning_rate": 4.632072705951334e-05, + "loss": 3.057, + "step": 2722 + }, + { + "epoch": 0.78, + "learning_rate": 4.6262093227792435e-05, + "loss": 2.8854, + "step": 2723 + }, + { + "epoch": 0.78, + "learning_rate": 4.6203459396071536e-05, + "loss": 2.9078, + "step": 2724 + }, + { + "epoch": 0.78, + "learning_rate": 4.614482556435063e-05, + "loss": 2.9365, + "step": 2725 + }, + { + "epoch": 0.78, + "learning_rate": 4.608619173262973e-05, + "loss": 2.9439, + "step": 2726 + }, + { + "epoch": 0.78, + "learning_rate": 4.6027557900908825e-05, + "loss": 2.9007, + "step": 2727 + }, + { + "epoch": 0.78, + "learning_rate": 4.5968924069187927e-05, + "loss": 2.8987, + "step": 2728 + }, + { + "epoch": 0.78, + "learning_rate": 4.591029023746702e-05, + "loss": 2.8122, + "step": 2729 + }, + { + "epoch": 0.78, + "learning_rate": 4.5851656405746115e-05, + "loss": 2.9194, + "step": 2730 + }, + { + "epoch": 0.78, + "learning_rate": 4.579302257402521e-05, + "loss": 2.9381, + "step": 2731 + }, + { + "epoch": 0.78, + "learning_rate": 4.573438874230431e-05, + "loss": 2.9534, + "step": 2732 + }, + { + "epoch": 0.78, + "learning_rate": 4.5675754910583405e-05, + "loss": 2.7799, + "step": 2733 + }, + { + "epoch": 0.78, + "learning_rate": 4.5617121078862506e-05, + "loss": 2.9023, + "step": 2734 + }, + { + "epoch": 0.78, + "learning_rate": 4.55584872471416e-05, + "loss": 2.9027, + "step": 2735 + }, + { + "epoch": 0.78, + "learning_rate": 4.54998534154207e-05, + "loss": 2.9513, + "step": 2736 + }, + { + "epoch": 0.78, + "learning_rate": 4.5441219583699796e-05, + "loss": 2.8047, + "step": 2737 + }, + { + "epoch": 0.78, + "learning_rate": 4.53825857519789e-05, + "loss": 3.0478, + "step": 2738 + }, + { + "epoch": 0.78, + "learning_rate": 4.532395192025799e-05, + "loss": 2.9872, + "step": 2739 + }, + { + "epoch": 0.78, + "learning_rate": 4.526531808853709e-05, + "loss": 2.9807, + "step": 2740 + }, + { + "epoch": 0.78, + "learning_rate": 4.5206684256816187e-05, + "loss": 2.9503, + "step": 2741 + }, + { + "epoch": 0.78, + "learning_rate": 4.514805042509528e-05, + "loss": 2.9607, + "step": 2742 + }, + { + "epoch": 0.78, + "learning_rate": 4.5089416593374375e-05, + "loss": 3.0141, + "step": 2743 + }, + { + "epoch": 0.78, + "learning_rate": 4.5030782761653476e-05, + "loss": 2.8885, + "step": 2744 + }, + { + "epoch": 0.78, + "learning_rate": 4.497214892993257e-05, + "loss": 3.0052, + "step": 2745 + }, + { + "epoch": 0.78, + "learning_rate": 4.491351509821167e-05, + "loss": 2.9363, + "step": 2746 + }, + { + "epoch": 0.78, + "learning_rate": 4.4854881266490766e-05, + "loss": 3.0528, + "step": 2747 + }, + { + "epoch": 0.78, + "learning_rate": 4.479624743476987e-05, + "loss": 2.9836, + "step": 2748 + }, + { + "epoch": 0.78, + "learning_rate": 4.473761360304896e-05, + "loss": 2.9265, + "step": 2749 + }, + { + "epoch": 0.78, + "learning_rate": 4.467897977132806e-05, + "loss": 2.9269, + "step": 2750 + }, + { + "epoch": 0.78, + "learning_rate": 4.462034593960716e-05, + "loss": 2.933, + "step": 2751 + }, + { + "epoch": 0.78, + "learning_rate": 4.456171210788625e-05, + "loss": 2.9605, + "step": 2752 + }, + { + "epoch": 0.78, + "learning_rate": 4.450307827616535e-05, + "loss": 2.9411, + "step": 2753 + }, + { + "epoch": 0.78, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.9598, + "step": 2754 + }, + { + "epoch": 0.78, + "learning_rate": 4.438581061272354e-05, + "loss": 2.9167, + "step": 2755 + }, + { + "epoch": 0.78, + "learning_rate": 4.4327176781002635e-05, + "loss": 2.9279, + "step": 2756 + }, + { + "epoch": 0.79, + "learning_rate": 4.4268542949281736e-05, + "loss": 2.9237, + "step": 2757 + }, + { + "epoch": 0.79, + "learning_rate": 4.420990911756083e-05, + "loss": 2.903, + "step": 2758 + }, + { + "epoch": 0.79, + "learning_rate": 4.415127528583993e-05, + "loss": 2.841, + "step": 2759 + }, + { + "epoch": 0.79, + "learning_rate": 4.4092641454119026e-05, + "loss": 2.9219, + "step": 2760 + }, + { + "epoch": 0.79, + "learning_rate": 4.403400762239813e-05, + "loss": 2.9157, + "step": 2761 + }, + { + "epoch": 0.79, + "learning_rate": 4.397537379067722e-05, + "loss": 2.8828, + "step": 2762 + }, + { + "epoch": 0.79, + "learning_rate": 4.391673995895632e-05, + "loss": 2.9777, + "step": 2763 + }, + { + "epoch": 0.79, + "learning_rate": 4.385810612723542e-05, + "loss": 2.9269, + "step": 2764 + }, + { + "epoch": 0.79, + "learning_rate": 4.379947229551452e-05, + "loss": 2.9936, + "step": 2765 + }, + { + "epoch": 0.79, + "learning_rate": 4.374083846379361e-05, + "loss": 2.901, + "step": 2766 + }, + { + "epoch": 0.79, + "learning_rate": 4.3682204632072707e-05, + "loss": 2.8971, + "step": 2767 + }, + { + "epoch": 0.79, + "learning_rate": 4.36235708003518e-05, + "loss": 2.9404, + "step": 2768 + }, + { + "epoch": 0.79, + "learning_rate": 4.35649369686309e-05, + "loss": 2.9661, + "step": 2769 + }, + { + "epoch": 0.79, + "learning_rate": 4.3506303136909996e-05, + "loss": 2.8264, + "step": 2770 + }, + { + "epoch": 0.79, + "learning_rate": 4.34476693051891e-05, + "loss": 2.977, + "step": 2771 + }, + { + "epoch": 0.79, + "learning_rate": 4.338903547346819e-05, + "loss": 2.8653, + "step": 2772 + }, + { + "epoch": 0.79, + "learning_rate": 4.333040164174729e-05, + "loss": 2.9227, + "step": 2773 + }, + { + "epoch": 0.79, + "learning_rate": 4.327176781002639e-05, + "loss": 2.9449, + "step": 2774 + }, + { + "epoch": 0.79, + "learning_rate": 4.321313397830549e-05, + "loss": 2.9402, + "step": 2775 + }, + { + "epoch": 0.79, + "learning_rate": 4.315450014658458e-05, + "loss": 2.8669, + "step": 2776 + }, + { + "epoch": 0.79, + "learning_rate": 4.3095866314863684e-05, + "loss": 2.971, + "step": 2777 + }, + { + "epoch": 0.79, + "learning_rate": 4.303723248314278e-05, + "loss": 2.8939, + "step": 2778 + }, + { + "epoch": 0.79, + "learning_rate": 4.297859865142187e-05, + "loss": 2.9303, + "step": 2779 + }, + { + "epoch": 0.79, + "learning_rate": 4.291996481970097e-05, + "loss": 2.9258, + "step": 2780 + }, + { + "epoch": 0.79, + "learning_rate": 4.286133098798006e-05, + "loss": 2.8606, + "step": 2781 + }, + { + "epoch": 0.79, + "learning_rate": 4.280269715625916e-05, + "loss": 2.7927, + "step": 2782 + }, + { + "epoch": 0.79, + "learning_rate": 4.2744063324538256e-05, + "loss": 2.9105, + "step": 2783 + }, + { + "epoch": 0.79, + "learning_rate": 4.268542949281736e-05, + "loss": 2.9119, + "step": 2784 + }, + { + "epoch": 0.79, + "learning_rate": 4.262679566109645e-05, + "loss": 3.0501, + "step": 2785 + }, + { + "epoch": 0.79, + "learning_rate": 4.256816182937555e-05, + "loss": 2.9486, + "step": 2786 + }, + { + "epoch": 0.79, + "learning_rate": 4.250952799765465e-05, + "loss": 2.9263, + "step": 2787 + }, + { + "epoch": 0.79, + "learning_rate": 4.245089416593375e-05, + "loss": 2.8871, + "step": 2788 + }, + { + "epoch": 0.79, + "learning_rate": 4.239226033421284e-05, + "loss": 2.9066, + "step": 2789 + }, + { + "epoch": 0.79, + "learning_rate": 4.2333626502491944e-05, + "loss": 2.956, + "step": 2790 + }, + { + "epoch": 0.79, + "learning_rate": 4.227499267077104e-05, + "loss": 3.0425, + "step": 2791 + }, + { + "epoch": 0.8, + "learning_rate": 4.221635883905013e-05, + "loss": 2.9842, + "step": 2792 + }, + { + "epoch": 0.8, + "learning_rate": 4.215772500732923e-05, + "loss": 2.9224, + "step": 2793 + }, + { + "epoch": 0.8, + "learning_rate": 4.209909117560833e-05, + "loss": 2.8652, + "step": 2794 + }, + { + "epoch": 0.8, + "learning_rate": 4.204045734388742e-05, + "loss": 2.9478, + "step": 2795 + }, + { + "epoch": 0.8, + "learning_rate": 4.198182351216652e-05, + "loss": 2.9776, + "step": 2796 + }, + { + "epoch": 0.8, + "learning_rate": 4.192318968044562e-05, + "loss": 2.9747, + "step": 2797 + }, + { + "epoch": 0.8, + "learning_rate": 4.186455584872472e-05, + "loss": 2.9551, + "step": 2798 + }, + { + "epoch": 0.8, + "learning_rate": 4.180592201700381e-05, + "loss": 2.8735, + "step": 2799 + }, + { + "epoch": 0.8, + "learning_rate": 4.1747288185282914e-05, + "loss": 2.8173, + "step": 2800 + }, + { + "epoch": 0.8, + "learning_rate": 4.168865435356201e-05, + "loss": 2.9245, + "step": 2801 + }, + { + "epoch": 0.8, + "learning_rate": 4.163002052184111e-05, + "loss": 2.9679, + "step": 2802 + }, + { + "epoch": 0.8, + "learning_rate": 4.1571386690120204e-05, + "loss": 2.9496, + "step": 2803 + }, + { + "epoch": 0.8, + "learning_rate": 4.15127528583993e-05, + "loss": 2.9174, + "step": 2804 + }, + { + "epoch": 0.8, + "learning_rate": 4.145411902667839e-05, + "loss": 2.9739, + "step": 2805 + }, + { + "epoch": 0.8, + "learning_rate": 4.1395485194957493e-05, + "loss": 2.9363, + "step": 2806 + }, + { + "epoch": 0.8, + "learning_rate": 4.133685136323659e-05, + "loss": 2.9018, + "step": 2807 + }, + { + "epoch": 0.8, + "learning_rate": 4.127821753151568e-05, + "loss": 2.9554, + "step": 2808 + }, + { + "epoch": 0.8, + "learning_rate": 4.121958369979478e-05, + "loss": 2.89, + "step": 2809 + }, + { + "epoch": 0.8, + "learning_rate": 4.116094986807388e-05, + "loss": 3.0036, + "step": 2810 + }, + { + "epoch": 0.8, + "learning_rate": 4.110231603635298e-05, + "loss": 2.9772, + "step": 2811 + }, + { + "epoch": 0.8, + "learning_rate": 4.104368220463207e-05, + "loss": 2.9327, + "step": 2812 + }, + { + "epoch": 0.8, + "learning_rate": 4.0985048372911174e-05, + "loss": 2.9426, + "step": 2813 + }, + { + "epoch": 0.8, + "learning_rate": 4.092641454119027e-05, + "loss": 2.9169, + "step": 2814 + }, + { + "epoch": 0.8, + "learning_rate": 4.086778070946937e-05, + "loss": 2.9826, + "step": 2815 + }, + { + "epoch": 0.8, + "learning_rate": 4.0809146877748464e-05, + "loss": 2.9663, + "step": 2816 + }, + { + "epoch": 0.8, + "eval_loss": 3.1972124576568604, + "eval_runtime": 2940.1891, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 2.323, + "step": 2816 + }, + { + "epoch": 0.8, + "learning_rate": 4.075051304602756e-05, + "loss": 2.9791, + "step": 2817 + }, + { + "epoch": 0.8, + "learning_rate": 4.069187921430665e-05, + "loss": 2.9606, + "step": 2818 + }, + { + "epoch": 0.8, + "learning_rate": 4.0633245382585753e-05, + "loss": 2.953, + "step": 2819 + }, + { + "epoch": 0.8, + "learning_rate": 4.057461155086485e-05, + "loss": 2.9999, + "step": 2820 + }, + { + "epoch": 0.8, + "learning_rate": 4.051597771914395e-05, + "loss": 2.7495, + "step": 2821 + }, + { + "epoch": 0.8, + "learning_rate": 4.045734388742304e-05, + "loss": 2.9623, + "step": 2822 + }, + { + "epoch": 0.8, + "learning_rate": 4.0398710055702144e-05, + "loss": 2.9181, + "step": 2823 + }, + { + "epoch": 0.8, + "learning_rate": 4.034007622398124e-05, + "loss": 2.9388, + "step": 2824 + }, + { + "epoch": 0.8, + "learning_rate": 4.028144239226034e-05, + "loss": 2.8328, + "step": 2825 + }, + { + "epoch": 0.8, + "learning_rate": 4.0222808560539434e-05, + "loss": 2.9606, + "step": 2826 + }, + { + "epoch": 0.81, + "learning_rate": 4.0164174728818535e-05, + "loss": 2.984, + "step": 2827 + }, + { + "epoch": 0.81, + "learning_rate": 4.010554089709763e-05, + "loss": 2.9417, + "step": 2828 + }, + { + "epoch": 0.81, + "learning_rate": 4.0046907065376724e-05, + "loss": 2.8587, + "step": 2829 + }, + { + "epoch": 0.81, + "learning_rate": 3.998827323365582e-05, + "loss": 2.9365, + "step": 2830 + }, + { + "epoch": 0.81, + "learning_rate": 3.992963940193492e-05, + "loss": 2.9864, + "step": 2831 + }, + { + "epoch": 0.81, + "learning_rate": 3.9871005570214013e-05, + "loss": 2.9776, + "step": 2832 + }, + { + "epoch": 0.81, + "learning_rate": 3.9812371738493115e-05, + "loss": 2.9485, + "step": 2833 + }, + { + "epoch": 0.81, + "learning_rate": 3.975373790677221e-05, + "loss": 2.9256, + "step": 2834 + }, + { + "epoch": 0.81, + "learning_rate": 3.969510407505131e-05, + "loss": 2.9352, + "step": 2835 + }, + { + "epoch": 0.81, + "learning_rate": 3.9636470243330404e-05, + "loss": 2.9328, + "step": 2836 + }, + { + "epoch": 0.81, + "learning_rate": 3.95778364116095e-05, + "loss": 2.9756, + "step": 2837 + }, + { + "epoch": 0.81, + "learning_rate": 3.95192025798886e-05, + "loss": 2.875, + "step": 2838 + }, + { + "epoch": 0.81, + "learning_rate": 3.9460568748167694e-05, + "loss": 2.9342, + "step": 2839 + }, + { + "epoch": 0.81, + "learning_rate": 3.9401934916446795e-05, + "loss": 2.908, + "step": 2840 + }, + { + "epoch": 0.81, + "learning_rate": 3.934330108472589e-05, + "loss": 3.0439, + "step": 2841 + }, + { + "epoch": 0.81, + "learning_rate": 3.9284667253004984e-05, + "loss": 3.0013, + "step": 2842 + }, + { + "epoch": 0.81, + "learning_rate": 3.922603342128408e-05, + "loss": 2.9613, + "step": 2843 + }, + { + "epoch": 0.81, + "learning_rate": 3.916739958956318e-05, + "loss": 2.9669, + "step": 2844 + }, + { + "epoch": 0.81, + "learning_rate": 3.9108765757842274e-05, + "loss": 2.9951, + "step": 2845 + }, + { + "epoch": 0.81, + "learning_rate": 3.9050131926121375e-05, + "loss": 2.9353, + "step": 2846 + }, + { + "epoch": 0.81, + "learning_rate": 3.899149809440047e-05, + "loss": 2.9397, + "step": 2847 + }, + { + "epoch": 0.81, + "learning_rate": 3.893286426267957e-05, + "loss": 2.927, + "step": 2848 + }, + { + "epoch": 0.81, + "learning_rate": 3.8874230430958664e-05, + "loss": 2.8031, + "step": 2849 + }, + { + "epoch": 0.81, + "learning_rate": 3.8815596599237765e-05, + "loss": 2.9521, + "step": 2850 + }, + { + "epoch": 0.81, + "learning_rate": 3.875696276751686e-05, + "loss": 2.8733, + "step": 2851 + }, + { + "epoch": 0.81, + "learning_rate": 3.869832893579596e-05, + "loss": 2.9939, + "step": 2852 + }, + { + "epoch": 0.81, + "learning_rate": 3.8639695104075055e-05, + "loss": 2.866, + "step": 2853 + }, + { + "epoch": 0.81, + "learning_rate": 3.858106127235415e-05, + "loss": 2.896, + "step": 2854 + }, + { + "epoch": 0.81, + "learning_rate": 3.8522427440633244e-05, + "loss": 3.0004, + "step": 2855 + }, + { + "epoch": 0.81, + "learning_rate": 3.8463793608912345e-05, + "loss": 2.895, + "step": 2856 + }, + { + "epoch": 0.81, + "learning_rate": 3.840515977719144e-05, + "loss": 2.9782, + "step": 2857 + }, + { + "epoch": 0.81, + "learning_rate": 3.834652594547054e-05, + "loss": 2.896, + "step": 2858 + }, + { + "epoch": 0.81, + "learning_rate": 3.8287892113749635e-05, + "loss": 2.9681, + "step": 2859 + }, + { + "epoch": 0.81, + "learning_rate": 3.8229258282028736e-05, + "loss": 2.9011, + "step": 2860 + }, + { + "epoch": 0.81, + "learning_rate": 3.817062445030783e-05, + "loss": 3.0072, + "step": 2861 + }, + { + "epoch": 0.81, + "learning_rate": 3.811199061858693e-05, + "loss": 2.8798, + "step": 2862 + }, + { + "epoch": 0.82, + "learning_rate": 3.8053356786866025e-05, + "loss": 2.8416, + "step": 2863 + }, + { + "epoch": 0.82, + "learning_rate": 3.799472295514512e-05, + "loss": 2.8416, + "step": 2864 + }, + { + "epoch": 0.82, + "learning_rate": 3.793608912342422e-05, + "loss": 2.9927, + "step": 2865 + }, + { + "epoch": 0.82, + "learning_rate": 3.7877455291703315e-05, + "loss": 2.8251, + "step": 2866 + }, + { + "epoch": 0.82, + "learning_rate": 3.781882145998241e-05, + "loss": 2.8113, + "step": 2867 + }, + { + "epoch": 0.82, + "learning_rate": 3.7760187628261504e-05, + "loss": 2.8884, + "step": 2868 + }, + { + "epoch": 0.82, + "learning_rate": 3.7701553796540605e-05, + "loss": 2.9539, + "step": 2869 + }, + { + "epoch": 0.82, + "learning_rate": 3.76429199648197e-05, + "loss": 2.9121, + "step": 2870 + }, + { + "epoch": 0.82, + "learning_rate": 3.75842861330988e-05, + "loss": 2.9654, + "step": 2871 + }, + { + "epoch": 0.82, + "learning_rate": 3.7525652301377895e-05, + "loss": 2.9765, + "step": 2872 + }, + { + "epoch": 0.82, + "learning_rate": 3.7467018469656996e-05, + "loss": 2.7985, + "step": 2873 + }, + { + "epoch": 0.82, + "learning_rate": 3.740838463793609e-05, + "loss": 2.9105, + "step": 2874 + }, + { + "epoch": 0.82, + "learning_rate": 3.734975080621519e-05, + "loss": 2.9365, + "step": 2875 + }, + { + "epoch": 0.82, + "learning_rate": 3.7291116974494285e-05, + "loss": 2.9142, + "step": 2876 + }, + { + "epoch": 0.82, + "learning_rate": 3.7232483142773387e-05, + "loss": 3.0456, + "step": 2877 + }, + { + "epoch": 0.82, + "learning_rate": 3.717384931105248e-05, + "loss": 2.9197, + "step": 2878 + }, + { + "epoch": 0.82, + "learning_rate": 3.7115215479331575e-05, + "loss": 2.8813, + "step": 2879 + }, + { + "epoch": 0.82, + "learning_rate": 3.705658164761067e-05, + "loss": 2.909, + "step": 2880 + }, + { + "epoch": 0.82, + "learning_rate": 3.699794781588977e-05, + "loss": 2.7869, + "step": 2881 + }, + { + "epoch": 0.82, + "learning_rate": 3.6939313984168865e-05, + "loss": 2.9119, + "step": 2882 + }, + { + "epoch": 0.82, + "learning_rate": 3.6880680152447966e-05, + "loss": 2.9985, + "step": 2883 + }, + { + "epoch": 0.82, + "learning_rate": 3.682204632072706e-05, + "loss": 2.949, + "step": 2884 + }, + { + "epoch": 0.82, + "learning_rate": 3.676341248900616e-05, + "loss": 2.9885, + "step": 2885 + }, + { + "epoch": 0.82, + "learning_rate": 3.6704778657285256e-05, + "loss": 2.9018, + "step": 2886 + }, + { + "epoch": 0.82, + "learning_rate": 3.664614482556436e-05, + "loss": 3.0304, + "step": 2887 + }, + { + "epoch": 0.82, + "learning_rate": 3.658751099384345e-05, + "loss": 2.8653, + "step": 2888 + }, + { + "epoch": 0.82, + "learning_rate": 3.652887716212255e-05, + "loss": 3.0575, + "step": 2889 + }, + { + "epoch": 0.82, + "learning_rate": 3.6470243330401647e-05, + "loss": 2.9043, + "step": 2890 + }, + { + "epoch": 0.82, + "learning_rate": 3.641160949868074e-05, + "loss": 3.0285, + "step": 2891 + }, + { + "epoch": 0.82, + "learning_rate": 3.6352975666959835e-05, + "loss": 3.032, + "step": 2892 + }, + { + "epoch": 0.82, + "learning_rate": 3.629434183523893e-05, + "loss": 3.0223, + "step": 2893 + }, + { + "epoch": 0.82, + "learning_rate": 3.623570800351803e-05, + "loss": 2.9317, + "step": 2894 + }, + { + "epoch": 0.82, + "learning_rate": 3.6177074171797125e-05, + "loss": 2.9454, + "step": 2895 + }, + { + "epoch": 0.82, + "learning_rate": 3.6118440340076226e-05, + "loss": 2.9658, + "step": 2896 + }, + { + "epoch": 0.82, + "learning_rate": 3.605980650835532e-05, + "loss": 2.9197, + "step": 2897 + }, + { + "epoch": 0.83, + "learning_rate": 3.600117267663442e-05, + "loss": 3.0083, + "step": 2898 + }, + { + "epoch": 0.83, + "learning_rate": 3.5942538844913516e-05, + "loss": 2.9619, + "step": 2899 + }, + { + "epoch": 0.83, + "learning_rate": 3.588390501319262e-05, + "loss": 2.9261, + "step": 2900 + }, + { + "epoch": 0.83, + "learning_rate": 3.582527118147171e-05, + "loss": 2.9368, + "step": 2901 + }, + { + "epoch": 0.83, + "learning_rate": 3.576663734975081e-05, + "loss": 2.9306, + "step": 2902 + }, + { + "epoch": 0.83, + "learning_rate": 3.5708003518029907e-05, + "loss": 2.9412, + "step": 2903 + }, + { + "epoch": 0.83, + "learning_rate": 3.5649369686309e-05, + "loss": 2.902, + "step": 2904 + }, + { + "epoch": 0.83, + "learning_rate": 3.5590735854588095e-05, + "loss": 2.8198, + "step": 2905 + }, + { + "epoch": 0.83, + "learning_rate": 3.5532102022867196e-05, + "loss": 2.8804, + "step": 2906 + }, + { + "epoch": 0.83, + "learning_rate": 3.547346819114629e-05, + "loss": 2.8733, + "step": 2907 + }, + { + "epoch": 0.83, + "learning_rate": 3.541483435942539e-05, + "loss": 2.8826, + "step": 2908 + }, + { + "epoch": 0.83, + "learning_rate": 3.5356200527704486e-05, + "loss": 2.8929, + "step": 2909 + }, + { + "epoch": 0.83, + "learning_rate": 3.529756669598359e-05, + "loss": 2.8247, + "step": 2910 + }, + { + "epoch": 0.83, + "learning_rate": 3.523893286426268e-05, + "loss": 2.802, + "step": 2911 + }, + { + "epoch": 0.83, + "learning_rate": 3.518029903254178e-05, + "loss": 3.0505, + "step": 2912 + }, + { + "epoch": 0.83, + "learning_rate": 3.512166520082088e-05, + "loss": 2.9077, + "step": 2913 + }, + { + "epoch": 0.83, + "learning_rate": 3.506303136909997e-05, + "loss": 2.9342, + "step": 2914 + }, + { + "epoch": 0.83, + "learning_rate": 3.5004397537379066e-05, + "loss": 2.9597, + "step": 2915 + }, + { + "epoch": 0.83, + "learning_rate": 3.494576370565817e-05, + "loss": 2.9522, + "step": 2916 + }, + { + "epoch": 0.83, + "learning_rate": 3.488712987393726e-05, + "loss": 2.8228, + "step": 2917 + }, + { + "epoch": 0.83, + "learning_rate": 3.482849604221636e-05, + "loss": 2.9209, + "step": 2918 + }, + { + "epoch": 0.83, + "learning_rate": 3.4769862210495456e-05, + "loss": 2.9564, + "step": 2919 + }, + { + "epoch": 0.83, + "learning_rate": 3.471122837877456e-05, + "loss": 2.9294, + "step": 2920 + }, + { + "epoch": 0.83, + "learning_rate": 3.465259454705365e-05, + "loss": 2.9424, + "step": 2921 + }, + { + "epoch": 0.83, + "learning_rate": 3.4593960715332746e-05, + "loss": 2.9069, + "step": 2922 + }, + { + "epoch": 0.83, + "learning_rate": 3.453532688361185e-05, + "loss": 2.9826, + "step": 2923 + }, + { + "epoch": 0.83, + "learning_rate": 3.447669305189094e-05, + "loss": 2.8924, + "step": 2924 + }, + { + "epoch": 0.83, + "learning_rate": 3.441805922017004e-05, + "loss": 2.8812, + "step": 2925 + }, + { + "epoch": 0.83, + "learning_rate": 3.435942538844914e-05, + "loss": 2.8904, + "step": 2926 + }, + { + "epoch": 0.83, + "learning_rate": 3.430079155672823e-05, + "loss": 2.9383, + "step": 2927 + }, + { + "epoch": 0.83, + "learning_rate": 3.4242157725007326e-05, + "loss": 2.8946, + "step": 2928 + }, + { + "epoch": 0.83, + "learning_rate": 3.418352389328643e-05, + "loss": 2.9094, + "step": 2929 + }, + { + "epoch": 0.83, + "learning_rate": 3.412489006156552e-05, + "loss": 2.8616, + "step": 2930 + }, + { + "epoch": 0.83, + "learning_rate": 3.406625622984462e-05, + "loss": 2.9005, + "step": 2931 + }, + { + "epoch": 0.83, + "learning_rate": 3.4007622398123716e-05, + "loss": 3.0549, + "step": 2932 + }, + { + "epoch": 0.84, + "learning_rate": 3.394898856640282e-05, + "loss": 2.8597, + "step": 2933 + }, + { + "epoch": 0.84, + "learning_rate": 3.389035473468191e-05, + "loss": 2.8892, + "step": 2934 + }, + { + "epoch": 0.84, + "learning_rate": 3.383172090296101e-05, + "loss": 2.9433, + "step": 2935 + }, + { + "epoch": 0.84, + "learning_rate": 3.377308707124011e-05, + "loss": 2.7032, + "step": 2936 + }, + { + "epoch": 0.84, + "learning_rate": 3.371445323951921e-05, + "loss": 2.9428, + "step": 2937 + }, + { + "epoch": 0.84, + "learning_rate": 3.36558194077983e-05, + "loss": 2.9722, + "step": 2938 + }, + { + "epoch": 0.84, + "learning_rate": 3.35971855760774e-05, + "loss": 2.8409, + "step": 2939 + }, + { + "epoch": 0.84, + "learning_rate": 3.353855174435649e-05, + "loss": 2.9067, + "step": 2940 + }, + { + "epoch": 0.84, + "learning_rate": 3.347991791263559e-05, + "loss": 2.6999, + "step": 2941 + }, + { + "epoch": 0.84, + "learning_rate": 3.342128408091469e-05, + "loss": 2.8469, + "step": 2942 + }, + { + "epoch": 0.84, + "learning_rate": 3.336265024919379e-05, + "loss": 2.9083, + "step": 2943 + }, + { + "epoch": 0.84, + "learning_rate": 3.330401641747288e-05, + "loss": 2.9588, + "step": 2944 + }, + { + "epoch": 0.84, + "learning_rate": 3.324538258575198e-05, + "loss": 2.8951, + "step": 2945 + }, + { + "epoch": 0.84, + "learning_rate": 3.318674875403108e-05, + "loss": 2.898, + "step": 2946 + }, + { + "epoch": 0.84, + "learning_rate": 3.312811492231018e-05, + "loss": 3.0032, + "step": 2947 + }, + { + "epoch": 0.84, + "learning_rate": 3.306948109058927e-05, + "loss": 2.8667, + "step": 2948 + }, + { + "epoch": 0.84, + "learning_rate": 3.301084725886837e-05, + "loss": 2.911, + "step": 2949 + }, + { + "epoch": 0.84, + "learning_rate": 3.295221342714747e-05, + "loss": 2.9187, + "step": 2950 + }, + { + "epoch": 0.84, + "learning_rate": 3.289357959542656e-05, + "loss": 2.9395, + "step": 2951 + }, + { + "epoch": 0.84, + "learning_rate": 3.283494576370566e-05, + "loss": 2.8935, + "step": 2952 + }, + { + "epoch": 0.84, + "learning_rate": 3.277631193198475e-05, + "loss": 2.9818, + "step": 2953 + }, + { + "epoch": 0.84, + "learning_rate": 3.271767810026385e-05, + "loss": 2.9324, + "step": 2954 + }, + { + "epoch": 0.84, + "learning_rate": 3.265904426854295e-05, + "loss": 2.9162, + "step": 2955 + }, + { + "epoch": 0.84, + "learning_rate": 3.260041043682205e-05, + "loss": 2.9753, + "step": 2956 + }, + { + "epoch": 0.84, + "learning_rate": 3.254177660510114e-05, + "loss": 2.9244, + "step": 2957 + }, + { + "epoch": 0.84, + "learning_rate": 3.248314277338024e-05, + "loss": 2.9027, + "step": 2958 + }, + { + "epoch": 0.84, + "learning_rate": 3.242450894165934e-05, + "loss": 2.8489, + "step": 2959 + }, + { + "epoch": 0.84, + "learning_rate": 3.236587510993844e-05, + "loss": 2.9391, + "step": 2960 + }, + { + "epoch": 0.84, + "learning_rate": 3.230724127821753e-05, + "loss": 2.8787, + "step": 2961 + }, + { + "epoch": 0.84, + "learning_rate": 3.2248607446496634e-05, + "loss": 2.8082, + "step": 2962 + }, + { + "epoch": 0.84, + "learning_rate": 3.218997361477573e-05, + "loss": 2.8509, + "step": 2963 + }, + { + "epoch": 0.84, + "learning_rate": 3.213133978305482e-05, + "loss": 2.8784, + "step": 2964 + }, + { + "epoch": 0.84, + "learning_rate": 3.207270595133392e-05, + "loss": 2.9297, + "step": 2965 + }, + { + "epoch": 0.84, + "learning_rate": 3.201407211961302e-05, + "loss": 2.8596, + "step": 2966 + }, + { + "epoch": 0.84, + "learning_rate": 3.195543828789211e-05, + "loss": 2.8075, + "step": 2967 + }, + { + "epoch": 0.85, + "learning_rate": 3.1896804456171213e-05, + "loss": 2.9482, + "step": 2968 + }, + { + "epoch": 0.85, + "learning_rate": 3.183817062445031e-05, + "loss": 2.8439, + "step": 2969 + }, + { + "epoch": 0.85, + "learning_rate": 3.177953679272941e-05, + "loss": 2.9943, + "step": 2970 + }, + { + "epoch": 0.85, + "learning_rate": 3.17209029610085e-05, + "loss": 2.8396, + "step": 2971 + }, + { + "epoch": 0.85, + "learning_rate": 3.1662269129287604e-05, + "loss": 2.7967, + "step": 2972 + }, + { + "epoch": 0.85, + "learning_rate": 3.16036352975667e-05, + "loss": 2.9674, + "step": 2973 + }, + { + "epoch": 0.85, + "learning_rate": 3.15450014658458e-05, + "loss": 2.8858, + "step": 2974 + }, + { + "epoch": 0.85, + "learning_rate": 3.1486367634124894e-05, + "loss": 2.8675, + "step": 2975 + }, + { + "epoch": 0.85, + "learning_rate": 3.142773380240399e-05, + "loss": 2.9529, + "step": 2976 + }, + { + "epoch": 0.85, + "learning_rate": 3.136909997068308e-05, + "loss": 2.8245, + "step": 2977 + }, + { + "epoch": 0.85, + "learning_rate": 3.131046613896218e-05, + "loss": 2.9489, + "step": 2978 + }, + { + "epoch": 0.85, + "learning_rate": 3.125183230724128e-05, + "loss": 2.8507, + "step": 2979 + }, + { + "epoch": 0.85, + "learning_rate": 3.119319847552037e-05, + "loss": 2.8228, + "step": 2980 + }, + { + "epoch": 0.85, + "learning_rate": 3.1134564643799474e-05, + "loss": 2.9688, + "step": 2981 + }, + { + "epoch": 0.85, + "learning_rate": 3.107593081207857e-05, + "loss": 2.8488, + "step": 2982 + }, + { + "epoch": 0.85, + "learning_rate": 3.101729698035767e-05, + "loss": 2.8388, + "step": 2983 + }, + { + "epoch": 0.85, + "learning_rate": 3.095866314863676e-05, + "loss": 2.8861, + "step": 2984 + }, + { + "epoch": 0.85, + "learning_rate": 3.0900029316915864e-05, + "loss": 2.9253, + "step": 2985 + }, + { + "epoch": 0.85, + "learning_rate": 3.084139548519496e-05, + "loss": 2.8586, + "step": 2986 + }, + { + "epoch": 0.85, + "learning_rate": 3.078276165347406e-05, + "loss": 2.8695, + "step": 2987 + }, + { + "epoch": 0.85, + "learning_rate": 3.0724127821753154e-05, + "loss": 2.9596, + "step": 2988 + }, + { + "epoch": 0.85, + "learning_rate": 3.066549399003225e-05, + "loss": 2.8596, + "step": 2989 + }, + { + "epoch": 0.85, + "learning_rate": 3.060686015831134e-05, + "loss": 2.9009, + "step": 2990 + }, + { + "epoch": 0.85, + "learning_rate": 3.0548226326590444e-05, + "loss": 2.8724, + "step": 2991 + }, + { + "epoch": 0.85, + "learning_rate": 3.048959249486954e-05, + "loss": 2.8179, + "step": 2992 + }, + { + "epoch": 0.85, + "eval_loss": 3.1793813705444336, + "eval_runtime": 2939.5659, + "eval_samples_per_second": 6.969, + "eval_steps_per_second": 2.323, + "step": 2992 + }, + { + "epoch": 0.85, + "learning_rate": 3.043095866314864e-05, + "loss": 2.8459, + "step": 2993 + }, + { + "epoch": 0.85, + "learning_rate": 3.0372324831427734e-05, + "loss": 2.8515, + "step": 2994 + }, + { + "epoch": 0.85, + "learning_rate": 3.0313690999706835e-05, + "loss": 2.79, + "step": 2995 + }, + { + "epoch": 0.85, + "learning_rate": 3.025505716798593e-05, + "loss": 2.8396, + "step": 2996 + }, + { + "epoch": 0.85, + "learning_rate": 3.019642333626503e-05, + "loss": 2.7678, + "step": 2997 + }, + { + "epoch": 0.85, + "learning_rate": 3.0137789504544124e-05, + "loss": 2.895, + "step": 2998 + }, + { + "epoch": 0.85, + "learning_rate": 3.0079155672823222e-05, + "loss": 2.8841, + "step": 2999 + }, + { + "epoch": 0.85, + "learning_rate": 3.0020521841102316e-05, + "loss": 2.8433, + "step": 3000 + }, + { + "epoch": 0.85, + "learning_rate": 2.9961888009381417e-05, + "loss": 2.9279, + "step": 3001 + }, + { + "epoch": 0.85, + "learning_rate": 2.9903254177660512e-05, + "loss": 2.8772, + "step": 3002 + }, + { + "epoch": 0.86, + "learning_rate": 2.984462034593961e-05, + "loss": 2.8162, + "step": 3003 + }, + { + "epoch": 0.86, + "learning_rate": 2.9785986514218707e-05, + "loss": 2.8856, + "step": 3004 + }, + { + "epoch": 0.86, + "learning_rate": 2.97273526824978e-05, + "loss": 2.8785, + "step": 3005 + }, + { + "epoch": 0.86, + "learning_rate": 2.96687188507769e-05, + "loss": 2.8341, + "step": 3006 + }, + { + "epoch": 0.86, + "learning_rate": 2.9610085019055994e-05, + "loss": 2.894, + "step": 3007 + }, + { + "epoch": 0.86, + "learning_rate": 2.9551451187335095e-05, + "loss": 2.9335, + "step": 3008 + }, + { + "epoch": 0.86, + "learning_rate": 2.949281735561419e-05, + "loss": 2.8878, + "step": 3009 + }, + { + "epoch": 0.86, + "learning_rate": 2.9434183523893287e-05, + "loss": 2.8899, + "step": 3010 + }, + { + "epoch": 0.86, + "learning_rate": 2.9375549692172384e-05, + "loss": 2.8749, + "step": 3011 + }, + { + "epoch": 0.86, + "learning_rate": 2.9316915860451482e-05, + "loss": 2.9289, + "step": 3012 + }, + { + "epoch": 0.86, + "learning_rate": 2.9258282028730576e-05, + "loss": 2.9509, + "step": 3013 + }, + { + "epoch": 0.86, + "learning_rate": 2.9199648197009678e-05, + "loss": 2.9502, + "step": 3014 + }, + { + "epoch": 0.86, + "learning_rate": 2.9141014365288772e-05, + "loss": 2.9716, + "step": 3015 + }, + { + "epoch": 0.86, + "learning_rate": 2.908238053356787e-05, + "loss": 2.8487, + "step": 3016 + }, + { + "epoch": 0.86, + "learning_rate": 2.9023746701846964e-05, + "loss": 2.9988, + "step": 3017 + }, + { + "epoch": 0.86, + "learning_rate": 2.8965112870126065e-05, + "loss": 2.8262, + "step": 3018 + }, + { + "epoch": 0.86, + "learning_rate": 2.890647903840516e-05, + "loss": 2.8904, + "step": 3019 + }, + { + "epoch": 0.86, + "learning_rate": 2.884784520668426e-05, + "loss": 2.7983, + "step": 3020 + }, + { + "epoch": 0.86, + "learning_rate": 2.8789211374963355e-05, + "loss": 2.8936, + "step": 3021 + }, + { + "epoch": 0.86, + "learning_rate": 2.8730577543242452e-05, + "loss": 2.9329, + "step": 3022 + }, + { + "epoch": 0.86, + "learning_rate": 2.8671943711521547e-05, + "loss": 2.9266, + "step": 3023 + }, + { + "epoch": 0.86, + "learning_rate": 2.8613309879800648e-05, + "loss": 2.8731, + "step": 3024 + }, + { + "epoch": 0.86, + "learning_rate": 2.8554676048079742e-05, + "loss": 2.9438, + "step": 3025 + }, + { + "epoch": 0.86, + "learning_rate": 2.8496042216358843e-05, + "loss": 2.8051, + "step": 3026 + }, + { + "epoch": 0.86, + "learning_rate": 2.8437408384637938e-05, + "loss": 2.8465, + "step": 3027 + }, + { + "epoch": 0.86, + "learning_rate": 2.8378774552917035e-05, + "loss": 2.89, + "step": 3028 + }, + { + "epoch": 0.86, + "learning_rate": 2.832014072119613e-05, + "loss": 2.9555, + "step": 3029 + }, + { + "epoch": 0.86, + "learning_rate": 2.826150688947523e-05, + "loss": 2.9402, + "step": 3030 + }, + { + "epoch": 0.86, + "learning_rate": 2.8202873057754325e-05, + "loss": 2.8841, + "step": 3031 + }, + { + "epoch": 0.86, + "learning_rate": 2.8144239226033426e-05, + "loss": 2.9948, + "step": 3032 + }, + { + "epoch": 0.86, + "learning_rate": 2.808560539431252e-05, + "loss": 2.8503, + "step": 3033 + }, + { + "epoch": 0.86, + "learning_rate": 2.8026971562591615e-05, + "loss": 2.9322, + "step": 3034 + }, + { + "epoch": 0.86, + "learning_rate": 2.7968337730870712e-05, + "loss": 2.812, + "step": 3035 + }, + { + "epoch": 0.86, + "learning_rate": 2.7909703899149807e-05, + "loss": 2.8251, + "step": 3036 + }, + { + "epoch": 0.86, + "learning_rate": 2.7851070067428908e-05, + "loss": 2.9553, + "step": 3037 + }, + { + "epoch": 0.87, + "learning_rate": 2.7792436235708002e-05, + "loss": 2.942, + "step": 3038 + }, + { + "epoch": 0.87, + "learning_rate": 2.7733802403987103e-05, + "loss": 2.9353, + "step": 3039 + }, + { + "epoch": 0.87, + "learning_rate": 2.7675168572266198e-05, + "loss": 2.8833, + "step": 3040 + }, + { + "epoch": 0.87, + "learning_rate": 2.7616534740545295e-05, + "loss": 2.8876, + "step": 3041 + }, + { + "epoch": 0.87, + "learning_rate": 2.755790090882439e-05, + "loss": 2.9271, + "step": 3042 + }, + { + "epoch": 0.87, + "learning_rate": 2.749926707710349e-05, + "loss": 2.9971, + "step": 3043 + }, + { + "epoch": 0.87, + "learning_rate": 2.7440633245382585e-05, + "loss": 2.8194, + "step": 3044 + }, + { + "epoch": 0.87, + "learning_rate": 2.7381999413661686e-05, + "loss": 2.9262, + "step": 3045 + }, + { + "epoch": 0.87, + "learning_rate": 2.732336558194078e-05, + "loss": 2.9073, + "step": 3046 + }, + { + "epoch": 0.87, + "learning_rate": 2.7264731750219878e-05, + "loss": 2.9261, + "step": 3047 + }, + { + "epoch": 0.87, + "learning_rate": 2.7206097918498972e-05, + "loss": 2.8828, + "step": 3048 + }, + { + "epoch": 0.87, + "learning_rate": 2.7147464086778074e-05, + "loss": 2.9443, + "step": 3049 + }, + { + "epoch": 0.87, + "learning_rate": 2.7088830255057168e-05, + "loss": 2.9206, + "step": 3050 + }, + { + "epoch": 0.87, + "learning_rate": 2.703019642333627e-05, + "loss": 2.8375, + "step": 3051 + }, + { + "epoch": 0.87, + "learning_rate": 2.6971562591615363e-05, + "loss": 2.9127, + "step": 3052 + }, + { + "epoch": 0.87, + "learning_rate": 2.691292875989446e-05, + "loss": 2.8895, + "step": 3053 + }, + { + "epoch": 0.87, + "learning_rate": 2.6854294928173555e-05, + "loss": 2.9373, + "step": 3054 + }, + { + "epoch": 0.87, + "learning_rate": 2.6795661096452656e-05, + "loss": 3.0038, + "step": 3055 + }, + { + "epoch": 0.87, + "learning_rate": 2.673702726473175e-05, + "loss": 2.9043, + "step": 3056 + }, + { + "epoch": 0.87, + "learning_rate": 2.6678393433010852e-05, + "loss": 2.8431, + "step": 3057 + }, + { + "epoch": 0.87, + "learning_rate": 2.6619759601289946e-05, + "loss": 2.94, + "step": 3058 + }, + { + "epoch": 0.87, + "learning_rate": 2.6561125769569044e-05, + "loss": 2.9705, + "step": 3059 + }, + { + "epoch": 0.87, + "learning_rate": 2.6502491937848138e-05, + "loss": 2.8743, + "step": 3060 + }, + { + "epoch": 0.87, + "learning_rate": 2.6443858106127232e-05, + "loss": 2.9339, + "step": 3061 + }, + { + "epoch": 0.87, + "learning_rate": 2.6385224274406334e-05, + "loss": 2.8471, + "step": 3062 + }, + { + "epoch": 0.87, + "learning_rate": 2.6326590442685428e-05, + "loss": 2.9231, + "step": 3063 + }, + { + "epoch": 0.87, + "learning_rate": 2.626795661096453e-05, + "loss": 2.895, + "step": 3064 + }, + { + "epoch": 0.87, + "learning_rate": 2.6209322779243623e-05, + "loss": 2.8238, + "step": 3065 + }, + { + "epoch": 0.87, + "learning_rate": 2.615068894752272e-05, + "loss": 2.8911, + "step": 3066 + }, + { + "epoch": 0.87, + "learning_rate": 2.6092055115801815e-05, + "loss": 2.9757, + "step": 3067 + }, + { + "epoch": 0.87, + "learning_rate": 2.6033421284080916e-05, + "loss": 2.8924, + "step": 3068 + }, + { + "epoch": 0.87, + "learning_rate": 2.597478745236001e-05, + "loss": 2.9346, + "step": 3069 + }, + { + "epoch": 0.87, + "learning_rate": 2.5916153620639112e-05, + "loss": 2.909, + "step": 3070 + }, + { + "epoch": 0.87, + "learning_rate": 2.5857519788918206e-05, + "loss": 2.9504, + "step": 3071 + }, + { + "epoch": 0.87, + "learning_rate": 2.5798885957197304e-05, + "loss": 2.929, + "step": 3072 + }, + { + "epoch": 0.88, + "learning_rate": 2.5740252125476398e-05, + "loss": 3.0145, + "step": 3073 + }, + { + "epoch": 0.88, + "learning_rate": 2.56816182937555e-05, + "loss": 2.8655, + "step": 3074 + }, + { + "epoch": 0.88, + "learning_rate": 2.5622984462034594e-05, + "loss": 2.9567, + "step": 3075 + }, + { + "epoch": 0.88, + "learning_rate": 2.5564350630313695e-05, + "loss": 2.8592, + "step": 3076 + }, + { + "epoch": 0.88, + "learning_rate": 2.550571679859279e-05, + "loss": 2.8847, + "step": 3077 + }, + { + "epoch": 0.88, + "learning_rate": 2.5447082966871887e-05, + "loss": 2.8842, + "step": 3078 + }, + { + "epoch": 0.88, + "learning_rate": 2.538844913515098e-05, + "loss": 2.9071, + "step": 3079 + }, + { + "epoch": 0.88, + "learning_rate": 2.5329815303430082e-05, + "loss": 2.8896, + "step": 3080 + }, + { + "epoch": 0.88, + "learning_rate": 2.5271181471709176e-05, + "loss": 2.9599, + "step": 3081 + }, + { + "epoch": 0.88, + "learning_rate": 2.5212547639988278e-05, + "loss": 2.9521, + "step": 3082 + }, + { + "epoch": 0.88, + "learning_rate": 2.5153913808267372e-05, + "loss": 2.8697, + "step": 3083 + }, + { + "epoch": 0.88, + "learning_rate": 2.509527997654647e-05, + "loss": 2.8008, + "step": 3084 + }, + { + "epoch": 0.88, + "learning_rate": 2.5036646144825564e-05, + "loss": 2.9064, + "step": 3085 + }, + { + "epoch": 0.88, + "learning_rate": 2.497801231310466e-05, + "loss": 2.8085, + "step": 3086 + }, + { + "epoch": 0.88, + "learning_rate": 2.491937848138376e-05, + "loss": 2.8273, + "step": 3087 + }, + { + "epoch": 0.88, + "learning_rate": 2.4860744649662857e-05, + "loss": 2.8658, + "step": 3088 + }, + { + "epoch": 0.88, + "learning_rate": 2.4802110817941955e-05, + "loss": 2.9475, + "step": 3089 + }, + { + "epoch": 0.88, + "learning_rate": 2.4743476986221052e-05, + "loss": 2.9246, + "step": 3090 + }, + { + "epoch": 0.88, + "learning_rate": 2.4684843154500147e-05, + "loss": 2.8969, + "step": 3091 + }, + { + "epoch": 0.88, + "learning_rate": 2.4626209322779244e-05, + "loss": 2.8539, + "step": 3092 + }, + { + "epoch": 0.88, + "learning_rate": 2.4567575491058342e-05, + "loss": 2.8667, + "step": 3093 + }, + { + "epoch": 0.88, + "learning_rate": 2.450894165933744e-05, + "loss": 3.0136, + "step": 3094 + }, + { + "epoch": 0.88, + "learning_rate": 2.4450307827616538e-05, + "loss": 2.885, + "step": 3095 + }, + { + "epoch": 0.88, + "learning_rate": 2.4391673995895635e-05, + "loss": 2.8927, + "step": 3096 + }, + { + "epoch": 0.88, + "learning_rate": 2.433304016417473e-05, + "loss": 2.8121, + "step": 3097 + }, + { + "epoch": 0.88, + "learning_rate": 2.4274406332453827e-05, + "loss": 2.9706, + "step": 3098 + }, + { + "epoch": 0.88, + "learning_rate": 2.4215772500732925e-05, + "loss": 2.7851, + "step": 3099 + }, + { + "epoch": 0.88, + "learning_rate": 2.4157138669012023e-05, + "loss": 2.9091, + "step": 3100 + }, + { + "epoch": 0.88, + "learning_rate": 2.409850483729112e-05, + "loss": 2.9729, + "step": 3101 + }, + { + "epoch": 0.88, + "learning_rate": 2.4039871005570218e-05, + "loss": 2.8444, + "step": 3102 + }, + { + "epoch": 0.88, + "learning_rate": 2.3981237173849312e-05, + "loss": 2.8912, + "step": 3103 + }, + { + "epoch": 0.88, + "learning_rate": 2.3922603342128407e-05, + "loss": 2.9623, + "step": 3104 + }, + { + "epoch": 0.88, + "learning_rate": 2.3863969510407504e-05, + "loss": 2.8596, + "step": 3105 + }, + { + "epoch": 0.88, + "learning_rate": 2.3805335678686602e-05, + "loss": 2.9232, + "step": 3106 + }, + { + "epoch": 0.88, + "learning_rate": 2.37467018469657e-05, + "loss": 2.8836, + "step": 3107 + }, + { + "epoch": 0.89, + "learning_rate": 2.3688068015244798e-05, + "loss": 2.9154, + "step": 3108 + }, + { + "epoch": 0.89, + "learning_rate": 2.3629434183523895e-05, + "loss": 2.8813, + "step": 3109 + }, + { + "epoch": 0.89, + "learning_rate": 2.357080035180299e-05, + "loss": 2.8312, + "step": 3110 + }, + { + "epoch": 0.89, + "learning_rate": 2.3512166520082087e-05, + "loss": 2.9847, + "step": 3111 + }, + { + "epoch": 0.89, + "learning_rate": 2.3453532688361185e-05, + "loss": 2.8094, + "step": 3112 + }, + { + "epoch": 0.89, + "learning_rate": 2.3394898856640283e-05, + "loss": 2.9362, + "step": 3113 + }, + { + "epoch": 0.89, + "learning_rate": 2.333626502491938e-05, + "loss": 2.6475, + "step": 3114 + }, + { + "epoch": 0.89, + "learning_rate": 2.3277631193198475e-05, + "loss": 2.9112, + "step": 3115 + }, + { + "epoch": 0.89, + "learning_rate": 2.3218997361477572e-05, + "loss": 2.9251, + "step": 3116 + }, + { + "epoch": 0.89, + "learning_rate": 2.316036352975667e-05, + "loss": 2.9166, + "step": 3117 + }, + { + "epoch": 0.89, + "learning_rate": 2.3101729698035768e-05, + "loss": 3.003, + "step": 3118 + }, + { + "epoch": 0.89, + "learning_rate": 2.3043095866314866e-05, + "loss": 2.8892, + "step": 3119 + }, + { + "epoch": 0.89, + "learning_rate": 2.2984462034593963e-05, + "loss": 2.8297, + "step": 3120 + }, + { + "epoch": 0.89, + "learning_rate": 2.2925828202873058e-05, + "loss": 2.9204, + "step": 3121 + }, + { + "epoch": 0.89, + "learning_rate": 2.2867194371152155e-05, + "loss": 2.9136, + "step": 3122 + }, + { + "epoch": 0.89, + "learning_rate": 2.2808560539431253e-05, + "loss": 2.8663, + "step": 3123 + }, + { + "epoch": 0.89, + "learning_rate": 2.274992670771035e-05, + "loss": 2.8577, + "step": 3124 + }, + { + "epoch": 0.89, + "learning_rate": 2.269129287598945e-05, + "loss": 2.8965, + "step": 3125 + }, + { + "epoch": 0.89, + "learning_rate": 2.2632659044268546e-05, + "loss": 2.8784, + "step": 3126 + }, + { + "epoch": 0.89, + "learning_rate": 2.257402521254764e-05, + "loss": 2.8439, + "step": 3127 + }, + { + "epoch": 0.89, + "learning_rate": 2.2515391380826738e-05, + "loss": 2.95, + "step": 3128 + }, + { + "epoch": 0.89, + "learning_rate": 2.2456757549105836e-05, + "loss": 2.9214, + "step": 3129 + }, + { + "epoch": 0.89, + "learning_rate": 2.2398123717384934e-05, + "loss": 2.8409, + "step": 3130 + }, + { + "epoch": 0.89, + "learning_rate": 2.233948988566403e-05, + "loss": 2.8276, + "step": 3131 + }, + { + "epoch": 0.89, + "learning_rate": 2.2280856053943126e-05, + "loss": 2.9065, + "step": 3132 + }, + { + "epoch": 0.89, + "learning_rate": 2.2222222222222223e-05, + "loss": 2.8985, + "step": 3133 + }, + { + "epoch": 0.89, + "learning_rate": 2.2163588390501318e-05, + "loss": 2.917, + "step": 3134 + }, + { + "epoch": 0.89, + "learning_rate": 2.2104954558780415e-05, + "loss": 2.8759, + "step": 3135 + }, + { + "epoch": 0.89, + "learning_rate": 2.2046320727059513e-05, + "loss": 2.9307, + "step": 3136 + }, + { + "epoch": 0.89, + "learning_rate": 2.198768689533861e-05, + "loss": 2.9063, + "step": 3137 + }, + { + "epoch": 0.89, + "learning_rate": 2.192905306361771e-05, + "loss": 2.8303, + "step": 3138 + }, + { + "epoch": 0.89, + "learning_rate": 2.1870419231896806e-05, + "loss": 2.9144, + "step": 3139 + }, + { + "epoch": 0.89, + "learning_rate": 2.18117854001759e-05, + "loss": 2.9602, + "step": 3140 + }, + { + "epoch": 0.89, + "learning_rate": 2.1753151568454998e-05, + "loss": 2.9569, + "step": 3141 + }, + { + "epoch": 0.89, + "learning_rate": 2.1694517736734096e-05, + "loss": 2.9886, + "step": 3142 + }, + { + "epoch": 0.89, + "learning_rate": 2.1635883905013194e-05, + "loss": 2.8674, + "step": 3143 + }, + { + "epoch": 0.9, + "learning_rate": 2.157725007329229e-05, + "loss": 2.7881, + "step": 3144 + }, + { + "epoch": 0.9, + "learning_rate": 2.151861624157139e-05, + "loss": 2.8678, + "step": 3145 + }, + { + "epoch": 0.9, + "learning_rate": 2.1459982409850483e-05, + "loss": 2.83, + "step": 3146 + }, + { + "epoch": 0.9, + "learning_rate": 2.140134857812958e-05, + "loss": 2.8217, + "step": 3147 + }, + { + "epoch": 0.9, + "learning_rate": 2.134271474640868e-05, + "loss": 2.8975, + "step": 3148 + }, + { + "epoch": 0.9, + "learning_rate": 2.1284080914687776e-05, + "loss": 2.8039, + "step": 3149 + }, + { + "epoch": 0.9, + "learning_rate": 2.1225447082966874e-05, + "loss": 2.8653, + "step": 3150 + }, + { + "epoch": 0.9, + "learning_rate": 2.1166813251245972e-05, + "loss": 2.8633, + "step": 3151 + }, + { + "epoch": 0.9, + "learning_rate": 2.1108179419525066e-05, + "loss": 2.9128, + "step": 3152 + }, + { + "epoch": 0.9, + "learning_rate": 2.1049545587804164e-05, + "loss": 2.9029, + "step": 3153 + }, + { + "epoch": 0.9, + "learning_rate": 2.099091175608326e-05, + "loss": 2.7887, + "step": 3154 + }, + { + "epoch": 0.9, + "learning_rate": 2.093227792436236e-05, + "loss": 2.822, + "step": 3155 + }, + { + "epoch": 0.9, + "learning_rate": 2.0873644092641457e-05, + "loss": 2.9808, + "step": 3156 + }, + { + "epoch": 0.9, + "learning_rate": 2.0815010260920555e-05, + "loss": 2.9651, + "step": 3157 + }, + { + "epoch": 0.9, + "learning_rate": 2.075637642919965e-05, + "loss": 2.8369, + "step": 3158 + }, + { + "epoch": 0.9, + "learning_rate": 2.0697742597478747e-05, + "loss": 2.8609, + "step": 3159 + }, + { + "epoch": 0.9, + "learning_rate": 2.063910876575784e-05, + "loss": 2.8827, + "step": 3160 + }, + { + "epoch": 0.9, + "learning_rate": 2.058047493403694e-05, + "loss": 2.8084, + "step": 3161 + }, + { + "epoch": 0.9, + "learning_rate": 2.0521841102316036e-05, + "loss": 2.7938, + "step": 3162 + }, + { + "epoch": 0.9, + "learning_rate": 2.0463207270595134e-05, + "loss": 2.9767, + "step": 3163 + }, + { + "epoch": 0.9, + "learning_rate": 2.0404573438874232e-05, + "loss": 2.9485, + "step": 3164 + }, + { + "epoch": 0.9, + "learning_rate": 2.0345939607153326e-05, + "loss": 2.9543, + "step": 3165 + }, + { + "epoch": 0.9, + "learning_rate": 2.0287305775432424e-05, + "loss": 2.9186, + "step": 3166 + }, + { + "epoch": 0.9, + "learning_rate": 2.022867194371152e-05, + "loss": 2.9111, + "step": 3167 + }, + { + "epoch": 0.9, + "learning_rate": 2.017003811199062e-05, + "loss": 2.836, + "step": 3168 + }, + { + "epoch": 0.9, + "eval_loss": 3.160200357437134, + "eval_runtime": 2939.8509, + "eval_samples_per_second": 6.969, + "eval_steps_per_second": 2.323, + "step": 3168 + }, + { + "epoch": 0.9, + "learning_rate": 2.0111404280269717e-05, + "loss": 2.8292, + "step": 3169 + }, + { + "epoch": 0.9, + "learning_rate": 2.0052770448548815e-05, + "loss": 2.8157, + "step": 3170 + }, + { + "epoch": 0.9, + "learning_rate": 1.999413661682791e-05, + "loss": 2.8245, + "step": 3171 + }, + { + "epoch": 0.9, + "learning_rate": 1.9935502785107007e-05, + "loss": 2.9398, + "step": 3172 + }, + { + "epoch": 0.9, + "learning_rate": 1.9876868953386104e-05, + "loss": 2.796, + "step": 3173 + }, + { + "epoch": 0.9, + "learning_rate": 1.9818235121665202e-05, + "loss": 2.8846, + "step": 3174 + }, + { + "epoch": 0.9, + "learning_rate": 1.97596012899443e-05, + "loss": 2.7891, + "step": 3175 + }, + { + "epoch": 0.9, + "learning_rate": 1.9700967458223398e-05, + "loss": 2.9822, + "step": 3176 + }, + { + "epoch": 0.9, + "learning_rate": 1.9642333626502492e-05, + "loss": 2.9676, + "step": 3177 + }, + { + "epoch": 0.9, + "learning_rate": 1.958369979478159e-05, + "loss": 2.8681, + "step": 3178 + }, + { + "epoch": 0.91, + "learning_rate": 1.9525065963060687e-05, + "loss": 2.8187, + "step": 3179 + }, + { + "epoch": 0.91, + "learning_rate": 1.9466432131339785e-05, + "loss": 2.8531, + "step": 3180 + }, + { + "epoch": 0.91, + "learning_rate": 1.9407798299618883e-05, + "loss": 2.8477, + "step": 3181 + }, + { + "epoch": 0.91, + "learning_rate": 1.934916446789798e-05, + "loss": 2.9817, + "step": 3182 + }, + { + "epoch": 0.91, + "learning_rate": 1.9290530636177075e-05, + "loss": 2.8434, + "step": 3183 + }, + { + "epoch": 0.91, + "learning_rate": 1.9231896804456172e-05, + "loss": 2.9335, + "step": 3184 + }, + { + "epoch": 0.91, + "learning_rate": 1.917326297273527e-05, + "loss": 2.7555, + "step": 3185 + }, + { + "epoch": 0.91, + "learning_rate": 1.9114629141014368e-05, + "loss": 2.83, + "step": 3186 + }, + { + "epoch": 0.91, + "learning_rate": 1.9055995309293466e-05, + "loss": 2.7177, + "step": 3187 + }, + { + "epoch": 0.91, + "learning_rate": 1.899736147757256e-05, + "loss": 2.8393, + "step": 3188 + }, + { + "epoch": 0.91, + "learning_rate": 1.8938727645851658e-05, + "loss": 2.9234, + "step": 3189 + }, + { + "epoch": 0.91, + "learning_rate": 1.8880093814130752e-05, + "loss": 2.9678, + "step": 3190 + }, + { + "epoch": 0.91, + "learning_rate": 1.882145998240985e-05, + "loss": 2.9529, + "step": 3191 + }, + { + "epoch": 0.91, + "learning_rate": 1.8762826150688947e-05, + "loss": 2.9177, + "step": 3192 + }, + { + "epoch": 0.91, + "learning_rate": 1.8704192318968045e-05, + "loss": 2.8308, + "step": 3193 + }, + { + "epoch": 0.91, + "learning_rate": 1.8645558487247143e-05, + "loss": 2.8068, + "step": 3194 + }, + { + "epoch": 0.91, + "learning_rate": 1.858692465552624e-05, + "loss": 2.8895, + "step": 3195 + }, + { + "epoch": 0.91, + "learning_rate": 1.8528290823805335e-05, + "loss": 2.8562, + "step": 3196 + }, + { + "epoch": 0.91, + "learning_rate": 1.8469656992084432e-05, + "loss": 2.9185, + "step": 3197 + }, + { + "epoch": 0.91, + "learning_rate": 1.841102316036353e-05, + "loss": 2.8922, + "step": 3198 + }, + { + "epoch": 0.91, + "learning_rate": 1.8352389328642628e-05, + "loss": 2.7684, + "step": 3199 + }, + { + "epoch": 0.91, + "learning_rate": 1.8293755496921726e-05, + "loss": 2.9996, + "step": 3200 + }, + { + "epoch": 0.91, + "learning_rate": 1.8235121665200823e-05, + "loss": 2.8695, + "step": 3201 + }, + { + "epoch": 0.91, + "learning_rate": 1.8176487833479918e-05, + "loss": 2.8365, + "step": 3202 + }, + { + "epoch": 0.91, + "learning_rate": 1.8117854001759015e-05, + "loss": 2.8094, + "step": 3203 + }, + { + "epoch": 0.91, + "learning_rate": 1.8059220170038113e-05, + "loss": 2.9036, + "step": 3204 + }, + { + "epoch": 0.91, + "learning_rate": 1.800058633831721e-05, + "loss": 2.7422, + "step": 3205 + }, + { + "epoch": 0.91, + "learning_rate": 1.794195250659631e-05, + "loss": 2.84, + "step": 3206 + }, + { + "epoch": 0.91, + "learning_rate": 1.7883318674875406e-05, + "loss": 2.8855, + "step": 3207 + }, + { + "epoch": 0.91, + "learning_rate": 1.78246848431545e-05, + "loss": 2.8716, + "step": 3208 + }, + { + "epoch": 0.91, + "learning_rate": 1.7766051011433598e-05, + "loss": 2.9577, + "step": 3209 + }, + { + "epoch": 0.91, + "learning_rate": 1.7707417179712696e-05, + "loss": 2.9218, + "step": 3210 + }, + { + "epoch": 0.91, + "learning_rate": 1.7648783347991794e-05, + "loss": 2.8567, + "step": 3211 + }, + { + "epoch": 0.91, + "learning_rate": 1.759014951627089e-05, + "loss": 2.941, + "step": 3212 + }, + { + "epoch": 0.91, + "learning_rate": 1.7531515684549986e-05, + "loss": 2.8864, + "step": 3213 + }, + { + "epoch": 0.92, + "learning_rate": 1.7472881852829083e-05, + "loss": 2.8474, + "step": 3214 + }, + { + "epoch": 0.92, + "learning_rate": 1.741424802110818e-05, + "loss": 2.8396, + "step": 3215 + }, + { + "epoch": 0.92, + "learning_rate": 1.735561418938728e-05, + "loss": 2.813, + "step": 3216 + }, + { + "epoch": 0.92, + "learning_rate": 1.7296980357666373e-05, + "loss": 2.8872, + "step": 3217 + }, + { + "epoch": 0.92, + "learning_rate": 1.723834652594547e-05, + "loss": 2.9066, + "step": 3218 + }, + { + "epoch": 0.92, + "learning_rate": 1.717971269422457e-05, + "loss": 2.9, + "step": 3219 + }, + { + "epoch": 0.92, + "learning_rate": 1.7121078862503663e-05, + "loss": 2.9115, + "step": 3220 + }, + { + "epoch": 0.92, + "learning_rate": 1.706244503078276e-05, + "loss": 2.7752, + "step": 3221 + }, + { + "epoch": 0.92, + "learning_rate": 1.7003811199061858e-05, + "loss": 2.9374, + "step": 3222 + }, + { + "epoch": 0.92, + "learning_rate": 1.6945177367340956e-05, + "loss": 2.8273, + "step": 3223 + }, + { + "epoch": 0.92, + "learning_rate": 1.6886543535620054e-05, + "loss": 2.7782, + "step": 3224 + }, + { + "epoch": 0.92, + "learning_rate": 1.682790970389915e-05, + "loss": 2.9504, + "step": 3225 + }, + { + "epoch": 0.92, + "learning_rate": 1.6769275872178246e-05, + "loss": 2.8504, + "step": 3226 + }, + { + "epoch": 0.92, + "learning_rate": 1.6710642040457343e-05, + "loss": 2.8606, + "step": 3227 + }, + { + "epoch": 0.92, + "learning_rate": 1.665200820873644e-05, + "loss": 2.7783, + "step": 3228 + }, + { + "epoch": 0.92, + "learning_rate": 1.659337437701554e-05, + "loss": 2.8033, + "step": 3229 + }, + { + "epoch": 0.92, + "learning_rate": 1.6534740545294636e-05, + "loss": 2.8692, + "step": 3230 + }, + { + "epoch": 0.92, + "learning_rate": 1.6476106713573734e-05, + "loss": 2.8594, + "step": 3231 + }, + { + "epoch": 0.92, + "learning_rate": 1.641747288185283e-05, + "loss": 2.8659, + "step": 3232 + }, + { + "epoch": 0.92, + "learning_rate": 1.6358839050131926e-05, + "loss": 2.8606, + "step": 3233 + }, + { + "epoch": 0.92, + "learning_rate": 1.6300205218411024e-05, + "loss": 2.7963, + "step": 3234 + }, + { + "epoch": 0.92, + "learning_rate": 1.624157138669012e-05, + "loss": 2.8598, + "step": 3235 + }, + { + "epoch": 0.92, + "learning_rate": 1.618293755496922e-05, + "loss": 2.8262, + "step": 3236 + }, + { + "epoch": 0.92, + "learning_rate": 1.6124303723248317e-05, + "loss": 2.8283, + "step": 3237 + }, + { + "epoch": 0.92, + "learning_rate": 1.606566989152741e-05, + "loss": 3.0016, + "step": 3238 + }, + { + "epoch": 0.92, + "learning_rate": 1.600703605980651e-05, + "loss": 2.8427, + "step": 3239 + }, + { + "epoch": 0.92, + "learning_rate": 1.5948402228085607e-05, + "loss": 3.0162, + "step": 3240 + }, + { + "epoch": 0.92, + "learning_rate": 1.5889768396364704e-05, + "loss": 2.7804, + "step": 3241 + }, + { + "epoch": 0.92, + "learning_rate": 1.5831134564643802e-05, + "loss": 2.8232, + "step": 3242 + }, + { + "epoch": 0.92, + "learning_rate": 1.57725007329229e-05, + "loss": 2.9136, + "step": 3243 + }, + { + "epoch": 0.92, + "learning_rate": 1.5713866901201994e-05, + "loss": 2.8262, + "step": 3244 + }, + { + "epoch": 0.92, + "learning_rate": 1.565523306948109e-05, + "loss": 2.8576, + "step": 3245 + }, + { + "epoch": 0.92, + "learning_rate": 1.5596599237760186e-05, + "loss": 2.85, + "step": 3246 + }, + { + "epoch": 0.92, + "learning_rate": 1.5537965406039284e-05, + "loss": 2.9155, + "step": 3247 + }, + { + "epoch": 0.92, + "learning_rate": 1.547933157431838e-05, + "loss": 2.9287, + "step": 3248 + }, + { + "epoch": 0.93, + "learning_rate": 1.542069774259748e-05, + "loss": 2.9685, + "step": 3249 + }, + { + "epoch": 0.93, + "learning_rate": 1.5362063910876577e-05, + "loss": 2.9452, + "step": 3250 + }, + { + "epoch": 0.93, + "learning_rate": 1.530343007915567e-05, + "loss": 2.839, + "step": 3251 + }, + { + "epoch": 0.93, + "learning_rate": 1.524479624743477e-05, + "loss": 2.8205, + "step": 3252 + }, + { + "epoch": 0.93, + "learning_rate": 1.5186162415713867e-05, + "loss": 2.965, + "step": 3253 + }, + { + "epoch": 0.93, + "learning_rate": 1.5127528583992964e-05, + "loss": 2.9346, + "step": 3254 + }, + { + "epoch": 0.93, + "learning_rate": 1.5068894752272062e-05, + "loss": 2.8178, + "step": 3255 + }, + { + "epoch": 0.93, + "learning_rate": 1.5010260920551158e-05, + "loss": 2.7931, + "step": 3256 + }, + { + "epoch": 0.93, + "learning_rate": 1.4951627088830256e-05, + "loss": 2.8281, + "step": 3257 + }, + { + "epoch": 0.93, + "learning_rate": 1.4892993257109354e-05, + "loss": 2.8357, + "step": 3258 + }, + { + "epoch": 0.93, + "learning_rate": 1.483435942538845e-05, + "loss": 2.8288, + "step": 3259 + }, + { + "epoch": 0.93, + "learning_rate": 1.4775725593667547e-05, + "loss": 2.8825, + "step": 3260 + }, + { + "epoch": 0.93, + "learning_rate": 1.4717091761946643e-05, + "loss": 2.7498, + "step": 3261 + }, + { + "epoch": 0.93, + "learning_rate": 1.4658457930225741e-05, + "loss": 2.806, + "step": 3262 + }, + { + "epoch": 0.93, + "learning_rate": 1.4599824098504839e-05, + "loss": 2.9031, + "step": 3263 + }, + { + "epoch": 0.93, + "learning_rate": 1.4541190266783935e-05, + "loss": 2.8699, + "step": 3264 + }, + { + "epoch": 0.93, + "learning_rate": 1.4482556435063032e-05, + "loss": 2.9579, + "step": 3265 + }, + { + "epoch": 0.93, + "learning_rate": 1.442392260334213e-05, + "loss": 2.7564, + "step": 3266 + }, + { + "epoch": 0.93, + "learning_rate": 1.4365288771621226e-05, + "loss": 2.9202, + "step": 3267 + }, + { + "epoch": 0.93, + "learning_rate": 1.4306654939900324e-05, + "loss": 2.8848, + "step": 3268 + }, + { + "epoch": 0.93, + "learning_rate": 1.4248021108179422e-05, + "loss": 2.8895, + "step": 3269 + }, + { + "epoch": 0.93, + "learning_rate": 1.4189387276458518e-05, + "loss": 2.8821, + "step": 3270 + }, + { + "epoch": 0.93, + "learning_rate": 1.4130753444737615e-05, + "loss": 2.9572, + "step": 3271 + }, + { + "epoch": 0.93, + "learning_rate": 1.4072119613016713e-05, + "loss": 2.9551, + "step": 3272 + }, + { + "epoch": 0.93, + "learning_rate": 1.4013485781295807e-05, + "loss": 2.7267, + "step": 3273 + }, + { + "epoch": 0.93, + "learning_rate": 1.3954851949574903e-05, + "loss": 2.7621, + "step": 3274 + }, + { + "epoch": 0.93, + "learning_rate": 1.3896218117854001e-05, + "loss": 2.8636, + "step": 3275 + }, + { + "epoch": 0.93, + "learning_rate": 1.3837584286133099e-05, + "loss": 2.8203, + "step": 3276 + }, + { + "epoch": 0.93, + "learning_rate": 1.3778950454412195e-05, + "loss": 2.7836, + "step": 3277 + }, + { + "epoch": 0.93, + "learning_rate": 1.3720316622691292e-05, + "loss": 2.8999, + "step": 3278 + }, + { + "epoch": 0.93, + "learning_rate": 1.366168279097039e-05, + "loss": 2.851, + "step": 3279 + }, + { + "epoch": 0.93, + "learning_rate": 1.3603048959249486e-05, + "loss": 2.862, + "step": 3280 + }, + { + "epoch": 0.93, + "learning_rate": 1.3544415127528584e-05, + "loss": 2.8626, + "step": 3281 + }, + { + "epoch": 0.93, + "learning_rate": 1.3485781295807682e-05, + "loss": 2.9474, + "step": 3282 + }, + { + "epoch": 0.93, + "learning_rate": 1.3427147464086778e-05, + "loss": 2.8565, + "step": 3283 + }, + { + "epoch": 0.94, + "learning_rate": 1.3368513632365875e-05, + "loss": 2.9163, + "step": 3284 + }, + { + "epoch": 0.94, + "learning_rate": 1.3309879800644973e-05, + "loss": 2.8821, + "step": 3285 + }, + { + "epoch": 0.94, + "learning_rate": 1.3251245968924069e-05, + "loss": 2.8121, + "step": 3286 + }, + { + "epoch": 0.94, + "learning_rate": 1.3192612137203167e-05, + "loss": 2.9233, + "step": 3287 + }, + { + "epoch": 0.94, + "learning_rate": 1.3133978305482264e-05, + "loss": 2.9479, + "step": 3288 + }, + { + "epoch": 0.94, + "learning_rate": 1.307534447376136e-05, + "loss": 2.8446, + "step": 3289 + }, + { + "epoch": 0.94, + "learning_rate": 1.3016710642040458e-05, + "loss": 2.8088, + "step": 3290 + }, + { + "epoch": 0.94, + "learning_rate": 1.2958076810319556e-05, + "loss": 2.8534, + "step": 3291 + }, + { + "epoch": 0.94, + "learning_rate": 1.2899442978598652e-05, + "loss": 2.7208, + "step": 3292 + }, + { + "epoch": 0.94, + "learning_rate": 1.284080914687775e-05, + "loss": 2.8408, + "step": 3293 + }, + { + "epoch": 0.94, + "learning_rate": 1.2782175315156847e-05, + "loss": 2.8883, + "step": 3294 + }, + { + "epoch": 0.94, + "learning_rate": 1.2723541483435943e-05, + "loss": 2.8794, + "step": 3295 + }, + { + "epoch": 0.94, + "learning_rate": 1.2664907651715041e-05, + "loss": 2.8654, + "step": 3296 + }, + { + "epoch": 0.94, + "learning_rate": 1.2606273819994139e-05, + "loss": 2.8625, + "step": 3297 + }, + { + "epoch": 0.94, + "learning_rate": 1.2547639988273235e-05, + "loss": 2.9146, + "step": 3298 + }, + { + "epoch": 0.94, + "learning_rate": 1.248900615655233e-05, + "loss": 2.839, + "step": 3299 + }, + { + "epoch": 0.94, + "learning_rate": 1.2430372324831428e-05, + "loss": 2.9716, + "step": 3300 + }, + { + "epoch": 0.94, + "learning_rate": 1.2371738493110526e-05, + "loss": 2.8428, + "step": 3301 + }, + { + "epoch": 0.94, + "learning_rate": 1.2313104661389622e-05, + "loss": 2.9266, + "step": 3302 + }, + { + "epoch": 0.94, + "learning_rate": 1.225447082966872e-05, + "loss": 2.8197, + "step": 3303 + }, + { + "epoch": 0.94, + "learning_rate": 1.2195836997947818e-05, + "loss": 2.9407, + "step": 3304 + }, + { + "epoch": 0.94, + "learning_rate": 1.2137203166226914e-05, + "loss": 2.8513, + "step": 3305 + }, + { + "epoch": 0.94, + "learning_rate": 1.2078569334506011e-05, + "loss": 2.8183, + "step": 3306 + }, + { + "epoch": 0.94, + "learning_rate": 1.2019935502785109e-05, + "loss": 2.8864, + "step": 3307 + }, + { + "epoch": 0.94, + "learning_rate": 1.1961301671064203e-05, + "loss": 2.9143, + "step": 3308 + }, + { + "epoch": 0.94, + "learning_rate": 1.1902667839343301e-05, + "loss": 2.8703, + "step": 3309 + }, + { + "epoch": 0.94, + "learning_rate": 1.1844034007622399e-05, + "loss": 2.9207, + "step": 3310 + }, + { + "epoch": 0.94, + "learning_rate": 1.1785400175901495e-05, + "loss": 2.9206, + "step": 3311 + }, + { + "epoch": 0.94, + "learning_rate": 1.1726766344180592e-05, + "loss": 2.8417, + "step": 3312 + }, + { + "epoch": 0.94, + "learning_rate": 1.166813251245969e-05, + "loss": 2.7817, + "step": 3313 + }, + { + "epoch": 0.94, + "learning_rate": 1.1609498680738786e-05, + "loss": 2.8918, + "step": 3314 + }, + { + "epoch": 0.94, + "learning_rate": 1.1550864849017884e-05, + "loss": 2.8252, + "step": 3315 + }, + { + "epoch": 0.94, + "learning_rate": 1.1492231017296982e-05, + "loss": 2.8636, + "step": 3316 + }, + { + "epoch": 0.94, + "learning_rate": 1.1433597185576078e-05, + "loss": 2.9615, + "step": 3317 + }, + { + "epoch": 0.94, + "learning_rate": 1.1374963353855175e-05, + "loss": 2.9234, + "step": 3318 + }, + { + "epoch": 0.95, + "learning_rate": 1.1316329522134273e-05, + "loss": 2.8497, + "step": 3319 + }, + { + "epoch": 0.95, + "learning_rate": 1.1257695690413369e-05, + "loss": 2.9749, + "step": 3320 + }, + { + "epoch": 0.95, + "learning_rate": 1.1199061858692467e-05, + "loss": 2.9241, + "step": 3321 + }, + { + "epoch": 0.95, + "learning_rate": 1.1140428026971563e-05, + "loss": 2.873, + "step": 3322 + }, + { + "epoch": 0.95, + "learning_rate": 1.1081794195250659e-05, + "loss": 2.851, + "step": 3323 + }, + { + "epoch": 0.95, + "learning_rate": 1.1023160363529757e-05, + "loss": 2.8547, + "step": 3324 + }, + { + "epoch": 0.95, + "learning_rate": 1.0964526531808854e-05, + "loss": 2.8402, + "step": 3325 + }, + { + "epoch": 0.95, + "learning_rate": 1.090589270008795e-05, + "loss": 2.9167, + "step": 3326 + }, + { + "epoch": 0.95, + "learning_rate": 1.0847258868367048e-05, + "loss": 2.8132, + "step": 3327 + }, + { + "epoch": 0.95, + "learning_rate": 1.0788625036646146e-05, + "loss": 2.9386, + "step": 3328 + }, + { + "epoch": 0.95, + "learning_rate": 1.0729991204925242e-05, + "loss": 2.8653, + "step": 3329 + }, + { + "epoch": 0.95, + "learning_rate": 1.067135737320434e-05, + "loss": 2.862, + "step": 3330 + }, + { + "epoch": 0.95, + "learning_rate": 1.0612723541483437e-05, + "loss": 2.7229, + "step": 3331 + }, + { + "epoch": 0.95, + "learning_rate": 1.0554089709762533e-05, + "loss": 2.8003, + "step": 3332 + }, + { + "epoch": 0.95, + "learning_rate": 1.049545587804163e-05, + "loss": 2.8708, + "step": 3333 + }, + { + "epoch": 0.95, + "learning_rate": 1.0436822046320728e-05, + "loss": 2.8852, + "step": 3334 + }, + { + "epoch": 0.95, + "learning_rate": 1.0378188214599825e-05, + "loss": 2.8857, + "step": 3335 + }, + { + "epoch": 0.95, + "learning_rate": 1.031955438287892e-05, + "loss": 2.7855, + "step": 3336 + }, + { + "epoch": 0.95, + "learning_rate": 1.0260920551158018e-05, + "loss": 2.8554, + "step": 3337 + }, + { + "epoch": 0.95, + "learning_rate": 1.0202286719437116e-05, + "loss": 2.8743, + "step": 3338 + }, + { + "epoch": 0.95, + "learning_rate": 1.0143652887716212e-05, + "loss": 2.8712, + "step": 3339 + }, + { + "epoch": 0.95, + "learning_rate": 1.008501905599531e-05, + "loss": 3.0037, + "step": 3340 + }, + { + "epoch": 0.95, + "learning_rate": 1.0026385224274407e-05, + "loss": 2.8358, + "step": 3341 + }, + { + "epoch": 0.95, + "learning_rate": 9.967751392553503e-06, + "loss": 2.869, + "step": 3342 + }, + { + "epoch": 0.95, + "learning_rate": 9.909117560832601e-06, + "loss": 2.8204, + "step": 3343 + }, + { + "epoch": 0.95, + "learning_rate": 9.850483729111699e-06, + "loss": 2.9861, + "step": 3344 + }, + { + "epoch": 0.95, + "eval_loss": 3.1451783180236816, + "eval_runtime": 2939.0886, + "eval_samples_per_second": 6.971, + "eval_steps_per_second": 2.324, + "step": 3344 + }, + { + "epoch": 0.95, + "learning_rate": 9.791849897390795e-06, + "loss": 2.8953, + "step": 3345 + }, + { + "epoch": 0.95, + "learning_rate": 9.733216065669893e-06, + "loss": 2.8766, + "step": 3346 + }, + { + "epoch": 0.95, + "learning_rate": 9.67458223394899e-06, + "loss": 2.9005, + "step": 3347 + }, + { + "epoch": 0.95, + "learning_rate": 9.615948402228086e-06, + "loss": 2.8427, + "step": 3348 + }, + { + "epoch": 0.95, + "learning_rate": 9.557314570507184e-06, + "loss": 2.8354, + "step": 3349 + }, + { + "epoch": 0.95, + "learning_rate": 9.49868073878628e-06, + "loss": 2.8795, + "step": 3350 + }, + { + "epoch": 0.95, + "learning_rate": 9.440046907065376e-06, + "loss": 2.8449, + "step": 3351 + }, + { + "epoch": 0.95, + "learning_rate": 9.381413075344474e-06, + "loss": 2.9221, + "step": 3352 + }, + { + "epoch": 0.95, + "learning_rate": 9.322779243623571e-06, + "loss": 2.8642, + "step": 3353 + }, + { + "epoch": 0.96, + "learning_rate": 9.264145411902667e-06, + "loss": 2.8635, + "step": 3354 + }, + { + "epoch": 0.96, + "learning_rate": 9.205511580181765e-06, + "loss": 2.9202, + "step": 3355 + }, + { + "epoch": 0.96, + "learning_rate": 9.146877748460863e-06, + "loss": 2.8686, + "step": 3356 + }, + { + "epoch": 0.96, + "learning_rate": 9.088243916739959e-06, + "loss": 2.9704, + "step": 3357 + }, + { + "epoch": 0.96, + "learning_rate": 9.029610085019057e-06, + "loss": 2.8669, + "step": 3358 + }, + { + "epoch": 0.96, + "learning_rate": 8.970976253298154e-06, + "loss": 2.8702, + "step": 3359 + }, + { + "epoch": 0.96, + "learning_rate": 8.91234242157725e-06, + "loss": 2.8498, + "step": 3360 + }, + { + "epoch": 0.96, + "learning_rate": 8.853708589856348e-06, + "loss": 2.8873, + "step": 3361 + }, + { + "epoch": 0.96, + "learning_rate": 8.795074758135446e-06, + "loss": 2.8563, + "step": 3362 + }, + { + "epoch": 0.96, + "learning_rate": 8.736440926414542e-06, + "loss": 2.9415, + "step": 3363 + }, + { + "epoch": 0.96, + "learning_rate": 8.67780709469364e-06, + "loss": 2.8061, + "step": 3364 + }, + { + "epoch": 0.96, + "learning_rate": 8.619173262972735e-06, + "loss": 2.7686, + "step": 3365 + }, + { + "epoch": 0.96, + "learning_rate": 8.560539431251831e-06, + "loss": 2.9245, + "step": 3366 + }, + { + "epoch": 0.96, + "learning_rate": 8.501905599530929e-06, + "loss": 2.882, + "step": 3367 + }, + { + "epoch": 0.96, + "learning_rate": 8.443271767810027e-06, + "loss": 2.7816, + "step": 3368 + }, + { + "epoch": 0.96, + "learning_rate": 8.384637936089123e-06, + "loss": 2.9509, + "step": 3369 + }, + { + "epoch": 0.96, + "learning_rate": 8.32600410436822e-06, + "loss": 2.8633, + "step": 3370 + }, + { + "epoch": 0.96, + "learning_rate": 8.267370272647318e-06, + "loss": 2.9092, + "step": 3371 + }, + { + "epoch": 0.96, + "learning_rate": 8.208736440926414e-06, + "loss": 2.8923, + "step": 3372 + }, + { + "epoch": 0.96, + "learning_rate": 8.150102609205512e-06, + "loss": 2.8727, + "step": 3373 + }, + { + "epoch": 0.96, + "learning_rate": 8.09146877748461e-06, + "loss": 2.8452, + "step": 3374 + }, + { + "epoch": 0.96, + "learning_rate": 8.032834945763706e-06, + "loss": 2.8718, + "step": 3375 + }, + { + "epoch": 0.96, + "learning_rate": 7.974201114042803e-06, + "loss": 2.8629, + "step": 3376 + }, + { + "epoch": 0.96, + "learning_rate": 7.915567282321901e-06, + "loss": 2.9427, + "step": 3377 + }, + { + "epoch": 0.96, + "learning_rate": 7.856933450600997e-06, + "loss": 2.8703, + "step": 3378 + }, + { + "epoch": 0.96, + "learning_rate": 7.798299618880093e-06, + "loss": 2.851, + "step": 3379 + }, + { + "epoch": 0.96, + "learning_rate": 7.73966578715919e-06, + "loss": 2.922, + "step": 3380 + }, + { + "epoch": 0.96, + "learning_rate": 7.681031955438289e-06, + "loss": 2.9027, + "step": 3381 + }, + { + "epoch": 0.96, + "learning_rate": 7.622398123717385e-06, + "loss": 2.8629, + "step": 3382 + }, + { + "epoch": 0.96, + "learning_rate": 7.563764291996482e-06, + "loss": 2.8226, + "step": 3383 + }, + { + "epoch": 0.96, + "learning_rate": 7.505130460275579e-06, + "loss": 2.9194, + "step": 3384 + }, + { + "epoch": 0.96, + "learning_rate": 7.446496628554677e-06, + "loss": 2.8865, + "step": 3385 + }, + { + "epoch": 0.96, + "learning_rate": 7.387862796833774e-06, + "loss": 2.7308, + "step": 3386 + }, + { + "epoch": 0.96, + "learning_rate": 7.3292289651128705e-06, + "loss": 2.809, + "step": 3387 + }, + { + "epoch": 0.96, + "learning_rate": 7.270595133391967e-06, + "loss": 2.8759, + "step": 3388 + }, + { + "epoch": 0.97, + "learning_rate": 7.211961301671065e-06, + "loss": 2.9575, + "step": 3389 + }, + { + "epoch": 0.97, + "learning_rate": 7.153327469950162e-06, + "loss": 2.8045, + "step": 3390 + }, + { + "epoch": 0.97, + "learning_rate": 7.094693638229259e-06, + "loss": 2.8413, + "step": 3391 + }, + { + "epoch": 0.97, + "learning_rate": 7.0360598065083565e-06, + "loss": 2.8926, + "step": 3392 + }, + { + "epoch": 0.97, + "learning_rate": 6.977425974787452e-06, + "loss": 2.8948, + "step": 3393 + }, + { + "epoch": 0.97, + "learning_rate": 6.918792143066549e-06, + "loss": 2.855, + "step": 3394 + }, + { + "epoch": 0.97, + "learning_rate": 6.860158311345646e-06, + "loss": 2.8372, + "step": 3395 + }, + { + "epoch": 0.97, + "learning_rate": 6.801524479624743e-06, + "loss": 2.8268, + "step": 3396 + }, + { + "epoch": 0.97, + "learning_rate": 6.742890647903841e-06, + "loss": 2.8013, + "step": 3397 + }, + { + "epoch": 0.97, + "learning_rate": 6.684256816182938e-06, + "loss": 2.8179, + "step": 3398 + }, + { + "epoch": 0.97, + "learning_rate": 6.6256229844620345e-06, + "loss": 2.875, + "step": 3399 + }, + { + "epoch": 0.97, + "learning_rate": 6.566989152741132e-06, + "loss": 2.8707, + "step": 3400 + }, + { + "epoch": 0.97, + "learning_rate": 6.508355321020229e-06, + "loss": 2.8618, + "step": 3401 + }, + { + "epoch": 0.97, + "learning_rate": 6.449721489299326e-06, + "loss": 2.8761, + "step": 3402 + }, + { + "epoch": 0.97, + "learning_rate": 6.391087657578424e-06, + "loss": 2.8959, + "step": 3403 + }, + { + "epoch": 0.97, + "learning_rate": 6.3324538258575205e-06, + "loss": 2.8393, + "step": 3404 + }, + { + "epoch": 0.97, + "learning_rate": 6.273819994136617e-06, + "loss": 2.9073, + "step": 3405 + }, + { + "epoch": 0.97, + "learning_rate": 6.215186162415714e-06, + "loss": 2.9331, + "step": 3406 + }, + { + "epoch": 0.97, + "learning_rate": 6.156552330694811e-06, + "loss": 2.8051, + "step": 3407 + }, + { + "epoch": 0.97, + "learning_rate": 6.097918498973909e-06, + "loss": 2.8643, + "step": 3408 + }, + { + "epoch": 0.97, + "learning_rate": 6.039284667253006e-06, + "loss": 2.9364, + "step": 3409 + }, + { + "epoch": 0.97, + "learning_rate": 5.980650835532102e-06, + "loss": 2.8516, + "step": 3410 + }, + { + "epoch": 0.97, + "learning_rate": 5.922017003811199e-06, + "loss": 2.9023, + "step": 3411 + }, + { + "epoch": 0.97, + "learning_rate": 5.863383172090296e-06, + "loss": 2.863, + "step": 3412 + }, + { + "epoch": 0.97, + "learning_rate": 5.804749340369393e-06, + "loss": 2.8365, + "step": 3413 + }, + { + "epoch": 0.97, + "learning_rate": 5.746115508648491e-06, + "loss": 2.8407, + "step": 3414 + }, + { + "epoch": 0.97, + "learning_rate": 5.687481676927588e-06, + "loss": 2.7719, + "step": 3415 + }, + { + "epoch": 0.97, + "learning_rate": 5.6288478452066845e-06, + "loss": 2.8312, + "step": 3416 + }, + { + "epoch": 0.97, + "learning_rate": 5.570214013485781e-06, + "loss": 2.8941, + "step": 3417 + }, + { + "epoch": 0.97, + "learning_rate": 5.511580181764878e-06, + "loss": 2.8098, + "step": 3418 + }, + { + "epoch": 0.97, + "learning_rate": 5.452946350043975e-06, + "loss": 2.8687, + "step": 3419 + }, + { + "epoch": 0.97, + "learning_rate": 5.394312518323073e-06, + "loss": 2.8045, + "step": 3420 + }, + { + "epoch": 0.97, + "learning_rate": 5.33567868660217e-06, + "loss": 2.6793, + "step": 3421 + }, + { + "epoch": 0.97, + "learning_rate": 5.2770448548812665e-06, + "loss": 2.8539, + "step": 3422 + }, + { + "epoch": 0.97, + "learning_rate": 5.218411023160364e-06, + "loss": 2.8605, + "step": 3423 + }, + { + "epoch": 0.98, + "learning_rate": 5.15977719143946e-06, + "loss": 2.834, + "step": 3424 + }, + { + "epoch": 0.98, + "learning_rate": 5.101143359718558e-06, + "loss": 2.8213, + "step": 3425 + }, + { + "epoch": 0.98, + "learning_rate": 5.042509527997655e-06, + "loss": 2.8129, + "step": 3426 + }, + { + "epoch": 0.98, + "learning_rate": 4.983875696276752e-06, + "loss": 2.9176, + "step": 3427 + }, + { + "epoch": 0.98, + "learning_rate": 4.925241864555849e-06, + "loss": 2.895, + "step": 3428 + }, + { + "epoch": 0.98, + "learning_rate": 4.866608032834946e-06, + "loss": 2.7044, + "step": 3429 + }, + { + "epoch": 0.98, + "learning_rate": 4.807974201114043e-06, + "loss": 2.891, + "step": 3430 + }, + { + "epoch": 0.98, + "learning_rate": 4.74934036939314e-06, + "loss": 2.7879, + "step": 3431 + }, + { + "epoch": 0.98, + "learning_rate": 4.690706537672237e-06, + "loss": 2.8971, + "step": 3432 + }, + { + "epoch": 0.98, + "learning_rate": 4.632072705951334e-06, + "loss": 2.8768, + "step": 3433 + }, + { + "epoch": 0.98, + "learning_rate": 4.573438874230431e-06, + "loss": 2.7405, + "step": 3434 + }, + { + "epoch": 0.98, + "learning_rate": 4.514805042509528e-06, + "loss": 2.8209, + "step": 3435 + }, + { + "epoch": 0.98, + "learning_rate": 4.456171210788625e-06, + "loss": 2.8784, + "step": 3436 + }, + { + "epoch": 0.98, + "learning_rate": 4.397537379067723e-06, + "loss": 2.7199, + "step": 3437 + }, + { + "epoch": 0.98, + "learning_rate": 4.33890354734682e-06, + "loss": 2.912, + "step": 3438 + }, + { + "epoch": 0.98, + "learning_rate": 4.280269715625916e-06, + "loss": 2.8641, + "step": 3439 + }, + { + "epoch": 0.98, + "learning_rate": 4.221635883905013e-06, + "loss": 2.7801, + "step": 3440 + }, + { + "epoch": 0.98, + "learning_rate": 4.16300205218411e-06, + "loss": 2.8512, + "step": 3441 + }, + { + "epoch": 0.98, + "learning_rate": 4.104368220463207e-06, + "loss": 2.7977, + "step": 3442 + }, + { + "epoch": 0.98, + "learning_rate": 4.045734388742305e-06, + "loss": 2.8873, + "step": 3443 + }, + { + "epoch": 0.98, + "learning_rate": 3.987100557021402e-06, + "loss": 2.8778, + "step": 3444 + }, + { + "epoch": 0.98, + "learning_rate": 3.9284667253004985e-06, + "loss": 2.731, + "step": 3445 + }, + { + "epoch": 0.98, + "learning_rate": 3.869832893579595e-06, + "loss": 2.9343, + "step": 3446 + }, + { + "epoch": 0.98, + "learning_rate": 3.8111990618586927e-06, + "loss": 2.7655, + "step": 3447 + }, + { + "epoch": 0.98, + "learning_rate": 3.7525652301377895e-06, + "loss": 2.9145, + "step": 3448 + }, + { + "epoch": 0.98, + "learning_rate": 3.693931398416887e-06, + "loss": 2.7719, + "step": 3449 + }, + { + "epoch": 0.98, + "learning_rate": 3.6352975666959837e-06, + "loss": 2.8715, + "step": 3450 + }, + { + "epoch": 0.98, + "learning_rate": 3.576663734975081e-06, + "loss": 2.7961, + "step": 3451 + }, + { + "epoch": 0.98, + "learning_rate": 3.5180299032541783e-06, + "loss": 2.8308, + "step": 3452 + }, + { + "epoch": 0.98, + "learning_rate": 3.4593960715332747e-06, + "loss": 2.9132, + "step": 3453 + }, + { + "epoch": 0.98, + "learning_rate": 3.4007622398123716e-06, + "loss": 2.87, + "step": 3454 + }, + { + "epoch": 0.98, + "learning_rate": 3.342128408091469e-06, + "loss": 2.8685, + "step": 3455 + }, + { + "epoch": 0.98, + "learning_rate": 3.283494576370566e-06, + "loss": 2.9182, + "step": 3456 + }, + { + "epoch": 0.98, + "learning_rate": 3.224860744649663e-06, + "loss": 2.8645, + "step": 3457 + }, + { + "epoch": 0.98, + "learning_rate": 3.1662269129287603e-06, + "loss": 2.8448, + "step": 3458 + }, + { + "epoch": 0.98, + "learning_rate": 3.107593081207857e-06, + "loss": 2.9324, + "step": 3459 + }, + { + "epoch": 0.99, + "learning_rate": 3.0489592494869544e-06, + "loss": 2.8694, + "step": 3460 + }, + { + "epoch": 0.99, + "learning_rate": 2.990325417766051e-06, + "loss": 2.6942, + "step": 3461 + }, + { + "epoch": 0.99, + "learning_rate": 2.931691586045148e-06, + "loss": 2.8406, + "step": 3462 + }, + { + "epoch": 0.99, + "learning_rate": 2.8730577543242454e-06, + "loss": 2.8226, + "step": 3463 + }, + { + "epoch": 0.99, + "learning_rate": 2.8144239226033423e-06, + "loss": 2.8704, + "step": 3464 + }, + { + "epoch": 0.99, + "learning_rate": 2.755790090882439e-06, + "loss": 2.8336, + "step": 3465 + }, + { + "epoch": 0.99, + "learning_rate": 2.6971562591615364e-06, + "loss": 2.8579, + "step": 3466 + }, + { + "epoch": 0.99, + "learning_rate": 2.6385224274406333e-06, + "loss": 2.8385, + "step": 3467 + }, + { + "epoch": 0.99, + "learning_rate": 2.57988859571973e-06, + "loss": 2.93, + "step": 3468 + }, + { + "epoch": 0.99, + "learning_rate": 2.5212547639988274e-06, + "loss": 3.0071, + "step": 3469 + }, + { + "epoch": 0.99, + "learning_rate": 2.4626209322779247e-06, + "loss": 2.8082, + "step": 3470 + }, + { + "epoch": 0.99, + "learning_rate": 2.4039871005570216e-06, + "loss": 2.8131, + "step": 3471 + }, + { + "epoch": 0.99, + "learning_rate": 2.3453532688361184e-06, + "loss": 2.9279, + "step": 3472 + }, + { + "epoch": 0.99, + "learning_rate": 2.2867194371152157e-06, + "loss": 2.8262, + "step": 3473 + }, + { + "epoch": 0.99, + "learning_rate": 2.2280856053943126e-06, + "loss": 2.8332, + "step": 3474 + }, + { + "epoch": 0.99, + "learning_rate": 2.16945177367341e-06, + "loss": 2.8972, + "step": 3475 + }, + { + "epoch": 0.99, + "learning_rate": 2.1108179419525067e-06, + "loss": 2.8598, + "step": 3476 + }, + { + "epoch": 0.99, + "learning_rate": 2.0521841102316036e-06, + "loss": 2.8697, + "step": 3477 + }, + { + "epoch": 0.99, + "learning_rate": 1.993550278510701e-06, + "loss": 2.8586, + "step": 3478 + }, + { + "epoch": 0.99, + "learning_rate": 1.9349164467897977e-06, + "loss": 2.8463, + "step": 3479 + }, + { + "epoch": 0.99, + "learning_rate": 1.8762826150688948e-06, + "loss": 2.7904, + "step": 3480 + }, + { + "epoch": 0.99, + "learning_rate": 1.8176487833479918e-06, + "loss": 2.8856, + "step": 3481 + }, + { + "epoch": 0.99, + "learning_rate": 1.7590149516270891e-06, + "loss": 2.8751, + "step": 3482 + }, + { + "epoch": 0.99, + "learning_rate": 1.7003811199061858e-06, + "loss": 2.8387, + "step": 3483 + }, + { + "epoch": 0.99, + "learning_rate": 1.641747288185283e-06, + "loss": 2.8042, + "step": 3484 + }, + { + "epoch": 0.99, + "learning_rate": 1.5831134564643801e-06, + "loss": 2.7155, + "step": 3485 + }, + { + "epoch": 0.99, + "learning_rate": 1.5244796247434772e-06, + "loss": 2.7341, + "step": 3486 + }, + { + "epoch": 0.99, + "learning_rate": 1.465845793022574e-06, + "loss": 2.9217, + "step": 3487 + }, + { + "epoch": 0.99, + "learning_rate": 1.4072119613016711e-06, + "loss": 2.8898, + "step": 3488 + }, + { + "epoch": 0.99, + "learning_rate": 1.3485781295807682e-06, + "loss": 2.8511, + "step": 3489 + }, + { + "epoch": 0.99, + "learning_rate": 1.289944297859865e-06, + "loss": 2.7788, + "step": 3490 + }, + { + "epoch": 0.99, + "learning_rate": 1.2313104661389623e-06, + "loss": 2.7344, + "step": 3491 + }, + { + "epoch": 0.99, + "learning_rate": 1.1726766344180592e-06, + "loss": 2.8135, + "step": 3492 + }, + { + "epoch": 0.99, + "learning_rate": 1.1140428026971563e-06, + "loss": 2.8553, + "step": 3493 + }, + { + "epoch": 0.99, + "learning_rate": 1.0554089709762534e-06, + "loss": 2.7931, + "step": 3494 + }, + { + "epoch": 1.0, + "learning_rate": 9.967751392553504e-07, + "loss": 2.8761, + "step": 3495 + }, + { + "epoch": 1.0, + "learning_rate": 9.381413075344474e-07, + "loss": 2.8658, + "step": 3496 + }, + { + "epoch": 1.0, + "learning_rate": 8.795074758135446e-07, + "loss": 2.8199, + "step": 3497 + }, + { + "epoch": 1.0, + "learning_rate": 8.208736440926415e-07, + "loss": 2.8918, + "step": 3498 + }, + { + "epoch": 1.0, + "learning_rate": 7.622398123717386e-07, + "loss": 2.8117, + "step": 3499 + }, + { + "epoch": 1.0, + "learning_rate": 7.036059806508356e-07, + "loss": 2.8405, + "step": 3500 + }, + { + "epoch": 1.0, + "learning_rate": 6.449721489299325e-07, + "loss": 2.9002, + "step": 3501 + }, + { + "epoch": 1.0, + "learning_rate": 5.863383172090296e-07, + "loss": 2.8177, + "step": 3502 + }, + { + "epoch": 1.0, + "learning_rate": 5.277044854881267e-07, + "loss": 2.9016, + "step": 3503 + }, + { + "epoch": 1.0, + "learning_rate": 4.690706537672237e-07, + "loss": 2.87, + "step": 3504 + }, + { + "epoch": 1.0, + "learning_rate": 4.1043682204632077e-07, + "loss": 2.8673, + "step": 3505 + }, + { + "epoch": 1.0, + "learning_rate": 3.518029903254178e-07, + "loss": 2.8511, + "step": 3506 + }, + { + "epoch": 1.0, + "learning_rate": 2.931691586045148e-07, + "loss": 2.8761, + "step": 3507 + }, + { + "epoch": 1.0, + "learning_rate": 2.3453532688361185e-07, + "loss": 2.8007, + "step": 3508 + }, + { + "epoch": 1.0, + "learning_rate": 1.759014951627089e-07, + "loss": 2.8836, + "step": 3509 + }, + { + "epoch": 1.0, + "learning_rate": 1.1726766344180592e-07, + "loss": 2.8452, + "step": 3510 + }, + { + "epoch": 1.0, + "learning_rate": 5.863383172090296e-08, + "loss": 2.8898, + "step": 3511 + } + ], + "logging_steps": 1, + "max_steps": 3511, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 275654859816960.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3511/training_args.bin b/checkpoint-3511/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a6a0a162a1968593802fa9f39616b18ddfb3473 --- /dev/null +++ b/checkpoint-3511/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e9dbdd00abc69c750dbf4df28e6ed1ebd106a3aaa97f6b846aed50c15a29a6d +size 6587 diff --git a/checkpoint-3511/zero_to_fp32.py b/checkpoint-3511/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-3511/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-59/config.json b/checkpoint-59/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2599737ed66369800e7a6efe6cdf23d0cfe85382 --- /dev/null +++ b/checkpoint-59/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "cyberagent/calm2-7b-chat", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 1, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 500000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.34.1", + "use_cache": false, + "vocab_size": 65024 +} diff --git a/checkpoint-59/generation_config.json b/checkpoint-59/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28d913eb61a7fd74338a6f1ff8d2efb149f99dbc --- /dev/null +++ b/checkpoint-59/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "pad_token_id": 1, + "transformers_version": "4.34.1" +} diff --git a/checkpoint-59/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-59/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07c2a325bb14bdb7d4e2a284331eb553604a15f0 --- /dev/null +++ b/checkpoint-59/global_step59/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106bdd16cd8f82425fb3053417c9fcf7f0fe4e56e548bfe62ccbeb4219ccd1b1 +size 28036079603 diff --git a/checkpoint-59/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-59/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f5b4e6eecc117fa8ef3566d645e2965dc816927 --- /dev/null +++ b/checkpoint-59/global_step59/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77596b3e82e205d79189e7094109d54b6da18783f47eae3589f2b95c27dfcb18 +size 28036079603 diff --git a/checkpoint-59/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-59/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d252f3c2117c64d72ac76f2b5e316bd30eff7195 --- /dev/null +++ b/checkpoint-59/global_step59/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550e422fd5b16a81dfac00c33087e93372a43407bcd1b529fd6a16cd30db99c0 +size 28036079603 diff --git a/checkpoint-59/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-59/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2e86aca6201c93dda0d99c13145f0164772908c --- /dev/null +++ b/checkpoint-59/global_step59/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8097a536be1e65d24250d2a432dbf67d9415af0b2e055e89c0299797803ea333 +size 138326 diff --git a/checkpoint-59/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-59/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68be944e07d7c3326b8e45b76181b75ce696883a --- /dev/null +++ b/checkpoint-59/global_step59/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32ef62f6c3cd0af6c40d27aa812c069631fb030743521f6cb510b7de8835ad7 +size 138326 diff --git a/checkpoint-59/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-59/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ddc9cec2f1021aa1035d1be692fd7a36ec97019 --- /dev/null +++ b/checkpoint-59/global_step59/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc582b3aa3b761819dd3574da334aff13b9699d1aa7de6e61c32abe3259c443 +size 138326 diff --git a/checkpoint-59/latest b/checkpoint-59/latest new file mode 100644 index 0000000000000000000000000000000000000000..099fa08342218cca7c00fb7043635561ebda9695 --- /dev/null +++ b/checkpoint-59/latest @@ -0,0 +1 @@ +global_step59 \ No newline at end of file diff --git a/checkpoint-59/pytorch_model-00001-of-00002.bin b/checkpoint-59/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..e8f5641c0e81715ec212d4c8fb4df07c211bffd7 --- /dev/null +++ b/checkpoint-59/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d1955a7f90cb97a91a0a801b19287f49e0d90c52286e1cf3c83a90356f1d54e +size 9976594142 diff --git a/checkpoint-59/pytorch_model-00002-of-00002.bin b/checkpoint-59/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..2710e7fadc4e3d22eda4f089ce4a16a5ad05f0c1 --- /dev/null +++ b/checkpoint-59/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90aac6bd4b26fc9f62af0bcf2375856406f447dea534ccdf5d3337ddde6817e2 +size 4041391035 diff --git a/checkpoint-59/pytorch_model.bin.index.json b/checkpoint-59/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..85c4314f00522f62839f8495ec3f0d9adec0fcfa --- /dev/null +++ b/checkpoint-59/pytorch_model.bin.index.json @@ -0,0 +1,266 @@ +{ + "metadata": { + "total_size": 14017896448 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/checkpoint-59/rng_state_0.pth b/checkpoint-59/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..417d52d6958ce1d6e6c0e711d6eb0a68a1f1ae42 --- /dev/null +++ b/checkpoint-59/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1653d5b0e09c2d93759ad31b0bca034b949c5beacbcec854b9c133c18ff0f1 +size 16631 diff --git a/checkpoint-59/rng_state_1.pth b/checkpoint-59/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6389ab11d2699189dff857d5cf6911645ac491a7 --- /dev/null +++ b/checkpoint-59/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:718a356e2faee3d07e0416c137f3bcdc0c70d127268ae7202882018ffa03e320 +size 16631 diff --git a/checkpoint-59/rng_state_2.pth b/checkpoint-59/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..553871298ac3918ceef48b7e90ee7784f4afe077 --- /dev/null +++ b/checkpoint-59/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ab1200db9bd16e014659660734c61fe08517897cef6b3efe97c366790250f5 +size 16631 diff --git a/checkpoint-59/trainer_state.json b/checkpoint-59/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4791ef1473c23b96c6c4b3fc6205a5d29a03f8d4 --- /dev/null +++ b/checkpoint-59/trainer_state.json @@ -0,0 +1,533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9957805907172996, + "eval_steps": 3, + "global_step": 59, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "learning_rate": 0.0, + "loss": 3.5879, + "step": 1 + }, + { + "epoch": 0.02, + "eval_loss": 5.454614639282227, + "eval_runtime": 38.769, + "eval_samples_per_second": 8.718, + "eval_steps_per_second": 2.915, + "step": 1 + }, + { + "epoch": 0.03, + "learning_rate": 2.0000000000000003e-06, + "loss": 5.5874, + "step": 2 + }, + { + "epoch": 0.05, + "learning_rate": 4.000000000000001e-06, + "loss": 5.5473, + "step": 3 + }, + { + "epoch": 0.05, + "eval_loss": 5.365917682647705, + "eval_runtime": 38.6587, + "eval_samples_per_second": 8.743, + "eval_steps_per_second": 2.923, + "step": 3 + }, + { + "epoch": 0.07, + "learning_rate": 6e-06, + "loss": 5.4992, + "step": 4 + }, + { + "epoch": 0.08, + "learning_rate": 8.000000000000001e-06, + "loss": 4.2092, + "step": 5 + }, + { + "epoch": 0.1, + "learning_rate": 1e-05, + "loss": 4.3209, + "step": 6 + }, + { + "epoch": 0.1, + "eval_loss": 3.893451452255249, + "eval_runtime": 38.6883, + "eval_samples_per_second": 8.737, + "eval_steps_per_second": 2.921, + "step": 6 + }, + { + "epoch": 0.12, + "learning_rate": 1.2e-05, + "loss": 4.052, + "step": 7 + }, + { + "epoch": 0.14, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.8525, + "step": 8 + }, + { + "epoch": 0.15, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.452, + "step": 9 + }, + { + "epoch": 0.15, + "eval_loss": 3.324516534805298, + "eval_runtime": 38.6537, + "eval_samples_per_second": 8.744, + "eval_steps_per_second": 2.923, + "step": 9 + }, + { + "epoch": 0.17, + "learning_rate": 1.8e-05, + "loss": 3.4402, + "step": 10 + }, + { + "epoch": 0.19, + "learning_rate": 2e-05, + "loss": 3.5562, + "step": 11 + }, + { + "epoch": 0.2, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.3865, + "step": 12 + }, + { + "epoch": 0.2, + "eval_loss": 3.2463653087615967, + "eval_runtime": 38.6438, + "eval_samples_per_second": 8.747, + "eval_steps_per_second": 2.924, + "step": 12 + }, + { + "epoch": 0.22, + "learning_rate": 2.4e-05, + "loss": 3.3706, + "step": 13 + }, + { + "epoch": 0.24, + "learning_rate": 2.6000000000000002e-05, + "loss": 3.3094, + "step": 14 + }, + { + "epoch": 0.25, + "learning_rate": 2.8000000000000003e-05, + "loss": 3.4042, + "step": 15 + }, + { + "epoch": 0.25, + "eval_loss": 3.2070071697235107, + "eval_runtime": 38.664, + "eval_samples_per_second": 8.742, + "eval_steps_per_second": 2.923, + "step": 15 + }, + { + "epoch": 0.27, + "learning_rate": 3e-05, + "loss": 3.2641, + "step": 16 + }, + { + "epoch": 0.29, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.2077, + "step": 17 + }, + { + "epoch": 0.3, + "learning_rate": 3.4000000000000007e-05, + "loss": 3.1833, + "step": 18 + }, + { + "epoch": 0.3, + "eval_loss": 3.172358274459839, + "eval_runtime": 38.6613, + "eval_samples_per_second": 8.743, + "eval_steps_per_second": 2.923, + "step": 18 + }, + { + "epoch": 0.32, + "learning_rate": 3.6e-05, + "loss": 3.2509, + "step": 19 + }, + { + "epoch": 0.34, + "learning_rate": 3.8e-05, + "loss": 3.2675, + "step": 20 + }, + { + "epoch": 0.35, + "learning_rate": 4e-05, + "loss": 3.2255, + "step": 21 + }, + { + "epoch": 0.35, + "eval_loss": 3.1417272090911865, + "eval_runtime": 38.6879, + "eval_samples_per_second": 8.737, + "eval_steps_per_second": 2.921, + "step": 21 + }, + { + "epoch": 0.37, + "learning_rate": 4.2e-05, + "loss": 3.1394, + "step": 22 + }, + { + "epoch": 0.39, + "learning_rate": 4.4000000000000006e-05, + "loss": 3.1616, + "step": 23 + }, + { + "epoch": 0.41, + "learning_rate": 4.600000000000001e-05, + "loss": 3.1794, + "step": 24 + }, + { + "epoch": 0.41, + "eval_loss": 3.105272054672241, + "eval_runtime": 38.6846, + "eval_samples_per_second": 8.737, + "eval_steps_per_second": 2.921, + "step": 24 + }, + { + "epoch": 0.42, + "learning_rate": 4.8e-05, + "loss": 3.0956, + "step": 25 + }, + { + "epoch": 0.44, + "learning_rate": 5e-05, + "loss": 3.1482, + "step": 26 + }, + { + "epoch": 0.46, + "learning_rate": 5.2000000000000004e-05, + "loss": 3.1397, + "step": 27 + }, + { + "epoch": 0.46, + "eval_loss": 3.0812368392944336, + "eval_runtime": 38.7031, + "eval_samples_per_second": 8.733, + "eval_steps_per_second": 2.92, + "step": 27 + }, + { + "epoch": 0.47, + "learning_rate": 5.4000000000000005e-05, + "loss": 3.1913, + "step": 28 + }, + { + "epoch": 0.49, + "learning_rate": 5.6000000000000006e-05, + "loss": 3.0505, + "step": 29 + }, + { + "epoch": 0.51, + "learning_rate": 5.8e-05, + "loss": 3.2152, + "step": 30 + }, + { + "epoch": 0.51, + "eval_loss": 3.0692708492279053, + "eval_runtime": 38.6647, + "eval_samples_per_second": 8.742, + "eval_steps_per_second": 2.923, + "step": 30 + }, + { + "epoch": 0.52, + "learning_rate": 6e-05, + "loss": 3.062, + "step": 31 + }, + { + "epoch": 0.54, + "learning_rate": 6.2e-05, + "loss": 3.0408, + "step": 32 + }, + { + "epoch": 0.56, + "learning_rate": 6.400000000000001e-05, + "loss": 3.1555, + "step": 33 + }, + { + "epoch": 0.56, + "eval_loss": 3.062750816345215, + "eval_runtime": 38.6431, + "eval_samples_per_second": 8.747, + "eval_steps_per_second": 2.924, + "step": 33 + }, + { + "epoch": 0.57, + "learning_rate": 6.6e-05, + "loss": 3.0227, + "step": 34 + }, + { + "epoch": 0.59, + "learning_rate": 6.800000000000001e-05, + "loss": 2.959, + "step": 35 + }, + { + "epoch": 0.61, + "learning_rate": 7e-05, + "loss": 3.0286, + "step": 36 + }, + { + "epoch": 0.61, + "eval_loss": 3.050220012664795, + "eval_runtime": 38.6284, + "eval_samples_per_second": 8.75, + "eval_steps_per_second": 2.925, + "step": 36 + }, + { + "epoch": 0.62, + "learning_rate": 7.2e-05, + "loss": 3.0433, + "step": 37 + }, + { + "epoch": 0.64, + "learning_rate": 7.4e-05, + "loss": 2.9642, + "step": 38 + }, + { + "epoch": 0.66, + "learning_rate": 7.6e-05, + "loss": 2.998, + "step": 39 + }, + { + "epoch": 0.66, + "eval_loss": 3.0451812744140625, + "eval_runtime": 38.6036, + "eval_samples_per_second": 8.756, + "eval_steps_per_second": 2.927, + "step": 39 + }, + { + "epoch": 0.68, + "learning_rate": 7.800000000000001e-05, + "loss": 3.0317, + "step": 40 + }, + { + "epoch": 0.69, + "learning_rate": 8e-05, + "loss": 3.0231, + "step": 41 + }, + { + "epoch": 0.71, + "learning_rate": 8.2e-05, + "loss": 3.0289, + "step": 42 + }, + { + "epoch": 0.71, + "eval_loss": 3.0393357276916504, + "eval_runtime": 38.7085, + "eval_samples_per_second": 8.732, + "eval_steps_per_second": 2.919, + "step": 42 + }, + { + "epoch": 0.73, + "learning_rate": 8.4e-05, + "loss": 2.9359, + "step": 43 + }, + { + "epoch": 0.74, + "learning_rate": 8.6e-05, + "loss": 3.0932, + "step": 44 + }, + { + "epoch": 0.76, + "learning_rate": 8.800000000000001e-05, + "loss": 3.0782, + "step": 45 + }, + { + "epoch": 0.76, + "eval_loss": 3.0284407138824463, + "eval_runtime": 38.6437, + "eval_samples_per_second": 8.747, + "eval_steps_per_second": 2.924, + "step": 45 + }, + { + "epoch": 0.78, + "learning_rate": 9e-05, + "loss": 2.984, + "step": 46 + }, + { + "epoch": 0.79, + "learning_rate": 9.200000000000001e-05, + "loss": 2.8464, + "step": 47 + }, + { + "epoch": 0.81, + "learning_rate": 9.4e-05, + "loss": 2.9876, + "step": 48 + }, + { + "epoch": 0.81, + "eval_loss": 3.0273585319519043, + "eval_runtime": 38.6372, + "eval_samples_per_second": 8.748, + "eval_steps_per_second": 2.925, + "step": 48 + }, + { + "epoch": 0.83, + "learning_rate": 9.6e-05, + "loss": 3.1017, + "step": 49 + }, + { + "epoch": 0.84, + "learning_rate": 9.8e-05, + "loss": 3.0528, + "step": 50 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001, + "loss": 3.0262, + "step": 51 + }, + { + "epoch": 0.86, + "eval_loss": 3.0559024810791016, + "eval_runtime": 38.6899, + "eval_samples_per_second": 8.736, + "eval_steps_per_second": 2.921, + "step": 51 + }, + { + "epoch": 0.88, + "learning_rate": 0.00010200000000000001, + "loss": 2.9996, + "step": 52 + }, + { + "epoch": 0.89, + "learning_rate": 0.00010400000000000001, + "loss": 2.9649, + "step": 53 + }, + { + "epoch": 0.91, + "learning_rate": 0.00010600000000000002, + "loss": 3.0604, + "step": 54 + }, + { + "epoch": 0.91, + "eval_loss": 3.0412142276763916, + "eval_runtime": 38.6646, + "eval_samples_per_second": 8.742, + "eval_steps_per_second": 2.923, + "step": 54 + }, + { + "epoch": 0.93, + "learning_rate": 0.00010800000000000001, + "loss": 2.9577, + "step": 55 + }, + { + "epoch": 0.95, + "learning_rate": 0.00011000000000000002, + "loss": 3.0167, + "step": 56 + }, + { + "epoch": 0.96, + "learning_rate": 0.00011200000000000001, + "loss": 2.9368, + "step": 57 + }, + { + "epoch": 0.96, + "eval_loss": 3.0456135272979736, + "eval_runtime": 38.6549, + "eval_samples_per_second": 8.744, + "eval_steps_per_second": 2.923, + "step": 57 + }, + { + "epoch": 0.98, + "learning_rate": 0.00011399999999999999, + "loss": 3.0352, + "step": 58 + }, + { + "epoch": 1.0, + "learning_rate": 0.000116, + "loss": 2.8919, + "step": 59 + } + ], + "logging_steps": 1, + "max_steps": 59, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 4612895539200.0, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-59/training_args.bin b/checkpoint-59/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..433b5a5520af53ecfab71d759aafdbecab0c1a5a --- /dev/null +++ b/checkpoint-59/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9ec53a82ab7ccd0b56a0e210031dd5230906ab55f7bbb5351d53946650df69 +size 6587 diff --git a/checkpoint-59/zero_to_fp32.py b/checkpoint-59/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-59/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-618/config.json b/checkpoint-618/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2599737ed66369800e7a6efe6cdf23d0cfe85382 --- /dev/null +++ b/checkpoint-618/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "cyberagent/calm2-7b-chat", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 1, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 500000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.34.1", + "use_cache": false, + "vocab_size": 65024 +} diff --git a/checkpoint-618/generation_config.json b/checkpoint-618/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28d913eb61a7fd74338a6f1ff8d2efb149f99dbc --- /dev/null +++ b/checkpoint-618/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "pad_token_id": 1, + "transformers_version": "4.34.1" +} diff --git a/checkpoint-618/global_step618/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-618/global_step618/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ac625f19c3f1823f08583e884b3235ad52fe438 --- /dev/null +++ b/checkpoint-618/global_step618/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfb4514c3718fcaf0b44817b8b6282dfd700e0d12b937ecbf66871de6ff19c09 +size 28035802551 diff --git a/checkpoint-618/global_step618/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-618/global_step618/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e31024f7e07d77d1d61f1d75ff9f2249249ef9a0 --- /dev/null +++ b/checkpoint-618/global_step618/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9290cc1480f6205a0c20b10d09202764a1fa808d4fdd800776a1625bee45ce +size 28035803191 diff --git a/checkpoint-618/global_step618/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-618/global_step618/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1f322f3534cd3e95645868f4eeec25f0f290961 --- /dev/null +++ b/checkpoint-618/global_step618/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36ceff325ba6cd1535f1dfceccd0fd34a624bd63b1f33ee5278c713aee01c6f +size 28035802743 diff --git a/checkpoint-618/global_step618/mp_rank_00_model_states.pt b/checkpoint-618/global_step618/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6f6f5d348fe27ec1a30d6391c045392e34effa8 --- /dev/null +++ b/checkpoint-618/global_step618/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae8e77bdb20476e11de017ea64f26b80360300bcbfa293b237730d077a5329e1 +size 14017976195 diff --git a/checkpoint-618/latest b/checkpoint-618/latest new file mode 100644 index 0000000000000000000000000000000000000000..a7ca56e5787da62bc2c53baa649c9ff38129cc72 --- /dev/null +++ b/checkpoint-618/latest @@ -0,0 +1 @@ +global_step618 \ No newline at end of file diff --git a/checkpoint-618/pytorch_model-00001-of-00002.bin b/checkpoint-618/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..864fc0b38d1f1eb617cd42051155b48f87f5522b --- /dev/null +++ b/checkpoint-618/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:245595c67e8712c740f414577e89a49e851bc1110aef9b159f85ad73d7bf63c1 +size 9976594142 diff --git a/checkpoint-618/pytorch_model-00002-of-00002.bin b/checkpoint-618/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..7e241c0b73ced2835c44d4b94c06710a4d0229fe --- /dev/null +++ b/checkpoint-618/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b965f08ffaf99a0f921d52572ee779b6be1f96afb3031cad865be6cbb5bfe6f +size 4041391035 diff --git a/checkpoint-618/pytorch_model.bin.index.json b/checkpoint-618/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..85c4314f00522f62839f8495ec3f0d9adec0fcfa --- /dev/null +++ b/checkpoint-618/pytorch_model.bin.index.json @@ -0,0 +1,266 @@ +{ + "metadata": { + "total_size": 14017896448 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/checkpoint-618/rng_state_0.pth b/checkpoint-618/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5297819e590cdd0e0c5e45081474a5317d4cb2a --- /dev/null +++ b/checkpoint-618/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f4f4eabd3d3209be5ecfa7748b59c9bcebe66f8280e04423295c3adb56fdda8 +size 16631 diff --git a/checkpoint-618/rng_state_1.pth b/checkpoint-618/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d750e695b456b3d23917a06ebedc9a4f550485df --- /dev/null +++ b/checkpoint-618/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e5d459c15b3659d339b29f90d9c6d4fdbf6c828b592cb47110d9ed8c71e113f +size 16631 diff --git a/checkpoint-618/rng_state_2.pth b/checkpoint-618/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..87527de5273a8cd55db32cb792e814afc37f4dae --- /dev/null +++ b/checkpoint-618/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b4bd6bae9c008f0e9d18f4569f4dddb1adc43f27c07518ce3f88803299dc53b +size 16631 diff --git a/checkpoint-618/trainer_state.json b/checkpoint-618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..628cc7ea64590d538b28e455cee9e1fa07ac9747 --- /dev/null +++ b/checkpoint-618/trainer_state.json @@ -0,0 +1,3887 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9611650485436893, + "eval_steps": 31, + "global_step": 618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.0, + "loss": 4.59, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 4.240383148193359, + "eval_runtime": 7.311, + "eval_samples_per_second": 164.273, + "eval_steps_per_second": 54.849, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 4.252, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "loss": 4.2054, + "step": 3 + }, + { + "epoch": 0.02, + "learning_rate": 6e-06, + "loss": 4.1624, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 8.000000000000001e-06, + "loss": 3.9787, + "step": 5 + }, + { + "epoch": 0.03, + "learning_rate": 1e-05, + "loss": 3.7979, + "step": 6 + }, + { + "epoch": 0.03, + "learning_rate": 1.2e-05, + "loss": 3.8982, + "step": 7 + }, + { + "epoch": 0.04, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.805, + "step": 8 + }, + { + "epoch": 0.04, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.7176, + "step": 9 + }, + { + "epoch": 0.05, + "learning_rate": 1.8e-05, + "loss": 3.4755, + "step": 10 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 3.6401, + "step": 11 + }, + { + "epoch": 0.06, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.5615, + "step": 12 + }, + { + "epoch": 0.06, + "learning_rate": 2.4e-05, + "loss": 3.5286, + "step": 13 + }, + { + "epoch": 0.07, + "learning_rate": 2.6000000000000002e-05, + "loss": 3.5437, + "step": 14 + }, + { + "epoch": 0.07, + "learning_rate": 2.8000000000000003e-05, + "loss": 3.5163, + "step": 15 + }, + { + "epoch": 0.08, + "learning_rate": 3e-05, + "loss": 3.4108, + "step": 16 + }, + { + "epoch": 0.08, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.3637, + "step": 17 + }, + { + "epoch": 0.09, + "learning_rate": 3.4000000000000007e-05, + "loss": 3.3538, + "step": 18 + }, + { + "epoch": 0.09, + "learning_rate": 3.6e-05, + "loss": 3.3819, + "step": 19 + }, + { + "epoch": 0.1, + "learning_rate": 3.8e-05, + "loss": 3.2511, + "step": 20 + }, + { + "epoch": 0.1, + "learning_rate": 4e-05, + "loss": 3.3211, + "step": 21 + }, + { + "epoch": 0.11, + "learning_rate": 4.2e-05, + "loss": 3.2764, + "step": 22 + }, + { + "epoch": 0.11, + "learning_rate": 4.4000000000000006e-05, + "loss": 3.0653, + "step": 23 + }, + { + "epoch": 0.12, + "learning_rate": 4.600000000000001e-05, + "loss": 3.0859, + "step": 24 + }, + { + "epoch": 0.12, + "learning_rate": 4.8e-05, + "loss": 3.0804, + "step": 25 + }, + { + "epoch": 0.13, + "learning_rate": 5e-05, + "loss": 2.9774, + "step": 26 + }, + { + "epoch": 0.13, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.9269, + "step": 27 + }, + { + "epoch": 0.14, + "learning_rate": 5.4000000000000005e-05, + "loss": 3.0926, + "step": 28 + }, + { + "epoch": 0.14, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.9725, + "step": 29 + }, + { + "epoch": 0.15, + "learning_rate": 5.8e-05, + "loss": 3.0293, + "step": 30 + }, + { + "epoch": 0.15, + "learning_rate": 6e-05, + "loss": 3.0903, + "step": 31 + }, + { + "epoch": 0.15, + "eval_loss": 2.9962990283966064, + "eval_runtime": 7.3671, + "eval_samples_per_second": 163.022, + "eval_steps_per_second": 54.431, + "step": 31 + }, + { + "epoch": 0.16, + "learning_rate": 6.2e-05, + "loss": 2.9903, + "step": 32 + }, + { + "epoch": 0.16, + "learning_rate": 6.400000000000001e-05, + "loss": 3.0196, + "step": 33 + }, + { + "epoch": 0.17, + "learning_rate": 6.6e-05, + "loss": 3.0288, + "step": 34 + }, + { + "epoch": 0.17, + "learning_rate": 6.800000000000001e-05, + "loss": 3.0071, + "step": 35 + }, + { + "epoch": 0.17, + "learning_rate": 7e-05, + "loss": 3.0393, + "step": 36 + }, + { + "epoch": 0.18, + "learning_rate": 7.2e-05, + "loss": 2.9937, + "step": 37 + }, + { + "epoch": 0.18, + "learning_rate": 7.4e-05, + "loss": 2.9988, + "step": 38 + }, + { + "epoch": 0.19, + "learning_rate": 7.6e-05, + "loss": 2.9331, + "step": 39 + }, + { + "epoch": 0.19, + "learning_rate": 7.800000000000001e-05, + "loss": 3.0414, + "step": 40 + }, + { + "epoch": 0.2, + "learning_rate": 8e-05, + "loss": 3.0237, + "step": 41 + }, + { + "epoch": 0.2, + "learning_rate": 8.2e-05, + "loss": 2.9664, + "step": 42 + }, + { + "epoch": 0.21, + "learning_rate": 8.4e-05, + "loss": 2.8639, + "step": 43 + }, + { + "epoch": 0.21, + "learning_rate": 8.6e-05, + "loss": 2.8562, + "step": 44 + }, + { + "epoch": 0.22, + "learning_rate": 8.800000000000001e-05, + "loss": 2.9632, + "step": 45 + }, + { + "epoch": 0.22, + "learning_rate": 9e-05, + "loss": 2.946, + "step": 46 + }, + { + "epoch": 0.23, + "learning_rate": 9.200000000000001e-05, + "loss": 2.8428, + "step": 47 + }, + { + "epoch": 0.23, + "learning_rate": 9.4e-05, + "loss": 2.9827, + "step": 48 + }, + { + "epoch": 0.24, + "learning_rate": 9.6e-05, + "loss": 2.9512, + "step": 49 + }, + { + "epoch": 0.24, + "learning_rate": 9.8e-05, + "loss": 2.8997, + "step": 50 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001, + "loss": 2.9762, + "step": 51 + }, + { + "epoch": 0.25, + "learning_rate": 0.00010200000000000001, + "loss": 3.0429, + "step": 52 + }, + { + "epoch": 0.26, + "learning_rate": 0.00010400000000000001, + "loss": 3.0223, + "step": 53 + }, + { + "epoch": 0.26, + "learning_rate": 0.00010600000000000002, + "loss": 3.0007, + "step": 54 + }, + { + "epoch": 0.27, + "learning_rate": 0.00010800000000000001, + "loss": 3.0436, + "step": 55 + }, + { + "epoch": 0.27, + "learning_rate": 0.00011000000000000002, + "loss": 3.0151, + "step": 56 + }, + { + "epoch": 0.28, + "learning_rate": 0.00011200000000000001, + "loss": 2.9909, + "step": 57 + }, + { + "epoch": 0.28, + "learning_rate": 0.00011399999999999999, + "loss": 2.9942, + "step": 58 + }, + { + "epoch": 0.29, + "learning_rate": 0.000116, + "loss": 3.0098, + "step": 59 + }, + { + "epoch": 0.29, + "learning_rate": 0.000118, + "loss": 3.0353, + "step": 60 + }, + { + "epoch": 0.3, + "learning_rate": 0.00012, + "loss": 3.0671, + "step": 61 + }, + { + "epoch": 0.3, + "learning_rate": 0.000122, + "loss": 2.9824, + "step": 62 + }, + { + "epoch": 0.3, + "eval_loss": 3.022158145904541, + "eval_runtime": 7.3702, + "eval_samples_per_second": 162.953, + "eval_steps_per_second": 54.408, + "step": 62 + }, + { + "epoch": 0.31, + "learning_rate": 0.000124, + "loss": 3.0207, + "step": 63 + }, + { + "epoch": 0.31, + "learning_rate": 0.000126, + "loss": 2.9048, + "step": 64 + }, + { + "epoch": 0.32, + "learning_rate": 0.00012800000000000002, + "loss": 3.0518, + "step": 65 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013000000000000002, + "loss": 3.0854, + "step": 66 + }, + { + "epoch": 0.33, + "learning_rate": 0.000132, + "loss": 3.0317, + "step": 67 + }, + { + "epoch": 0.33, + "learning_rate": 0.000134, + "loss": 3.0313, + "step": 68 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013600000000000003, + "loss": 3.0753, + "step": 69 + }, + { + "epoch": 0.34, + "learning_rate": 0.000138, + "loss": 2.9999, + "step": 70 + }, + { + "epoch": 0.34, + "learning_rate": 0.00014, + "loss": 3.0423, + "step": 71 + }, + { + "epoch": 0.35, + "learning_rate": 0.000142, + "loss": 2.9642, + "step": 72 + }, + { + "epoch": 0.35, + "learning_rate": 0.000144, + "loss": 2.9575, + "step": 73 + }, + { + "epoch": 0.36, + "learning_rate": 0.000146, + "loss": 2.9854, + "step": 74 + }, + { + "epoch": 0.36, + "learning_rate": 0.000148, + "loss": 2.9729, + "step": 75 + }, + { + "epoch": 0.37, + "learning_rate": 0.00015000000000000001, + "loss": 2.9176, + "step": 76 + }, + { + "epoch": 0.37, + "learning_rate": 0.000152, + "loss": 2.947, + "step": 77 + }, + { + "epoch": 0.38, + "learning_rate": 0.000154, + "loss": 3.0542, + "step": 78 + }, + { + "epoch": 0.38, + "learning_rate": 0.00015600000000000002, + "loss": 3.0718, + "step": 79 + }, + { + "epoch": 0.39, + "learning_rate": 0.00015800000000000002, + "loss": 3.027, + "step": 80 + }, + { + "epoch": 0.39, + "learning_rate": 0.00016, + "loss": 3.1764, + "step": 81 + }, + { + "epoch": 0.4, + "learning_rate": 0.000162, + "loss": 3.1091, + "step": 82 + }, + { + "epoch": 0.4, + "learning_rate": 0.000164, + "loss": 3.0931, + "step": 83 + }, + { + "epoch": 0.41, + "learning_rate": 0.000166, + "loss": 3.2712, + "step": 84 + }, + { + "epoch": 0.41, + "learning_rate": 0.000168, + "loss": 3.3353, + "step": 85 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017, + "loss": 3.4876, + "step": 86 + }, + { + "epoch": 0.42, + "learning_rate": 0.000172, + "loss": 3.3383, + "step": 87 + }, + { + "epoch": 0.43, + "learning_rate": 0.000174, + "loss": 3.1497, + "step": 88 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017600000000000002, + "loss": 3.1029, + "step": 89 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017800000000000002, + "loss": 3.1484, + "step": 90 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018, + "loss": 3.1156, + "step": 91 + }, + { + "epoch": 0.45, + "learning_rate": 0.000182, + "loss": 3.2557, + "step": 92 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018400000000000003, + "loss": 3.173, + "step": 93 + }, + { + "epoch": 0.45, + "eval_loss": 3.150902032852173, + "eval_runtime": 7.5676, + "eval_samples_per_second": 158.703, + "eval_steps_per_second": 52.989, + "step": 93 + }, + { + "epoch": 0.46, + "learning_rate": 0.00018600000000000002, + "loss": 3.128, + "step": 94 + }, + { + "epoch": 0.46, + "learning_rate": 0.000188, + "loss": 3.146, + "step": 95 + }, + { + "epoch": 0.47, + "learning_rate": 0.00019, + "loss": 3.194, + "step": 96 + }, + { + "epoch": 0.47, + "learning_rate": 0.000192, + "loss": 3.0987, + "step": 97 + }, + { + "epoch": 0.48, + "learning_rate": 0.000194, + "loss": 3.2405, + "step": 98 + }, + { + "epoch": 0.48, + "learning_rate": 0.000196, + "loss": 3.1568, + "step": 99 + }, + { + "epoch": 0.49, + "learning_rate": 0.00019800000000000002, + "loss": 3.1488, + "step": 100 + }, + { + "epoch": 0.49, + "learning_rate": 0.0002, + "loss": 3.2105, + "step": 101 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001996138996138996, + "loss": 3.2575, + "step": 102 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019922779922779924, + "loss": 3.1921, + "step": 103 + }, + { + "epoch": 0.5, + "learning_rate": 0.00019884169884169884, + "loss": 3.2369, + "step": 104 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019845559845559847, + "loss": 3.1031, + "step": 105 + }, + { + "epoch": 0.51, + "learning_rate": 0.00019806949806949807, + "loss": 3.2618, + "step": 106 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001976833976833977, + "loss": 3.2034, + "step": 107 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001972972972972973, + "loss": 3.2094, + "step": 108 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019691119691119693, + "loss": 3.1894, + "step": 109 + }, + { + "epoch": 0.53, + "learning_rate": 0.00019652509652509653, + "loss": 3.1614, + "step": 110 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019613899613899616, + "loss": 3.176, + "step": 111 + }, + { + "epoch": 0.54, + "learning_rate": 0.00019575289575289576, + "loss": 3.2153, + "step": 112 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001953667953667954, + "loss": 3.0923, + "step": 113 + }, + { + "epoch": 0.55, + "learning_rate": 0.000194980694980695, + "loss": 3.2878, + "step": 114 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019459459459459462, + "loss": 3.0605, + "step": 115 + }, + { + "epoch": 0.56, + "learning_rate": 0.00019420849420849422, + "loss": 3.1282, + "step": 116 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019382239382239382, + "loss": 3.1204, + "step": 117 + }, + { + "epoch": 0.57, + "learning_rate": 0.00019343629343629345, + "loss": 3.0932, + "step": 118 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019305019305019305, + "loss": 3.2913, + "step": 119 + }, + { + "epoch": 0.58, + "learning_rate": 0.00019266409266409268, + "loss": 3.1809, + "step": 120 + }, + { + "epoch": 0.59, + "learning_rate": 0.00019227799227799228, + "loss": 3.2408, + "step": 121 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001918918918918919, + "loss": 3.8133, + "step": 122 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001915057915057915, + "loss": 3.1869, + "step": 123 + }, + { + "epoch": 0.6, + "learning_rate": 0.00019111969111969114, + "loss": 3.1053, + "step": 124 + }, + { + "epoch": 0.6, + "eval_loss": 3.205463409423828, + "eval_runtime": 7.3677, + "eval_samples_per_second": 163.009, + "eval_steps_per_second": 54.427, + "step": 124 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019073359073359074, + "loss": 3.1206, + "step": 125 + }, + { + "epoch": 0.61, + "learning_rate": 0.00019034749034749037, + "loss": 3.1233, + "step": 126 + }, + { + "epoch": 0.62, + "learning_rate": 0.00018996138996138997, + "loss": 3.0673, + "step": 127 + }, + { + "epoch": 0.62, + "learning_rate": 0.0001895752895752896, + "loss": 3.1314, + "step": 128 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001891891891891892, + "loss": 3.1997, + "step": 129 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001888030888030888, + "loss": 3.1298, + "step": 130 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018841698841698843, + "loss": 3.1821, + "step": 131 + }, + { + "epoch": 0.64, + "learning_rate": 0.00018803088803088803, + "loss": 3.2418, + "step": 132 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018764478764478766, + "loss": 3.1543, + "step": 133 + }, + { + "epoch": 0.65, + "learning_rate": 0.00018725868725868726, + "loss": 3.2136, + "step": 134 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001868725868725869, + "loss": 3.3314, + "step": 135 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001864864864864865, + "loss": 3.2328, + "step": 136 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018610038610038612, + "loss": 3.2225, + "step": 137 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018571428571428572, + "loss": 3.1159, + "step": 138 + }, + { + "epoch": 0.67, + "learning_rate": 0.00018532818532818535, + "loss": 3.0339, + "step": 139 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018494208494208495, + "loss": 3.2672, + "step": 140 + }, + { + "epoch": 0.68, + "learning_rate": 0.00018455598455598458, + "loss": 3.1237, + "step": 141 + }, + { + "epoch": 0.69, + "learning_rate": 0.00018416988416988418, + "loss": 3.1692, + "step": 142 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001837837837837838, + "loss": 3.3243, + "step": 143 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001833976833976834, + "loss": 3.0264, + "step": 144 + }, + { + "epoch": 0.7, + "learning_rate": 0.000183011583011583, + "loss": 3.2045, + "step": 145 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018262548262548264, + "loss": 3.1966, + "step": 146 + }, + { + "epoch": 0.71, + "learning_rate": 0.00018223938223938224, + "loss": 3.0587, + "step": 147 + }, + { + "epoch": 0.72, + "learning_rate": 0.00018185328185328187, + "loss": 3.2979, + "step": 148 + }, + { + "epoch": 0.72, + "learning_rate": 0.00018146718146718147, + "loss": 3.1549, + "step": 149 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001810810810810811, + "loss": 3.1682, + "step": 150 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001806949806949807, + "loss": 3.3214, + "step": 151 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018030888030888032, + "loss": 3.1783, + "step": 152 + }, + { + "epoch": 0.74, + "learning_rate": 0.00017992277992277993, + "loss": 3.2268, + "step": 153 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017953667953667955, + "loss": 3.2843, + "step": 154 + }, + { + "epoch": 0.75, + "learning_rate": 0.00017915057915057916, + "loss": 3.3124, + "step": 155 + }, + { + "epoch": 0.75, + "eval_loss": 3.1898298263549805, + "eval_runtime": 7.5884, + "eval_samples_per_second": 158.268, + "eval_steps_per_second": 52.844, + "step": 155 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017876447876447878, + "loss": 3.1855, + "step": 156 + }, + { + "epoch": 0.76, + "learning_rate": 0.00017837837837837839, + "loss": 3.2096, + "step": 157 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017799227799227801, + "loss": 3.2174, + "step": 158 + }, + { + "epoch": 0.77, + "learning_rate": 0.00017760617760617762, + "loss": 3.2775, + "step": 159 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017722007722007722, + "loss": 3.2065, + "step": 160 + }, + { + "epoch": 0.78, + "learning_rate": 0.00017683397683397684, + "loss": 3.2905, + "step": 161 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017644787644787645, + "loss": 3.1591, + "step": 162 + }, + { + "epoch": 0.79, + "learning_rate": 0.00017606177606177607, + "loss": 3.2721, + "step": 163 + }, + { + "epoch": 0.8, + "learning_rate": 0.00017567567567567568, + "loss": 3.1743, + "step": 164 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001752895752895753, + "loss": 3.234, + "step": 165 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001749034749034749, + "loss": 3.2775, + "step": 166 + }, + { + "epoch": 0.81, + "learning_rate": 0.00017451737451737453, + "loss": 3.2317, + "step": 167 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017413127413127413, + "loss": 3.0691, + "step": 168 + }, + { + "epoch": 0.82, + "learning_rate": 0.00017374517374517376, + "loss": 3.1793, + "step": 169 + }, + { + "epoch": 0.83, + "learning_rate": 0.00017335907335907336, + "loss": 3.2259, + "step": 170 + }, + { + "epoch": 0.83, + "learning_rate": 0.000172972972972973, + "loss": 3.1813, + "step": 171 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001725868725868726, + "loss": 3.2416, + "step": 172 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001722007722007722, + "loss": 3.2016, + "step": 173 + }, + { + "epoch": 0.84, + "learning_rate": 0.00017181467181467182, + "loss": 3.1766, + "step": 174 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017142857142857143, + "loss": 3.0861, + "step": 175 + }, + { + "epoch": 0.85, + "learning_rate": 0.00017104247104247105, + "loss": 3.2104, + "step": 176 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017065637065637065, + "loss": 3.273, + "step": 177 + }, + { + "epoch": 0.86, + "learning_rate": 0.00017027027027027028, + "loss": 3.2371, + "step": 178 + }, + { + "epoch": 0.87, + "learning_rate": 0.00016988416988416988, + "loss": 3.2654, + "step": 179 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001694980694980695, + "loss": 3.1812, + "step": 180 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016911196911196911, + "loss": 3.2781, + "step": 181 + }, + { + "epoch": 0.88, + "learning_rate": 0.00016872586872586874, + "loss": 3.1611, + "step": 182 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016833976833976834, + "loss": 3.0902, + "step": 183 + }, + { + "epoch": 0.89, + "learning_rate": 0.00016795366795366797, + "loss": 3.2414, + "step": 184 + }, + { + "epoch": 0.9, + "learning_rate": 0.00016756756756756757, + "loss": 3.1472, + "step": 185 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001671814671814672, + "loss": 3.1761, + "step": 186 + }, + { + "epoch": 0.9, + "eval_loss": 3.1637256145477295, + "eval_runtime": 7.3669, + "eval_samples_per_second": 163.026, + "eval_steps_per_second": 54.432, + "step": 186 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001667953667953668, + "loss": 3.1409, + "step": 187 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001664092664092664, + "loss": 3.2262, + "step": 188 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016602316602316603, + "loss": 3.105, + "step": 189 + }, + { + "epoch": 0.92, + "learning_rate": 0.00016563706563706563, + "loss": 3.2596, + "step": 190 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016525096525096526, + "loss": 3.1528, + "step": 191 + }, + { + "epoch": 0.93, + "learning_rate": 0.00016486486486486486, + "loss": 3.1561, + "step": 192 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001644787644787645, + "loss": 3.2552, + "step": 193 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001640926640926641, + "loss": 3.0347, + "step": 194 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016370656370656372, + "loss": 3.0418, + "step": 195 + }, + { + "epoch": 0.95, + "learning_rate": 0.00016332046332046332, + "loss": 3.0838, + "step": 196 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016293436293436295, + "loss": 3.1867, + "step": 197 + }, + { + "epoch": 0.96, + "learning_rate": 0.00016254826254826255, + "loss": 3.0373, + "step": 198 + }, + { + "epoch": 0.97, + "learning_rate": 0.00016216216216216218, + "loss": 2.9896, + "step": 199 + }, + { + "epoch": 0.97, + "learning_rate": 0.00016177606177606178, + "loss": 3.1511, + "step": 200 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001613899613899614, + "loss": 3.1029, + "step": 201 + }, + { + "epoch": 0.98, + "learning_rate": 0.000161003861003861, + "loss": 3.2193, + "step": 202 + }, + { + "epoch": 0.99, + "learning_rate": 0.0001606177606177606, + "loss": 3.2214, + "step": 203 + }, + { + "epoch": 0.99, + "learning_rate": 0.00016023166023166024, + "loss": 3.1428, + "step": 204 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015984555984555984, + "loss": 3.1259, + "step": 205 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015945945945945947, + "loss": 3.2007, + "step": 206 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015907335907335907, + "loss": 3.1123, + "step": 207 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001586872586872587, + "loss": 3.3417, + "step": 208 + }, + { + "epoch": 1.01, + "learning_rate": 0.0001583011583011583, + "loss": 3.089, + "step": 209 + }, + { + "epoch": 1.02, + "learning_rate": 0.00015791505791505793, + "loss": 3.0972, + "step": 210 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015752895752895753, + "loss": 2.1341, + "step": 211 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015714285714285716, + "loss": 1.9415, + "step": 212 + }, + { + "epoch": 1.01, + "learning_rate": 0.00015675675675675676, + "loss": 1.959, + "step": 213 + }, + { + "epoch": 1.02, + "learning_rate": 0.0001563706563706564, + "loss": 1.857, + "step": 214 + }, + { + "epoch": 1.02, + "learning_rate": 0.000155984555984556, + "loss": 1.8255, + "step": 215 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015559845559845562, + "loss": 1.6538, + "step": 216 + }, + { + "epoch": 1.03, + "learning_rate": 0.00015521235521235522, + "loss": 1.9162, + "step": 217 + }, + { + "epoch": 1.03, + "eval_loss": 3.399775981903076, + "eval_runtime": 7.3711, + "eval_samples_per_second": 162.933, + "eval_steps_per_second": 54.401, + "step": 217 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015482625482625482, + "loss": 1.8293, + "step": 218 + }, + { + "epoch": 1.04, + "learning_rate": 0.00015444015444015445, + "loss": 1.8539, + "step": 219 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015405405405405405, + "loss": 1.7888, + "step": 220 + }, + { + "epoch": 1.05, + "learning_rate": 0.00015366795366795368, + "loss": 1.7813, + "step": 221 + }, + { + "epoch": 1.06, + "learning_rate": 0.00015328185328185328, + "loss": 1.8911, + "step": 222 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001528957528957529, + "loss": 1.839, + "step": 223 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001525096525096525, + "loss": 1.8223, + "step": 224 + }, + { + "epoch": 1.07, + "learning_rate": 0.00015212355212355214, + "loss": 1.825, + "step": 225 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015173745173745174, + "loss": 1.7962, + "step": 226 + }, + { + "epoch": 1.08, + "learning_rate": 0.00015135135135135137, + "loss": 1.8117, + "step": 227 + }, + { + "epoch": 1.09, + "learning_rate": 0.00015096525096525097, + "loss": 1.9061, + "step": 228 + }, + { + "epoch": 1.09, + "learning_rate": 0.0001505791505791506, + "loss": 1.8982, + "step": 229 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001501930501930502, + "loss": 1.9087, + "step": 230 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001498069498069498, + "loss": 1.9664, + "step": 231 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014942084942084943, + "loss": 1.7921, + "step": 232 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014903474903474903, + "loss": 1.9163, + "step": 233 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014864864864864866, + "loss": 1.8759, + "step": 234 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014826254826254826, + "loss": 1.9262, + "step": 235 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001478764478764479, + "loss": 1.9063, + "step": 236 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001474903474903475, + "loss": 1.8577, + "step": 237 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014710424710424712, + "loss": 1.7452, + "step": 238 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014671814671814672, + "loss": 1.9344, + "step": 239 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014633204633204635, + "loss": 1.7575, + "step": 240 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014594594594594595, + "loss": 1.7707, + "step": 241 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014555984555984558, + "loss": 1.8945, + "step": 242 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014517374517374518, + "loss": 1.8379, + "step": 243 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001447876447876448, + "loss": 1.9021, + "step": 244 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001444015444015444, + "loss": 1.844, + "step": 245 + }, + { + "epoch": 1.17, + "learning_rate": 0.000144015444015444, + "loss": 1.9396, + "step": 246 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014362934362934364, + "loss": 2.0305, + "step": 247 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014324324324324324, + "loss": 1.8985, + "step": 248 + }, + { + "epoch": 1.18, + "eval_loss": 3.330674409866333, + "eval_runtime": 7.364, + "eval_samples_per_second": 163.091, + "eval_steps_per_second": 54.454, + "step": 248 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014285714285714287, + "loss": 1.8457, + "step": 249 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014247104247104247, + "loss": 1.8213, + "step": 250 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001420849420849421, + "loss": 1.7586, + "step": 251 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001416988416988417, + "loss": 1.8669, + "step": 252 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014131274131274133, + "loss": 1.9476, + "step": 253 + }, + { + "epoch": 1.21, + "learning_rate": 0.00014092664092664093, + "loss": 1.8525, + "step": 254 + }, + { + "epoch": 1.22, + "learning_rate": 0.00014054054054054056, + "loss": 2.0163, + "step": 255 + }, + { + "epoch": 1.22, + "learning_rate": 0.00014015444015444016, + "loss": 1.9186, + "step": 256 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013976833976833979, + "loss": 1.9528, + "step": 257 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001393822393822394, + "loss": 2.2483, + "step": 258 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013899613899613902, + "loss": 1.8889, + "step": 259 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013861003861003862, + "loss": 2.0137, + "step": 260 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013822393822393822, + "loss": 1.9397, + "step": 261 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013783783783783785, + "loss": 1.8241, + "step": 262 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013745173745173745, + "loss": 1.9685, + "step": 263 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013706563706563708, + "loss": 1.9909, + "step": 264 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013667953667953668, + "loss": 1.8653, + "step": 265 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001362934362934363, + "loss": 1.9163, + "step": 266 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001359073359073359, + "loss": 1.9765, + "step": 267 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013552123552123554, + "loss": 1.788, + "step": 268 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013513513513513514, + "loss": 1.8103, + "step": 269 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013474903474903477, + "loss": 2.0086, + "step": 270 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013436293436293437, + "loss": 1.9448, + "step": 271 + }, + { + "epoch": 1.3, + "learning_rate": 0.000133976833976834, + "loss": 1.8598, + "step": 272 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001335907335907336, + "loss": 2.0792, + "step": 273 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001332046332046332, + "loss": 1.7766, + "step": 274 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013281853281853283, + "loss": 1.9329, + "step": 275 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013243243243243243, + "loss": 1.9933, + "step": 276 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013204633204633206, + "loss": 1.9371, + "step": 277 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013166023166023166, + "loss": 1.9629, + "step": 278 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013127413127413129, + "loss": 2.0488, + "step": 279 + }, + { + "epoch": 1.33, + "eval_loss": 3.333745241165161, + "eval_runtime": 7.361, + "eval_samples_per_second": 163.157, + "eval_steps_per_second": 54.476, + "step": 279 + }, + { + "epoch": 1.34, + "learning_rate": 0.0001308880308880309, + "loss": 2.0148, + "step": 280 + }, + { + "epoch": 1.34, + "learning_rate": 0.00013050193050193052, + "loss": 1.8416, + "step": 281 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013011583011583012, + "loss": 2.1004, + "step": 282 + }, + { + "epoch": 1.35, + "learning_rate": 0.00012972972972972974, + "loss": 1.8308, + "step": 283 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012934362934362935, + "loss": 1.9441, + "step": 284 + }, + { + "epoch": 1.36, + "learning_rate": 0.00012895752895752897, + "loss": 2.083, + "step": 285 + }, + { + "epoch": 1.37, + "learning_rate": 0.00012857142857142858, + "loss": 1.8198, + "step": 286 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001281853281853282, + "loss": 2.0069, + "step": 287 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001277992277992278, + "loss": 2.0146, + "step": 288 + }, + { + "epoch": 1.38, + "learning_rate": 0.0001274131274131274, + "loss": 1.8554, + "step": 289 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012702702702702703, + "loss": 1.972, + "step": 290 + }, + { + "epoch": 1.39, + "learning_rate": 0.00012664092664092664, + "loss": 1.9583, + "step": 291 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012625482625482626, + "loss": 1.8567, + "step": 292 + }, + { + "epoch": 1.4, + "learning_rate": 0.00012586872586872587, + "loss": 2.0031, + "step": 293 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001254826254826255, + "loss": 1.9725, + "step": 294 + }, + { + "epoch": 1.41, + "learning_rate": 0.0001250965250965251, + "loss": 1.9517, + "step": 295 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012471042471042472, + "loss": 1.7436, + "step": 296 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012432432432432433, + "loss": 1.9968, + "step": 297 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012393822393822395, + "loss": 1.8299, + "step": 298 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012355212355212355, + "loss": 2.1024, + "step": 299 + }, + { + "epoch": 1.44, + "learning_rate": 0.00012316602316602318, + "loss": 1.8099, + "step": 300 + }, + { + "epoch": 1.44, + "learning_rate": 0.00012277992277992278, + "loss": 1.9761, + "step": 301 + }, + { + "epoch": 1.45, + "learning_rate": 0.0001223938223938224, + "loss": 2.1201, + "step": 302 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012200772200772201, + "loss": 1.9268, + "step": 303 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012162162162162163, + "loss": 1.8136, + "step": 304 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012123552123552124, + "loss": 2.0362, + "step": 305 + }, + { + "epoch": 1.47, + "learning_rate": 0.00012084942084942086, + "loss": 2.0653, + "step": 306 + }, + { + "epoch": 1.47, + "learning_rate": 0.00012046332046332047, + "loss": 2.022, + "step": 307 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012007722007722009, + "loss": 1.9317, + "step": 308 + }, + { + "epoch": 1.48, + "learning_rate": 0.0001196911196911197, + "loss": 2.0455, + "step": 309 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011930501930501932, + "loss": 2.0812, + "step": 310 + }, + { + "epoch": 1.49, + "eval_loss": 3.341298818588257, + "eval_runtime": 7.3644, + "eval_samples_per_second": 163.081, + "eval_steps_per_second": 54.451, + "step": 310 + }, + { + "epoch": 1.49, + "learning_rate": 0.00011891891891891893, + "loss": 2.0609, + "step": 311 + }, + { + "epoch": 1.5, + "learning_rate": 0.00011853281853281855, + "loss": 1.9708, + "step": 312 + }, + { + "epoch": 1.5, + "learning_rate": 0.00011814671814671816, + "loss": 1.9968, + "step": 313 + }, + { + "epoch": 1.5, + "learning_rate": 0.00011776061776061778, + "loss": 2.0283, + "step": 314 + }, + { + "epoch": 1.51, + "learning_rate": 0.00011737451737451739, + "loss": 2.0142, + "step": 315 + }, + { + "epoch": 1.51, + "learning_rate": 0.00011698841698841701, + "loss": 2.026, + "step": 316 + }, + { + "epoch": 1.52, + "learning_rate": 0.0001166023166023166, + "loss": 2.1965, + "step": 317 + }, + { + "epoch": 1.52, + "learning_rate": 0.00011621621621621621, + "loss": 1.984, + "step": 318 + }, + { + "epoch": 1.53, + "learning_rate": 0.00011583011583011582, + "loss": 2.0699, + "step": 319 + }, + { + "epoch": 1.53, + "learning_rate": 0.00011544401544401544, + "loss": 1.864, + "step": 320 + }, + { + "epoch": 1.54, + "learning_rate": 0.00011505791505791505, + "loss": 2.0219, + "step": 321 + }, + { + "epoch": 1.54, + "learning_rate": 0.00011467181467181467, + "loss": 1.9162, + "step": 322 + }, + { + "epoch": 1.55, + "learning_rate": 0.00011428571428571428, + "loss": 1.9092, + "step": 323 + }, + { + "epoch": 1.55, + "learning_rate": 0.0001138996138996139, + "loss": 2.0932, + "step": 324 + }, + { + "epoch": 1.56, + "learning_rate": 0.00011351351351351351, + "loss": 2.0975, + "step": 325 + }, + { + "epoch": 1.56, + "learning_rate": 0.00011312741312741313, + "loss": 2.1674, + "step": 326 + }, + { + "epoch": 1.57, + "learning_rate": 0.00011274131274131274, + "loss": 1.8444, + "step": 327 + }, + { + "epoch": 1.57, + "learning_rate": 0.00011235521235521236, + "loss": 1.9696, + "step": 328 + }, + { + "epoch": 1.58, + "learning_rate": 0.00011196911196911197, + "loss": 1.943, + "step": 329 + }, + { + "epoch": 1.58, + "learning_rate": 0.00011158301158301159, + "loss": 2.1044, + "step": 330 + }, + { + "epoch": 1.59, + "learning_rate": 0.0001111969111969112, + "loss": 2.2068, + "step": 331 + }, + { + "epoch": 1.59, + "learning_rate": 0.00011081081081081082, + "loss": 2.0958, + "step": 332 + }, + { + "epoch": 1.6, + "learning_rate": 0.00011042471042471043, + "loss": 1.9789, + "step": 333 + }, + { + "epoch": 1.6, + "learning_rate": 0.00011003861003861005, + "loss": 1.8663, + "step": 334 + }, + { + "epoch": 1.61, + "learning_rate": 0.00010965250965250966, + "loss": 2.0499, + "step": 335 + }, + { + "epoch": 1.61, + "learning_rate": 0.00010926640926640928, + "loss": 1.935, + "step": 336 + }, + { + "epoch": 1.62, + "learning_rate": 0.00010888030888030889, + "loss": 2.0021, + "step": 337 + }, + { + "epoch": 1.62, + "learning_rate": 0.0001084942084942085, + "loss": 1.953, + "step": 338 + }, + { + "epoch": 1.63, + "learning_rate": 0.00010810810810810812, + "loss": 2.0466, + "step": 339 + }, + { + "epoch": 1.63, + "learning_rate": 0.00010772200772200774, + "loss": 1.9709, + "step": 340 + }, + { + "epoch": 1.64, + "learning_rate": 0.00010733590733590735, + "loss": 1.8884, + "step": 341 + }, + { + "epoch": 1.64, + "eval_loss": 3.323758602142334, + "eval_runtime": 7.3665, + "eval_samples_per_second": 163.036, + "eval_steps_per_second": 54.436, + "step": 341 + }, + { + "epoch": 1.64, + "learning_rate": 0.00010694980694980697, + "loss": 2.0557, + "step": 342 + }, + { + "epoch": 1.65, + "learning_rate": 0.00010656370656370658, + "loss": 2.0345, + "step": 343 + }, + { + "epoch": 1.65, + "learning_rate": 0.0001061776061776062, + "loss": 1.8173, + "step": 344 + }, + { + "epoch": 1.66, + "learning_rate": 0.00010579150579150581, + "loss": 1.9598, + "step": 345 + }, + { + "epoch": 1.66, + "learning_rate": 0.0001054054054054054, + "loss": 2.0323, + "step": 346 + }, + { + "epoch": 1.67, + "learning_rate": 0.00010501930501930501, + "loss": 1.9284, + "step": 347 + }, + { + "epoch": 1.67, + "learning_rate": 0.00010463320463320463, + "loss": 2.1235, + "step": 348 + }, + { + "epoch": 1.67, + "learning_rate": 0.00010424710424710424, + "loss": 1.9426, + "step": 349 + }, + { + "epoch": 1.68, + "learning_rate": 0.00010386100386100386, + "loss": 1.8692, + "step": 350 + }, + { + "epoch": 1.68, + "learning_rate": 0.00010347490347490347, + "loss": 1.9559, + "step": 351 + }, + { + "epoch": 1.69, + "learning_rate": 0.00010308880308880309, + "loss": 2.0407, + "step": 352 + }, + { + "epoch": 1.69, + "learning_rate": 0.0001027027027027027, + "loss": 1.9356, + "step": 353 + }, + { + "epoch": 1.7, + "learning_rate": 0.00010231660231660232, + "loss": 1.9055, + "step": 354 + }, + { + "epoch": 1.7, + "learning_rate": 0.00010193050193050193, + "loss": 1.9776, + "step": 355 + }, + { + "epoch": 1.71, + "learning_rate": 0.00010154440154440155, + "loss": 1.8996, + "step": 356 + }, + { + "epoch": 1.71, + "learning_rate": 0.00010115830115830116, + "loss": 1.8893, + "step": 357 + }, + { + "epoch": 1.72, + "learning_rate": 0.00010077220077220078, + "loss": 1.9621, + "step": 358 + }, + { + "epoch": 1.72, + "learning_rate": 0.00010038610038610039, + "loss": 1.9447, + "step": 359 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001, + "loss": 1.9646, + "step": 360 + }, + { + "epoch": 1.73, + "learning_rate": 9.961389961389962e-05, + "loss": 1.9459, + "step": 361 + }, + { + "epoch": 1.74, + "learning_rate": 9.922779922779923e-05, + "loss": 2.0168, + "step": 362 + }, + { + "epoch": 1.74, + "learning_rate": 9.884169884169885e-05, + "loss": 2.0021, + "step": 363 + }, + { + "epoch": 1.75, + "learning_rate": 9.845559845559846e-05, + "loss": 1.7642, + "step": 364 + }, + { + "epoch": 1.75, + "learning_rate": 9.806949806949808e-05, + "loss": 2.0138, + "step": 365 + }, + { + "epoch": 1.76, + "learning_rate": 9.76833976833977e-05, + "loss": 2.0108, + "step": 366 + }, + { + "epoch": 1.76, + "learning_rate": 9.729729729729731e-05, + "loss": 2.0955, + "step": 367 + }, + { + "epoch": 1.77, + "learning_rate": 9.691119691119691e-05, + "loss": 2.1705, + "step": 368 + }, + { + "epoch": 1.77, + "learning_rate": 9.652509652509652e-05, + "loss": 1.8494, + "step": 369 + }, + { + "epoch": 1.78, + "learning_rate": 9.613899613899614e-05, + "loss": 1.8676, + "step": 370 + }, + { + "epoch": 1.78, + "learning_rate": 9.575289575289575e-05, + "loss": 1.821, + "step": 371 + }, + { + "epoch": 1.79, + "learning_rate": 9.536679536679537e-05, + "loss": 2.0281, + "step": 372 + }, + { + "epoch": 1.79, + "eval_loss": 3.2718801498413086, + "eval_runtime": 7.3652, + "eval_samples_per_second": 163.063, + "eval_steps_per_second": 54.445, + "step": 372 + }, + { + "epoch": 1.79, + "learning_rate": 9.498069498069498e-05, + "loss": 2.1556, + "step": 373 + }, + { + "epoch": 1.8, + "learning_rate": 9.45945945945946e-05, + "loss": 1.9643, + "step": 374 + }, + { + "epoch": 1.8, + "learning_rate": 9.420849420849421e-05, + "loss": 2.0287, + "step": 375 + }, + { + "epoch": 1.81, + "learning_rate": 9.382239382239383e-05, + "loss": 1.927, + "step": 376 + }, + { + "epoch": 1.81, + "learning_rate": 9.343629343629344e-05, + "loss": 1.9838, + "step": 377 + }, + { + "epoch": 1.82, + "learning_rate": 9.305019305019306e-05, + "loss": 1.9065, + "step": 378 + }, + { + "epoch": 1.82, + "learning_rate": 9.266409266409267e-05, + "loss": 2.056, + "step": 379 + }, + { + "epoch": 1.83, + "learning_rate": 9.227799227799229e-05, + "loss": 1.9277, + "step": 380 + }, + { + "epoch": 1.83, + "learning_rate": 9.18918918918919e-05, + "loss": 1.7797, + "step": 381 + }, + { + "epoch": 1.83, + "learning_rate": 9.15057915057915e-05, + "loss": 1.8818, + "step": 382 + }, + { + "epoch": 1.84, + "learning_rate": 9.111969111969112e-05, + "loss": 1.947, + "step": 383 + }, + { + "epoch": 1.84, + "learning_rate": 9.073359073359073e-05, + "loss": 1.942, + "step": 384 + }, + { + "epoch": 1.85, + "learning_rate": 9.034749034749035e-05, + "loss": 1.9998, + "step": 385 + }, + { + "epoch": 1.85, + "learning_rate": 8.996138996138996e-05, + "loss": 1.8805, + "step": 386 + }, + { + "epoch": 1.86, + "learning_rate": 8.957528957528958e-05, + "loss": 1.8903, + "step": 387 + }, + { + "epoch": 1.86, + "learning_rate": 8.918918918918919e-05, + "loss": 1.9189, + "step": 388 + }, + { + "epoch": 1.87, + "learning_rate": 8.880308880308881e-05, + "loss": 2.0308, + "step": 389 + }, + { + "epoch": 1.87, + "learning_rate": 8.841698841698842e-05, + "loss": 2.0768, + "step": 390 + }, + { + "epoch": 1.88, + "learning_rate": 8.803088803088804e-05, + "loss": 1.9168, + "step": 391 + }, + { + "epoch": 1.88, + "learning_rate": 8.764478764478765e-05, + "loss": 1.8967, + "step": 392 + }, + { + "epoch": 1.89, + "learning_rate": 8.725868725868727e-05, + "loss": 1.9347, + "step": 393 + }, + { + "epoch": 1.89, + "learning_rate": 8.687258687258688e-05, + "loss": 1.8273, + "step": 394 + }, + { + "epoch": 1.9, + "learning_rate": 8.64864864864865e-05, + "loss": 1.9801, + "step": 395 + }, + { + "epoch": 1.9, + "learning_rate": 8.61003861003861e-05, + "loss": 2.0002, + "step": 396 + }, + { + "epoch": 1.91, + "learning_rate": 8.571428571428571e-05, + "loss": 2.0318, + "step": 397 + }, + { + "epoch": 1.91, + "learning_rate": 8.532818532818533e-05, + "loss": 1.8399, + "step": 398 + }, + { + "epoch": 1.92, + "learning_rate": 8.494208494208494e-05, + "loss": 1.8956, + "step": 399 + }, + { + "epoch": 1.92, + "learning_rate": 8.455598455598456e-05, + "loss": 2.0156, + "step": 400 + }, + { + "epoch": 1.93, + "learning_rate": 8.416988416988417e-05, + "loss": 1.9499, + "step": 401 + }, + { + "epoch": 1.93, + "learning_rate": 8.378378378378379e-05, + "loss": 1.8823, + "step": 402 + }, + { + "epoch": 1.94, + "learning_rate": 8.33976833976834e-05, + "loss": 2.1344, + "step": 403 + }, + { + "epoch": 1.94, + "eval_loss": 3.2487306594848633, + "eval_runtime": 7.3645, + "eval_samples_per_second": 163.08, + "eval_steps_per_second": 54.451, + "step": 403 + }, + { + "epoch": 1.94, + "learning_rate": 8.301158301158302e-05, + "loss": 1.9887, + "step": 404 + }, + { + "epoch": 1.95, + "learning_rate": 8.262548262548263e-05, + "loss": 2.0445, + "step": 405 + }, + { + "epoch": 1.95, + "learning_rate": 8.223938223938225e-05, + "loss": 1.8847, + "step": 406 + }, + { + "epoch": 1.96, + "learning_rate": 8.185328185328186e-05, + "loss": 1.8461, + "step": 407 + }, + { + "epoch": 1.96, + "learning_rate": 8.146718146718148e-05, + "loss": 1.9106, + "step": 408 + }, + { + "epoch": 1.97, + "learning_rate": 8.108108108108109e-05, + "loss": 2.0067, + "step": 409 + }, + { + "epoch": 1.97, + "learning_rate": 8.06949806949807e-05, + "loss": 1.9705, + "step": 410 + }, + { + "epoch": 1.98, + "learning_rate": 8.03088803088803e-05, + "loss": 1.8092, + "step": 411 + }, + { + "epoch": 1.98, + "learning_rate": 7.992277992277992e-05, + "loss": 1.8563, + "step": 412 + }, + { + "epoch": 1.99, + "learning_rate": 7.953667953667954e-05, + "loss": 1.8833, + "step": 413 + }, + { + "epoch": 1.99, + "learning_rate": 7.915057915057915e-05, + "loss": 1.9905, + "step": 414 + }, + { + "epoch": 2.0, + "learning_rate": 7.876447876447877e-05, + "loss": 2.0448, + "step": 415 + }, + { + "epoch": 2.0, + "learning_rate": 7.837837837837838e-05, + "loss": 1.9066, + "step": 416 + }, + { + "epoch": 2.0, + "learning_rate": 7.7992277992278e-05, + "loss": 1.8585, + "step": 417 + }, + { + "epoch": 2.01, + "learning_rate": 7.760617760617761e-05, + "loss": 2.0163, + "step": 418 + }, + { + "epoch": 2.01, + "learning_rate": 7.722007722007723e-05, + "loss": 1.8571, + "step": 419 + }, + { + "epoch": 2.02, + "learning_rate": 7.683397683397684e-05, + "loss": 2.0083, + "step": 420 + }, + { + "epoch": 2.0, + "learning_rate": 7.644787644787645e-05, + "loss": 0.6158, + "step": 421 + }, + { + "epoch": 2.01, + "learning_rate": 7.606177606177607e-05, + "loss": 0.7386, + "step": 422 + }, + { + "epoch": 2.01, + "learning_rate": 7.567567567567568e-05, + "loss": 0.7067, + "step": 423 + }, + { + "epoch": 2.02, + "learning_rate": 7.52895752895753e-05, + "loss": 0.6173, + "step": 424 + }, + { + "epoch": 2.02, + "learning_rate": 7.49034749034749e-05, + "loss": 0.5876, + "step": 425 + }, + { + "epoch": 2.03, + "learning_rate": 7.451737451737452e-05, + "loss": 0.5948, + "step": 426 + }, + { + "epoch": 2.03, + "learning_rate": 7.413127413127413e-05, + "loss": 0.5593, + "step": 427 + }, + { + "epoch": 2.04, + "learning_rate": 7.374517374517374e-05, + "loss": 0.5989, + "step": 428 + }, + { + "epoch": 2.04, + "learning_rate": 7.335907335907336e-05, + "loss": 0.5699, + "step": 429 + }, + { + "epoch": 2.05, + "learning_rate": 7.297297297297297e-05, + "loss": 0.5719, + "step": 430 + }, + { + "epoch": 2.05, + "learning_rate": 7.258687258687259e-05, + "loss": 0.4928, + "step": 431 + }, + { + "epoch": 2.06, + "learning_rate": 7.22007722007722e-05, + "loss": 0.4713, + "step": 432 + }, + { + "epoch": 2.06, + "learning_rate": 7.181467181467182e-05, + "loss": 0.6161, + "step": 433 + }, + { + "epoch": 2.07, + "learning_rate": 7.142857142857143e-05, + "loss": 0.566, + "step": 434 + }, + { + "epoch": 2.07, + "eval_loss": 4.280820369720459, + "eval_runtime": 7.3682, + "eval_samples_per_second": 162.998, + "eval_steps_per_second": 54.423, + "step": 434 + }, + { + "epoch": 2.07, + "learning_rate": 7.104247104247105e-05, + "loss": 0.5182, + "step": 435 + }, + { + "epoch": 2.08, + "learning_rate": 7.065637065637066e-05, + "loss": 0.6347, + "step": 436 + }, + { + "epoch": 2.08, + "learning_rate": 7.027027027027028e-05, + "loss": 0.6002, + "step": 437 + }, + { + "epoch": 2.09, + "learning_rate": 6.988416988416989e-05, + "loss": 0.5696, + "step": 438 + }, + { + "epoch": 2.09, + "learning_rate": 6.949806949806951e-05, + "loss": 0.5535, + "step": 439 + }, + { + "epoch": 2.1, + "learning_rate": 6.911196911196911e-05, + "loss": 0.5263, + "step": 440 + }, + { + "epoch": 2.1, + "learning_rate": 6.872586872586872e-05, + "loss": 0.5342, + "step": 441 + }, + { + "epoch": 2.11, + "learning_rate": 6.833976833976834e-05, + "loss": 0.4946, + "step": 442 + }, + { + "epoch": 2.11, + "learning_rate": 6.795366795366795e-05, + "loss": 0.5402, + "step": 443 + }, + { + "epoch": 2.12, + "learning_rate": 6.756756756756757e-05, + "loss": 0.5005, + "step": 444 + }, + { + "epoch": 2.12, + "learning_rate": 6.718146718146718e-05, + "loss": 0.6038, + "step": 445 + }, + { + "epoch": 2.13, + "learning_rate": 6.67953667953668e-05, + "loss": 0.5123, + "step": 446 + }, + { + "epoch": 2.13, + "learning_rate": 6.640926640926641e-05, + "loss": 0.558, + "step": 447 + }, + { + "epoch": 2.14, + "learning_rate": 6.602316602316603e-05, + "loss": 0.4858, + "step": 448 + }, + { + "epoch": 2.14, + "learning_rate": 6.563706563706564e-05, + "loss": 0.6183, + "step": 449 + }, + { + "epoch": 2.15, + "learning_rate": 6.525096525096526e-05, + "loss": 0.5093, + "step": 450 + }, + { + "epoch": 2.15, + "learning_rate": 6.486486486486487e-05, + "loss": 0.4336, + "step": 451 + }, + { + "epoch": 2.16, + "learning_rate": 6.447876447876449e-05, + "loss": 0.653, + "step": 452 + }, + { + "epoch": 2.16, + "learning_rate": 6.40926640926641e-05, + "loss": 0.5675, + "step": 453 + }, + { + "epoch": 2.17, + "learning_rate": 6.37065637065637e-05, + "loss": 0.5146, + "step": 454 + }, + { + "epoch": 2.17, + "learning_rate": 6.332046332046332e-05, + "loss": 0.4988, + "step": 455 + }, + { + "epoch": 2.17, + "learning_rate": 6.293436293436293e-05, + "loss": 0.5216, + "step": 456 + }, + { + "epoch": 2.18, + "learning_rate": 6.254826254826255e-05, + "loss": 0.5886, + "step": 457 + }, + { + "epoch": 2.18, + "learning_rate": 6.216216216216216e-05, + "loss": 0.5856, + "step": 458 + }, + { + "epoch": 2.19, + "learning_rate": 6.177606177606178e-05, + "loss": 0.4837, + "step": 459 + }, + { + "epoch": 2.19, + "learning_rate": 6.138996138996139e-05, + "loss": 0.6044, + "step": 460 + }, + { + "epoch": 2.2, + "learning_rate": 6.100386100386101e-05, + "loss": 0.5276, + "step": 461 + }, + { + "epoch": 2.2, + "learning_rate": 6.061776061776062e-05, + "loss": 0.4752, + "step": 462 + }, + { + "epoch": 2.21, + "learning_rate": 6.023166023166024e-05, + "loss": 0.5702, + "step": 463 + }, + { + "epoch": 2.21, + "learning_rate": 5.984555984555985e-05, + "loss": 0.4758, + "step": 464 + }, + { + "epoch": 2.22, + "learning_rate": 5.9459459459459466e-05, + "loss": 0.573, + "step": 465 + }, + { + "epoch": 2.22, + "eval_loss": 4.131713390350342, + "eval_runtime": 7.3741, + "eval_samples_per_second": 162.866, + "eval_steps_per_second": 54.379, + "step": 465 + }, + { + "epoch": 2.22, + "learning_rate": 5.907335907335908e-05, + "loss": 0.5373, + "step": 466 + }, + { + "epoch": 2.23, + "learning_rate": 5.8687258687258696e-05, + "loss": 0.5611, + "step": 467 + }, + { + "epoch": 2.23, + "learning_rate": 5.83011583011583e-05, + "loss": 0.5744, + "step": 468 + }, + { + "epoch": 2.24, + "learning_rate": 5.791505791505791e-05, + "loss": 0.4818, + "step": 469 + }, + { + "epoch": 2.24, + "learning_rate": 5.752895752895753e-05, + "loss": 0.4519, + "step": 470 + }, + { + "epoch": 2.25, + "learning_rate": 5.714285714285714e-05, + "loss": 0.4295, + "step": 471 + }, + { + "epoch": 2.25, + "learning_rate": 5.6756756756756757e-05, + "loss": 0.4755, + "step": 472 + }, + { + "epoch": 2.26, + "learning_rate": 5.637065637065637e-05, + "loss": 0.501, + "step": 473 + }, + { + "epoch": 2.26, + "learning_rate": 5.5984555984555986e-05, + "loss": 0.449, + "step": 474 + }, + { + "epoch": 2.27, + "learning_rate": 5.55984555984556e-05, + "loss": 0.4914, + "step": 475 + }, + { + "epoch": 2.27, + "learning_rate": 5.5212355212355216e-05, + "loss": 0.5153, + "step": 476 + }, + { + "epoch": 2.28, + "learning_rate": 5.482625482625483e-05, + "loss": 0.5433, + "step": 477 + }, + { + "epoch": 2.28, + "learning_rate": 5.4440154440154445e-05, + "loss": 0.5248, + "step": 478 + }, + { + "epoch": 2.29, + "learning_rate": 5.405405405405406e-05, + "loss": 0.5453, + "step": 479 + }, + { + "epoch": 2.29, + "learning_rate": 5.3667953667953675e-05, + "loss": 0.5288, + "step": 480 + }, + { + "epoch": 2.3, + "learning_rate": 5.328185328185329e-05, + "loss": 0.532, + "step": 481 + }, + { + "epoch": 2.3, + "learning_rate": 5.2895752895752905e-05, + "loss": 0.5139, + "step": 482 + }, + { + "epoch": 2.31, + "learning_rate": 5.2509652509652506e-05, + "loss": 0.5175, + "step": 483 + }, + { + "epoch": 2.31, + "learning_rate": 5.212355212355212e-05, + "loss": 0.6227, + "step": 484 + }, + { + "epoch": 2.32, + "learning_rate": 5.1737451737451736e-05, + "loss": 0.567, + "step": 485 + }, + { + "epoch": 2.32, + "learning_rate": 5.135135135135135e-05, + "loss": 0.5636, + "step": 486 + }, + { + "epoch": 2.33, + "learning_rate": 5.0965250965250965e-05, + "loss": 0.5367, + "step": 487 + }, + { + "epoch": 2.33, + "learning_rate": 5.057915057915058e-05, + "loss": 0.6016, + "step": 488 + }, + { + "epoch": 2.33, + "learning_rate": 5.0193050193050195e-05, + "loss": 0.4492, + "step": 489 + }, + { + "epoch": 2.34, + "learning_rate": 4.980694980694981e-05, + "loss": 0.5329, + "step": 490 + }, + { + "epoch": 2.34, + "learning_rate": 4.9420849420849425e-05, + "loss": 0.503, + "step": 491 + }, + { + "epoch": 2.35, + "learning_rate": 4.903474903474904e-05, + "loss": 0.4799, + "step": 492 + }, + { + "epoch": 2.35, + "learning_rate": 4.8648648648648654e-05, + "loss": 0.454, + "step": 493 + }, + { + "epoch": 2.36, + "learning_rate": 4.826254826254826e-05, + "loss": 0.5555, + "step": 494 + }, + { + "epoch": 2.36, + "learning_rate": 4.787644787644788e-05, + "loss": 0.5925, + "step": 495 + }, + { + "epoch": 2.37, + "learning_rate": 4.749034749034749e-05, + "loss": 0.5557, + "step": 496 + }, + { + "epoch": 2.37, + "eval_loss": 4.199349403381348, + "eval_runtime": 7.3742, + "eval_samples_per_second": 162.866, + "eval_steps_per_second": 54.379, + "step": 496 + }, + { + "epoch": 2.37, + "learning_rate": 4.710424710424711e-05, + "loss": 0.5157, + "step": 497 + }, + { + "epoch": 2.38, + "learning_rate": 4.671814671814672e-05, + "loss": 0.5538, + "step": 498 + }, + { + "epoch": 2.38, + "learning_rate": 4.6332046332046336e-05, + "loss": 0.6174, + "step": 499 + }, + { + "epoch": 2.39, + "learning_rate": 4.594594594594595e-05, + "loss": 0.4592, + "step": 500 + }, + { + "epoch": 2.39, + "learning_rate": 4.555984555984556e-05, + "loss": 0.4557, + "step": 501 + }, + { + "epoch": 2.4, + "learning_rate": 4.5173745173745174e-05, + "loss": 0.5154, + "step": 502 + }, + { + "epoch": 2.4, + "learning_rate": 4.478764478764479e-05, + "loss": 0.4909, + "step": 503 + }, + { + "epoch": 2.41, + "learning_rate": 4.4401544401544404e-05, + "loss": 0.4755, + "step": 504 + }, + { + "epoch": 2.41, + "learning_rate": 4.401544401544402e-05, + "loss": 0.592, + "step": 505 + }, + { + "epoch": 2.42, + "learning_rate": 4.3629343629343633e-05, + "loss": 0.5014, + "step": 506 + }, + { + "epoch": 2.42, + "learning_rate": 4.324324324324325e-05, + "loss": 0.4928, + "step": 507 + }, + { + "epoch": 2.43, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.5352, + "step": 508 + }, + { + "epoch": 2.43, + "learning_rate": 4.247104247104247e-05, + "loss": 0.5457, + "step": 509 + }, + { + "epoch": 2.44, + "learning_rate": 4.2084942084942086e-05, + "loss": 0.5182, + "step": 510 + }, + { + "epoch": 2.44, + "learning_rate": 4.16988416988417e-05, + "loss": 0.527, + "step": 511 + }, + { + "epoch": 2.45, + "learning_rate": 4.1312741312741316e-05, + "loss": 0.4961, + "step": 512 + }, + { + "epoch": 2.45, + "learning_rate": 4.092664092664093e-05, + "loss": 0.4988, + "step": 513 + }, + { + "epoch": 2.46, + "learning_rate": 4.0540540540540545e-05, + "loss": 0.5314, + "step": 514 + }, + { + "epoch": 2.46, + "learning_rate": 4.015444015444015e-05, + "loss": 0.5523, + "step": 515 + }, + { + "epoch": 2.47, + "learning_rate": 3.976833976833977e-05, + "loss": 0.4368, + "step": 516 + }, + { + "epoch": 2.47, + "learning_rate": 3.938223938223938e-05, + "loss": 0.5184, + "step": 517 + }, + { + "epoch": 2.48, + "learning_rate": 3.8996138996139e-05, + "loss": 0.6171, + "step": 518 + }, + { + "epoch": 2.48, + "learning_rate": 3.861003861003861e-05, + "loss": 0.5357, + "step": 519 + }, + { + "epoch": 2.49, + "learning_rate": 3.822393822393823e-05, + "loss": 0.5664, + "step": 520 + }, + { + "epoch": 2.49, + "learning_rate": 3.783783783783784e-05, + "loss": 0.4805, + "step": 521 + }, + { + "epoch": 2.5, + "learning_rate": 3.745173745173745e-05, + "loss": 0.4562, + "step": 522 + }, + { + "epoch": 2.5, + "learning_rate": 3.7065637065637065e-05, + "loss": 0.5238, + "step": 523 + }, + { + "epoch": 2.5, + "learning_rate": 3.667953667953668e-05, + "loss": 0.4338, + "step": 524 + }, + { + "epoch": 2.51, + "learning_rate": 3.6293436293436295e-05, + "loss": 0.5656, + "step": 525 + }, + { + "epoch": 2.51, + "learning_rate": 3.590733590733591e-05, + "loss": 0.4496, + "step": 526 + }, + { + "epoch": 2.52, + "learning_rate": 3.5521235521235524e-05, + "loss": 0.4997, + "step": 527 + }, + { + "epoch": 2.52, + "eval_loss": 4.19807767868042, + "eval_runtime": 7.3698, + "eval_samples_per_second": 162.962, + "eval_steps_per_second": 54.411, + "step": 527 + }, + { + "epoch": 2.52, + "learning_rate": 3.513513513513514e-05, + "loss": 0.4531, + "step": 528 + }, + { + "epoch": 2.53, + "learning_rate": 3.4749034749034754e-05, + "loss": 0.5048, + "step": 529 + }, + { + "epoch": 2.53, + "learning_rate": 3.436293436293436e-05, + "loss": 0.5195, + "step": 530 + }, + { + "epoch": 2.54, + "learning_rate": 3.397683397683398e-05, + "loss": 0.4885, + "step": 531 + }, + { + "epoch": 2.54, + "learning_rate": 3.359073359073359e-05, + "loss": 0.6774, + "step": 532 + }, + { + "epoch": 2.55, + "learning_rate": 3.3204633204633207e-05, + "loss": 0.4755, + "step": 533 + }, + { + "epoch": 2.55, + "learning_rate": 3.281853281853282e-05, + "loss": 0.5164, + "step": 534 + }, + { + "epoch": 2.56, + "learning_rate": 3.2432432432432436e-05, + "loss": 0.4748, + "step": 535 + }, + { + "epoch": 2.56, + "learning_rate": 3.204633204633205e-05, + "loss": 0.5656, + "step": 536 + }, + { + "epoch": 2.57, + "learning_rate": 3.166023166023166e-05, + "loss": 0.5167, + "step": 537 + }, + { + "epoch": 2.57, + "learning_rate": 3.1274131274131274e-05, + "loss": 0.5101, + "step": 538 + }, + { + "epoch": 2.58, + "learning_rate": 3.088803088803089e-05, + "loss": 0.4965, + "step": 539 + }, + { + "epoch": 2.58, + "learning_rate": 3.0501930501930504e-05, + "loss": 0.5549, + "step": 540 + }, + { + "epoch": 2.59, + "learning_rate": 3.011583011583012e-05, + "loss": 0.4873, + "step": 541 + }, + { + "epoch": 2.59, + "learning_rate": 2.9729729729729733e-05, + "loss": 0.5093, + "step": 542 + }, + { + "epoch": 2.6, + "learning_rate": 2.9343629343629348e-05, + "loss": 0.4897, + "step": 543 + }, + { + "epoch": 2.6, + "learning_rate": 2.8957528957528956e-05, + "loss": 0.5128, + "step": 544 + }, + { + "epoch": 2.61, + "learning_rate": 2.857142857142857e-05, + "loss": 0.4829, + "step": 545 + }, + { + "epoch": 2.61, + "learning_rate": 2.8185328185328186e-05, + "loss": 0.4853, + "step": 546 + }, + { + "epoch": 2.62, + "learning_rate": 2.77992277992278e-05, + "loss": 0.5499, + "step": 547 + }, + { + "epoch": 2.62, + "learning_rate": 2.7413127413127415e-05, + "loss": 0.59, + "step": 548 + }, + { + "epoch": 2.63, + "learning_rate": 2.702702702702703e-05, + "loss": 0.5169, + "step": 549 + }, + { + "epoch": 2.63, + "learning_rate": 2.6640926640926645e-05, + "loss": 0.5642, + "step": 550 + }, + { + "epoch": 2.64, + "learning_rate": 2.6254826254826253e-05, + "loss": 0.5745, + "step": 551 + }, + { + "epoch": 2.64, + "learning_rate": 2.5868725868725868e-05, + "loss": 0.4845, + "step": 552 + }, + { + "epoch": 2.65, + "learning_rate": 2.5482625482625483e-05, + "loss": 0.5198, + "step": 553 + }, + { + "epoch": 2.65, + "learning_rate": 2.5096525096525097e-05, + "loss": 0.5402, + "step": 554 + }, + { + "epoch": 2.66, + "learning_rate": 2.4710424710424712e-05, + "loss": 0.5122, + "step": 555 + }, + { + "epoch": 2.66, + "learning_rate": 2.4324324324324327e-05, + "loss": 0.5769, + "step": 556 + }, + { + "epoch": 2.67, + "learning_rate": 2.393822393822394e-05, + "loss": 0.5519, + "step": 557 + }, + { + "epoch": 2.67, + "learning_rate": 2.3552123552123553e-05, + "loss": 0.465, + "step": 558 + }, + { + "epoch": 2.67, + "eval_loss": 4.177217483520508, + "eval_runtime": 7.3742, + "eval_samples_per_second": 162.865, + "eval_steps_per_second": 54.379, + "step": 558 + }, + { + "epoch": 2.67, + "learning_rate": 2.3166023166023168e-05, + "loss": 0.4816, + "step": 559 + }, + { + "epoch": 2.68, + "learning_rate": 2.277992277992278e-05, + "loss": 0.4428, + "step": 560 + }, + { + "epoch": 2.68, + "learning_rate": 2.2393822393822394e-05, + "loss": 0.4969, + "step": 561 + }, + { + "epoch": 2.69, + "learning_rate": 2.200772200772201e-05, + "loss": 0.4891, + "step": 562 + }, + { + "epoch": 2.69, + "learning_rate": 2.1621621621621624e-05, + "loss": 0.4082, + "step": 563 + }, + { + "epoch": 2.7, + "learning_rate": 2.1235521235521236e-05, + "loss": 0.4735, + "step": 564 + }, + { + "epoch": 2.7, + "learning_rate": 2.084942084942085e-05, + "loss": 0.5121, + "step": 565 + }, + { + "epoch": 2.71, + "learning_rate": 2.0463320463320465e-05, + "loss": 0.4696, + "step": 566 + }, + { + "epoch": 2.71, + "learning_rate": 2.0077220077220077e-05, + "loss": 0.397, + "step": 567 + }, + { + "epoch": 2.72, + "learning_rate": 1.969111969111969e-05, + "loss": 0.5271, + "step": 568 + }, + { + "epoch": 2.72, + "learning_rate": 1.9305019305019306e-05, + "loss": 0.4974, + "step": 569 + }, + { + "epoch": 2.73, + "learning_rate": 1.891891891891892e-05, + "loss": 0.4814, + "step": 570 + }, + { + "epoch": 2.73, + "learning_rate": 1.8532818532818533e-05, + "loss": 0.5565, + "step": 571 + }, + { + "epoch": 2.74, + "learning_rate": 1.8146718146718147e-05, + "loss": 0.4737, + "step": 572 + }, + { + "epoch": 2.74, + "learning_rate": 1.7760617760617762e-05, + "loss": 0.4448, + "step": 573 + }, + { + "epoch": 2.75, + "learning_rate": 1.7374517374517377e-05, + "loss": 0.4886, + "step": 574 + }, + { + "epoch": 2.75, + "learning_rate": 1.698841698841699e-05, + "loss": 0.5197, + "step": 575 + }, + { + "epoch": 2.76, + "learning_rate": 1.6602316602316603e-05, + "loss": 0.4688, + "step": 576 + }, + { + "epoch": 2.76, + "learning_rate": 1.6216216216216218e-05, + "loss": 0.5649, + "step": 577 + }, + { + "epoch": 2.77, + "learning_rate": 1.583011583011583e-05, + "loss": 0.5026, + "step": 578 + }, + { + "epoch": 2.77, + "learning_rate": 1.5444015444015444e-05, + "loss": 0.5832, + "step": 579 + }, + { + "epoch": 2.78, + "learning_rate": 1.505791505791506e-05, + "loss": 0.5995, + "step": 580 + }, + { + "epoch": 2.78, + "learning_rate": 1.4671814671814674e-05, + "loss": 0.5342, + "step": 581 + }, + { + "epoch": 2.79, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.5465, + "step": 582 + }, + { + "epoch": 2.79, + "learning_rate": 1.38996138996139e-05, + "loss": 0.5165, + "step": 583 + }, + { + "epoch": 2.8, + "learning_rate": 1.3513513513513515e-05, + "loss": 0.4594, + "step": 584 + }, + { + "epoch": 2.8, + "learning_rate": 1.3127413127413127e-05, + "loss": 0.4448, + "step": 585 + }, + { + "epoch": 2.81, + "learning_rate": 1.2741312741312741e-05, + "loss": 0.5148, + "step": 586 + }, + { + "epoch": 2.81, + "learning_rate": 1.2355212355212356e-05, + "loss": 0.5255, + "step": 587 + }, + { + "epoch": 2.82, + "learning_rate": 1.196911196911197e-05, + "loss": 0.4979, + "step": 588 + }, + { + "epoch": 2.82, + "learning_rate": 1.1583011583011584e-05, + "loss": 0.4531, + "step": 589 + }, + { + "epoch": 2.82, + "eval_loss": 4.171577453613281, + "eval_runtime": 7.3719, + "eval_samples_per_second": 162.916, + "eval_steps_per_second": 54.396, + "step": 589 + }, + { + "epoch": 2.83, + "learning_rate": 1.1196911196911197e-05, + "loss": 0.5339, + "step": 590 + }, + { + "epoch": 2.83, + "learning_rate": 1.0810810810810812e-05, + "loss": 0.5242, + "step": 591 + }, + { + "epoch": 2.83, + "learning_rate": 1.0424710424710425e-05, + "loss": 0.5266, + "step": 592 + }, + { + "epoch": 2.84, + "learning_rate": 1.0038610038610038e-05, + "loss": 0.5188, + "step": 593 + }, + { + "epoch": 2.84, + "learning_rate": 9.652509652509653e-06, + "loss": 0.459, + "step": 594 + }, + { + "epoch": 2.85, + "learning_rate": 9.266409266409266e-06, + "loss": 0.3489, + "step": 595 + }, + { + "epoch": 2.85, + "learning_rate": 8.880308880308881e-06, + "loss": 0.5022, + "step": 596 + }, + { + "epoch": 2.86, + "learning_rate": 8.494208494208494e-06, + "loss": 0.4513, + "step": 597 + }, + { + "epoch": 2.86, + "learning_rate": 8.108108108108109e-06, + "loss": 0.4338, + "step": 598 + }, + { + "epoch": 2.87, + "learning_rate": 7.722007722007722e-06, + "loss": 0.5263, + "step": 599 + }, + { + "epoch": 2.87, + "learning_rate": 7.335907335907337e-06, + "loss": 0.4898, + "step": 600 + }, + { + "epoch": 2.88, + "learning_rate": 6.94980694980695e-06, + "loss": 0.5212, + "step": 601 + }, + { + "epoch": 2.88, + "learning_rate": 6.563706563706563e-06, + "loss": 0.5355, + "step": 602 + }, + { + "epoch": 2.89, + "learning_rate": 6.177606177606178e-06, + "loss": 0.5444, + "step": 603 + }, + { + "epoch": 2.89, + "learning_rate": 5.791505791505792e-06, + "loss": 0.5875, + "step": 604 + }, + { + "epoch": 2.9, + "learning_rate": 5.405405405405406e-06, + "loss": 0.4989, + "step": 605 + }, + { + "epoch": 2.9, + "learning_rate": 5.019305019305019e-06, + "loss": 0.5159, + "step": 606 + }, + { + "epoch": 2.91, + "learning_rate": 4.633204633204633e-06, + "loss": 0.3956, + "step": 607 + }, + { + "epoch": 2.91, + "learning_rate": 4.247104247104247e-06, + "loss": 0.491, + "step": 608 + }, + { + "epoch": 2.92, + "learning_rate": 3.861003861003861e-06, + "loss": 0.4454, + "step": 609 + }, + { + "epoch": 2.92, + "learning_rate": 3.474903474903475e-06, + "loss": 0.4844, + "step": 610 + }, + { + "epoch": 2.93, + "learning_rate": 3.088803088803089e-06, + "loss": 0.4972, + "step": 611 + }, + { + "epoch": 2.93, + "learning_rate": 2.702702702702703e-06, + "loss": 0.4617, + "step": 612 + }, + { + "epoch": 2.94, + "learning_rate": 2.3166023166023166e-06, + "loss": 0.4683, + "step": 613 + }, + { + "epoch": 2.94, + "learning_rate": 1.9305019305019305e-06, + "loss": 0.5815, + "step": 614 + }, + { + "epoch": 2.95, + "learning_rate": 1.5444015444015445e-06, + "loss": 0.4502, + "step": 615 + }, + { + "epoch": 2.95, + "learning_rate": 1.1583011583011583e-06, + "loss": 0.477, + "step": 616 + }, + { + "epoch": 2.96, + "learning_rate": 7.722007722007723e-07, + "loss": 0.4836, + "step": 617 + }, + { + "epoch": 2.96, + "learning_rate": 3.8610038610038613e-07, + "loss": 0.4523, + "step": 618 + } + ], + "logging_steps": 1, + "max_steps": 618, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 3.072196311043277e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-618/training_args.bin b/checkpoint-618/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..50ac99f4d954fadcdfa89859b41976b03aed10b8 --- /dev/null +++ b/checkpoint-618/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40caa3ffe88e39fb8f17ca2f4b2952df2344fe3de9435a3b5cb8662a65ff745d +size 6011 diff --git a/checkpoint-618/zero_to_fp32.py b/checkpoint-618/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-618/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-7023/config.json b/checkpoint-7023/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2599737ed66369800e7a6efe6cdf23d0cfe85382 --- /dev/null +++ b/checkpoint-7023/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "cyberagent/calm2-7b-chat", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 32768, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "pad_token_id": 1, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 500000, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.34.1", + "use_cache": false, + "vocab_size": 65024 +} diff --git a/checkpoint-7023/generation_config.json b/checkpoint-7023/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28d913eb61a7fd74338a6f1ff8d2efb149f99dbc --- /dev/null +++ b/checkpoint-7023/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 0, + "eos_token_id": 0, + "pad_token_id": 1, + "transformers_version": "4.34.1" +} diff --git a/checkpoint-7023/global_step7023/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4298303252a8695cc5f9efb897c60b681c095440 --- /dev/null +++ b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e8409e28a4dd8f2e63534c508bc67d87af2dbd5c6b0336662f2ca3a91662d6 +size 28035802551 diff --git a/checkpoint-7023/global_step7023/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63a7f499cc9029e60d6aeab52903fa612cd39b35 --- /dev/null +++ b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e85d657856b0f6cc46ae1e6216abd7526c54162c5a05e9e5fba260967d65cf88 +size 28035803191 diff --git a/checkpoint-7023/global_step7023/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..852922c69560480a0325a1c0fa75ce3c683be2d4 --- /dev/null +++ b/checkpoint-7023/global_step7023/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7562098a345b2b184cee07c155b61311325cab4af5b7a2c52a303418a4b05e12 +size 28035802743 diff --git a/checkpoint-7023/global_step7023/mp_rank_00_model_states.pt b/checkpoint-7023/global_step7023/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00f1227b915fe97ec64ef50ec9929359547a400e --- /dev/null +++ b/checkpoint-7023/global_step7023/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3368151ecad4d39f99b6d87bc625040badf3e1a30e5d2661d80369f1bc849beb +size 14017976195 diff --git a/checkpoint-7023/latest b/checkpoint-7023/latest new file mode 100644 index 0000000000000000000000000000000000000000..11e86ced5e2ee87a633f6430512f2532c2f93790 --- /dev/null +++ b/checkpoint-7023/latest @@ -0,0 +1 @@ +global_step7023 \ No newline at end of file diff --git a/checkpoint-7023/pytorch_model-00001-of-00002.bin b/checkpoint-7023/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..5eed5df07a9e78b4a5a320066e469d875d128d6a --- /dev/null +++ b/checkpoint-7023/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b98f04a3fb8a808b5aa704219ccfd3291621abd6be915d74f50d4196fe863d +size 9976594142 diff --git a/checkpoint-7023/pytorch_model-00002-of-00002.bin b/checkpoint-7023/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..465ba3d759058bce4ffa7b99d6fc61eb59b952ef --- /dev/null +++ b/checkpoint-7023/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8d0dc9dd398a7009e3683075aebf94673b76a22a2100d00841dd0fa3f7095c +size 4041391035 diff --git a/checkpoint-7023/pytorch_model.bin.index.json b/checkpoint-7023/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..85c4314f00522f62839f8495ec3f0d9adec0fcfa --- /dev/null +++ b/checkpoint-7023/pytorch_model.bin.index.json @@ -0,0 +1,266 @@ +{ + "metadata": { + "total_size": 14017896448 + }, + "weight_map": { + "lm_head.weight": "pytorch_model-00002-of-00002.bin", + "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w12.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.mlp.swiglu.w3.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin", + "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w12.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.mlp.swiglu.w3.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin", + "model.norm.weight": "pytorch_model-00002-of-00002.bin" + } +} diff --git a/checkpoint-7023/rng_state_0.pth b/checkpoint-7023/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..417d52d6958ce1d6e6c0e711d6eb0a68a1f1ae42 --- /dev/null +++ b/checkpoint-7023/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1653d5b0e09c2d93759ad31b0bca034b949c5beacbcec854b9c133c18ff0f1 +size 16631 diff --git a/checkpoint-7023/rng_state_1.pth b/checkpoint-7023/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..6389ab11d2699189dff857d5cf6911645ac491a7 --- /dev/null +++ b/checkpoint-7023/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:718a356e2faee3d07e0416c137f3bcdc0c70d127268ae7202882018ffa03e320 +size 16631 diff --git a/checkpoint-7023/rng_state_2.pth b/checkpoint-7023/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..553871298ac3918ceef48b7e90ee7784f4afe077 --- /dev/null +++ b/checkpoint-7023/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ab1200db9bd16e014659660734c61fe08517897cef6b3efe97c366790250f5 +size 16631 diff --git a/checkpoint-7023/trainer_state.json b/checkpoint-7023/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fe9742845b2a2e4ff4818e415a23f11edf26211 --- /dev/null +++ b/checkpoint-7023/trainer_state.json @@ -0,0 +1,42317 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999288104221542, + "eval_steps": 352, + "global_step": 7023, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.0, + "loss": 3.5899, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 4.178114414215088, + "eval_runtime": 473.0271, + "eval_samples_per_second": 43.31, + "eval_steps_per_second": 14.437, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 4.1967, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 4.000000000000001e-06, + "loss": 4.2584, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 6e-06, + "loss": 4.2861, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 8.000000000000001e-06, + "loss": 3.449, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1e-05, + "loss": 3.6379, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.2e-05, + "loss": 3.5514, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.3639, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.2568, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 1.8e-05, + "loss": 3.3578, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 3.3858, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.4463, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 2.4e-05, + "loss": 3.3023, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 2.6000000000000002e-05, + "loss": 3.3967, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 2.8000000000000003e-05, + "loss": 3.3402, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 3e-05, + "loss": 3.2368, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.3075, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 3.4000000000000007e-05, + "loss": 3.3434, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 3.6e-05, + "loss": 3.3037, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 3.8e-05, + "loss": 3.2338, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 4e-05, + "loss": 3.1854, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 4.2e-05, + "loss": 3.2481, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 4.4000000000000006e-05, + "loss": 3.2006, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 4.600000000000001e-05, + "loss": 3.2349, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 4.8e-05, + "loss": 3.2476, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 3.169, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 5.2000000000000004e-05, + "loss": 3.2349, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 5.4000000000000005e-05, + "loss": 3.2007, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 5.6000000000000006e-05, + "loss": 3.2467, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 5.8e-05, + "loss": 3.2974, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 6e-05, + "loss": 3.0882, + "step": 31 + }, + { + "epoch": 0.0, + "learning_rate": 6.2e-05, + "loss": 3.1722, + "step": 32 + }, + { + "epoch": 0.0, + "learning_rate": 6.400000000000001e-05, + "loss": 3.1964, + "step": 33 + }, + { + "epoch": 0.0, + "learning_rate": 6.6e-05, + "loss": 3.2419, + "step": 34 + }, + { + "epoch": 0.0, + "learning_rate": 6.800000000000001e-05, + "loss": 3.1016, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 7e-05, + "loss": 3.1917, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 7.2e-05, + "loss": 3.2126, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 7.4e-05, + "loss": 3.2239, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 7.6e-05, + "loss": 3.1186, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 7.800000000000001e-05, + "loss": 3.2016, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 8e-05, + "loss": 3.2209, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 8.2e-05, + "loss": 3.2119, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 8.4e-05, + "loss": 3.3246, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 8.6e-05, + "loss": 3.1526, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 8.800000000000001e-05, + "loss": 3.22, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 9e-05, + "loss": 3.1782, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 9.200000000000001e-05, + "loss": 3.1501, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 9.4e-05, + "loss": 3.2063, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 9.6e-05, + "loss": 3.2535, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 9.8e-05, + "loss": 3.1148, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 3.232, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 0.00010200000000000001, + "loss": 3.1592, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 0.00010400000000000001, + "loss": 3.3169, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 0.00010600000000000002, + "loss": 3.2689, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 0.00010800000000000001, + "loss": 3.2544, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011000000000000002, + "loss": 3.2462, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011200000000000001, + "loss": 3.2163, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 0.00011399999999999999, + "loss": 3.1604, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 0.000116, + "loss": 3.2952, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 0.000118, + "loss": 3.2749, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00012, + "loss": 3.2169, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 0.000122, + "loss": 3.2749, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 0.000124, + "loss": 3.26, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 0.000126, + "loss": 3.2608, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 0.00012800000000000002, + "loss": 3.1959, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 0.00013000000000000002, + "loss": 3.1977, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 0.000132, + "loss": 3.2184, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 0.000134, + "loss": 3.1538, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 0.00013600000000000003, + "loss": 3.3864, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 0.000138, + "loss": 3.2612, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 0.00014, + "loss": 3.3267, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 0.000142, + "loss": 3.3697, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 0.000144, + "loss": 3.2061, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 0.000146, + "loss": 3.1879, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 0.000148, + "loss": 3.2455, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 3.2665, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 0.000152, + "loss": 3.3222, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 0.000154, + "loss": 3.2912, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015600000000000002, + "loss": 3.3734, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015800000000000002, + "loss": 3.2328, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 0.00016, + "loss": 3.3124, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 0.000162, + "loss": 3.2042, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 0.000164, + "loss": 3.3126, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 0.000166, + "loss": 3.3218, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 0.000168, + "loss": 3.3561, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017, + "loss": 3.4065, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 0.000172, + "loss": 3.4102, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 0.000174, + "loss": 3.4352, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017600000000000002, + "loss": 3.2285, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 0.00017800000000000002, + "loss": 3.3196, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 0.00018, + "loss": 3.4187, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 0.000182, + "loss": 3.505, + "step": 92 + }, + { + "epoch": 0.01, + "learning_rate": 0.00018400000000000003, + "loss": 3.4067, + "step": 93 + }, + { + "epoch": 0.01, + "learning_rate": 0.00018600000000000002, + "loss": 3.4128, + "step": 94 + }, + { + "epoch": 0.01, + "learning_rate": 0.000188, + "loss": 3.4732, + "step": 95 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019, + "loss": 3.3481, + "step": 96 + }, + { + "epoch": 0.01, + "learning_rate": 0.000192, + "loss": 3.4073, + "step": 97 + }, + { + "epoch": 0.01, + "learning_rate": 0.000194, + "loss": 3.5063, + "step": 98 + }, + { + "epoch": 0.01, + "learning_rate": 0.000196, + "loss": 3.3983, + "step": 99 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019800000000000002, + "loss": 3.4428, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 3.3764, + "step": 101 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999711107901199, + "loss": 3.3813, + "step": 102 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001999422215802398, + "loss": 3.4354, + "step": 103 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019991333237035966, + "loss": 3.2258, + "step": 104 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019988444316047956, + "loss": 3.3943, + "step": 105 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019985555395059945, + "loss": 3.3837, + "step": 106 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019982666474071937, + "loss": 3.3826, + "step": 107 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019979777553083924, + "loss": 3.4589, + "step": 108 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019976888632095913, + "loss": 3.3665, + "step": 109 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019973999711107903, + "loss": 3.4585, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019971110790119892, + "loss": 3.5142, + "step": 111 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001996822186913188, + "loss": 3.4871, + "step": 112 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019965332948143868, + "loss": 3.4486, + "step": 113 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019962444027155858, + "loss": 3.5011, + "step": 114 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019959555106167847, + "loss": 3.3932, + "step": 115 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019956666185179836, + "loss": 3.5907, + "step": 116 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019953777264191826, + "loss": 3.4969, + "step": 117 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019950888343203815, + "loss": 3.5815, + "step": 118 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019947999422215804, + "loss": 3.4717, + "step": 119 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019945110501227794, + "loss": 3.4238, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001994222158023978, + "loss": 3.5099, + "step": 121 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001993933265925177, + "loss": 3.5688, + "step": 122 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001993644373826376, + "loss": 3.4204, + "step": 123 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019933554817275749, + "loss": 3.5261, + "step": 124 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019930665896287738, + "loss": 3.6011, + "step": 125 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019927776975299725, + "loss": 3.5179, + "step": 126 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019924888054311714, + "loss": 3.5101, + "step": 127 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019921999133323706, + "loss": 3.5243, + "step": 128 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019919110212335696, + "loss": 3.6438, + "step": 129 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019916221291347682, + "loss": 3.4501, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019913332370359672, + "loss": 3.4317, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001991044344937166, + "loss": 3.5163, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001990755452838365, + "loss": 3.4548, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019904665607395637, + "loss": 3.5115, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019901776686407626, + "loss": 3.5406, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019898887765419616, + "loss": 3.4877, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019895998844431605, + "loss": 3.567, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019893109923443595, + "loss": 3.5098, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019890221002455584, + "loss": 3.4999, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019887332081467573, + "loss": 3.515, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019884443160479563, + "loss": 3.6411, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019881554239491552, + "loss": 3.6045, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001987866531850354, + "loss": 3.5639, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019875776397515528, + "loss": 3.3698, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019872887476527518, + "loss": 3.5527, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019869998555539507, + "loss": 3.4809, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019867109634551496, + "loss": 3.5931, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019864220713563483, + "loss": 3.4159, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019861331792575475, + "loss": 3.5496, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019858442871587464, + "loss": 3.5335, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019855553950599454, + "loss": 3.4063, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001985266502961144, + "loss": 3.4679, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001984977610862343, + "loss": 3.4876, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001984688718763542, + "loss": 3.5482, + "step": 154 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019843998266647409, + "loss": 3.6088, + "step": 155 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019841109345659395, + "loss": 3.5708, + "step": 156 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019838220424671385, + "loss": 3.4716, + "step": 157 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019835331503683374, + "loss": 3.6032, + "step": 158 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019832442582695366, + "loss": 3.4731, + "step": 159 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019829553661707353, + "loss": 3.4766, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019826664740719342, + "loss": 3.4977, + "step": 161 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019823775819731332, + "loss": 3.4669, + "step": 162 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001982088689874332, + "loss": 3.6169, + "step": 163 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001981799797775531, + "loss": 3.4544, + "step": 164 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019815109056767297, + "loss": 3.5422, + "step": 165 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019812220135779286, + "loss": 3.6311, + "step": 166 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019809331214791276, + "loss": 3.513, + "step": 167 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019806442293803265, + "loss": 3.5973, + "step": 168 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019803553372815255, + "loss": 3.4974, + "step": 169 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019800664451827244, + "loss": 3.6382, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019797775530839233, + "loss": 3.5406, + "step": 171 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019794886609851223, + "loss": 3.596, + "step": 172 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019791997688863212, + "loss": 3.5675, + "step": 173 + }, + { + "epoch": 0.02, + "learning_rate": 0.000197891087678752, + "loss": 3.4857, + "step": 174 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019786219846887188, + "loss": 3.5948, + "step": 175 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019783330925899178, + "loss": 3.5377, + "step": 176 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019780442004911167, + "loss": 3.4115, + "step": 177 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019777553083923154, + "loss": 3.5925, + "step": 178 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019774664162935143, + "loss": 3.6072, + "step": 179 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019771775241947135, + "loss": 3.4459, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019768886320959124, + "loss": 3.675, + "step": 181 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001976599739997111, + "loss": 3.5013, + "step": 182 + }, + { + "epoch": 0.03, + "learning_rate": 0.000197631084789831, + "loss": 3.5503, + "step": 183 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001976021955799509, + "loss": 3.4639, + "step": 184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001975733063700708, + "loss": 3.6222, + "step": 185 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001975444171601907, + "loss": 3.6222, + "step": 186 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019751552795031055, + "loss": 3.5586, + "step": 187 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019748663874043045, + "loss": 3.353, + "step": 188 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019745774953055034, + "loss": 3.5714, + "step": 189 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019742886032067023, + "loss": 3.5703, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019739997111079013, + "loss": 3.4602, + "step": 191 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019737108190091002, + "loss": 3.5076, + "step": 192 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019734219269102992, + "loss": 3.6046, + "step": 193 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001973133034811498, + "loss": 3.4482, + "step": 194 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001972844142712697, + "loss": 3.4445, + "step": 195 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019725552506138957, + "loss": 3.5888, + "step": 196 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019722663585150946, + "loss": 3.2793, + "step": 197 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019719774664162936, + "loss": 3.4957, + "step": 198 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019716885743174925, + "loss": 3.6092, + "step": 199 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019713996822186912, + "loss": 3.5575, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019711107901198904, + "loss": 3.4428, + "step": 201 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019708218980210893, + "loss": 3.4696, + "step": 202 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019705330059222883, + "loss": 3.6291, + "step": 203 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001970244113823487, + "loss": 3.5864, + "step": 204 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001969955221724686, + "loss": 3.6399, + "step": 205 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019696663296258848, + "loss": 3.547, + "step": 206 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019693774375270838, + "loss": 3.5143, + "step": 207 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019690885454282827, + "loss": 3.4557, + "step": 208 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019687996533294814, + "loss": 3.4584, + "step": 209 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019685107612306803, + "loss": 3.4184, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019682218691318795, + "loss": 3.5372, + "step": 211 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019679329770330784, + "loss": 3.4767, + "step": 212 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001967644084934277, + "loss": 3.5187, + "step": 213 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001967355192835476, + "loss": 3.5174, + "step": 214 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001967066300736675, + "loss": 3.5481, + "step": 215 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001966777408637874, + "loss": 3.5149, + "step": 216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001966488516539073, + "loss": 3.5833, + "step": 217 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019661996244402715, + "loss": 3.3972, + "step": 218 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019659107323414705, + "loss": 3.5472, + "step": 219 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019656218402426694, + "loss": 3.5609, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019653329481438683, + "loss": 3.6299, + "step": 221 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019650440560450673, + "loss": 3.4379, + "step": 222 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019647551639462662, + "loss": 3.5986, + "step": 223 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019644662718474652, + "loss": 3.5838, + "step": 224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001964177379748664, + "loss": 3.4134, + "step": 225 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019638884876498628, + "loss": 3.3992, + "step": 226 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019635995955510617, + "loss": 3.5541, + "step": 227 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019633107034522606, + "loss": 3.5869, + "step": 228 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019630218113534596, + "loss": 3.5119, + "step": 229 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019627329192546585, + "loss": 3.4892, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019624440271558572, + "loss": 3.6, + "step": 231 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019621551350570564, + "loss": 3.4632, + "step": 232 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019618662429582553, + "loss": 3.5046, + "step": 233 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019615773508594543, + "loss": 3.5646, + "step": 234 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001961288458760653, + "loss": 3.5583, + "step": 235 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001960999566661852, + "loss": 3.5467, + "step": 236 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019607106745630508, + "loss": 3.3647, + "step": 237 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019604217824642498, + "loss": 3.6019, + "step": 238 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019601328903654484, + "loss": 3.3073, + "step": 239 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019598439982666474, + "loss": 3.5884, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019595551061678463, + "loss": 3.509, + "step": 241 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019592662140690452, + "loss": 3.3664, + "step": 242 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019589773219702442, + "loss": 3.5424, + "step": 243 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001958688429871443, + "loss": 3.3911, + "step": 244 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001958399537772642, + "loss": 3.4583, + "step": 245 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001958110645673841, + "loss": 3.5823, + "step": 246 + }, + { + "epoch": 0.04, + "learning_rate": 0.000195782175357504, + "loss": 3.5662, + "step": 247 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019575328614762386, + "loss": 3.472, + "step": 248 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019572439693774375, + "loss": 3.5061, + "step": 249 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019569550772786365, + "loss": 3.5083, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019566661851798354, + "loss": 3.4507, + "step": 251 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019563772930810344, + "loss": 3.4526, + "step": 252 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019560884009822333, + "loss": 3.5103, + "step": 253 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019557995088834322, + "loss": 3.5585, + "step": 254 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019555106167846312, + "loss": 3.505, + "step": 255 + }, + { + "epoch": 0.04, + "learning_rate": 0.000195522172468583, + "loss": 3.4499, + "step": 256 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019549328325870288, + "loss": 3.3568, + "step": 257 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019546439404882277, + "loss": 3.7057, + "step": 258 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019543550483894266, + "loss": 3.517, + "step": 259 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019540661562906256, + "loss": 3.4745, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019537772641918243, + "loss": 3.4541, + "step": 261 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019534883720930232, + "loss": 3.5701, + "step": 262 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001953199479994222, + "loss": 3.6835, + "step": 263 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019529105878954213, + "loss": 3.5116, + "step": 264 + }, + { + "epoch": 0.04, + "learning_rate": 0.000195262169579662, + "loss": 3.3648, + "step": 265 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001952332803697819, + "loss": 3.3722, + "step": 266 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001952043911599018, + "loss": 3.3931, + "step": 267 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019517550195002168, + "loss": 3.5807, + "step": 268 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019514661274014158, + "loss": 3.5416, + "step": 269 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019511772353026144, + "loss": 3.582, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019508883432038134, + "loss": 3.4652, + "step": 271 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019505994511050123, + "loss": 3.5732, + "step": 272 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019503105590062112, + "loss": 3.5297, + "step": 273 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019500216669074102, + "loss": 3.7003, + "step": 274 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001949732774808609, + "loss": 3.5983, + "step": 275 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001949443882709808, + "loss": 3.6118, + "step": 276 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001949154990611007, + "loss": 3.4791, + "step": 277 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001948866098512206, + "loss": 3.6391, + "step": 278 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019485772064134046, + "loss": 3.5959, + "step": 279 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019482883143146035, + "loss": 3.5101, + "step": 280 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019479994222158025, + "loss": 3.5986, + "step": 281 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019477105301170014, + "loss": 3.5414, + "step": 282 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019474216380182, + "loss": 3.6653, + "step": 283 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001947132745919399, + "loss": 3.5027, + "step": 284 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019468438538205982, + "loss": 3.4249, + "step": 285 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019465549617217972, + "loss": 3.4433, + "step": 286 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019462660696229958, + "loss": 3.6223, + "step": 287 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019459771775241948, + "loss": 3.5467, + "step": 288 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019456882854253937, + "loss": 3.61, + "step": 289 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019453993933265926, + "loss": 3.6137, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019451105012277916, + "loss": 3.5088, + "step": 291 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019448216091289903, + "loss": 3.5359, + "step": 292 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019445327170301892, + "loss": 3.5458, + "step": 293 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001944243824931388, + "loss": 3.5681, + "step": 294 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019439549328325873, + "loss": 3.4299, + "step": 295 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001943666040733786, + "loss": 3.5176, + "step": 296 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001943377148634985, + "loss": 3.641, + "step": 297 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001943088256536184, + "loss": 3.4167, + "step": 298 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019427993644373828, + "loss": 3.4261, + "step": 299 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019425104723385818, + "loss": 3.4559, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019422215802397804, + "loss": 3.5327, + "step": 301 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019419326881409794, + "loss": 3.4398, + "step": 302 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019416437960421783, + "loss": 3.5033, + "step": 303 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019413549039433772, + "loss": 3.6118, + "step": 304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001941066011844576, + "loss": 3.4967, + "step": 305 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001940777119745775, + "loss": 3.4717, + "step": 306 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001940488227646974, + "loss": 3.355, + "step": 307 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001940199335548173, + "loss": 3.5887, + "step": 308 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019399104434493717, + "loss": 3.6378, + "step": 309 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019396215513505706, + "loss": 3.5147, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019393326592517695, + "loss": 3.4631, + "step": 311 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019390437671529685, + "loss": 3.5992, + "step": 312 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019387548750541674, + "loss": 3.5586, + "step": 313 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001938465982955366, + "loss": 3.5399, + "step": 314 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001938177090856565, + "loss": 3.4912, + "step": 315 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019378881987577642, + "loss": 3.6064, + "step": 316 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019375993066589632, + "loss": 3.7006, + "step": 317 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019373104145601618, + "loss": 3.5503, + "step": 318 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019370215224613608, + "loss": 3.5618, + "step": 319 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019367326303625597, + "loss": 3.651, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019364437382637587, + "loss": 3.5181, + "step": 321 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019361548461649576, + "loss": 3.706, + "step": 322 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019358659540661563, + "loss": 3.601, + "step": 323 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019355770619673552, + "loss": 3.5891, + "step": 324 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001935288169868554, + "loss": 3.3504, + "step": 325 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001934999277769753, + "loss": 3.4464, + "step": 326 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001934710385670952, + "loss": 3.5102, + "step": 327 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001934421493572151, + "loss": 3.5796, + "step": 328 + }, + { + "epoch": 0.05, + "learning_rate": 0.000193413260147335, + "loss": 3.5751, + "step": 329 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019338437093745488, + "loss": 3.4911, + "step": 330 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019335548172757475, + "loss": 3.5161, + "step": 331 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019332659251769464, + "loss": 3.4412, + "step": 332 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019329770330781454, + "loss": 3.4896, + "step": 333 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019326881409793443, + "loss": 3.5323, + "step": 334 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019323992488805432, + "loss": 3.633, + "step": 335 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001932110356781742, + "loss": 3.5541, + "step": 336 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001931821464682941, + "loss": 3.5204, + "step": 337 + }, + { + "epoch": 0.05, + "learning_rate": 0.000193153257258414, + "loss": 3.7043, + "step": 338 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001931243680485339, + "loss": 3.5533, + "step": 339 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019309547883865377, + "loss": 3.5269, + "step": 340 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019306658962877366, + "loss": 3.4716, + "step": 341 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019303770041889355, + "loss": 3.4324, + "step": 342 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019300881120901345, + "loss": 3.5459, + "step": 343 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019297992199913334, + "loss": 3.543, + "step": 344 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001929510327892532, + "loss": 3.5318, + "step": 345 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001929221435793731, + "loss": 3.6708, + "step": 346 + }, + { + "epoch": 0.05, + "learning_rate": 0.000192893254369493, + "loss": 3.5583, + "step": 347 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001928643651596129, + "loss": 3.5955, + "step": 348 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019283547594973278, + "loss": 3.4893, + "step": 349 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019280658673985268, + "loss": 3.6199, + "step": 350 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019277769752997257, + "loss": 3.4667, + "step": 351 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019274880832009247, + "loss": 3.4689, + "step": 352 + }, + { + "epoch": 0.05, + "eval_loss": 3.59875750541687, + "eval_runtime": 472.4811, + "eval_samples_per_second": 43.36, + "eval_steps_per_second": 14.453, + "step": 352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019271991911021233, + "loss": 3.5819, + "step": 353 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019269102990033223, + "loss": 3.5459, + "step": 354 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019266214069045212, + "loss": 3.5412, + "step": 355 + }, + { + "epoch": 0.05, + "learning_rate": 0.000192633251480572, + "loss": 3.5603, + "step": 356 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001926043622706919, + "loss": 3.3498, + "step": 357 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001925754730608118, + "loss": 3.2999, + "step": 358 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001925465838509317, + "loss": 3.5822, + "step": 359 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001925176946410516, + "loss": 3.583, + "step": 360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019248880543117148, + "loss": 3.6066, + "step": 361 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019245991622129135, + "loss": 3.6497, + "step": 362 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019243102701141124, + "loss": 3.551, + "step": 363 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019240213780153114, + "loss": 3.3119, + "step": 364 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019237324859165103, + "loss": 3.5062, + "step": 365 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001923443593817709, + "loss": 3.5501, + "step": 366 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001923154701718908, + "loss": 3.5859, + "step": 367 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019228658096201069, + "loss": 3.5105, + "step": 368 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001922576917521306, + "loss": 3.5117, + "step": 369 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019222880254225047, + "loss": 3.5607, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019219991333237037, + "loss": 3.5971, + "step": 371 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019217102412249026, + "loss": 3.573, + "step": 372 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019214213491261015, + "loss": 3.5984, + "step": 373 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019211324570273005, + "loss": 3.4534, + "step": 374 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019208435649284991, + "loss": 3.5523, + "step": 375 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001920554672829698, + "loss": 3.5324, + "step": 376 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001920265780730897, + "loss": 3.5271, + "step": 377 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001919976888632096, + "loss": 3.5481, + "step": 378 + }, + { + "epoch": 0.05, + "learning_rate": 0.0001919687996533295, + "loss": 3.5117, + "step": 379 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019193991044344938, + "loss": 3.3971, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019191102123356928, + "loss": 3.5115, + "step": 381 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019188213202368917, + "loss": 3.51, + "step": 382 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019185324281380907, + "loss": 3.6974, + "step": 383 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019182435360392893, + "loss": 3.5188, + "step": 384 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019179546439404883, + "loss": 3.5516, + "step": 385 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019176657518416872, + "loss": 3.6164, + "step": 386 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019173768597428861, + "loss": 3.6524, + "step": 387 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019170879676440848, + "loss": 3.6024, + "step": 388 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019167990755452837, + "loss": 3.4035, + "step": 389 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001916510183446483, + "loss": 3.3601, + "step": 390 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001916221291347682, + "loss": 3.5517, + "step": 391 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019159323992488806, + "loss": 3.4734, + "step": 392 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019156435071500795, + "loss": 3.4639, + "step": 393 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019153546150512784, + "loss": 3.4812, + "step": 394 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019150657229524774, + "loss": 3.7109, + "step": 395 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019147768308536763, + "loss": 3.4681, + "step": 396 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001914487938754875, + "loss": 3.6094, + "step": 397 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001914199046656074, + "loss": 3.5162, + "step": 398 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019139101545572729, + "loss": 3.4222, + "step": 399 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001913621262458472, + "loss": 3.577, + "step": 400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019133323703596707, + "loss": 3.6015, + "step": 401 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019130434782608697, + "loss": 3.6354, + "step": 402 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019127545861620686, + "loss": 3.4482, + "step": 403 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019124656940632675, + "loss": 3.5481, + "step": 404 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019121768019644665, + "loss": 3.4145, + "step": 405 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019118879098656652, + "loss": 3.365, + "step": 406 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001911599017766864, + "loss": 3.5916, + "step": 407 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001911310125668063, + "loss": 3.6518, + "step": 408 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001911021233569262, + "loss": 3.6107, + "step": 409 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001910732341470461, + "loss": 3.549, + "step": 410 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019104434493716598, + "loss": 3.6271, + "step": 411 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019101545572728588, + "loss": 3.5452, + "step": 412 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019098656651740577, + "loss": 3.6264, + "step": 413 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019095767730752564, + "loss": 3.5031, + "step": 414 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019092878809764553, + "loss": 3.5594, + "step": 415 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019089989888776543, + "loss": 3.3923, + "step": 416 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019087100967788532, + "loss": 3.3375, + "step": 417 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019084212046800521, + "loss": 3.5472, + "step": 418 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019081323125812508, + "loss": 3.6317, + "step": 419 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019078434204824497, + "loss": 3.6466, + "step": 420 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001907554528383649, + "loss": 3.5929, + "step": 421 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001907265636284848, + "loss": 3.4864, + "step": 422 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019069767441860466, + "loss": 3.5866, + "step": 423 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019066878520872455, + "loss": 3.3665, + "step": 424 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019063989599884444, + "loss": 3.5244, + "step": 425 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019061100678896434, + "loss": 3.3427, + "step": 426 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019058211757908423, + "loss": 3.6314, + "step": 427 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001905532283692041, + "loss": 3.5961, + "step": 428 + }, + { + "epoch": 0.06, + "learning_rate": 0.000190524339159324, + "loss": 3.6028, + "step": 429 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019049544994944389, + "loss": 3.565, + "step": 430 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001904665607395638, + "loss": 3.6294, + "step": 431 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019043767152968367, + "loss": 3.5588, + "step": 432 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019040878231980357, + "loss": 3.5346, + "step": 433 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019037989310992346, + "loss": 3.5715, + "step": 434 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019035100390004335, + "loss": 3.6058, + "step": 435 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019032211469016322, + "loss": 3.6087, + "step": 436 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019029322548028312, + "loss": 3.5346, + "step": 437 + }, + { + "epoch": 0.06, + "learning_rate": 0.000190264336270403, + "loss": 3.5482, + "step": 438 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001902354470605229, + "loss": 3.553, + "step": 439 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001902065578506428, + "loss": 3.5424, + "step": 440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019017766864076266, + "loss": 3.3804, + "step": 441 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019014877943088258, + "loss": 3.5441, + "step": 442 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019011989022100248, + "loss": 3.5132, + "step": 443 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019009100101112237, + "loss": 3.5497, + "step": 444 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019006211180124224, + "loss": 3.5957, + "step": 445 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019003322259136213, + "loss": 3.5656, + "step": 446 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019000433338148203, + "loss": 3.5914, + "step": 447 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018997544417160192, + "loss": 3.5543, + "step": 448 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018994655496172181, + "loss": 3.6238, + "step": 449 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018991766575184168, + "loss": 3.4488, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018988877654196157, + "loss": 3.5148, + "step": 451 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001898598873320815, + "loss": 3.6172, + "step": 452 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018983099812220136, + "loss": 3.5025, + "step": 453 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018980210891232126, + "loss": 3.4935, + "step": 454 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018977321970244115, + "loss": 3.4364, + "step": 455 + }, + { + "epoch": 0.06, + "learning_rate": 0.00018974433049256104, + "loss": 3.5489, + "step": 456 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018971544128268094, + "loss": 3.5546, + "step": 457 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001896865520728008, + "loss": 3.4535, + "step": 458 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001896576628629207, + "loss": 3.545, + "step": 459 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001896287736530406, + "loss": 3.5972, + "step": 460 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018959988444316049, + "loss": 3.6178, + "step": 461 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018957099523328038, + "loss": 3.3621, + "step": 462 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018954210602340027, + "loss": 3.4738, + "step": 463 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018951321681352017, + "loss": 3.6158, + "step": 464 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018948432760364006, + "loss": 3.4559, + "step": 465 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018945543839375995, + "loss": 3.4635, + "step": 466 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018942654918387982, + "loss": 3.7056, + "step": 467 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018939765997399972, + "loss": 3.5761, + "step": 468 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001893687707641196, + "loss": 3.537, + "step": 469 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001893398815542395, + "loss": 3.6607, + "step": 470 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001893109923443594, + "loss": 3.4395, + "step": 471 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018928210313447926, + "loss": 3.4993, + "step": 472 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018925321392459918, + "loss": 3.4373, + "step": 473 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018922432471471908, + "loss": 3.5638, + "step": 474 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018919543550483895, + "loss": 3.5201, + "step": 475 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018916654629495884, + "loss": 3.3825, + "step": 476 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018913765708507873, + "loss": 3.4802, + "step": 477 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018910876787519863, + "loss": 3.4747, + "step": 478 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018907987866531852, + "loss": 3.4511, + "step": 479 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001890509894554384, + "loss": 3.5461, + "step": 480 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018902210024555828, + "loss": 3.455, + "step": 481 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018899321103567817, + "loss": 3.5123, + "step": 482 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018896432182579807, + "loss": 3.5906, + "step": 483 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018893543261591796, + "loss": 3.421, + "step": 484 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018890654340603786, + "loss": 3.5119, + "step": 485 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018887765419615775, + "loss": 3.6598, + "step": 486 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018884876498627764, + "loss": 3.5187, + "step": 487 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018881987577639754, + "loss": 3.5036, + "step": 488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001887909865665174, + "loss": 3.6146, + "step": 489 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001887620973566373, + "loss": 3.6127, + "step": 490 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001887332081467572, + "loss": 3.494, + "step": 491 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018870431893687709, + "loss": 3.5811, + "step": 492 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018867542972699695, + "loss": 3.6014, + "step": 493 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018864654051711687, + "loss": 3.4939, + "step": 494 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018861765130723677, + "loss": 3.5964, + "step": 495 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018858876209735666, + "loss": 3.5323, + "step": 496 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018855987288747653, + "loss": 3.5668, + "step": 497 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018853098367759642, + "loss": 3.5236, + "step": 498 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018850209446771632, + "loss": 3.6417, + "step": 499 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001884732052578362, + "loss": 3.7174, + "step": 500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001884443160479561, + "loss": 3.4265, + "step": 501 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018841542683807597, + "loss": 3.4595, + "step": 502 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018838653762819586, + "loss": 3.3742, + "step": 503 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018835764841831576, + "loss": 3.6812, + "step": 504 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018832875920843568, + "loss": 3.64, + "step": 505 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018829986999855555, + "loss": 3.4502, + "step": 506 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018827098078867544, + "loss": 3.4054, + "step": 507 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018824209157879533, + "loss": 3.4928, + "step": 508 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018821320236891523, + "loss": 3.4226, + "step": 509 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018818431315903512, + "loss": 3.5288, + "step": 510 + }, + { + "epoch": 0.07, + "learning_rate": 0.000188155423949155, + "loss": 3.5659, + "step": 511 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018812653473927488, + "loss": 3.5009, + "step": 512 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018809764552939477, + "loss": 3.4401, + "step": 513 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018806875631951467, + "loss": 3.5149, + "step": 514 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018803986710963456, + "loss": 3.5233, + "step": 515 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018801097789975446, + "loss": 3.4313, + "step": 516 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018798208868987435, + "loss": 3.5574, + "step": 517 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018795319947999424, + "loss": 3.5375, + "step": 518 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001879243102701141, + "loss": 3.5875, + "step": 519 + }, + { + "epoch": 0.07, + "learning_rate": 0.000187895421060234, + "loss": 3.4169, + "step": 520 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001878665318503539, + "loss": 3.4563, + "step": 521 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001878376426404738, + "loss": 3.3797, + "step": 522 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018780875343059369, + "loss": 3.4075, + "step": 523 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018777986422071355, + "loss": 3.5773, + "step": 524 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018775097501083345, + "loss": 3.5523, + "step": 525 + }, + { + "epoch": 0.07, + "learning_rate": 0.00018772208580095337, + "loss": 3.5288, + "step": 526 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018769319659107326, + "loss": 3.6048, + "step": 527 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018766430738119313, + "loss": 3.56, + "step": 528 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018763541817131302, + "loss": 3.5448, + "step": 529 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018760652896143292, + "loss": 3.6165, + "step": 530 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001875776397515528, + "loss": 3.495, + "step": 531 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001875487505416727, + "loss": 3.6955, + "step": 532 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018751986133179257, + "loss": 3.4293, + "step": 533 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018749097212191246, + "loss": 3.4176, + "step": 534 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018746208291203236, + "loss": 3.5641, + "step": 535 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018743319370215228, + "loss": 3.6007, + "step": 536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018740430449227215, + "loss": 3.4111, + "step": 537 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018737541528239204, + "loss": 3.456, + "step": 538 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018734652607251193, + "loss": 3.507, + "step": 539 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018731763686263183, + "loss": 3.6267, + "step": 540 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001872887476527517, + "loss": 3.5538, + "step": 541 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001872598584428716, + "loss": 3.4112, + "step": 542 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018723096923299148, + "loss": 3.5022, + "step": 543 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018720208002311138, + "loss": 3.6393, + "step": 544 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018717319081323127, + "loss": 3.6233, + "step": 545 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018714430160335114, + "loss": 3.651, + "step": 546 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018711541239347106, + "loss": 3.6567, + "step": 547 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018708652318359095, + "loss": 3.3348, + "step": 548 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018705763397371084, + "loss": 3.5277, + "step": 549 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001870287447638307, + "loss": 3.4866, + "step": 550 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001869998555539506, + "loss": 3.5642, + "step": 551 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001869709663440705, + "loss": 3.4392, + "step": 552 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001869420771341904, + "loss": 3.6856, + "step": 553 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018691318792431029, + "loss": 3.4941, + "step": 554 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018688429871443015, + "loss": 3.5972, + "step": 555 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018685540950455005, + "loss": 3.5312, + "step": 556 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018682652029466997, + "loss": 3.5929, + "step": 557 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018679763108478986, + "loss": 3.6768, + "step": 558 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018676874187490973, + "loss": 3.5386, + "step": 559 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018673985266502962, + "loss": 3.4734, + "step": 560 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018671096345514952, + "loss": 3.5429, + "step": 561 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001866820742452694, + "loss": 3.4172, + "step": 562 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018665318503538928, + "loss": 3.5298, + "step": 563 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018662429582550917, + "loss": 3.547, + "step": 564 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018659540661562906, + "loss": 3.4804, + "step": 565 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018656651740574896, + "loss": 3.5119, + "step": 566 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018653762819586885, + "loss": 3.5192, + "step": 567 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018650873898598875, + "loss": 3.5618, + "step": 568 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018647984977610864, + "loss": 3.5257, + "step": 569 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018645096056622853, + "loss": 3.5611, + "step": 570 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018642207135634843, + "loss": 3.3794, + "step": 571 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001863931821464683, + "loss": 3.4684, + "step": 572 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001863642929365882, + "loss": 3.5756, + "step": 573 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018633540372670808, + "loss": 3.6312, + "step": 574 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018630651451682798, + "loss": 3.447, + "step": 575 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018627762530694787, + "loss": 3.5265, + "step": 576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018624873609706774, + "loss": 3.5244, + "step": 577 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018621984688718766, + "loss": 3.5743, + "step": 578 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018619095767730755, + "loss": 3.4801, + "step": 579 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018616206846742742, + "loss": 3.5325, + "step": 580 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001861331792575473, + "loss": 3.603, + "step": 581 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001861042900476672, + "loss": 3.4021, + "step": 582 + }, + { + "epoch": 0.08, + "learning_rate": 0.0001860754008377871, + "loss": 3.5223, + "step": 583 + }, + { + "epoch": 0.08, + "learning_rate": 0.000186046511627907, + "loss": 3.5119, + "step": 584 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018601762241802686, + "loss": 3.4831, + "step": 585 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018598873320814675, + "loss": 3.4985, + "step": 586 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018595984399826665, + "loss": 3.6217, + "step": 587 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018593095478838654, + "loss": 3.5672, + "step": 588 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018590206557850643, + "loss": 3.4158, + "step": 589 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018587317636862633, + "loss": 3.5823, + "step": 590 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018584428715874622, + "loss": 3.5354, + "step": 591 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018581539794886612, + "loss": 3.479, + "step": 592 + }, + { + "epoch": 0.08, + "learning_rate": 0.000185786508738986, + "loss": 3.4195, + "step": 593 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018575761952910588, + "loss": 3.6027, + "step": 594 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018572873031922577, + "loss": 3.6142, + "step": 595 + }, + { + "epoch": 0.08, + "learning_rate": 0.00018569984110934566, + "loss": 3.5468, + "step": 596 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018567095189946556, + "loss": 3.504, + "step": 597 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018564206268958545, + "loss": 3.5126, + "step": 598 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018561317347970535, + "loss": 3.4528, + "step": 599 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018558428426982524, + "loss": 3.3295, + "step": 600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018555539505994513, + "loss": 3.5697, + "step": 601 + }, + { + "epoch": 0.09, + "learning_rate": 0.000185526505850065, + "loss": 3.5069, + "step": 602 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001854976166401849, + "loss": 3.5449, + "step": 603 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001854687274303048, + "loss": 3.4998, + "step": 604 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018543983822042468, + "loss": 3.2614, + "step": 605 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018541094901054458, + "loss": 3.5806, + "step": 606 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018538205980066444, + "loss": 3.568, + "step": 607 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018535317059078434, + "loss": 3.6575, + "step": 608 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018532428138090426, + "loss": 3.4329, + "step": 609 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018529539217102415, + "loss": 3.5881, + "step": 610 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018526650296114402, + "loss": 3.4916, + "step": 611 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001852376137512639, + "loss": 3.5186, + "step": 612 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001852087245413838, + "loss": 3.528, + "step": 613 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001851798353315037, + "loss": 3.4139, + "step": 614 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001851509461216236, + "loss": 3.5184, + "step": 615 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018512205691174346, + "loss": 3.5681, + "step": 616 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018509316770186335, + "loss": 3.5774, + "step": 617 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018506427849198325, + "loss": 3.5222, + "step": 618 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018503538928210314, + "loss": 3.561, + "step": 619 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018500650007222303, + "loss": 3.4017, + "step": 620 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018497761086234293, + "loss": 3.4531, + "step": 621 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018494872165246282, + "loss": 3.5615, + "step": 622 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018491983244258272, + "loss": 3.5878, + "step": 623 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018489094323270258, + "loss": 3.6159, + "step": 624 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018486205402282248, + "loss": 3.496, + "step": 625 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018483316481294237, + "loss": 3.4758, + "step": 626 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018480427560306226, + "loss": 3.4669, + "step": 627 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018477538639318216, + "loss": 3.6198, + "step": 628 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018474649718330203, + "loss": 3.5373, + "step": 629 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018471760797342195, + "loss": 3.5283, + "step": 630 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018468871876354184, + "loss": 3.5688, + "step": 631 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018465982955366173, + "loss": 3.439, + "step": 632 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001846309403437816, + "loss": 3.4053, + "step": 633 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001846020511339015, + "loss": 3.3194, + "step": 634 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001845731619240214, + "loss": 3.4613, + "step": 635 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018454427271414128, + "loss": 3.5199, + "step": 636 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018451538350426118, + "loss": 3.5138, + "step": 637 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018448649429438104, + "loss": 3.5518, + "step": 638 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018445760508450094, + "loss": 3.5416, + "step": 639 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018442871587462083, + "loss": 3.4615, + "step": 640 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018439982666474075, + "loss": 3.638, + "step": 641 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018437093745486062, + "loss": 3.5444, + "step": 642 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001843420482449805, + "loss": 3.638, + "step": 643 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001843131590351004, + "loss": 3.451, + "step": 644 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001842842698252203, + "loss": 3.5095, + "step": 645 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018425538061534017, + "loss": 3.6475, + "step": 646 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018422649140546006, + "loss": 3.5567, + "step": 647 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018419760219557995, + "loss": 3.4693, + "step": 648 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018416871298569985, + "loss": 3.6224, + "step": 649 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018413982377581974, + "loss": 3.4074, + "step": 650 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018411093456593963, + "loss": 3.4079, + "step": 651 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018408204535605953, + "loss": 3.6761, + "step": 652 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018405315614617942, + "loss": 3.4546, + "step": 653 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018402426693629932, + "loss": 3.4474, + "step": 654 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018399537772641918, + "loss": 3.5165, + "step": 655 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018396648851653908, + "loss": 3.5854, + "step": 656 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018393759930665897, + "loss": 3.5001, + "step": 657 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018390871009677886, + "loss": 3.5853, + "step": 658 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018387982088689876, + "loss": 3.5338, + "step": 659 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018385093167701863, + "loss": 3.5332, + "step": 660 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018382204246713852, + "loss": 3.5204, + "step": 661 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018379315325725844, + "loss": 3.4821, + "step": 662 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018376426404737833, + "loss": 3.4963, + "step": 663 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001837353748374982, + "loss": 3.5454, + "step": 664 + }, + { + "epoch": 0.09, + "learning_rate": 0.0001837064856276181, + "loss": 3.5903, + "step": 665 + }, + { + "epoch": 0.09, + "learning_rate": 0.000183677596417738, + "loss": 3.5212, + "step": 666 + }, + { + "epoch": 0.09, + "learning_rate": 0.00018364870720785788, + "loss": 3.5175, + "step": 667 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018361981799797775, + "loss": 3.5781, + "step": 668 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018359092878809764, + "loss": 3.4063, + "step": 669 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018356203957821754, + "loss": 3.3629, + "step": 670 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018353315036833743, + "loss": 3.512, + "step": 671 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018350426115845732, + "loss": 3.5793, + "step": 672 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018347537194857722, + "loss": 3.4232, + "step": 673 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001834464827386971, + "loss": 3.608, + "step": 674 + }, + { + "epoch": 0.1, + "learning_rate": 0.000183417593528817, + "loss": 3.4862, + "step": 675 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001833887043189369, + "loss": 3.5062, + "step": 676 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018335981510905677, + "loss": 3.5194, + "step": 677 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018333092589917666, + "loss": 3.4268, + "step": 678 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018330203668929655, + "loss": 3.4015, + "step": 679 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018327314747941645, + "loss": 3.5977, + "step": 680 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018324425826953634, + "loss": 3.4977, + "step": 681 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001832153690596562, + "loss": 3.5338, + "step": 682 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018318647984977613, + "loss": 3.5607, + "step": 683 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018315759063989602, + "loss": 3.562, + "step": 684 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018312870143001592, + "loss": 3.5139, + "step": 685 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018309981222013578, + "loss": 3.5853, + "step": 686 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018307092301025568, + "loss": 3.5065, + "step": 687 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018304203380037557, + "loss": 3.5055, + "step": 688 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018301314459049546, + "loss": 3.494, + "step": 689 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018298425538061533, + "loss": 3.563, + "step": 690 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018295536617073523, + "loss": 3.5116, + "step": 691 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018292647696085512, + "loss": 3.5446, + "step": 692 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018289758775097504, + "loss": 3.4678, + "step": 693 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001828686985410949, + "loss": 3.5063, + "step": 694 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001828398093312148, + "loss": 3.49, + "step": 695 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001828109201213347, + "loss": 3.5029, + "step": 696 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001827820309114546, + "loss": 3.504, + "step": 697 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018275314170157448, + "loss": 3.5638, + "step": 698 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018272425249169435, + "loss": 3.5541, + "step": 699 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018269536328181424, + "loss": 3.5315, + "step": 700 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018266647407193414, + "loss": 3.5832, + "step": 701 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018263758486205403, + "loss": 3.5319, + "step": 702 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018260869565217392, + "loss": 3.5295, + "step": 703 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018257980644229382, + "loss": 3.519, + "step": 704 + }, + { + "epoch": 0.1, + "eval_loss": 3.620858669281006, + "eval_runtime": 471.8255, + "eval_samples_per_second": 43.421, + "eval_steps_per_second": 14.474, + "step": 704 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001825509172324137, + "loss": 3.4535, + "step": 705 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001825220280225336, + "loss": 3.33, + "step": 706 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018249313881265347, + "loss": 3.4858, + "step": 707 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018246424960277337, + "loss": 3.4598, + "step": 708 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018243536039289326, + "loss": 3.5521, + "step": 709 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018240647118301315, + "loss": 3.5254, + "step": 710 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018237758197313305, + "loss": 3.5396, + "step": 711 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018234869276325291, + "loss": 3.5268, + "step": 712 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001823198035533728, + "loss": 3.6661, + "step": 713 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018229091434349273, + "loss": 3.5742, + "step": 714 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018226202513361262, + "loss": 3.6033, + "step": 715 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001822331359237325, + "loss": 3.5636, + "step": 716 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018220424671385238, + "loss": 3.5835, + "step": 717 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018217535750397228, + "loss": 3.5446, + "step": 718 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018214646829409217, + "loss": 3.5002, + "step": 719 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018211757908421206, + "loss": 3.3511, + "step": 720 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018208868987433193, + "loss": 3.4461, + "step": 721 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018205980066445183, + "loss": 3.4288, + "step": 722 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018203091145457172, + "loss": 3.4757, + "step": 723 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001820020222446916, + "loss": 3.4678, + "step": 724 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001819731330348115, + "loss": 3.3662, + "step": 725 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001819442438249314, + "loss": 3.6651, + "step": 726 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001819153546150513, + "loss": 3.5741, + "step": 727 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001818864654051712, + "loss": 3.636, + "step": 728 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018185757619529106, + "loss": 3.5109, + "step": 729 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018182868698541095, + "loss": 3.4351, + "step": 730 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018179979777553084, + "loss": 3.495, + "step": 731 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018177090856565074, + "loss": 3.5606, + "step": 732 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018174201935577063, + "loss": 3.511, + "step": 733 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001817131301458905, + "loss": 3.4717, + "step": 734 + }, + { + "epoch": 0.1, + "learning_rate": 0.00018168424093601042, + "loss": 3.5888, + "step": 735 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001816553517261303, + "loss": 3.3341, + "step": 736 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001816264625162502, + "loss": 3.4462, + "step": 737 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018159757330637007, + "loss": 3.5978, + "step": 738 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018156868409648997, + "loss": 3.5568, + "step": 739 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018153979488660986, + "loss": 3.4909, + "step": 740 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018151090567672975, + "loss": 3.4277, + "step": 741 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018148201646684965, + "loss": 3.5154, + "step": 742 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018145312725696951, + "loss": 3.5742, + "step": 743 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001814242380470894, + "loss": 3.4974, + "step": 744 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001813953488372093, + "loss": 3.4369, + "step": 745 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018136645962732922, + "loss": 3.5458, + "step": 746 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001813375704174491, + "loss": 3.5238, + "step": 747 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018130868120756898, + "loss": 3.4968, + "step": 748 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018127979199768888, + "loss": 3.5419, + "step": 749 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018125090278780877, + "loss": 3.3761, + "step": 750 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018122201357792864, + "loss": 3.6181, + "step": 751 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018119312436804853, + "loss": 3.5131, + "step": 752 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018116423515816843, + "loss": 3.4547, + "step": 753 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018113534594828832, + "loss": 3.5306, + "step": 754 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001811064567384082, + "loss": 3.5011, + "step": 755 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001810775675285281, + "loss": 3.5871, + "step": 756 + }, + { + "epoch": 0.11, + "learning_rate": 0.000181048678318648, + "loss": 3.4811, + "step": 757 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001810197891087679, + "loss": 3.544, + "step": 758 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001809908998988878, + "loss": 3.5499, + "step": 759 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018096201068900766, + "loss": 3.6015, + "step": 760 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018093312147912755, + "loss": 3.4873, + "step": 761 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018090423226924744, + "loss": 3.5266, + "step": 762 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018087534305936734, + "loss": 3.5291, + "step": 763 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018084645384948723, + "loss": 3.465, + "step": 764 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001808175646396071, + "loss": 3.6005, + "step": 765 + }, + { + "epoch": 0.11, + "learning_rate": 0.000180788675429727, + "loss": 3.5141, + "step": 766 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001807597862198469, + "loss": 3.5216, + "step": 767 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001807308970099668, + "loss": 3.4973, + "step": 768 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018070200780008667, + "loss": 3.3215, + "step": 769 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018067311859020657, + "loss": 3.4744, + "step": 770 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018064422938032646, + "loss": 3.5823, + "step": 771 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018061534017044635, + "loss": 3.5777, + "step": 772 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018058645096056622, + "loss": 3.506, + "step": 773 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018055756175068611, + "loss": 3.4742, + "step": 774 + }, + { + "epoch": 0.11, + "learning_rate": 0.000180528672540806, + "loss": 3.5748, + "step": 775 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001804997833309259, + "loss": 3.6195, + "step": 776 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001804708941210458, + "loss": 3.3161, + "step": 777 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001804420049111657, + "loss": 3.5649, + "step": 778 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018041311570128558, + "loss": 3.5283, + "step": 779 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018038422649140548, + "loss": 3.6222, + "step": 780 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018035533728152537, + "loss": 3.5523, + "step": 781 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018032644807164524, + "loss": 3.5396, + "step": 782 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018029755886176513, + "loss": 3.5446, + "step": 783 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018026866965188503, + "loss": 3.6128, + "step": 784 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018023978044200492, + "loss": 3.4507, + "step": 785 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001802108912321248, + "loss": 3.4899, + "step": 786 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018018200202224468, + "loss": 3.6131, + "step": 787 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001801531128123646, + "loss": 3.5835, + "step": 788 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001801242236024845, + "loss": 3.5657, + "step": 789 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001800953343926044, + "loss": 3.5208, + "step": 790 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018006644518272426, + "loss": 3.3454, + "step": 791 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018003755597284415, + "loss": 3.5561, + "step": 792 + }, + { + "epoch": 0.11, + "learning_rate": 0.00018000866676296404, + "loss": 3.5146, + "step": 793 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017997977755308394, + "loss": 3.5223, + "step": 794 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001799508883432038, + "loss": 3.4998, + "step": 795 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001799219991333237, + "loss": 3.3141, + "step": 796 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001798931099234436, + "loss": 3.3507, + "step": 797 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001798642207135635, + "loss": 3.6048, + "step": 798 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017983533150368338, + "loss": 3.6106, + "step": 799 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017980644229380327, + "loss": 3.4502, + "step": 800 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017977755308392317, + "loss": 3.514, + "step": 801 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017974866387404306, + "loss": 3.6228, + "step": 802 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017971977466416295, + "loss": 3.5165, + "step": 803 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017969088545428282, + "loss": 3.4641, + "step": 804 + }, + { + "epoch": 0.11, + "learning_rate": 0.00017966199624440271, + "loss": 3.5434, + "step": 805 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001796331070345226, + "loss": 3.5272, + "step": 806 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001796042178246425, + "loss": 3.4221, + "step": 807 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001795753286147624, + "loss": 3.4645, + "step": 808 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001795464394048823, + "loss": 3.6072, + "step": 809 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017951755019500218, + "loss": 3.6087, + "step": 810 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017948866098512208, + "loss": 3.5988, + "step": 811 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017945977177524197, + "loss": 3.6459, + "step": 812 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017943088256536184, + "loss": 3.4854, + "step": 813 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017940199335548173, + "loss": 3.4805, + "step": 814 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017937310414560163, + "loss": 3.435, + "step": 815 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017934421493572152, + "loss": 3.5022, + "step": 816 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001793153257258414, + "loss": 3.6624, + "step": 817 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017928643651596128, + "loss": 3.5145, + "step": 818 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001792575473060812, + "loss": 3.4939, + "step": 819 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001792286580962011, + "loss": 3.4377, + "step": 820 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017919976888632096, + "loss": 3.5569, + "step": 821 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017917087967644086, + "loss": 3.5119, + "step": 822 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017914199046656075, + "loss": 3.5576, + "step": 823 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017911310125668064, + "loss": 3.4825, + "step": 824 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017908421204680054, + "loss": 3.5241, + "step": 825 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001790553228369204, + "loss": 3.4232, + "step": 826 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001790264336270403, + "loss": 3.429, + "step": 827 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001789975444171602, + "loss": 3.5378, + "step": 828 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001789686552072801, + "loss": 3.604, + "step": 829 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017893976599739998, + "loss": 3.6098, + "step": 830 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017891087678751987, + "loss": 3.4572, + "step": 831 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017888198757763977, + "loss": 3.4124, + "step": 832 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017885309836775966, + "loss": 3.5226, + "step": 833 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017882420915787953, + "loss": 3.5553, + "step": 834 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017879531994799942, + "loss": 3.5273, + "step": 835 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017876643073811932, + "loss": 3.5051, + "step": 836 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001787375415282392, + "loss": 3.5559, + "step": 837 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001787086523183591, + "loss": 3.605, + "step": 838 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017867976310847897, + "loss": 3.4009, + "step": 839 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001786508738985989, + "loss": 3.4324, + "step": 840 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017862198468871878, + "loss": 3.5615, + "step": 841 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017859309547883868, + "loss": 3.458, + "step": 842 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017856420626895854, + "loss": 3.4252, + "step": 843 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017853531705907844, + "loss": 3.4661, + "step": 844 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017850642784919833, + "loss": 3.4558, + "step": 845 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017847753863931823, + "loss": 3.5175, + "step": 846 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017844864942943812, + "loss": 3.5751, + "step": 847 + }, + { + "epoch": 0.12, + "learning_rate": 0.000178419760219558, + "loss": 3.5493, + "step": 848 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017839087100967788, + "loss": 3.6275, + "step": 849 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001783619817997978, + "loss": 3.5884, + "step": 850 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001783330925899177, + "loss": 3.519, + "step": 851 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017830420338003756, + "loss": 3.6041, + "step": 852 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017827531417015746, + "loss": 3.4436, + "step": 853 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017824642496027735, + "loss": 3.5615, + "step": 854 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017821753575039724, + "loss": 3.4636, + "step": 855 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001781886465405171, + "loss": 3.4988, + "step": 856 + }, + { + "epoch": 0.12, + "learning_rate": 0.000178159757330637, + "loss": 3.4776, + "step": 857 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001781308681207569, + "loss": 3.5114, + "step": 858 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001781019789108768, + "loss": 3.5242, + "step": 859 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017807308970099669, + "loss": 3.3103, + "step": 860 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017804420049111658, + "loss": 3.5707, + "step": 861 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017801531128123647, + "loss": 3.5229, + "step": 862 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017798642207135637, + "loss": 3.548, + "step": 863 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017795753286147626, + "loss": 3.4077, + "step": 864 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017792864365159613, + "loss": 3.5361, + "step": 865 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017789975444171602, + "loss": 3.569, + "step": 866 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017787086523183592, + "loss": 3.5805, + "step": 867 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001778419760219558, + "loss": 3.5906, + "step": 868 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001778130868120757, + "loss": 3.5841, + "step": 869 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017778419760219557, + "loss": 3.3957, + "step": 870 + }, + { + "epoch": 0.12, + "learning_rate": 0.0001777553083923155, + "loss": 3.4768, + "step": 871 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017772641918243538, + "loss": 3.4825, + "step": 872 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017769752997255528, + "loss": 3.4059, + "step": 873 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017766864076267514, + "loss": 3.4614, + "step": 874 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017763975155279504, + "loss": 3.4827, + "step": 875 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017761086234291493, + "loss": 3.559, + "step": 876 + }, + { + "epoch": 0.12, + "learning_rate": 0.00017758197313303483, + "loss": 3.5241, + "step": 877 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001775530839231547, + "loss": 3.5867, + "step": 878 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001775241947132746, + "loss": 3.5094, + "step": 879 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017749530550339448, + "loss": 3.3131, + "step": 880 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017746641629351437, + "loss": 3.4676, + "step": 881 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017743752708363427, + "loss": 3.6489, + "step": 882 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017740863787375416, + "loss": 3.5398, + "step": 883 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017737974866387406, + "loss": 3.5125, + "step": 884 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017735085945399395, + "loss": 3.5213, + "step": 885 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017732197024411384, + "loss": 3.4799, + "step": 886 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001772930810342337, + "loss": 3.4404, + "step": 887 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001772641918243536, + "loss": 3.478, + "step": 888 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001772353026144735, + "loss": 3.6084, + "step": 889 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001772064134045934, + "loss": 3.5364, + "step": 890 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017717752419471329, + "loss": 3.6162, + "step": 891 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017714863498483318, + "loss": 3.4878, + "step": 892 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017711974577495307, + "loss": 3.444, + "step": 893 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017709085656507297, + "loss": 3.5809, + "step": 894 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017706196735519286, + "loss": 3.5722, + "step": 895 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017703307814531273, + "loss": 3.5038, + "step": 896 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017700418893543262, + "loss": 3.5314, + "step": 897 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017697529972555252, + "loss": 3.3688, + "step": 898 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001769464105156724, + "loss": 3.6036, + "step": 899 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017691752130579228, + "loss": 3.615, + "step": 900 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017688863209591217, + "loss": 3.3827, + "step": 901 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017685974288603206, + "loss": 3.538, + "step": 902 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017683085367615198, + "loss": 3.4969, + "step": 903 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017680196446627185, + "loss": 3.5288, + "step": 904 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017677307525639175, + "loss": 3.6388, + "step": 905 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017674418604651164, + "loss": 3.5413, + "step": 906 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017671529683663153, + "loss": 3.5144, + "step": 907 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017668640762675143, + "loss": 3.3827, + "step": 908 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001766575184168713, + "loss": 3.4117, + "step": 909 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001766286292069912, + "loss": 3.5603, + "step": 910 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017659973999711108, + "loss": 3.446, + "step": 911 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017657085078723097, + "loss": 3.5327, + "step": 912 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017654196157735087, + "loss": 3.3939, + "step": 913 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017651307236747076, + "loss": 3.3279, + "step": 914 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017648418315759066, + "loss": 3.5229, + "step": 915 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017645529394771055, + "loss": 3.4836, + "step": 916 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017642640473783044, + "loss": 3.5151, + "step": 917 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001763975155279503, + "loss": 3.5036, + "step": 918 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001763686263180702, + "loss": 3.3613, + "step": 919 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001763397371081901, + "loss": 3.416, + "step": 920 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017631084789831, + "loss": 3.5282, + "step": 921 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017628195868842986, + "loss": 3.4353, + "step": 922 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017625306947854975, + "loss": 3.5206, + "step": 923 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017622418026866967, + "loss": 3.5056, + "step": 924 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017619529105878957, + "loss": 3.6753, + "step": 925 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017616640184890943, + "loss": 3.5445, + "step": 926 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017613751263902933, + "loss": 3.54, + "step": 927 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017610862342914922, + "loss": 3.5076, + "step": 928 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017607973421926912, + "loss": 3.341, + "step": 929 + }, + { + "epoch": 0.13, + "learning_rate": 0.000176050845009389, + "loss": 3.5414, + "step": 930 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017602195579950888, + "loss": 3.5658, + "step": 931 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017599306658962877, + "loss": 3.4999, + "step": 932 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017596417737974866, + "loss": 3.5007, + "step": 933 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017593528816986858, + "loss": 3.5981, + "step": 934 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017590639895998845, + "loss": 3.4566, + "step": 935 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017587750975010835, + "loss": 3.6219, + "step": 936 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017584862054022824, + "loss": 3.4128, + "step": 937 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017581973133034813, + "loss": 3.3593, + "step": 938 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017579084212046803, + "loss": 3.4622, + "step": 939 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001757619529105879, + "loss": 3.6345, + "step": 940 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001757330637007078, + "loss": 3.5757, + "step": 941 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017570417449082768, + "loss": 3.4232, + "step": 942 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017567528528094757, + "loss": 3.2727, + "step": 943 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017564639607106744, + "loss": 3.5383, + "step": 944 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017561750686118736, + "loss": 3.4219, + "step": 945 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017558861765130726, + "loss": 3.3755, + "step": 946 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017555972844142715, + "loss": 3.4804, + "step": 947 + }, + { + "epoch": 0.13, + "learning_rate": 0.00017553083923154702, + "loss": 3.5328, + "step": 948 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001755019500216669, + "loss": 3.4785, + "step": 949 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001754730608117868, + "loss": 3.4262, + "step": 950 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001754441716019067, + "loss": 3.5284, + "step": 951 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001754152823920266, + "loss": 3.5174, + "step": 952 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017538639318214646, + "loss": 3.5466, + "step": 953 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017535750397226635, + "loss": 3.5061, + "step": 954 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017532861476238627, + "loss": 3.3919, + "step": 955 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017529972555250617, + "loss": 3.4249, + "step": 956 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017527083634262603, + "loss": 3.5617, + "step": 957 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017524194713274593, + "loss": 3.5016, + "step": 958 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017521305792286582, + "loss": 3.4626, + "step": 959 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017518416871298572, + "loss": 3.3283, + "step": 960 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017515527950310558, + "loss": 3.5361, + "step": 961 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017512639029322548, + "loss": 3.5714, + "step": 962 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017509750108334537, + "loss": 3.6037, + "step": 963 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017506861187346526, + "loss": 3.4375, + "step": 964 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017503972266358516, + "loss": 3.4013, + "step": 965 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017501083345370505, + "loss": 3.3052, + "step": 966 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017498194424382495, + "loss": 3.5285, + "step": 967 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017495305503394484, + "loss": 3.477, + "step": 968 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017492416582406473, + "loss": 3.4421, + "step": 969 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001748952766141846, + "loss": 3.5492, + "step": 970 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001748663874043045, + "loss": 3.5483, + "step": 971 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001748374981944244, + "loss": 3.4165, + "step": 972 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017480860898454428, + "loss": 3.5144, + "step": 973 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017477971977466418, + "loss": 3.6121, + "step": 974 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017475083056478404, + "loss": 3.6716, + "step": 975 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017472194135490396, + "loss": 3.5422, + "step": 976 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017469305214502386, + "loss": 3.5777, + "step": 977 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017466416293514375, + "loss": 3.406, + "step": 978 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017463527372526362, + "loss": 3.5384, + "step": 979 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001746063845153835, + "loss": 3.4563, + "step": 980 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001745774953055034, + "loss": 3.481, + "step": 981 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001745486060956233, + "loss": 3.3857, + "step": 982 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017451971688574317, + "loss": 3.43, + "step": 983 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017449082767586306, + "loss": 3.4773, + "step": 984 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017446193846598295, + "loss": 3.5036, + "step": 985 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017443304925610285, + "loss": 3.4525, + "step": 986 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017440416004622274, + "loss": 3.3283, + "step": 987 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017437527083634263, + "loss": 3.5363, + "step": 988 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017434638162646253, + "loss": 3.5391, + "step": 989 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017431749241658242, + "loss": 3.582, + "step": 990 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017428860320670232, + "loss": 3.6314, + "step": 991 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017425971399682218, + "loss": 3.53, + "step": 992 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017423082478694208, + "loss": 3.3956, + "step": 993 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017420193557706197, + "loss": 3.4508, + "step": 994 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017417304636718186, + "loss": 3.512, + "step": 995 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017414415715730176, + "loss": 3.3145, + "step": 996 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017411526794742165, + "loss": 3.5319, + "step": 997 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017408637873754155, + "loss": 3.4918, + "step": 998 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017405748952766144, + "loss": 3.4096, + "step": 999 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017402860031778133, + "loss": 3.6008, + "step": 1000 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001739997111079012, + "loss": 3.2475, + "step": 1001 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001739708218980211, + "loss": 3.6255, + "step": 1002 + }, + { + "epoch": 0.14, + "learning_rate": 0.000173941932688141, + "loss": 3.491, + "step": 1003 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017391304347826088, + "loss": 3.3948, + "step": 1004 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017388415426838075, + "loss": 3.5865, + "step": 1005 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017385526505850064, + "loss": 3.5071, + "step": 1006 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017382637584862056, + "loss": 3.546, + "step": 1007 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017379748663874046, + "loss": 3.5369, + "step": 1008 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017376859742886032, + "loss": 3.4211, + "step": 1009 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017373970821898022, + "loss": 3.4982, + "step": 1010 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001737108190091001, + "loss": 3.4012, + "step": 1011 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017368192979922, + "loss": 3.4594, + "step": 1012 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001736530405893399, + "loss": 3.5041, + "step": 1013 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017362415137945977, + "loss": 3.497, + "step": 1014 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017359526216957966, + "loss": 3.3829, + "step": 1015 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017356637295969955, + "loss": 3.4956, + "step": 1016 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017353748374981945, + "loss": 3.6394, + "step": 1017 + }, + { + "epoch": 0.14, + "learning_rate": 0.00017350859453993934, + "loss": 3.629, + "step": 1018 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017347970533005923, + "loss": 3.5818, + "step": 1019 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017345081612017913, + "loss": 3.4767, + "step": 1020 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017342192691029902, + "loss": 3.6218, + "step": 1021 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017339303770041892, + "loss": 3.555, + "step": 1022 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017336414849053878, + "loss": 3.697, + "step": 1023 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017333525928065868, + "loss": 3.5714, + "step": 1024 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017330637007077857, + "loss": 3.4656, + "step": 1025 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017327748086089846, + "loss": 3.488, + "step": 1026 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017324859165101833, + "loss": 3.5766, + "step": 1027 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017321970244113825, + "loss": 3.6, + "step": 1028 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017319081323125815, + "loss": 3.662, + "step": 1029 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017316192402137804, + "loss": 3.4725, + "step": 1030 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001731330348114979, + "loss": 3.4792, + "step": 1031 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001731041456016178, + "loss": 3.143, + "step": 1032 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001730752563917377, + "loss": 3.5497, + "step": 1033 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001730463671818576, + "loss": 3.362, + "step": 1034 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017301747797197748, + "loss": 3.5541, + "step": 1035 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017298858876209735, + "loss": 3.4184, + "step": 1036 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017295969955221724, + "loss": 3.4563, + "step": 1037 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017293081034233714, + "loss": 3.4454, + "step": 1038 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017290192113245706, + "loss": 3.497, + "step": 1039 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017287303192257692, + "loss": 3.5273, + "step": 1040 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017284414271269682, + "loss": 3.3877, + "step": 1041 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001728152535028167, + "loss": 3.5384, + "step": 1042 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001727863642929366, + "loss": 3.5564, + "step": 1043 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001727574750830565, + "loss": 3.5066, + "step": 1044 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017272858587317637, + "loss": 3.3032, + "step": 1045 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017269969666329626, + "loss": 3.4389, + "step": 1046 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017267080745341615, + "loss": 3.5787, + "step": 1047 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017264191824353605, + "loss": 3.442, + "step": 1048 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017261302903365594, + "loss": 3.5452, + "step": 1049 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017258413982377583, + "loss": 3.3751, + "step": 1050 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017255525061389573, + "loss": 3.316, + "step": 1051 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017252636140401562, + "loss": 3.5407, + "step": 1052 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001724974721941355, + "loss": 3.3702, + "step": 1053 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017246858298425538, + "loss": 3.4369, + "step": 1054 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017243969377437528, + "loss": 3.438, + "step": 1055 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017241080456449517, + "loss": 3.4168, + "step": 1056 + }, + { + "epoch": 0.15, + "eval_loss": 3.6160545349121094, + "eval_runtime": 472.1217, + "eval_samples_per_second": 43.393, + "eval_steps_per_second": 14.464, + "step": 1056 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017238191535461506, + "loss": 3.4049, + "step": 1057 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017235302614473493, + "loss": 3.5561, + "step": 1058 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017232413693485482, + "loss": 3.5493, + "step": 1059 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017229524772497475, + "loss": 3.5245, + "step": 1060 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017226635851509464, + "loss": 3.565, + "step": 1061 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001722374693052145, + "loss": 3.5078, + "step": 1062 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001722085800953344, + "loss": 3.5372, + "step": 1063 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001721796908854543, + "loss": 3.4221, + "step": 1064 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001721508016755742, + "loss": 3.5337, + "step": 1065 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017212191246569408, + "loss": 3.3991, + "step": 1066 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017209302325581395, + "loss": 3.5483, + "step": 1067 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017206413404593384, + "loss": 3.4667, + "step": 1068 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017203524483605374, + "loss": 3.612, + "step": 1069 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017200635562617363, + "loss": 3.485, + "step": 1070 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017197746641629352, + "loss": 3.516, + "step": 1071 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017194857720641342, + "loss": 3.5625, + "step": 1072 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001719196879965333, + "loss": 3.4549, + "step": 1073 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001718907987866532, + "loss": 3.6844, + "step": 1074 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017186190957677307, + "loss": 3.5497, + "step": 1075 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017183302036689297, + "loss": 3.3467, + "step": 1076 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017180413115701286, + "loss": 3.5433, + "step": 1077 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017177524194713275, + "loss": 3.42, + "step": 1078 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017174635273725265, + "loss": 3.4682, + "step": 1079 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017171746352737251, + "loss": 3.4201, + "step": 1080 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017168857431749243, + "loss": 3.5717, + "step": 1081 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017165968510761233, + "loss": 3.453, + "step": 1082 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017163079589773222, + "loss": 3.6388, + "step": 1083 + }, + { + "epoch": 0.15, + "learning_rate": 0.0001716019066878521, + "loss": 3.5073, + "step": 1084 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017157301747797198, + "loss": 3.529, + "step": 1085 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017154412826809188, + "loss": 3.5095, + "step": 1086 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017151523905821177, + "loss": 3.4548, + "step": 1087 + }, + { + "epoch": 0.15, + "learning_rate": 0.00017148634984833164, + "loss": 3.3342, + "step": 1088 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017145746063845153, + "loss": 3.3946, + "step": 1089 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017142857142857143, + "loss": 3.5994, + "step": 1090 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017139968221869135, + "loss": 3.5959, + "step": 1091 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001713707930088112, + "loss": 3.507, + "step": 1092 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001713419037989311, + "loss": 3.6017, + "step": 1093 + }, + { + "epoch": 0.16, + "learning_rate": 0.000171313014589051, + "loss": 3.3432, + "step": 1094 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001712841253791709, + "loss": 3.4669, + "step": 1095 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001712552361692908, + "loss": 3.4304, + "step": 1096 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017122634695941065, + "loss": 3.5036, + "step": 1097 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017119745774953055, + "loss": 3.3196, + "step": 1098 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017116856853965044, + "loss": 3.4165, + "step": 1099 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017113967932977034, + "loss": 3.3619, + "step": 1100 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017111079011989023, + "loss": 3.546, + "step": 1101 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017108190091001012, + "loss": 3.3974, + "step": 1102 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017105301170013002, + "loss": 3.4734, + "step": 1103 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001710241224902499, + "loss": 3.4852, + "step": 1104 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001709952332803698, + "loss": 3.4784, + "step": 1105 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017096634407048967, + "loss": 3.5011, + "step": 1106 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017093745486060957, + "loss": 3.6002, + "step": 1107 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017090856565072946, + "loss": 3.4947, + "step": 1108 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017087967644084935, + "loss": 3.5663, + "step": 1109 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017085078723096922, + "loss": 3.411, + "step": 1110 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017082189802108911, + "loss": 3.5298, + "step": 1111 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017079300881120904, + "loss": 3.4539, + "step": 1112 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017076411960132893, + "loss": 3.3371, + "step": 1113 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001707352303914488, + "loss": 3.4276, + "step": 1114 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001707063411815687, + "loss": 3.5796, + "step": 1115 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017067745197168858, + "loss": 3.4728, + "step": 1116 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017064856276180848, + "loss": 3.4893, + "step": 1117 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017061967355192837, + "loss": 3.575, + "step": 1118 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017059078434204824, + "loss": 3.5082, + "step": 1119 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017056189513216813, + "loss": 3.3607, + "step": 1120 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017053300592228803, + "loss": 3.49, + "step": 1121 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017050411671240792, + "loss": 3.4581, + "step": 1122 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001704752275025278, + "loss": 3.5894, + "step": 1123 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001704463382926477, + "loss": 3.3818, + "step": 1124 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001704174490827676, + "loss": 3.7091, + "step": 1125 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001703885598728875, + "loss": 3.5166, + "step": 1126 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001703596706630074, + "loss": 3.547, + "step": 1127 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017033078145312726, + "loss": 3.392, + "step": 1128 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017030189224324715, + "loss": 3.6263, + "step": 1129 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017027300303336704, + "loss": 3.5654, + "step": 1130 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017024411382348694, + "loss": 3.547, + "step": 1131 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001702152246136068, + "loss": 3.5592, + "step": 1132 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017018633540372672, + "loss": 3.4518, + "step": 1133 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017015744619384662, + "loss": 3.55, + "step": 1134 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001701285569839665, + "loss": 3.3945, + "step": 1135 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017009966777408638, + "loss": 3.4937, + "step": 1136 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017007077856420627, + "loss": 3.5241, + "step": 1137 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017004188935432617, + "loss": 3.4706, + "step": 1138 + }, + { + "epoch": 0.16, + "learning_rate": 0.00017001300014444606, + "loss": 3.4276, + "step": 1139 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016998411093456595, + "loss": 3.569, + "step": 1140 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016995522172468582, + "loss": 3.5449, + "step": 1141 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016992633251480571, + "loss": 3.3942, + "step": 1142 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001698974433049256, + "loss": 3.5298, + "step": 1143 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016986855409504553, + "loss": 3.5164, + "step": 1144 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001698396648851654, + "loss": 3.5107, + "step": 1145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001698107756752853, + "loss": 3.5288, + "step": 1146 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016978188646540518, + "loss": 3.2389, + "step": 1147 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016975299725552508, + "loss": 3.3991, + "step": 1148 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016972410804564497, + "loss": 3.3714, + "step": 1149 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016969521883576484, + "loss": 3.5075, + "step": 1150 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016966632962588473, + "loss": 3.4373, + "step": 1151 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016963744041600463, + "loss": 3.5701, + "step": 1152 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016960855120612452, + "loss": 3.4354, + "step": 1153 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001695796619962444, + "loss": 3.3822, + "step": 1154 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001695507727863643, + "loss": 3.3253, + "step": 1155 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001695218835764842, + "loss": 3.5329, + "step": 1156 + }, + { + "epoch": 0.16, + "learning_rate": 0.0001694929943666041, + "loss": 3.5455, + "step": 1157 + }, + { + "epoch": 0.16, + "learning_rate": 0.00016946410515672396, + "loss": 3.4045, + "step": 1158 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016943521594684386, + "loss": 3.5935, + "step": 1159 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016940632673696375, + "loss": 3.4058, + "step": 1160 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016937743752708364, + "loss": 3.5842, + "step": 1161 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016934854831720354, + "loss": 3.429, + "step": 1162 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001693196591073234, + "loss": 3.4422, + "step": 1163 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001692907698974433, + "loss": 3.5908, + "step": 1164 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016926188068756322, + "loss": 3.6408, + "step": 1165 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001692329914776831, + "loss": 3.4388, + "step": 1166 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016920410226780298, + "loss": 3.5309, + "step": 1167 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016917521305792287, + "loss": 3.3306, + "step": 1168 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016914632384804277, + "loss": 3.5048, + "step": 1169 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016911743463816266, + "loss": 3.6547, + "step": 1170 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016908854542828255, + "loss": 3.5393, + "step": 1171 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016905965621840242, + "loss": 3.4526, + "step": 1172 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016903076700852231, + "loss": 3.5734, + "step": 1173 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001690018777986422, + "loss": 3.4974, + "step": 1174 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001689729885887621, + "loss": 3.5531, + "step": 1175 + }, + { + "epoch": 0.17, + "learning_rate": 0.000168944099378882, + "loss": 3.6911, + "step": 1176 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001689152101690019, + "loss": 3.5546, + "step": 1177 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016888632095912178, + "loss": 3.5638, + "step": 1178 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016885743174924168, + "loss": 3.5208, + "step": 1179 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016882854253936154, + "loss": 3.5458, + "step": 1180 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016879965332948144, + "loss": 3.4993, + "step": 1181 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016877076411960133, + "loss": 3.5327, + "step": 1182 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016874187490972123, + "loss": 3.6325, + "step": 1183 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016871298569984112, + "loss": 3.5104, + "step": 1184 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016868409648996099, + "loss": 3.3472, + "step": 1185 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001686552072800809, + "loss": 3.2308, + "step": 1186 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001686263180702008, + "loss": 3.5748, + "step": 1187 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001685974288603207, + "loss": 3.5129, + "step": 1188 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016856853965044056, + "loss": 3.2358, + "step": 1189 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016853965044056046, + "loss": 3.6463, + "step": 1190 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016851076123068035, + "loss": 3.4268, + "step": 1191 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016848187202080024, + "loss": 3.5219, + "step": 1192 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016845298281092014, + "loss": 3.5516, + "step": 1193 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016842409360104, + "loss": 3.5834, + "step": 1194 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001683952043911599, + "loss": 3.4691, + "step": 1195 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016836631518127982, + "loss": 3.5435, + "step": 1196 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016833742597139969, + "loss": 3.4956, + "step": 1197 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016830853676151958, + "loss": 3.5069, + "step": 1198 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016827964755163947, + "loss": 3.411, + "step": 1199 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016825075834175937, + "loss": 3.5438, + "step": 1200 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016822186913187926, + "loss": 3.5069, + "step": 1201 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016819297992199913, + "loss": 3.5991, + "step": 1202 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016816409071211902, + "loss": 3.3631, + "step": 1203 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016813520150223891, + "loss": 3.5341, + "step": 1204 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001681063122923588, + "loss": 3.4704, + "step": 1205 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001680774230824787, + "loss": 3.3993, + "step": 1206 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001680485338725986, + "loss": 3.3834, + "step": 1207 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001680196446627185, + "loss": 3.5037, + "step": 1208 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016799075545283838, + "loss": 3.5927, + "step": 1209 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016796186624295828, + "loss": 3.625, + "step": 1210 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016793297703307814, + "loss": 3.5295, + "step": 1211 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016790408782319804, + "loss": 3.3316, + "step": 1212 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016787519861331793, + "loss": 3.5151, + "step": 1213 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016784630940343783, + "loss": 3.4836, + "step": 1214 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016781742019355772, + "loss": 3.5249, + "step": 1215 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016778853098367759, + "loss": 3.3766, + "step": 1216 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001677596417737975, + "loss": 3.5304, + "step": 1217 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001677307525639174, + "loss": 3.4835, + "step": 1218 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016770186335403727, + "loss": 3.7327, + "step": 1219 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016767297414415716, + "loss": 3.4814, + "step": 1220 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016764408493427706, + "loss": 3.4791, + "step": 1221 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016761519572439695, + "loss": 3.5619, + "step": 1222 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016758630651451684, + "loss": 3.5676, + "step": 1223 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001675574173046367, + "loss": 3.3664, + "step": 1224 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001675285280947566, + "loss": 3.3978, + "step": 1225 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001674996388848765, + "loss": 3.5553, + "step": 1226 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016747074967499642, + "loss": 3.383, + "step": 1227 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016744186046511629, + "loss": 3.5232, + "step": 1228 + }, + { + "epoch": 0.17, + "learning_rate": 0.00016741297125523618, + "loss": 3.3742, + "step": 1229 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016738408204535607, + "loss": 3.4612, + "step": 1230 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016735519283547597, + "loss": 3.5645, + "step": 1231 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016732630362559586, + "loss": 3.4287, + "step": 1232 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016729741441571573, + "loss": 3.5151, + "step": 1233 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016726852520583562, + "loss": 3.5459, + "step": 1234 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016723963599595551, + "loss": 3.6089, + "step": 1235 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001672107467860754, + "loss": 3.5577, + "step": 1236 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016718185757619528, + "loss": 3.6152, + "step": 1237 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001671529683663152, + "loss": 3.5544, + "step": 1238 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001671240791564351, + "loss": 3.519, + "step": 1239 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016709518994655498, + "loss": 3.4751, + "step": 1240 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016706630073667485, + "loss": 3.4899, + "step": 1241 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016703741152679474, + "loss": 3.3328, + "step": 1242 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016700852231691464, + "loss": 3.3789, + "step": 1243 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016697963310703453, + "loss": 3.4779, + "step": 1244 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016695074389715443, + "loss": 3.4797, + "step": 1245 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001669218546872743, + "loss": 3.4116, + "step": 1246 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001668929654773942, + "loss": 3.5402, + "step": 1247 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001668640762675141, + "loss": 3.4977, + "step": 1248 + }, + { + "epoch": 0.18, + "learning_rate": 0.000166835187057634, + "loss": 3.5899, + "step": 1249 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016680629784775387, + "loss": 3.4811, + "step": 1250 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016677740863787376, + "loss": 3.5071, + "step": 1251 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016674851942799366, + "loss": 3.5884, + "step": 1252 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016671963021811355, + "loss": 3.5284, + "step": 1253 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016669074100823344, + "loss": 3.3557, + "step": 1254 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001666618517983533, + "loss": 3.3826, + "step": 1255 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001666329625884732, + "loss": 3.5119, + "step": 1256 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001666040733785931, + "loss": 3.4175, + "step": 1257 + }, + { + "epoch": 0.18, + "learning_rate": 0.000166575184168713, + "loss": 3.4374, + "step": 1258 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016654629495883289, + "loss": 3.4862, + "step": 1259 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016651740574895278, + "loss": 3.3808, + "step": 1260 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016648851653907267, + "loss": 3.2596, + "step": 1261 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016645962732919257, + "loss": 3.4759, + "step": 1262 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016643073811931243, + "loss": 3.5809, + "step": 1263 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016640184890943233, + "loss": 3.5571, + "step": 1264 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016637295969955222, + "loss": 3.4506, + "step": 1265 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016634407048967212, + "loss": 3.496, + "step": 1266 + }, + { + "epoch": 0.18, + "learning_rate": 0.000166315181279792, + "loss": 3.5072, + "step": 1267 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016628629206991188, + "loss": 3.4787, + "step": 1268 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001662574028600318, + "loss": 3.4776, + "step": 1269 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001662285136501517, + "loss": 3.4604, + "step": 1270 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016619962444027158, + "loss": 3.4504, + "step": 1271 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016617073523039145, + "loss": 3.4307, + "step": 1272 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016614184602051134, + "loss": 3.4938, + "step": 1273 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016611295681063124, + "loss": 3.3841, + "step": 1274 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016608406760075113, + "loss": 3.0949, + "step": 1275 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016605517839087103, + "loss": 3.5167, + "step": 1276 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001660262891809909, + "loss": 3.3093, + "step": 1277 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001659973999711108, + "loss": 3.3595, + "step": 1278 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016596851076123068, + "loss": 3.4556, + "step": 1279 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001659396215513506, + "loss": 3.564, + "step": 1280 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016591073234147047, + "loss": 3.6636, + "step": 1281 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016588184313159036, + "loss": 3.5133, + "step": 1282 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016585295392171026, + "loss": 3.5039, + "step": 1283 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016582406471183015, + "loss": 3.4521, + "step": 1284 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016579517550195002, + "loss": 3.4918, + "step": 1285 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001657662862920699, + "loss": 3.3308, + "step": 1286 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001657373970821898, + "loss": 3.5266, + "step": 1287 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001657085078723097, + "loss": 3.5503, + "step": 1288 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001656796186624296, + "loss": 3.4942, + "step": 1289 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016565072945254949, + "loss": 3.3854, + "step": 1290 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016562184024266938, + "loss": 3.5463, + "step": 1291 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016559295103278927, + "loss": 3.3519, + "step": 1292 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016556406182290917, + "loss": 3.3959, + "step": 1293 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016553517261302903, + "loss": 3.3282, + "step": 1294 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016550628340314893, + "loss": 3.4241, + "step": 1295 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016547739419326882, + "loss": 3.5177, + "step": 1296 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016544850498338872, + "loss": 3.4661, + "step": 1297 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001654196157735086, + "loss": 3.4302, + "step": 1298 + }, + { + "epoch": 0.18, + "learning_rate": 0.00016539072656362848, + "loss": 3.5691, + "step": 1299 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016536183735374837, + "loss": 3.47, + "step": 1300 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001653329481438683, + "loss": 3.4886, + "step": 1301 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016530405893398816, + "loss": 3.6058, + "step": 1302 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016527516972410805, + "loss": 3.2861, + "step": 1303 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016524628051422794, + "loss": 3.3734, + "step": 1304 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016521739130434784, + "loss": 3.581, + "step": 1305 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016518850209446773, + "loss": 3.4347, + "step": 1306 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001651596128845876, + "loss": 3.4264, + "step": 1307 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001651307236747075, + "loss": 3.5391, + "step": 1308 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001651018344648274, + "loss": 3.3605, + "step": 1309 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016507294525494728, + "loss": 3.4065, + "step": 1310 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016504405604506717, + "loss": 3.4079, + "step": 1311 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016501516683518707, + "loss": 3.5497, + "step": 1312 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016498627762530696, + "loss": 3.4103, + "step": 1313 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016495738841542686, + "loss": 3.4212, + "step": 1314 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016492849920554675, + "loss": 3.5213, + "step": 1315 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016489960999566662, + "loss": 3.3493, + "step": 1316 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001648707207857865, + "loss": 3.5141, + "step": 1317 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001648418315759064, + "loss": 3.3057, + "step": 1318 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001648129423660263, + "loss": 3.5041, + "step": 1319 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001647840531561462, + "loss": 3.5368, + "step": 1320 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016475516394626606, + "loss": 3.422, + "step": 1321 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016472627473638598, + "loss": 3.5449, + "step": 1322 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016469738552650587, + "loss": 3.5309, + "step": 1323 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016466849631662574, + "loss": 3.4928, + "step": 1324 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016463960710674563, + "loss": 3.5239, + "step": 1325 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016461071789686553, + "loss": 3.4141, + "step": 1326 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016458182868698542, + "loss": 3.5027, + "step": 1327 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016455293947710532, + "loss": 3.4508, + "step": 1328 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016452405026722518, + "loss": 3.4924, + "step": 1329 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016449516105734508, + "loss": 3.4917, + "step": 1330 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016446627184746497, + "loss": 3.5257, + "step": 1331 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001644373826375849, + "loss": 3.5661, + "step": 1332 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016440849342770476, + "loss": 3.4663, + "step": 1333 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016437960421782465, + "loss": 3.4531, + "step": 1334 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016435071500794455, + "loss": 3.4828, + "step": 1335 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016432182579806444, + "loss": 3.4431, + "step": 1336 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016429293658818433, + "loss": 3.4394, + "step": 1337 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001642640473783042, + "loss": 3.4915, + "step": 1338 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001642351581684241, + "loss": 3.5472, + "step": 1339 + }, + { + "epoch": 0.19, + "learning_rate": 0.000164206268958544, + "loss": 3.3655, + "step": 1340 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016417737974866388, + "loss": 3.4857, + "step": 1341 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016414849053878377, + "loss": 3.4078, + "step": 1342 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016411960132890367, + "loss": 3.5025, + "step": 1343 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016409071211902356, + "loss": 3.4253, + "step": 1344 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016406182290914346, + "loss": 3.5449, + "step": 1345 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016403293369926332, + "loss": 3.5968, + "step": 1346 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016400404448938322, + "loss": 3.2201, + "step": 1347 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001639751552795031, + "loss": 3.5532, + "step": 1348 + }, + { + "epoch": 0.19, + "learning_rate": 0.000163946266069623, + "loss": 3.51, + "step": 1349 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001639173768597429, + "loss": 3.4733, + "step": 1350 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016388848764986276, + "loss": 3.603, + "step": 1351 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016385959843998266, + "loss": 3.4945, + "step": 1352 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016383070923010258, + "loss": 3.5108, + "step": 1353 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016380182002022247, + "loss": 3.5857, + "step": 1354 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016377293081034234, + "loss": 3.376, + "step": 1355 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016374404160046223, + "loss": 3.5224, + "step": 1356 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016371515239058213, + "loss": 3.4412, + "step": 1357 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016368626318070202, + "loss": 3.4796, + "step": 1358 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016365737397082192, + "loss": 3.5574, + "step": 1359 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016362848476094178, + "loss": 3.3681, + "step": 1360 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016359959555106168, + "loss": 3.462, + "step": 1361 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016357070634118157, + "loss": 3.3938, + "step": 1362 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016354181713130146, + "loss": 3.628, + "step": 1363 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016351292792142136, + "loss": 3.5116, + "step": 1364 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016348403871154125, + "loss": 3.5233, + "step": 1365 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016345514950166115, + "loss": 3.2241, + "step": 1366 + }, + { + "epoch": 0.19, + "learning_rate": 0.00016342626029178104, + "loss": 3.4676, + "step": 1367 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001633973710819009, + "loss": 3.5448, + "step": 1368 + }, + { + "epoch": 0.19, + "learning_rate": 0.0001633684818720208, + "loss": 3.5828, + "step": 1369 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001633395926621407, + "loss": 3.5154, + "step": 1370 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001633107034522606, + "loss": 3.4541, + "step": 1371 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016328181424238048, + "loss": 3.4759, + "step": 1372 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016325292503250035, + "loss": 3.4711, + "step": 1373 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016322403582262027, + "loss": 3.563, + "step": 1374 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016319514661274016, + "loss": 3.3796, + "step": 1375 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016316625740286006, + "loss": 3.4592, + "step": 1376 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016313736819297992, + "loss": 3.4986, + "step": 1377 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016310847898309982, + "loss": 3.4143, + "step": 1378 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001630795897732197, + "loss": 3.6112, + "step": 1379 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001630507005633396, + "loss": 3.3096, + "step": 1380 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001630218113534595, + "loss": 3.4938, + "step": 1381 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016299292214357937, + "loss": 3.4479, + "step": 1382 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016296403293369926, + "loss": 3.4046, + "step": 1383 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016293514372381915, + "loss": 3.4154, + "step": 1384 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016290625451393907, + "loss": 3.5072, + "step": 1385 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016287736530405894, + "loss": 3.4145, + "step": 1386 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016284847609417883, + "loss": 3.4102, + "step": 1387 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016281958688429873, + "loss": 3.5517, + "step": 1388 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016279069767441862, + "loss": 3.4998, + "step": 1389 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001627618084645385, + "loss": 3.5467, + "step": 1390 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016273291925465838, + "loss": 3.4933, + "step": 1391 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016270403004477828, + "loss": 3.5235, + "step": 1392 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016267514083489817, + "loss": 3.5461, + "step": 1393 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016264625162501806, + "loss": 3.3848, + "step": 1394 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016261736241513796, + "loss": 3.3974, + "step": 1395 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016258847320525785, + "loss": 3.37, + "step": 1396 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016255958399537775, + "loss": 3.6363, + "step": 1397 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016253069478549764, + "loss": 3.4066, + "step": 1398 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001625018055756175, + "loss": 3.345, + "step": 1399 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001624729163657374, + "loss": 3.4497, + "step": 1400 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001624440271558573, + "loss": 3.5384, + "step": 1401 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001624151379459772, + "loss": 3.5991, + "step": 1402 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016238624873609708, + "loss": 3.4692, + "step": 1403 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016235735952621695, + "loss": 3.4696, + "step": 1404 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016232847031633687, + "loss": 3.5949, + "step": 1405 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016229958110645676, + "loss": 3.3968, + "step": 1406 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016227069189657666, + "loss": 3.5565, + "step": 1407 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016224180268669652, + "loss": 3.4906, + "step": 1408 + }, + { + "epoch": 0.2, + "eval_loss": 3.605428695678711, + "eval_runtime": 471.784, + "eval_samples_per_second": 43.425, + "eval_steps_per_second": 14.475, + "step": 1408 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016221291347681642, + "loss": 3.3684, + "step": 1409 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001621840242669363, + "loss": 3.4983, + "step": 1410 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001621551350570562, + "loss": 3.5674, + "step": 1411 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016212624584717607, + "loss": 3.3104, + "step": 1412 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016209735663729597, + "loss": 3.5129, + "step": 1413 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016206846742741586, + "loss": 3.2757, + "step": 1414 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016203957821753575, + "loss": 3.5879, + "step": 1415 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016201068900765565, + "loss": 3.5024, + "step": 1416 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016198179979777554, + "loss": 3.6142, + "step": 1417 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016195291058789543, + "loss": 3.5676, + "step": 1418 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016192402137801533, + "loss": 3.5295, + "step": 1419 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016189513216813522, + "loss": 3.5358, + "step": 1420 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001618662429582551, + "loss": 3.4825, + "step": 1421 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016183735374837498, + "loss": 3.3527, + "step": 1422 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016180846453849488, + "loss": 3.5883, + "step": 1423 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016177957532861477, + "loss": 3.407, + "step": 1424 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016175068611873466, + "loss": 3.5245, + "step": 1425 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016172179690885456, + "loss": 3.4443, + "step": 1426 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016169290769897445, + "loss": 3.3916, + "step": 1427 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016166401848909435, + "loss": 3.3604, + "step": 1428 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001616351292792142, + "loss": 3.475, + "step": 1429 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001616062400693341, + "loss": 3.6084, + "step": 1430 + }, + { + "epoch": 0.2, + "learning_rate": 0.000161577350859454, + "loss": 3.5751, + "step": 1431 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001615484616495739, + "loss": 3.485, + "step": 1432 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001615195724396938, + "loss": 3.5544, + "step": 1433 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016149068322981365, + "loss": 3.5604, + "step": 1434 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016146179401993355, + "loss": 3.5081, + "step": 1435 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016143290481005344, + "loss": 3.4819, + "step": 1436 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016140401560017336, + "loss": 3.4616, + "step": 1437 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016137512639029323, + "loss": 3.5589, + "step": 1438 + }, + { + "epoch": 0.2, + "learning_rate": 0.00016134623718041312, + "loss": 3.4721, + "step": 1439 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016131734797053302, + "loss": 3.5011, + "step": 1440 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001612884587606529, + "loss": 3.5185, + "step": 1441 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001612595695507728, + "loss": 3.5604, + "step": 1442 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016123068034089267, + "loss": 3.577, + "step": 1443 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016120179113101257, + "loss": 3.3364, + "step": 1444 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016117290192113246, + "loss": 3.4346, + "step": 1445 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016114401271125235, + "loss": 3.5262, + "step": 1446 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016111512350137225, + "loss": 3.4903, + "step": 1447 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016108623429149214, + "loss": 3.2734, + "step": 1448 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016105734508161203, + "loss": 3.5319, + "step": 1449 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016102845587173193, + "loss": 3.4755, + "step": 1450 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001609995666618518, + "loss": 3.3927, + "step": 1451 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001609706774519717, + "loss": 3.523, + "step": 1452 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016094178824209158, + "loss": 3.448, + "step": 1453 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016091289903221148, + "loss": 3.5125, + "step": 1454 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016088400982233137, + "loss": 3.3653, + "step": 1455 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016085512061245124, + "loss": 3.5488, + "step": 1456 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016082623140257113, + "loss": 3.3857, + "step": 1457 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016079734219269105, + "loss": 3.4681, + "step": 1458 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016076845298281095, + "loss": 3.5075, + "step": 1459 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001607395637729308, + "loss": 3.4557, + "step": 1460 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001607106745630507, + "loss": 3.5808, + "step": 1461 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001606817853531706, + "loss": 3.6319, + "step": 1462 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001606528961432905, + "loss": 3.3235, + "step": 1463 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001606240069334104, + "loss": 3.4253, + "step": 1464 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016059511772353025, + "loss": 3.4317, + "step": 1465 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016056622851365015, + "loss": 3.3315, + "step": 1466 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016053733930377004, + "loss": 3.389, + "step": 1467 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016050845009388996, + "loss": 3.5869, + "step": 1468 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016047956088400983, + "loss": 3.4756, + "step": 1469 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016045067167412972, + "loss": 3.5431, + "step": 1470 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016042178246424962, + "loss": 3.5496, + "step": 1471 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001603928932543695, + "loss": 3.4815, + "step": 1472 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016036400404448938, + "loss": 3.4222, + "step": 1473 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016033511483460927, + "loss": 3.3973, + "step": 1474 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016030622562472917, + "loss": 3.5076, + "step": 1475 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016027733641484906, + "loss": 3.5157, + "step": 1476 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016024844720496895, + "loss": 3.3839, + "step": 1477 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016021955799508882, + "loss": 3.4792, + "step": 1478 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016019066878520874, + "loss": 3.4945, + "step": 1479 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016016177957532863, + "loss": 3.5246, + "step": 1480 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016013289036544853, + "loss": 3.3951, + "step": 1481 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001601040011555684, + "loss": 3.548, + "step": 1482 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001600751119456883, + "loss": 3.5085, + "step": 1483 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016004622273580818, + "loss": 3.3503, + "step": 1484 + }, + { + "epoch": 0.21, + "learning_rate": 0.00016001733352592808, + "loss": 3.5932, + "step": 1485 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015998844431604797, + "loss": 3.4716, + "step": 1486 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015995955510616784, + "loss": 3.6247, + "step": 1487 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015993066589628773, + "loss": 3.5502, + "step": 1488 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015990177668640765, + "loss": 3.4689, + "step": 1489 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015987288747652755, + "loss": 3.4942, + "step": 1490 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001598439982666474, + "loss": 3.4845, + "step": 1491 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001598151090567673, + "loss": 3.4916, + "step": 1492 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001597862198468872, + "loss": 3.3864, + "step": 1493 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001597573306370071, + "loss": 3.4457, + "step": 1494 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015972844142712696, + "loss": 3.3781, + "step": 1495 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015969955221724685, + "loss": 3.3533, + "step": 1496 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015967066300736675, + "loss": 3.5256, + "step": 1497 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015964177379748664, + "loss": 3.5881, + "step": 1498 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015961288458760654, + "loss": 3.4377, + "step": 1499 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015958399537772643, + "loss": 3.4979, + "step": 1500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015955510616784632, + "loss": 3.2714, + "step": 1501 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015952621695796622, + "loss": 3.5759, + "step": 1502 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001594973277480861, + "loss": 3.5093, + "step": 1503 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015946843853820598, + "loss": 3.2263, + "step": 1504 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015943954932832587, + "loss": 3.417, + "step": 1505 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015941066011844577, + "loss": 3.3492, + "step": 1506 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015938177090856566, + "loss": 3.4252, + "step": 1507 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015935288169868555, + "loss": 3.3727, + "step": 1508 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015932399248880542, + "loss": 3.5917, + "step": 1509 + }, + { + "epoch": 0.21, + "learning_rate": 0.00015929510327892534, + "loss": 3.4071, + "step": 1510 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015926621406904523, + "loss": 3.4108, + "step": 1511 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015923732485916513, + "loss": 3.5325, + "step": 1512 + }, + { + "epoch": 0.22, + "learning_rate": 0.000159208435649285, + "loss": 3.4885, + "step": 1513 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001591795464394049, + "loss": 3.4643, + "step": 1514 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015915065722952478, + "loss": 3.6002, + "step": 1515 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015912176801964468, + "loss": 3.3506, + "step": 1516 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015909287880976454, + "loss": 3.3864, + "step": 1517 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015906398959988444, + "loss": 3.4929, + "step": 1518 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015903510039000433, + "loss": 3.508, + "step": 1519 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015900621118012423, + "loss": 3.4869, + "step": 1520 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015897732197024412, + "loss": 3.4724, + "step": 1521 + }, + { + "epoch": 0.22, + "learning_rate": 0.000158948432760364, + "loss": 3.4725, + "step": 1522 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001589195435504839, + "loss": 3.4134, + "step": 1523 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001588906543406038, + "loss": 3.3603, + "step": 1524 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001588617651307237, + "loss": 3.4585, + "step": 1525 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015883287592084356, + "loss": 3.5225, + "step": 1526 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015880398671096345, + "loss": 3.4008, + "step": 1527 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015877509750108335, + "loss": 3.5919, + "step": 1528 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015874620829120324, + "loss": 3.5007, + "step": 1529 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015871731908132314, + "loss": 3.538, + "step": 1530 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015868842987144303, + "loss": 3.413, + "step": 1531 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015865954066156292, + "loss": 3.5105, + "step": 1532 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015863065145168282, + "loss": 3.3884, + "step": 1533 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001586017622418027, + "loss": 3.5485, + "step": 1534 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015857287303192258, + "loss": 3.5247, + "step": 1535 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015854398382204247, + "loss": 3.441, + "step": 1536 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015851509461216237, + "loss": 3.351, + "step": 1537 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015848620540228226, + "loss": 3.3742, + "step": 1538 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015845731619240213, + "loss": 3.4401, + "step": 1539 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015842842698252202, + "loss": 3.5104, + "step": 1540 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015839953777264191, + "loss": 3.3604, + "step": 1541 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015837064856276184, + "loss": 3.5065, + "step": 1542 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001583417593528817, + "loss": 3.3764, + "step": 1543 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001583128701430016, + "loss": 3.5065, + "step": 1544 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001582839809331215, + "loss": 3.4282, + "step": 1545 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015825509172324138, + "loss": 3.4819, + "step": 1546 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015822620251336128, + "loss": 3.4324, + "step": 1547 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015819731330348114, + "loss": 3.4524, + "step": 1548 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015816842409360104, + "loss": 3.5542, + "step": 1549 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015813953488372093, + "loss": 3.6157, + "step": 1550 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015811064567384083, + "loss": 3.4819, + "step": 1551 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015808175646396072, + "loss": 3.3803, + "step": 1552 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001580528672540806, + "loss": 3.2802, + "step": 1553 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001580239780442005, + "loss": 3.4394, + "step": 1554 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001579950888343204, + "loss": 3.3469, + "step": 1555 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015796619962444027, + "loss": 3.495, + "step": 1556 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015793731041456016, + "loss": 3.5226, + "step": 1557 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015790842120468006, + "loss": 3.4985, + "step": 1558 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015787953199479995, + "loss": 3.5033, + "step": 1559 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015785064278491984, + "loss": 3.3505, + "step": 1560 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001578217535750397, + "loss": 3.39, + "step": 1561 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001577928643651596, + "loss": 3.5471, + "step": 1562 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015776397515527952, + "loss": 3.4033, + "step": 1563 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015773508594539942, + "loss": 3.5035, + "step": 1564 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015770619673551928, + "loss": 3.5532, + "step": 1565 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015767730752563918, + "loss": 3.3897, + "step": 1566 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015764841831575907, + "loss": 3.5054, + "step": 1567 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015761952910587897, + "loss": 3.3783, + "step": 1568 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015759063989599886, + "loss": 3.588, + "step": 1569 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015756175068611873, + "loss": 3.4535, + "step": 1570 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015753286147623862, + "loss": 3.3799, + "step": 1571 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015750397226635851, + "loss": 3.396, + "step": 1572 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015747508305647844, + "loss": 3.385, + "step": 1573 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001574461938465983, + "loss": 3.1487, + "step": 1574 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001574173046367182, + "loss": 3.3268, + "step": 1575 + }, + { + "epoch": 0.22, + "learning_rate": 0.0001573884154268381, + "loss": 3.4515, + "step": 1576 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015735952621695798, + "loss": 3.5814, + "step": 1577 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015733063700707785, + "loss": 3.4728, + "step": 1578 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015730174779719774, + "loss": 3.5349, + "step": 1579 + }, + { + "epoch": 0.22, + "learning_rate": 0.00015727285858731764, + "loss": 3.5144, + "step": 1580 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015724396937743753, + "loss": 3.4857, + "step": 1581 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015721508016755743, + "loss": 3.5092, + "step": 1582 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001571861909576773, + "loss": 3.3719, + "step": 1583 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001571573017477972, + "loss": 3.4875, + "step": 1584 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001571284125379171, + "loss": 3.1237, + "step": 1585 + }, + { + "epoch": 0.23, + "learning_rate": 0.000157099523328037, + "loss": 3.4557, + "step": 1586 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015707063411815687, + "loss": 3.3924, + "step": 1587 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015704174490827676, + "loss": 3.4948, + "step": 1588 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015701285569839666, + "loss": 3.505, + "step": 1589 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015698396648851655, + "loss": 3.4542, + "step": 1590 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015695507727863644, + "loss": 3.4866, + "step": 1591 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001569261880687563, + "loss": 3.435, + "step": 1592 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001568972988588762, + "loss": 3.4887, + "step": 1593 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015686840964899612, + "loss": 3.3567, + "step": 1594 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015683952043911602, + "loss": 3.4085, + "step": 1595 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015681063122923588, + "loss": 3.4561, + "step": 1596 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015678174201935578, + "loss": 3.5834, + "step": 1597 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015675285280947567, + "loss": 3.3371, + "step": 1598 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015672396359959557, + "loss": 3.374, + "step": 1599 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015669507438971543, + "loss": 3.419, + "step": 1600 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015666618517983533, + "loss": 3.5542, + "step": 1601 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015663729596995522, + "loss": 3.3417, + "step": 1602 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015660840676007511, + "loss": 3.3868, + "step": 1603 + }, + { + "epoch": 0.23, + "learning_rate": 0.000156579517550195, + "loss": 3.4926, + "step": 1604 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001565506283403149, + "loss": 3.3727, + "step": 1605 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001565217391304348, + "loss": 3.4854, + "step": 1606 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001564928499205547, + "loss": 3.4923, + "step": 1607 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015646396071067458, + "loss": 3.4982, + "step": 1608 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015643507150079445, + "loss": 3.5484, + "step": 1609 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015640618229091434, + "loss": 3.4584, + "step": 1610 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015637729308103424, + "loss": 3.4635, + "step": 1611 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015634840387115413, + "loss": 3.336, + "step": 1612 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015631951466127403, + "loss": 3.4563, + "step": 1613 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001562906254513939, + "loss": 3.4882, + "step": 1614 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001562617362415138, + "loss": 3.4069, + "step": 1615 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001562328470316337, + "loss": 3.3776, + "step": 1616 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001562039578217536, + "loss": 3.4601, + "step": 1617 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015617506861187347, + "loss": 3.4655, + "step": 1618 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015614617940199336, + "loss": 3.4411, + "step": 1619 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015611729019211326, + "loss": 3.424, + "step": 1620 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015608840098223315, + "loss": 3.5681, + "step": 1621 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015605951177235302, + "loss": 3.5628, + "step": 1622 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001560306225624729, + "loss": 3.6016, + "step": 1623 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001560017333525928, + "loss": 3.3233, + "step": 1624 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015597284414271272, + "loss": 3.4938, + "step": 1625 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001559439549328326, + "loss": 3.5279, + "step": 1626 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015591506572295249, + "loss": 3.351, + "step": 1627 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015588617651307238, + "loss": 3.4287, + "step": 1628 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015585728730319227, + "loss": 3.5784, + "step": 1629 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015582839809331217, + "loss": 3.5075, + "step": 1630 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015579950888343203, + "loss": 3.2393, + "step": 1631 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015577061967355193, + "loss": 3.44, + "step": 1632 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015574173046367182, + "loss": 3.4562, + "step": 1633 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015571284125379171, + "loss": 3.4367, + "step": 1634 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001556839520439116, + "loss": 3.3648, + "step": 1635 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001556550628340315, + "loss": 3.4151, + "step": 1636 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001556261736241514, + "loss": 3.4663, + "step": 1637 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001555972844142713, + "loss": 3.3799, + "step": 1638 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015556839520439118, + "loss": 3.4718, + "step": 1639 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015553950599451105, + "loss": 3.3207, + "step": 1640 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015551061678463094, + "loss": 3.5074, + "step": 1641 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015548172757475084, + "loss": 3.4117, + "step": 1642 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015545283836487073, + "loss": 3.4783, + "step": 1643 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001554239491549906, + "loss": 3.477, + "step": 1644 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001553950599451105, + "loss": 3.41, + "step": 1645 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001553661707352304, + "loss": 3.4416, + "step": 1646 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001553372815253503, + "loss": 3.5455, + "step": 1647 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015530839231547017, + "loss": 3.3198, + "step": 1648 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015527950310559007, + "loss": 3.5073, + "step": 1649 + }, + { + "epoch": 0.23, + "learning_rate": 0.00015525061389570996, + "loss": 3.3613, + "step": 1650 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015522172468582986, + "loss": 3.4919, + "step": 1651 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015519283547594975, + "loss": 3.371, + "step": 1652 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015516394626606962, + "loss": 3.5187, + "step": 1653 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001551350570561895, + "loss": 3.3989, + "step": 1654 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001551061678463094, + "loss": 3.4935, + "step": 1655 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001550772786364293, + "loss": 3.5323, + "step": 1656 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001550483894265492, + "loss": 3.5203, + "step": 1657 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015501950021666909, + "loss": 3.395, + "step": 1658 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015499061100678898, + "loss": 3.4209, + "step": 1659 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015496172179690887, + "loss": 3.5168, + "step": 1660 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015493283258702877, + "loss": 3.5469, + "step": 1661 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015490394337714863, + "loss": 3.4716, + "step": 1662 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015487505416726853, + "loss": 3.4165, + "step": 1663 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015484616495738842, + "loss": 3.3731, + "step": 1664 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015481727574750831, + "loss": 3.5362, + "step": 1665 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015478838653762818, + "loss": 3.5687, + "step": 1666 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001547594973277481, + "loss": 3.5811, + "step": 1667 + }, + { + "epoch": 0.24, + "learning_rate": 0.000154730608117868, + "loss": 3.4635, + "step": 1668 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001547017189079879, + "loss": 3.3834, + "step": 1669 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015467282969810776, + "loss": 3.4669, + "step": 1670 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015464394048822765, + "loss": 3.4839, + "step": 1671 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015461505127834754, + "loss": 3.4275, + "step": 1672 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015458616206846744, + "loss": 3.5108, + "step": 1673 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015455727285858733, + "loss": 3.5041, + "step": 1674 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001545283836487072, + "loss": 3.4974, + "step": 1675 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001544994944388271, + "loss": 3.3863, + "step": 1676 + }, + { + "epoch": 0.24, + "learning_rate": 0.000154470605228947, + "loss": 3.4526, + "step": 1677 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001544417160190669, + "loss": 3.5786, + "step": 1678 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015441282680918677, + "loss": 3.3061, + "step": 1679 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015438393759930667, + "loss": 3.4446, + "step": 1680 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015435504838942656, + "loss": 3.486, + "step": 1681 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015432615917954646, + "loss": 3.4333, + "step": 1682 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015429726996966632, + "loss": 3.3177, + "step": 1683 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015426838075978622, + "loss": 3.5748, + "step": 1684 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001542394915499061, + "loss": 3.495, + "step": 1685 + }, + { + "epoch": 0.24, + "learning_rate": 0.000154210602340026, + "loss": 3.4927, + "step": 1686 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001541817131301459, + "loss": 3.4502, + "step": 1687 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001541528239202658, + "loss": 3.4559, + "step": 1688 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015412393471038569, + "loss": 3.3388, + "step": 1689 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015409504550050558, + "loss": 3.437, + "step": 1690 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015406615629062547, + "loss": 3.3642, + "step": 1691 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015403726708074534, + "loss": 3.4297, + "step": 1692 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015400837787086523, + "loss": 3.3197, + "step": 1693 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015397948866098513, + "loss": 3.5311, + "step": 1694 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015395059945110502, + "loss": 3.3404, + "step": 1695 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015392171024122492, + "loss": 3.4481, + "step": 1696 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015389282103134478, + "loss": 3.4041, + "step": 1697 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015386393182146468, + "loss": 3.4734, + "step": 1698 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001538350426115846, + "loss": 3.5042, + "step": 1699 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001538061534017045, + "loss": 3.3428, + "step": 1700 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015377726419182436, + "loss": 3.5199, + "step": 1701 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015374837498194425, + "loss": 3.3486, + "step": 1702 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015371948577206414, + "loss": 3.3362, + "step": 1703 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015369059656218404, + "loss": 3.3543, + "step": 1704 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001536617073523039, + "loss": 3.4948, + "step": 1705 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001536328181424238, + "loss": 3.4326, + "step": 1706 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001536039289325437, + "loss": 3.3218, + "step": 1707 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001535750397226636, + "loss": 3.4999, + "step": 1708 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015354615051278348, + "loss": 3.4222, + "step": 1709 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015351726130290337, + "loss": 3.0983, + "step": 1710 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015348837209302327, + "loss": 3.447, + "step": 1711 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015345948288314316, + "loss": 3.4486, + "step": 1712 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015343059367326306, + "loss": 3.398, + "step": 1713 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015340170446338292, + "loss": 3.3366, + "step": 1714 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015337281525350282, + "loss": 3.5535, + "step": 1715 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001533439260436227, + "loss": 3.4162, + "step": 1716 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001533150368337426, + "loss": 3.3429, + "step": 1717 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001532861476238625, + "loss": 3.4877, + "step": 1718 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015325725841398236, + "loss": 3.4085, + "step": 1719 + }, + { + "epoch": 0.24, + "learning_rate": 0.00015322836920410229, + "loss": 3.5294, + "step": 1720 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015319947999422218, + "loss": 3.3905, + "step": 1721 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015317059078434207, + "loss": 3.4958, + "step": 1722 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015314170157446194, + "loss": 3.4739, + "step": 1723 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015311281236458183, + "loss": 3.4774, + "step": 1724 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015308392315470173, + "loss": 3.4016, + "step": 1725 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015305503394482162, + "loss": 3.4329, + "step": 1726 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001530261447349415, + "loss": 3.4621, + "step": 1727 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015299725552506138, + "loss": 3.4899, + "step": 1728 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015296836631518128, + "loss": 3.397, + "step": 1729 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001529394771053012, + "loss": 3.4158, + "step": 1730 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015291058789542106, + "loss": 3.455, + "step": 1731 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015288169868554096, + "loss": 3.5955, + "step": 1732 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015285280947566085, + "loss": 3.4992, + "step": 1733 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015282392026578074, + "loss": 3.1957, + "step": 1734 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015279503105590064, + "loss": 3.4251, + "step": 1735 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001527661418460205, + "loss": 3.3176, + "step": 1736 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001527372526361404, + "loss": 3.5188, + "step": 1737 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001527083634262603, + "loss": 3.4461, + "step": 1738 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001526794742163802, + "loss": 3.3749, + "step": 1739 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015265058500650008, + "loss": 3.4516, + "step": 1740 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015262169579661997, + "loss": 3.5357, + "step": 1741 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015259280658673987, + "loss": 3.4015, + "step": 1742 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015256391737685976, + "loss": 3.5292, + "step": 1743 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015253502816697966, + "loss": 3.4333, + "step": 1744 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015250613895709952, + "loss": 3.3426, + "step": 1745 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015247724974721942, + "loss": 3.4477, + "step": 1746 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001524483605373393, + "loss": 3.4709, + "step": 1747 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001524194713274592, + "loss": 3.4584, + "step": 1748 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015239058211757907, + "loss": 3.448, + "step": 1749 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015236169290769896, + "loss": 3.4082, + "step": 1750 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015233280369781889, + "loss": 3.4354, + "step": 1751 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015230391448793878, + "loss": 3.4206, + "step": 1752 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015227502527805865, + "loss": 3.3987, + "step": 1753 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015224613606817854, + "loss": 3.3487, + "step": 1754 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015221724685829843, + "loss": 3.5518, + "step": 1755 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015218835764841833, + "loss": 3.341, + "step": 1756 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015215946843853822, + "loss": 3.4783, + "step": 1757 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001521305792286581, + "loss": 3.3948, + "step": 1758 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015210169001877798, + "loss": 3.2644, + "step": 1759 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015207280080889788, + "loss": 3.4853, + "step": 1760 + }, + { + "epoch": 0.25, + "eval_loss": 3.5950064659118652, + "eval_runtime": 471.9526, + "eval_samples_per_second": 43.409, + "eval_steps_per_second": 14.47, + "step": 1760 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015204391159901777, + "loss": 3.4051, + "step": 1761 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015201502238913766, + "loss": 3.3596, + "step": 1762 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015198613317925756, + "loss": 3.4631, + "step": 1763 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015195724396937745, + "loss": 3.416, + "step": 1764 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015192835475949735, + "loss": 3.4119, + "step": 1765 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015189946554961724, + "loss": 3.3318, + "step": 1766 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001518705763397371, + "loss": 3.4591, + "step": 1767 + }, + { + "epoch": 0.25, + "learning_rate": 0.000151841687129857, + "loss": 3.483, + "step": 1768 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001518127979199769, + "loss": 3.4671, + "step": 1769 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001517839087100968, + "loss": 3.3936, + "step": 1770 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015175501950021665, + "loss": 3.4857, + "step": 1771 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015172613029033657, + "loss": 3.3232, + "step": 1772 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015169724108045647, + "loss": 3.4324, + "step": 1773 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015166835187057636, + "loss": 3.4477, + "step": 1774 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015163946266069623, + "loss": 3.593, + "step": 1775 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015161057345081612, + "loss": 3.2523, + "step": 1776 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015158168424093602, + "loss": 3.4394, + "step": 1777 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001515527950310559, + "loss": 3.4362, + "step": 1778 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001515239058211758, + "loss": 3.5046, + "step": 1779 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015149501661129567, + "loss": 3.35, + "step": 1780 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015146612740141556, + "loss": 3.4668, + "step": 1781 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015143723819153546, + "loss": 3.2848, + "step": 1782 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015140834898165538, + "loss": 3.5605, + "step": 1783 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015137945977177525, + "loss": 3.3905, + "step": 1784 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015135057056189514, + "loss": 3.4506, + "step": 1785 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015132168135201503, + "loss": 3.4539, + "step": 1786 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015129279214213493, + "loss": 3.532, + "step": 1787 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015126390293225482, + "loss": 3.4072, + "step": 1788 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001512350137223747, + "loss": 3.5478, + "step": 1789 + }, + { + "epoch": 0.25, + "learning_rate": 0.00015120612451249458, + "loss": 3.5377, + "step": 1790 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015117723530261448, + "loss": 3.3438, + "step": 1791 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015114834609273437, + "loss": 3.4662, + "step": 1792 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015111945688285426, + "loss": 3.3743, + "step": 1793 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015109056767297416, + "loss": 3.4603, + "step": 1794 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015106167846309405, + "loss": 3.5661, + "step": 1795 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015103278925321395, + "loss": 3.6666, + "step": 1796 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001510039000433338, + "loss": 3.5021, + "step": 1797 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001509750108334537, + "loss": 3.4549, + "step": 1798 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001509461216235736, + "loss": 3.4754, + "step": 1799 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001509172324136935, + "loss": 3.3963, + "step": 1800 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001508883432038134, + "loss": 3.5162, + "step": 1801 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015085945399393325, + "loss": 3.4586, + "step": 1802 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015083056478405317, + "loss": 3.4553, + "step": 1803 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015080167557417307, + "loss": 3.4765, + "step": 1804 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015077278636429296, + "loss": 3.5084, + "step": 1805 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015074389715441283, + "loss": 3.3074, + "step": 1806 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015071500794453272, + "loss": 3.4487, + "step": 1807 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015068611873465262, + "loss": 3.4417, + "step": 1808 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001506572295247725, + "loss": 3.364, + "step": 1809 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001506283403148924, + "loss": 3.4581, + "step": 1810 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015059945110501227, + "loss": 3.4035, + "step": 1811 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015057056189513217, + "loss": 3.4665, + "step": 1812 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015054167268525206, + "loss": 3.4461, + "step": 1813 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015051278347537195, + "loss": 3.4914, + "step": 1814 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015048389426549185, + "loss": 3.3571, + "step": 1815 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015045500505561174, + "loss": 3.3196, + "step": 1816 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015042611584573163, + "loss": 3.4052, + "step": 1817 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015039722663585153, + "loss": 3.341, + "step": 1818 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001503683374259714, + "loss": 3.5346, + "step": 1819 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001503394482160913, + "loss": 3.2565, + "step": 1820 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015031055900621118, + "loss": 3.5817, + "step": 1821 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015028166979633108, + "loss": 3.5139, + "step": 1822 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015025278058645097, + "loss": 3.5845, + "step": 1823 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015022389137657086, + "loss": 3.4028, + "step": 1824 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015019500216669076, + "loss": 3.555, + "step": 1825 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015016611295681065, + "loss": 3.3109, + "step": 1826 + }, + { + "epoch": 0.26, + "learning_rate": 0.00015013722374693055, + "loss": 3.5086, + "step": 1827 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001501083345370504, + "loss": 3.3717, + "step": 1828 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001500794453271703, + "loss": 3.375, + "step": 1829 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001500505561172902, + "loss": 3.4272, + "step": 1830 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001500216669074101, + "loss": 3.5006, + "step": 1831 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014999277769752996, + "loss": 3.4847, + "step": 1832 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014996388848764985, + "loss": 3.4681, + "step": 1833 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014993499927776975, + "loss": 3.342, + "step": 1834 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014990611006788967, + "loss": 3.3611, + "step": 1835 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014987722085800954, + "loss": 3.3995, + "step": 1836 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014984833164812943, + "loss": 3.4514, + "step": 1837 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014981944243824932, + "loss": 3.5673, + "step": 1838 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014979055322836922, + "loss": 3.4605, + "step": 1839 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001497616640184891, + "loss": 3.5023, + "step": 1840 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014973277480860898, + "loss": 3.3265, + "step": 1841 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014970388559872887, + "loss": 3.4755, + "step": 1842 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014967499638884877, + "loss": 3.3242, + "step": 1843 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014964610717896866, + "loss": 3.3834, + "step": 1844 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014961721796908855, + "loss": 3.4341, + "step": 1845 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014958832875920845, + "loss": 3.5213, + "step": 1846 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014955943954932834, + "loss": 3.543, + "step": 1847 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014953055033944823, + "loss": 3.4442, + "step": 1848 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014950166112956813, + "loss": 3.5597, + "step": 1849 + }, + { + "epoch": 0.26, + "learning_rate": 0.000149472771919688, + "loss": 3.3681, + "step": 1850 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001494438827098079, + "loss": 3.3805, + "step": 1851 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014941499349992778, + "loss": 3.4571, + "step": 1852 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014938610429004768, + "loss": 3.444, + "step": 1853 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014935721508016754, + "loss": 3.4871, + "step": 1854 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014932832587028744, + "loss": 3.4849, + "step": 1855 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014929943666040736, + "loss": 3.2477, + "step": 1856 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014927054745052725, + "loss": 3.3903, + "step": 1857 + }, + { + "epoch": 0.26, + "learning_rate": 0.00014924165824064712, + "loss": 3.5671, + "step": 1858 + }, + { + "epoch": 0.26, + "learning_rate": 0.000149212769030767, + "loss": 3.2997, + "step": 1859 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001491838798208869, + "loss": 3.5099, + "step": 1860 + }, + { + "epoch": 0.26, + "learning_rate": 0.0001491549906110068, + "loss": 3.3259, + "step": 1861 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001491261014011267, + "loss": 3.4265, + "step": 1862 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014909721219124656, + "loss": 3.357, + "step": 1863 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014906832298136645, + "loss": 3.6089, + "step": 1864 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014903943377148635, + "loss": 3.4923, + "step": 1865 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014901054456160627, + "loss": 3.5827, + "step": 1866 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014898165535172614, + "loss": 3.4838, + "step": 1867 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014895276614184603, + "loss": 3.3623, + "step": 1868 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014892387693196592, + "loss": 3.4234, + "step": 1869 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014889498772208582, + "loss": 3.42, + "step": 1870 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001488660985122057, + "loss": 3.4147, + "step": 1871 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014883720930232558, + "loss": 3.4886, + "step": 1872 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014880832009244547, + "loss": 3.2949, + "step": 1873 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014877943088256537, + "loss": 3.4814, + "step": 1874 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014875054167268526, + "loss": 3.4084, + "step": 1875 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014872165246280513, + "loss": 3.4247, + "step": 1876 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014869276325292505, + "loss": 3.4906, + "step": 1877 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014866387404304494, + "loss": 3.4889, + "step": 1878 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014863498483316483, + "loss": 3.4885, + "step": 1879 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001486060956232847, + "loss": 3.4409, + "step": 1880 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001485772064134046, + "loss": 3.2834, + "step": 1881 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001485483172035245, + "loss": 3.4257, + "step": 1882 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014851942799364438, + "loss": 3.5979, + "step": 1883 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014849053878376428, + "loss": 3.3264, + "step": 1884 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014846164957388414, + "loss": 3.4631, + "step": 1885 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014843276036400404, + "loss": 3.4964, + "step": 1886 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014840387115412396, + "loss": 3.5215, + "step": 1887 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014837498194424385, + "loss": 3.4671, + "step": 1888 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014834609273436372, + "loss": 3.4544, + "step": 1889 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001483172035244836, + "loss": 3.4728, + "step": 1890 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001482883143146035, + "loss": 3.5478, + "step": 1891 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001482594251047234, + "loss": 3.3491, + "step": 1892 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001482305358948433, + "loss": 3.5396, + "step": 1893 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014820164668496316, + "loss": 3.4655, + "step": 1894 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014817275747508305, + "loss": 3.3832, + "step": 1895 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014814386826520295, + "loss": 3.485, + "step": 1896 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014811497905532284, + "loss": 3.5032, + "step": 1897 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014808608984544274, + "loss": 3.4827, + "step": 1898 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014805720063556263, + "loss": 3.4626, + "step": 1899 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014802831142568252, + "loss": 3.3802, + "step": 1900 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014799942221580242, + "loss": 3.5136, + "step": 1901 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014797053300592228, + "loss": 3.5037, + "step": 1902 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014794164379604218, + "loss": 3.5207, + "step": 1903 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014791275458616207, + "loss": 3.4731, + "step": 1904 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014788386537628197, + "loss": 3.4823, + "step": 1905 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014785497616640186, + "loss": 3.4794, + "step": 1906 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014782608695652173, + "loss": 3.4531, + "step": 1907 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014779719774664165, + "loss": 3.3548, + "step": 1908 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014776830853676154, + "loss": 3.4536, + "step": 1909 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014773941932688143, + "loss": 3.5259, + "step": 1910 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001477105301170013, + "loss": 3.4277, + "step": 1911 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001476816409071212, + "loss": 3.3827, + "step": 1912 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001476527516972411, + "loss": 3.4711, + "step": 1913 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014762386248736098, + "loss": 3.4863, + "step": 1914 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014759497327748088, + "loss": 3.5021, + "step": 1915 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014756608406760074, + "loss": 3.3306, + "step": 1916 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014753719485772064, + "loss": 3.5057, + "step": 1917 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014750830564784053, + "loss": 3.4257, + "step": 1918 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014747941643796042, + "loss": 3.3859, + "step": 1919 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014745052722808032, + "loss": 3.488, + "step": 1920 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001474216380182002, + "loss": 3.2959, + "step": 1921 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001473927488083201, + "loss": 3.5709, + "step": 1922 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014736385959844, + "loss": 3.3738, + "step": 1923 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014733497038855987, + "loss": 3.4031, + "step": 1924 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014730608117867976, + "loss": 3.3902, + "step": 1925 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014727719196879965, + "loss": 3.3969, + "step": 1926 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014724830275891955, + "loss": 3.3293, + "step": 1927 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014721941354903944, + "loss": 3.442, + "step": 1928 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014719052433915934, + "loss": 3.4116, + "step": 1929 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014716163512927923, + "loss": 3.5437, + "step": 1930 + }, + { + "epoch": 0.27, + "learning_rate": 0.00014713274591939912, + "loss": 3.4743, + "step": 1931 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014710385670951902, + "loss": 3.281, + "step": 1932 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014707496749963888, + "loss": 3.4105, + "step": 1933 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014704607828975878, + "loss": 3.6074, + "step": 1934 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014701718907987867, + "loss": 3.5347, + "step": 1935 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014698829986999857, + "loss": 3.4239, + "step": 1936 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014695941066011846, + "loss": 3.4338, + "step": 1937 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014693052145023833, + "loss": 3.4685, + "step": 1938 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014690163224035822, + "loss": 3.3967, + "step": 1939 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014687274303047814, + "loss": 3.4177, + "step": 1940 + }, + { + "epoch": 0.28, + "learning_rate": 0.000146843853820598, + "loss": 3.4269, + "step": 1941 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001468149646107179, + "loss": 3.4652, + "step": 1942 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001467860754008378, + "loss": 3.4627, + "step": 1943 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001467571861909577, + "loss": 3.5027, + "step": 1944 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014672829698107758, + "loss": 3.5213, + "step": 1945 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014669940777119745, + "loss": 3.5287, + "step": 1946 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014667051856131734, + "loss": 3.5134, + "step": 1947 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014664162935143724, + "loss": 3.439, + "step": 1948 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014661274014155713, + "loss": 3.4804, + "step": 1949 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014658385093167703, + "loss": 3.2259, + "step": 1950 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014655496172179692, + "loss": 3.4906, + "step": 1951 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001465260725119168, + "loss": 3.4134, + "step": 1952 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001464971833020367, + "loss": 3.5899, + "step": 1953 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001464682940921566, + "loss": 3.4322, + "step": 1954 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014643940488227647, + "loss": 3.2959, + "step": 1955 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014641051567239636, + "loss": 3.3999, + "step": 1956 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014638162646251625, + "loss": 3.444, + "step": 1957 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014635273725263615, + "loss": 3.4835, + "step": 1958 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014632384804275602, + "loss": 3.4171, + "step": 1959 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001462949588328759, + "loss": 3.3343, + "step": 1960 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014626606962299583, + "loss": 3.4857, + "step": 1961 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014623718041311572, + "loss": 3.2975, + "step": 1962 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001462082912032356, + "loss": 3.3762, + "step": 1963 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014617940199335548, + "loss": 3.5087, + "step": 1964 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014615051278347538, + "loss": 3.5289, + "step": 1965 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014612162357359527, + "loss": 3.4127, + "step": 1966 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014609273436371517, + "loss": 3.4432, + "step": 1967 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014606384515383503, + "loss": 3.4058, + "step": 1968 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014603495594395493, + "loss": 3.5486, + "step": 1969 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014600606673407482, + "loss": 3.491, + "step": 1970 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014597717752419474, + "loss": 3.3858, + "step": 1971 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001459482883143146, + "loss": 3.3609, + "step": 1972 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001459193991044345, + "loss": 3.4312, + "step": 1973 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001458905098945544, + "loss": 3.403, + "step": 1974 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001458616206846743, + "loss": 3.2376, + "step": 1975 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014583273147479418, + "loss": 3.4582, + "step": 1976 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014580384226491405, + "loss": 3.5404, + "step": 1977 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014577495305503394, + "loss": 3.5013, + "step": 1978 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014574606384515384, + "loss": 3.4204, + "step": 1979 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014571717463527373, + "loss": 3.3112, + "step": 1980 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014568828542539363, + "loss": 3.562, + "step": 1981 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014565939621551352, + "loss": 3.2597, + "step": 1982 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001456305070056334, + "loss": 3.4568, + "step": 1983 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001456016177957533, + "loss": 3.5787, + "step": 1984 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014557272858587317, + "loss": 3.4654, + "step": 1985 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014554383937599307, + "loss": 3.5657, + "step": 1986 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014551495016611296, + "loss": 3.4461, + "step": 1987 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014548606095623285, + "loss": 3.5129, + "step": 1988 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014545717174635275, + "loss": 3.5618, + "step": 1989 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014542828253647262, + "loss": 3.324, + "step": 1990 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001453993933265925, + "loss": 3.3, + "step": 1991 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014537050411671243, + "loss": 3.395, + "step": 1992 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014534161490683232, + "loss": 3.5426, + "step": 1993 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001453127256969522, + "loss": 3.4614, + "step": 1994 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014528383648707208, + "loss": 3.3017, + "step": 1995 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014525494727719198, + "loss": 3.4191, + "step": 1996 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014522605806731187, + "loss": 3.4152, + "step": 1997 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014519716885743177, + "loss": 3.2961, + "step": 1998 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014516827964755163, + "loss": 3.3462, + "step": 1999 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014513939043767153, + "loss": 3.1992, + "step": 2000 + }, + { + "epoch": 0.28, + "learning_rate": 0.00014511050122779142, + "loss": 3.4572, + "step": 2001 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014508161201791134, + "loss": 3.3327, + "step": 2002 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001450527228080312, + "loss": 3.4493, + "step": 2003 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001450238335981511, + "loss": 3.4782, + "step": 2004 + }, + { + "epoch": 0.29, + "learning_rate": 0.000144994944388271, + "loss": 3.5013, + "step": 2005 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001449660551783909, + "loss": 3.3044, + "step": 2006 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014493716596851076, + "loss": 3.65, + "step": 2007 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014490827675863065, + "loss": 3.4377, + "step": 2008 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014487938754875054, + "loss": 3.2571, + "step": 2009 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014485049833887044, + "loss": 3.4659, + "step": 2010 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014482160912899033, + "loss": 3.3477, + "step": 2011 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001447927199191102, + "loss": 3.4271, + "step": 2012 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014476383070923012, + "loss": 3.5187, + "step": 2013 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014473494149935, + "loss": 3.4195, + "step": 2014 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001447060522894699, + "loss": 3.4035, + "step": 2015 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014467716307958977, + "loss": 3.3835, + "step": 2016 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014464827386970967, + "loss": 3.5223, + "step": 2017 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014461938465982956, + "loss": 3.412, + "step": 2018 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014459049544994946, + "loss": 3.3719, + "step": 2019 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014456160624006935, + "loss": 3.5132, + "step": 2020 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014453271703018922, + "loss": 3.4407, + "step": 2021 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001445038278203091, + "loss": 3.5427, + "step": 2022 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014447493861042903, + "loss": 3.3198, + "step": 2023 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001444460494005489, + "loss": 3.4871, + "step": 2024 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001444171601906688, + "loss": 3.3314, + "step": 2025 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014438827098078868, + "loss": 3.4372, + "step": 2026 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014435938177090858, + "loss": 3.2933, + "step": 2027 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014433049256102847, + "loss": 3.4497, + "step": 2028 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014430160335114834, + "loss": 3.501, + "step": 2029 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014427271414126823, + "loss": 3.3996, + "step": 2030 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014424382493138813, + "loss": 3.5048, + "step": 2031 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014421493572150802, + "loss": 3.4347, + "step": 2032 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014418604651162791, + "loss": 3.2714, + "step": 2033 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001441571573017478, + "loss": 3.5157, + "step": 2034 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001441282680918677, + "loss": 3.4685, + "step": 2035 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001440993788819876, + "loss": 3.4565, + "step": 2036 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001440704896721075, + "loss": 3.4407, + "step": 2037 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014404160046222736, + "loss": 3.4024, + "step": 2038 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014401271125234725, + "loss": 3.4478, + "step": 2039 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014398382204246714, + "loss": 3.5271, + "step": 2040 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014395493283258704, + "loss": 3.5119, + "step": 2041 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014392604362270693, + "loss": 3.2467, + "step": 2042 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001438971544128268, + "loss": 3.504, + "step": 2043 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014386826520294672, + "loss": 3.4109, + "step": 2044 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001438393759930666, + "loss": 3.3956, + "step": 2045 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014381048678318648, + "loss": 3.4761, + "step": 2046 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014378159757330637, + "loss": 3.5453, + "step": 2047 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014375270836342627, + "loss": 3.317, + "step": 2048 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014372381915354616, + "loss": 3.4624, + "step": 2049 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014369492994366606, + "loss": 3.3344, + "step": 2050 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014366604073378592, + "loss": 3.3991, + "step": 2051 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014363715152390582, + "loss": 3.3717, + "step": 2052 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001436082623140257, + "loss": 3.4737, + "step": 2053 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001435793731041456, + "loss": 3.4253, + "step": 2054 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001435504838942655, + "loss": 3.25, + "step": 2055 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001435215946843854, + "loss": 3.4574, + "step": 2056 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014349270547450529, + "loss": 3.2754, + "step": 2057 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014346381626462518, + "loss": 3.391, + "step": 2058 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014343492705474507, + "loss": 3.5016, + "step": 2059 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014340603784486494, + "loss": 3.3742, + "step": 2060 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014337714863498483, + "loss": 3.3986, + "step": 2061 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014334825942510473, + "loss": 3.2683, + "step": 2062 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014331937021522462, + "loss": 3.3845, + "step": 2063 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014329048100534451, + "loss": 3.2227, + "step": 2064 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001432615917954644, + "loss": 3.3658, + "step": 2065 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001432327025855843, + "loss": 3.455, + "step": 2066 + }, + { + "epoch": 0.29, + "learning_rate": 0.0001432038133757042, + "loss": 3.3727, + "step": 2067 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014317492416582406, + "loss": 3.2789, + "step": 2068 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014314603495594396, + "loss": 3.4471, + "step": 2069 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014311714574606385, + "loss": 3.412, + "step": 2070 + }, + { + "epoch": 0.29, + "learning_rate": 0.00014308825653618374, + "loss": 3.573, + "step": 2071 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014305936732630364, + "loss": 3.3721, + "step": 2072 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001430304781164235, + "loss": 3.5033, + "step": 2073 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001430015889065434, + "loss": 3.4696, + "step": 2074 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001429726996966633, + "loss": 3.4221, + "step": 2075 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001429438104867832, + "loss": 3.5093, + "step": 2076 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014291492127690308, + "loss": 3.4657, + "step": 2077 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014288603206702297, + "loss": 3.3822, + "step": 2078 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014285714285714287, + "loss": 3.4654, + "step": 2079 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014282825364726276, + "loss": 3.3517, + "step": 2080 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014279936443738266, + "loss": 3.4709, + "step": 2081 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014277047522750252, + "loss": 3.4689, + "step": 2082 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014274158601762242, + "loss": 3.4866, + "step": 2083 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001427126968077423, + "loss": 3.266, + "step": 2084 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001426838075978622, + "loss": 3.4905, + "step": 2085 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001426549183879821, + "loss": 3.5003, + "step": 2086 + }, + { + "epoch": 0.3, + "learning_rate": 0.000142626029178102, + "loss": 3.4256, + "step": 2087 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014259713996822189, + "loss": 3.4049, + "step": 2088 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014256825075834178, + "loss": 3.5358, + "step": 2089 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014253936154846165, + "loss": 3.4838, + "step": 2090 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014251047233858154, + "loss": 3.3808, + "step": 2091 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014248158312870143, + "loss": 3.3242, + "step": 2092 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014245269391882133, + "loss": 3.336, + "step": 2093 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014242380470894122, + "loss": 3.3955, + "step": 2094 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001423949154990611, + "loss": 3.5222, + "step": 2095 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014236602628918098, + "loss": 3.4184, + "step": 2096 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001423371370793009, + "loss": 3.4204, + "step": 2097 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001423082478694208, + "loss": 3.4188, + "step": 2098 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014227935865954066, + "loss": 3.4482, + "step": 2099 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014225046944966056, + "loss": 3.4424, + "step": 2100 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014222158023978045, + "loss": 3.3663, + "step": 2101 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014219269102990034, + "loss": 3.3478, + "step": 2102 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014216380182002024, + "loss": 3.2278, + "step": 2103 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001421349126101401, + "loss": 3.4806, + "step": 2104 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014210602340026, + "loss": 3.5108, + "step": 2105 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001420771341903799, + "loss": 3.5468, + "step": 2106 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014204824498049981, + "loss": 3.4712, + "step": 2107 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014201935577061968, + "loss": 3.4026, + "step": 2108 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014199046656073957, + "loss": 3.4801, + "step": 2109 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014196157735085947, + "loss": 3.4062, + "step": 2110 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014193268814097936, + "loss": 3.4245, + "step": 2111 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014190379893109923, + "loss": 3.475, + "step": 2112 + }, + { + "epoch": 0.3, + "eval_loss": 3.580632209777832, + "eval_runtime": 471.7483, + "eval_samples_per_second": 43.428, + "eval_steps_per_second": 14.476, + "step": 2112 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014187490972121912, + "loss": 3.4145, + "step": 2113 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014184602051133902, + "loss": 3.3238, + "step": 2114 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001418171313014589, + "loss": 3.3204, + "step": 2115 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001417882420915788, + "loss": 3.2878, + "step": 2116 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014175935288169867, + "loss": 3.4137, + "step": 2117 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001417304636718186, + "loss": 3.4031, + "step": 2118 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014170157446193849, + "loss": 3.4523, + "step": 2119 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014167268525205838, + "loss": 3.3077, + "step": 2120 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014164379604217825, + "loss": 3.4158, + "step": 2121 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014161490683229814, + "loss": 3.3561, + "step": 2122 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014158601762241803, + "loss": 3.3478, + "step": 2123 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014155712841253793, + "loss": 3.4943, + "step": 2124 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014152823920265782, + "loss": 3.387, + "step": 2125 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001414993499927777, + "loss": 3.5522, + "step": 2126 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014147046078289758, + "loss": 3.4068, + "step": 2127 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001414415715730175, + "loss": 3.3601, + "step": 2128 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001414126823631374, + "loss": 3.4129, + "step": 2129 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014138379315325726, + "loss": 3.4063, + "step": 2130 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014135490394337716, + "loss": 3.2681, + "step": 2131 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014132601473349705, + "loss": 3.456, + "step": 2132 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014129712552361694, + "loss": 3.4014, + "step": 2133 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001412682363137368, + "loss": 3.2214, + "step": 2134 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001412393471038567, + "loss": 3.2428, + "step": 2135 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001412104578939766, + "loss": 3.3887, + "step": 2136 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001411815686840965, + "loss": 3.5157, + "step": 2137 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001411526794742164, + "loss": 3.4656, + "step": 2138 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014112379026433628, + "loss": 3.4659, + "step": 2139 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014109490105445617, + "loss": 3.4634, + "step": 2140 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014106601184457607, + "loss": 3.4335, + "step": 2141 + }, + { + "epoch": 0.3, + "learning_rate": 0.00014103712263469596, + "loss": 3.4129, + "step": 2142 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014100823342481583, + "loss": 3.38, + "step": 2143 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014097934421493572, + "loss": 3.5137, + "step": 2144 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014095045500505562, + "loss": 3.5617, + "step": 2145 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001409215657951755, + "loss": 3.395, + "step": 2146 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001408926765852954, + "loss": 3.5115, + "step": 2147 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014086378737541527, + "loss": 3.4942, + "step": 2148 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001408348981655352, + "loss": 3.5737, + "step": 2149 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014080600895565509, + "loss": 3.5475, + "step": 2150 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014077711974577495, + "loss": 3.3564, + "step": 2151 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014074823053589485, + "loss": 3.4725, + "step": 2152 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014071934132601474, + "loss": 3.407, + "step": 2153 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014069045211613463, + "loss": 3.5516, + "step": 2154 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014066156290625453, + "loss": 3.5385, + "step": 2155 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001406326736963744, + "loss": 3.35, + "step": 2156 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001406037844864943, + "loss": 3.3215, + "step": 2157 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014057489527661418, + "loss": 3.4591, + "step": 2158 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014054600606673408, + "loss": 3.4866, + "step": 2159 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014051711685685397, + "loss": 3.4184, + "step": 2160 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014048822764697386, + "loss": 3.3715, + "step": 2161 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014045933843709376, + "loss": 3.5308, + "step": 2162 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014043044922721365, + "loss": 3.3307, + "step": 2163 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014040156001733354, + "loss": 3.4918, + "step": 2164 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001403726708074534, + "loss": 3.2267, + "step": 2165 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001403437815975733, + "loss": 3.3426, + "step": 2166 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001403148923876932, + "loss": 3.3965, + "step": 2167 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001402860031778131, + "loss": 3.5155, + "step": 2168 + }, + { + "epoch": 0.31, + "learning_rate": 0.000140257113967933, + "loss": 3.3793, + "step": 2169 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014022822475805288, + "loss": 3.5449, + "step": 2170 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014019933554817277, + "loss": 3.4593, + "step": 2171 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014017044633829267, + "loss": 3.3046, + "step": 2172 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014014155712841254, + "loss": 3.1801, + "step": 2173 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014011266791853243, + "loss": 3.4525, + "step": 2174 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014008377870865232, + "loss": 3.4199, + "step": 2175 + }, + { + "epoch": 0.31, + "learning_rate": 0.00014005488949877222, + "loss": 3.5184, + "step": 2176 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001400260002888921, + "loss": 3.4598, + "step": 2177 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013999711107901198, + "loss": 3.4862, + "step": 2178 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013996822186913187, + "loss": 3.4429, + "step": 2179 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001399393326592518, + "loss": 3.5418, + "step": 2180 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013991044344937169, + "loss": 3.4268, + "step": 2181 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013988155423949155, + "loss": 3.2958, + "step": 2182 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013985266502961145, + "loss": 3.4215, + "step": 2183 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013982377581973134, + "loss": 3.3225, + "step": 2184 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013979488660985123, + "loss": 3.3378, + "step": 2185 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013976599739997113, + "loss": 3.4642, + "step": 2186 + }, + { + "epoch": 0.31, + "learning_rate": 0.000139737108190091, + "loss": 3.4571, + "step": 2187 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001397082189802109, + "loss": 3.3121, + "step": 2188 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013967932977033078, + "loss": 3.4845, + "step": 2189 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013965044056045068, + "loss": 3.3634, + "step": 2190 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013962155135057057, + "loss": 3.3096, + "step": 2191 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013959266214069046, + "loss": 3.3263, + "step": 2192 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013956377293081036, + "loss": 3.12, + "step": 2193 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013953488372093025, + "loss": 3.5795, + "step": 2194 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013950599451105012, + "loss": 3.4364, + "step": 2195 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013947710530117, + "loss": 3.3533, + "step": 2196 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001394482160912899, + "loss": 3.5455, + "step": 2197 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001394193268814098, + "loss": 3.3429, + "step": 2198 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001393904376715297, + "loss": 3.4561, + "step": 2199 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013936154846164956, + "loss": 3.2739, + "step": 2200 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013933265925176948, + "loss": 3.3866, + "step": 2201 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013930377004188937, + "loss": 3.369, + "step": 2202 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013927488083200927, + "loss": 3.4938, + "step": 2203 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013924599162212914, + "loss": 3.4248, + "step": 2204 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013921710241224903, + "loss": 3.4534, + "step": 2205 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013918821320236892, + "loss": 3.4099, + "step": 2206 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013915932399248882, + "loss": 3.4923, + "step": 2207 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001391304347826087, + "loss": 3.4412, + "step": 2208 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013910154557272858, + "loss": 3.4822, + "step": 2209 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013907265636284847, + "loss": 3.3436, + "step": 2210 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013904376715296836, + "loss": 3.3572, + "step": 2211 + }, + { + "epoch": 0.31, + "learning_rate": 0.00013901487794308829, + "loss": 3.3505, + "step": 2212 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013898598873320815, + "loss": 3.408, + "step": 2213 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013895709952332805, + "loss": 3.3081, + "step": 2214 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013892821031344794, + "loss": 3.3781, + "step": 2215 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013889932110356783, + "loss": 3.4703, + "step": 2216 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001388704318936877, + "loss": 3.6122, + "step": 2217 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001388415426838076, + "loss": 3.3728, + "step": 2218 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001388126534739275, + "loss": 3.5355, + "step": 2219 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013878376426404738, + "loss": 3.4331, + "step": 2220 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013875487505416728, + "loss": 3.377, + "step": 2221 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013872598584428717, + "loss": 3.5025, + "step": 2222 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013869709663440706, + "loss": 3.5106, + "step": 2223 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013866820742452696, + "loss": 3.5311, + "step": 2224 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013863931821464685, + "loss": 3.4397, + "step": 2225 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013861042900476672, + "loss": 3.3628, + "step": 2226 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001385815397948866, + "loss": 3.4772, + "step": 2227 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001385526505850065, + "loss": 3.5013, + "step": 2228 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001385237613751264, + "loss": 3.5671, + "step": 2229 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001384948721652463, + "loss": 3.498, + "step": 2230 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013846598295536616, + "loss": 3.3978, + "step": 2231 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013843709374548605, + "loss": 3.3056, + "step": 2232 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013840820453560597, + "loss": 3.3484, + "step": 2233 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013837931532572587, + "loss": 3.4505, + "step": 2234 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013835042611584574, + "loss": 3.4581, + "step": 2235 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013832153690596563, + "loss": 3.4996, + "step": 2236 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013829264769608552, + "loss": 3.2491, + "step": 2237 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013826375848620542, + "loss": 3.3702, + "step": 2238 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013823486927632528, + "loss": 3.4381, + "step": 2239 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013820598006644518, + "loss": 3.4453, + "step": 2240 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013817709085656507, + "loss": 3.4205, + "step": 2241 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013814820164668497, + "loss": 3.5151, + "step": 2242 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013811931243680486, + "loss": 3.4931, + "step": 2243 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013809042322692475, + "loss": 3.4466, + "step": 2244 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013806153401704465, + "loss": 3.3914, + "step": 2245 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013803264480716454, + "loss": 3.4542, + "step": 2246 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013800375559728443, + "loss": 3.3532, + "step": 2247 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001379748663874043, + "loss": 3.4223, + "step": 2248 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001379459771775242, + "loss": 3.3836, + "step": 2249 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001379170879676441, + "loss": 3.4089, + "step": 2250 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013788819875776398, + "loss": 3.4003, + "step": 2251 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013785930954788388, + "loss": 3.3875, + "step": 2252 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013783042033800374, + "loss": 3.512, + "step": 2253 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013780153112812366, + "loss": 3.3483, + "step": 2254 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013777264191824356, + "loss": 3.4381, + "step": 2255 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013774375270836345, + "loss": 3.4253, + "step": 2256 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013771486349848332, + "loss": 3.4555, + "step": 2257 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001376859742886032, + "loss": 3.4515, + "step": 2258 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001376570850787231, + "loss": 3.4872, + "step": 2259 + }, + { + "epoch": 0.32, + "learning_rate": 0.000137628195868843, + "loss": 3.2705, + "step": 2260 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013759930665896287, + "loss": 3.4221, + "step": 2261 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013757041744908276, + "loss": 3.3614, + "step": 2262 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013754152823920265, + "loss": 3.3684, + "step": 2263 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013751263902932258, + "loss": 3.4306, + "step": 2264 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013748374981944244, + "loss": 3.2772, + "step": 2265 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013745486060956234, + "loss": 3.4369, + "step": 2266 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013742597139968223, + "loss": 3.3247, + "step": 2267 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013739708218980212, + "loss": 3.5047, + "step": 2268 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013736819297992202, + "loss": 3.3833, + "step": 2269 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013733930377004188, + "loss": 3.2106, + "step": 2270 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013731041456016178, + "loss": 3.3727, + "step": 2271 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013728152535028167, + "loss": 3.3919, + "step": 2272 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013725263614040157, + "loss": 3.3544, + "step": 2273 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013722374693052146, + "loss": 3.3658, + "step": 2274 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013719485772064135, + "loss": 3.4047, + "step": 2275 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013716596851076125, + "loss": 3.3533, + "step": 2276 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013713707930088114, + "loss": 3.3707, + "step": 2277 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013710819009100103, + "loss": 3.2264, + "step": 2278 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001370793008811209, + "loss": 3.39, + "step": 2279 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001370504116712408, + "loss": 3.457, + "step": 2280 + }, + { + "epoch": 0.32, + "learning_rate": 0.0001370215224613607, + "loss": 3.2658, + "step": 2281 + }, + { + "epoch": 0.32, + "learning_rate": 0.00013699263325148058, + "loss": 3.3738, + "step": 2282 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013696374404160045, + "loss": 3.3054, + "step": 2283 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013693485483172034, + "loss": 3.4937, + "step": 2284 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013690596562184026, + "loss": 3.3848, + "step": 2285 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013687707641196016, + "loss": 3.5145, + "step": 2286 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013684818720208002, + "loss": 3.4478, + "step": 2287 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013681929799219992, + "loss": 3.5072, + "step": 2288 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001367904087823198, + "loss": 3.4196, + "step": 2289 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001367615195724397, + "loss": 3.3121, + "step": 2290 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001367326303625596, + "loss": 3.4513, + "step": 2291 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013670374115267947, + "loss": 3.4903, + "step": 2292 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013667485194279936, + "loss": 3.39, + "step": 2293 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013664596273291925, + "loss": 3.4293, + "step": 2294 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013661707352303915, + "loss": 3.3947, + "step": 2295 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013658818431315904, + "loss": 3.376, + "step": 2296 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013655929510327894, + "loss": 3.3409, + "step": 2297 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013653040589339883, + "loss": 3.2811, + "step": 2298 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013650151668351872, + "loss": 3.4879, + "step": 2299 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001364726274736386, + "loss": 3.5118, + "step": 2300 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013644373826375848, + "loss": 3.4129, + "step": 2301 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013641484905387838, + "loss": 3.4909, + "step": 2302 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013638595984399827, + "loss": 3.3344, + "step": 2303 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013635707063411817, + "loss": 3.2765, + "step": 2304 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013632818142423803, + "loss": 3.4694, + "step": 2305 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013629929221435795, + "loss": 3.3702, + "step": 2306 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013627040300447785, + "loss": 3.3879, + "step": 2307 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013624151379459774, + "loss": 3.4246, + "step": 2308 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001362126245847176, + "loss": 3.3766, + "step": 2309 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001361837353748375, + "loss": 3.4544, + "step": 2310 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001361548461649574, + "loss": 3.3527, + "step": 2311 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001361259569550773, + "loss": 3.3898, + "step": 2312 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013609706774519718, + "loss": 3.3793, + "step": 2313 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013606817853531705, + "loss": 3.3719, + "step": 2314 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013603928932543694, + "loss": 3.5055, + "step": 2315 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013601040011555684, + "loss": 3.4176, + "step": 2316 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013598151090567676, + "loss": 3.1551, + "step": 2317 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013595262169579662, + "loss": 3.4452, + "step": 2318 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013592373248591652, + "loss": 3.351, + "step": 2319 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001358948432760364, + "loss": 3.3997, + "step": 2320 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001358659540661563, + "loss": 3.2877, + "step": 2321 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013583706485627617, + "loss": 3.3394, + "step": 2322 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013580817564639607, + "loss": 3.5017, + "step": 2323 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013577928643651596, + "loss": 3.306, + "step": 2324 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013575039722663585, + "loss": 3.2135, + "step": 2325 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013572150801675575, + "loss": 3.1845, + "step": 2326 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013569261880687564, + "loss": 3.1219, + "step": 2327 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013566372959699554, + "loss": 3.4218, + "step": 2328 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013563484038711543, + "loss": 3.425, + "step": 2329 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013560595117723532, + "loss": 3.3029, + "step": 2330 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001355770619673552, + "loss": 3.3517, + "step": 2331 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013554817275747508, + "loss": 3.3517, + "step": 2332 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013551928354759498, + "loss": 3.444, + "step": 2333 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013549039433771487, + "loss": 3.3205, + "step": 2334 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013546150512783477, + "loss": 3.4313, + "step": 2335 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013543261591795463, + "loss": 3.4755, + "step": 2336 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013540372670807453, + "loss": 3.3698, + "step": 2337 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013537483749819445, + "loss": 3.5406, + "step": 2338 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013534594828831434, + "loss": 3.4552, + "step": 2339 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001353170590784342, + "loss": 3.3954, + "step": 2340 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001352881698685541, + "loss": 3.3485, + "step": 2341 + }, + { + "epoch": 0.33, + "learning_rate": 0.000135259280658674, + "loss": 3.4813, + "step": 2342 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001352303914487939, + "loss": 3.4158, + "step": 2343 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013520150223891376, + "loss": 3.416, + "step": 2344 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013517261302903365, + "loss": 3.4413, + "step": 2345 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013514372381915354, + "loss": 3.4087, + "step": 2346 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013511483460927344, + "loss": 3.4527, + "step": 2347 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013508594539939333, + "loss": 3.4639, + "step": 2348 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013505705618951322, + "loss": 3.2068, + "step": 2349 + }, + { + "epoch": 0.33, + "learning_rate": 0.00013502816697963312, + "loss": 3.5575, + "step": 2350 + }, + { + "epoch": 0.33, + "learning_rate": 0.000134999277769753, + "loss": 3.1795, + "step": 2351 + }, + { + "epoch": 0.33, + "learning_rate": 0.0001349703885598729, + "loss": 3.4778, + "step": 2352 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013494149934999277, + "loss": 3.4111, + "step": 2353 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013491261014011267, + "loss": 3.1266, + "step": 2354 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013488372093023256, + "loss": 3.3929, + "step": 2355 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013485483172035245, + "loss": 3.3329, + "step": 2356 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013482594251047235, + "loss": 3.4474, + "step": 2357 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013479705330059222, + "loss": 3.4609, + "step": 2358 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013476816409071214, + "loss": 3.3622, + "step": 2359 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013473927488083203, + "loss": 3.3481, + "step": 2360 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013471038567095192, + "loss": 3.4563, + "step": 2361 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001346814964610718, + "loss": 3.295, + "step": 2362 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013465260725119168, + "loss": 3.4981, + "step": 2363 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013462371804131158, + "loss": 3.4384, + "step": 2364 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013459482883143147, + "loss": 3.3174, + "step": 2365 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013456593962155134, + "loss": 3.431, + "step": 2366 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013453705041167123, + "loss": 3.4281, + "step": 2367 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013450816120179113, + "loss": 3.4862, + "step": 2368 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013447927199191105, + "loss": 3.4451, + "step": 2369 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013445038278203091, + "loss": 3.3016, + "step": 2370 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001344214935721508, + "loss": 3.3705, + "step": 2371 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001343926043622707, + "loss": 3.3258, + "step": 2372 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001343637151523906, + "loss": 3.3826, + "step": 2373 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001343348259425105, + "loss": 3.3718, + "step": 2374 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013430593673263036, + "loss": 3.4815, + "step": 2375 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013427704752275025, + "loss": 3.3265, + "step": 2376 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013424815831287014, + "loss": 3.4326, + "step": 2377 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013421926910299004, + "loss": 3.3151, + "step": 2378 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013419037989310993, + "loss": 3.324, + "step": 2379 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013416149068322983, + "loss": 3.2986, + "step": 2380 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013413260147334972, + "loss": 3.3388, + "step": 2381 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001341037122634696, + "loss": 3.2725, + "step": 2382 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001340748230535895, + "loss": 3.4595, + "step": 2383 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013404593384370937, + "loss": 3.4725, + "step": 2384 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013401704463382927, + "loss": 3.4385, + "step": 2385 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013398815542394916, + "loss": 3.3203, + "step": 2386 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013395926621406905, + "loss": 3.5133, + "step": 2387 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013393037700418892, + "loss": 3.4137, + "step": 2388 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013390148779430882, + "loss": 3.4752, + "step": 2389 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013387259858442874, + "loss": 3.3159, + "step": 2390 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013384370937454863, + "loss": 3.37, + "step": 2391 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001338148201646685, + "loss": 3.4084, + "step": 2392 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001337859309547884, + "loss": 3.4467, + "step": 2393 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013375704174490828, + "loss": 3.3266, + "step": 2394 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013372815253502818, + "loss": 3.2987, + "step": 2395 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013369926332514807, + "loss": 3.3875, + "step": 2396 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013367037411526794, + "loss": 3.417, + "step": 2397 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013364148490538783, + "loss": 3.4194, + "step": 2398 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013361259569550773, + "loss": 3.4125, + "step": 2399 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013358370648562765, + "loss": 3.3874, + "step": 2400 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013355481727574751, + "loss": 3.4618, + "step": 2401 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001335259280658674, + "loss": 3.4174, + "step": 2402 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001334970388559873, + "loss": 3.3945, + "step": 2403 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001334681496461072, + "loss": 3.4469, + "step": 2404 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001334392604362271, + "loss": 3.5016, + "step": 2405 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013341037122634696, + "loss": 3.4477, + "step": 2406 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013338148201646685, + "loss": 3.3503, + "step": 2407 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013335259280658674, + "loss": 3.3697, + "step": 2408 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013332370359670664, + "loss": 3.2187, + "step": 2409 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001332948143868265, + "loss": 3.3541, + "step": 2410 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013326592517694643, + "loss": 3.4176, + "step": 2411 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013323703596706632, + "loss": 3.3427, + "step": 2412 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001332081467571862, + "loss": 3.3391, + "step": 2413 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013317925754730608, + "loss": 3.392, + "step": 2414 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013315036833742597, + "loss": 3.3353, + "step": 2415 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013312147912754587, + "loss": 3.4673, + "step": 2416 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013309258991766576, + "loss": 3.315, + "step": 2417 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013306370070778565, + "loss": 3.1815, + "step": 2418 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013303481149790552, + "loss": 3.3533, + "step": 2419 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013300592228802542, + "loss": 3.2504, + "step": 2420 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013297703307814534, + "loss": 3.4406, + "step": 2421 + }, + { + "epoch": 0.34, + "learning_rate": 0.00013294814386826523, + "loss": 3.3511, + "step": 2422 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001329192546583851, + "loss": 3.4214, + "step": 2423 + }, + { + "epoch": 0.35, + "learning_rate": 0.000132890365448505, + "loss": 3.3204, + "step": 2424 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013286147623862488, + "loss": 3.4374, + "step": 2425 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013283258702874478, + "loss": 3.405, + "step": 2426 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013280369781886465, + "loss": 3.2408, + "step": 2427 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013277480860898454, + "loss": 3.3318, + "step": 2428 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013274591939910443, + "loss": 3.4731, + "step": 2429 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013271703018922433, + "loss": 3.4582, + "step": 2430 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013268814097934422, + "loss": 3.4412, + "step": 2431 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013265925176946411, + "loss": 3.3804, + "step": 2432 + }, + { + "epoch": 0.35, + "learning_rate": 0.000132630362559584, + "loss": 3.4339, + "step": 2433 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001326014733497039, + "loss": 3.2806, + "step": 2434 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001325725841398238, + "loss": 3.2909, + "step": 2435 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013254369492994366, + "loss": 3.4289, + "step": 2436 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013251480572006356, + "loss": 3.1086, + "step": 2437 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013248591651018345, + "loss": 3.4366, + "step": 2438 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013245702730030334, + "loss": 3.4952, + "step": 2439 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013242813809042324, + "loss": 3.2876, + "step": 2440 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001323992488805431, + "loss": 3.3862, + "step": 2441 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013237035967066303, + "loss": 3.3508, + "step": 2442 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013234147046078292, + "loss": 3.5075, + "step": 2443 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001323125812509028, + "loss": 3.0807, + "step": 2444 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013228369204102268, + "loss": 3.3371, + "step": 2445 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013225480283114257, + "loss": 3.3085, + "step": 2446 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013222591362126247, + "loss": 3.4499, + "step": 2447 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013219702441138236, + "loss": 3.474, + "step": 2448 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013216813520150223, + "loss": 3.4154, + "step": 2449 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013213924599162212, + "loss": 3.5276, + "step": 2450 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013211035678174202, + "loss": 3.432, + "step": 2451 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001320814675718619, + "loss": 3.3362, + "step": 2452 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001320525783619818, + "loss": 3.3559, + "step": 2453 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001320236891521017, + "loss": 3.2714, + "step": 2454 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001319947999422216, + "loss": 3.2934, + "step": 2455 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013196591073234148, + "loss": 3.3839, + "step": 2456 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013193702152246138, + "loss": 3.3512, + "step": 2457 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013190813231258125, + "loss": 3.2705, + "step": 2458 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013187924310270114, + "loss": 3.4098, + "step": 2459 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013185035389282103, + "loss": 3.2906, + "step": 2460 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013182146468294093, + "loss": 3.389, + "step": 2461 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013179257547306082, + "loss": 3.354, + "step": 2462 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013176368626318071, + "loss": 3.4475, + "step": 2463 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001317347970533006, + "loss": 3.2476, + "step": 2464 + }, + { + "epoch": 0.35, + "eval_loss": 3.5674850940704346, + "eval_runtime": 473.0302, + "eval_samples_per_second": 43.31, + "eval_steps_per_second": 14.437, + "step": 2464 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001317059078434205, + "loss": 3.4002, + "step": 2465 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001316770186335404, + "loss": 3.429, + "step": 2466 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013164812942366026, + "loss": 3.3282, + "step": 2467 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013161924021378016, + "loss": 3.1434, + "step": 2468 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013159035100390005, + "loss": 3.3893, + "step": 2469 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013156146179401994, + "loss": 3.3807, + "step": 2470 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001315325725841398, + "loss": 3.374, + "step": 2471 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001315036833742597, + "loss": 3.3385, + "step": 2472 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001314747941643796, + "loss": 3.3285, + "step": 2473 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013144590495449952, + "loss": 3.4922, + "step": 2474 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013141701574461939, + "loss": 3.4045, + "step": 2475 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013138812653473928, + "loss": 3.3272, + "step": 2476 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013135923732485917, + "loss": 3.373, + "step": 2477 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013133034811497907, + "loss": 3.3136, + "step": 2478 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013130145890509896, + "loss": 3.2604, + "step": 2479 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013127256969521883, + "loss": 3.4602, + "step": 2480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013124368048533872, + "loss": 3.361, + "step": 2481 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013121479127545862, + "loss": 3.3301, + "step": 2482 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001311859020655785, + "loss": 3.3257, + "step": 2483 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001311570128556984, + "loss": 3.3816, + "step": 2484 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001311281236458183, + "loss": 3.4853, + "step": 2485 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001310992344359382, + "loss": 3.4588, + "step": 2486 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013107034522605809, + "loss": 3.369, + "step": 2487 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013104145601617798, + "loss": 3.3041, + "step": 2488 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013101256680629785, + "loss": 3.4367, + "step": 2489 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013098367759641774, + "loss": 3.2189, + "step": 2490 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013095478838653763, + "loss": 3.2695, + "step": 2491 + }, + { + "epoch": 0.35, + "learning_rate": 0.00013092589917665753, + "loss": 3.4266, + "step": 2492 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001308970099667774, + "loss": 3.4851, + "step": 2493 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001308681207568973, + "loss": 3.4867, + "step": 2494 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001308392315470172, + "loss": 3.5863, + "step": 2495 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001308103423371371, + "loss": 3.3697, + "step": 2496 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013078145312725697, + "loss": 3.4082, + "step": 2497 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013075256391737686, + "loss": 3.4489, + "step": 2498 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013072367470749676, + "loss": 3.4199, + "step": 2499 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013069478549761665, + "loss": 3.354, + "step": 2500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013066589628773654, + "loss": 3.4063, + "step": 2501 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001306370070778564, + "loss": 3.4643, + "step": 2502 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001306081178679763, + "loss": 3.3565, + "step": 2503 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001305792286580962, + "loss": 3.4586, + "step": 2504 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013055033944821612, + "loss": 3.4305, + "step": 2505 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013052145023833599, + "loss": 3.4861, + "step": 2506 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013049256102845588, + "loss": 3.2312, + "step": 2507 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013046367181857577, + "loss": 3.2431, + "step": 2508 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013043478260869567, + "loss": 3.5262, + "step": 2509 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013040589339881556, + "loss": 3.3597, + "step": 2510 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013037700418893543, + "loss": 3.3521, + "step": 2511 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013034811497905532, + "loss": 3.2393, + "step": 2512 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013031922576917522, + "loss": 3.494, + "step": 2513 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001302903365592951, + "loss": 3.5039, + "step": 2514 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013026144734941498, + "loss": 3.3433, + "step": 2515 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001302325581395349, + "loss": 3.4088, + "step": 2516 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001302036689296548, + "loss": 3.2848, + "step": 2517 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013017477971977469, + "loss": 3.4788, + "step": 2518 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013014589050989455, + "loss": 3.2091, + "step": 2519 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013011700130001445, + "loss": 3.3265, + "step": 2520 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013008811209013434, + "loss": 3.4179, + "step": 2521 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013005922288025423, + "loss": 3.4583, + "step": 2522 + }, + { + "epoch": 0.36, + "learning_rate": 0.00013003033367037413, + "loss": 3.4588, + "step": 2523 + }, + { + "epoch": 0.36, + "learning_rate": 0.000130001444460494, + "loss": 3.3187, + "step": 2524 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001299725552506139, + "loss": 3.3585, + "step": 2525 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001299436660407338, + "loss": 3.2796, + "step": 2526 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001299147768308537, + "loss": 3.3668, + "step": 2527 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012988588762097357, + "loss": 3.2533, + "step": 2528 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012985699841109346, + "loss": 3.5402, + "step": 2529 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012982810920121336, + "loss": 3.336, + "step": 2530 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012979921999133325, + "loss": 3.4623, + "step": 2531 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012977033078145314, + "loss": 3.4201, + "step": 2532 + }, + { + "epoch": 0.36, + "learning_rate": 0.000129741441571573, + "loss": 3.3559, + "step": 2533 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001297125523616929, + "loss": 3.3933, + "step": 2534 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001296836631518128, + "loss": 3.4174, + "step": 2535 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001296547739419327, + "loss": 3.3951, + "step": 2536 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001296258847320526, + "loss": 3.2896, + "step": 2537 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012959699552217248, + "loss": 3.4864, + "step": 2538 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012956810631229237, + "loss": 3.4116, + "step": 2539 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012953921710241227, + "loss": 3.4263, + "step": 2540 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012951032789253213, + "loss": 3.4341, + "step": 2541 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012948143868265203, + "loss": 3.2924, + "step": 2542 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012945254947277192, + "loss": 3.3333, + "step": 2543 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012942366026289182, + "loss": 3.3644, + "step": 2544 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001293947710530117, + "loss": 3.3937, + "step": 2545 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012936588184313158, + "loss": 3.3669, + "step": 2546 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001293369926332515, + "loss": 3.4506, + "step": 2547 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001293081034233714, + "loss": 3.4005, + "step": 2548 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012927921421349129, + "loss": 3.347, + "step": 2549 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012925032500361115, + "loss": 3.3494, + "step": 2550 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012922143579373105, + "loss": 3.3931, + "step": 2551 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012919254658385094, + "loss": 3.4486, + "step": 2552 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012916365737397083, + "loss": 3.3955, + "step": 2553 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001291347681640907, + "loss": 3.323, + "step": 2554 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001291058789542106, + "loss": 3.4641, + "step": 2555 + }, + { + "epoch": 0.36, + "learning_rate": 0.0001290769897443305, + "loss": 3.4741, + "step": 2556 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012904810053445038, + "loss": 3.3527, + "step": 2557 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012901921132457028, + "loss": 3.4107, + "step": 2558 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012899032211469017, + "loss": 3.3844, + "step": 2559 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012896143290481006, + "loss": 3.3398, + "step": 2560 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012893254369492996, + "loss": 3.3155, + "step": 2561 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012890365448504985, + "loss": 3.4171, + "step": 2562 + }, + { + "epoch": 0.36, + "learning_rate": 0.00012887476527516972, + "loss": 3.3677, + "step": 2563 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001288458760652896, + "loss": 3.1637, + "step": 2564 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001288169868554095, + "loss": 3.3501, + "step": 2565 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001287880976455294, + "loss": 3.2347, + "step": 2566 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001287592084356493, + "loss": 3.3296, + "step": 2567 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001287303192257692, + "loss": 3.4267, + "step": 2568 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012870143001588908, + "loss": 3.3299, + "step": 2569 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012867254080600897, + "loss": 3.3744, + "step": 2570 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012864365159612887, + "loss": 3.4453, + "step": 2571 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012861476238624873, + "loss": 3.4061, + "step": 2572 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012858587317636863, + "loss": 3.3093, + "step": 2573 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012855698396648852, + "loss": 3.1693, + "step": 2574 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012852809475660842, + "loss": 3.1817, + "step": 2575 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012849920554672828, + "loss": 3.4069, + "step": 2576 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012847031633684818, + "loss": 3.5068, + "step": 2577 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001284414271269681, + "loss": 3.3693, + "step": 2578 + }, + { + "epoch": 0.37, + "learning_rate": 0.000128412537917088, + "loss": 3.3957, + "step": 2579 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012838364870720786, + "loss": 3.36, + "step": 2580 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012835475949732775, + "loss": 3.3984, + "step": 2581 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012832587028744765, + "loss": 3.1863, + "step": 2582 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012829698107756754, + "loss": 3.3402, + "step": 2583 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012826809186768743, + "loss": 3.3369, + "step": 2584 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001282392026578073, + "loss": 3.3675, + "step": 2585 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001282103134479272, + "loss": 3.3442, + "step": 2586 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001281814242380471, + "loss": 3.4422, + "step": 2587 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012815253502816698, + "loss": 3.2933, + "step": 2588 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012812364581828688, + "loss": 3.4378, + "step": 2589 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012809475660840677, + "loss": 3.4258, + "step": 2590 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012806586739852666, + "loss": 3.2842, + "step": 2591 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012803697818864656, + "loss": 3.0408, + "step": 2592 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012800808897876645, + "loss": 3.2241, + "step": 2593 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012797919976888632, + "loss": 3.4057, + "step": 2594 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001279503105590062, + "loss": 3.3407, + "step": 2595 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001279214213491261, + "loss": 3.4498, + "step": 2596 + }, + { + "epoch": 0.37, + "learning_rate": 0.000127892532139246, + "loss": 3.4478, + "step": 2597 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012786364292936587, + "loss": 3.3925, + "step": 2598 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001278347537194858, + "loss": 3.3481, + "step": 2599 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012780586450960568, + "loss": 3.3213, + "step": 2600 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012777697529972557, + "loss": 3.3323, + "step": 2601 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012774808608984544, + "loss": 3.3765, + "step": 2602 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012771919687996534, + "loss": 3.3472, + "step": 2603 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012769030767008523, + "loss": 3.376, + "step": 2604 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012766141846020512, + "loss": 3.356, + "step": 2605 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012763252925032502, + "loss": 3.4621, + "step": 2606 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012760364004044488, + "loss": 3.3554, + "step": 2607 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012757475083056478, + "loss": 3.439, + "step": 2608 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012754586162068467, + "loss": 3.3964, + "step": 2609 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001275169724108046, + "loss": 3.2209, + "step": 2610 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012748808320092446, + "loss": 3.471, + "step": 2611 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012745919399104435, + "loss": 3.2721, + "step": 2612 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012743030478116425, + "loss": 3.3398, + "step": 2613 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012740141557128414, + "loss": 3.3427, + "step": 2614 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012737252636140403, + "loss": 3.4032, + "step": 2615 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001273436371515239, + "loss": 3.2084, + "step": 2616 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001273147479416438, + "loss": 3.348, + "step": 2617 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001272858587317637, + "loss": 3.2441, + "step": 2618 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012725696952188358, + "loss": 3.391, + "step": 2619 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012722808031200348, + "loss": 3.4738, + "step": 2620 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012719919110212337, + "loss": 3.4117, + "step": 2621 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012717030189224326, + "loss": 3.2362, + "step": 2622 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012714141268236316, + "loss": 3.3843, + "step": 2623 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012711252347248302, + "loss": 3.2797, + "step": 2624 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012708363426260292, + "loss": 3.3762, + "step": 2625 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001270547450527228, + "loss": 3.3659, + "step": 2626 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001270258558428427, + "loss": 3.2781, + "step": 2627 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001269969666329626, + "loss": 3.3275, + "step": 2628 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012696807742308247, + "loss": 3.3819, + "step": 2629 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012693918821320236, + "loss": 3.4597, + "step": 2630 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012691029900332228, + "loss": 3.4048, + "step": 2631 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012688140979344217, + "loss": 3.3594, + "step": 2632 + }, + { + "epoch": 0.37, + "learning_rate": 0.00012685252058356204, + "loss": 3.4678, + "step": 2633 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012682363137368194, + "loss": 3.1888, + "step": 2634 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012679474216380183, + "loss": 3.4054, + "step": 2635 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012676585295392172, + "loss": 3.2659, + "step": 2636 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012673696374404162, + "loss": 3.1928, + "step": 2637 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012670807453416148, + "loss": 3.3957, + "step": 2638 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012667918532428138, + "loss": 3.4169, + "step": 2639 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012665029611440127, + "loss": 3.4016, + "step": 2640 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012662140690452116, + "loss": 3.3982, + "step": 2641 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012659251769464106, + "loss": 3.3423, + "step": 2642 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012656362848476095, + "loss": 3.439, + "step": 2643 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012653473927488085, + "loss": 3.3031, + "step": 2644 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012650585006500074, + "loss": 3.4652, + "step": 2645 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001264769608551206, + "loss": 3.3652, + "step": 2646 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001264480716452405, + "loss": 3.3633, + "step": 2647 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001264191824353604, + "loss": 3.3795, + "step": 2648 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001263902932254803, + "loss": 3.5622, + "step": 2649 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012636140401560018, + "loss": 3.3977, + "step": 2650 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012633251480572005, + "loss": 3.4627, + "step": 2651 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012630362559583997, + "loss": 3.4903, + "step": 2652 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012627473638595986, + "loss": 3.4199, + "step": 2653 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012624584717607976, + "loss": 3.3015, + "step": 2654 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012621695796619962, + "loss": 3.2783, + "step": 2655 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012618806875631952, + "loss": 3.3574, + "step": 2656 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001261591795464394, + "loss": 3.3708, + "step": 2657 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001261302903365593, + "loss": 3.5374, + "step": 2658 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001261014011266792, + "loss": 3.4267, + "step": 2659 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012607251191679907, + "loss": 3.4563, + "step": 2660 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012604362270691896, + "loss": 3.3468, + "step": 2661 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012601473349703888, + "loss": 3.2677, + "step": 2662 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012598584428715875, + "loss": 3.3923, + "step": 2663 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012595695507727864, + "loss": 3.3761, + "step": 2664 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012592806586739854, + "loss": 3.4044, + "step": 2665 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012589917665751843, + "loss": 3.2343, + "step": 2666 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012587028744763832, + "loss": 3.4267, + "step": 2667 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001258413982377582, + "loss": 3.4803, + "step": 2668 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012581250902787808, + "loss": 3.4012, + "step": 2669 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012578361981799798, + "loss": 3.3552, + "step": 2670 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012575473060811787, + "loss": 3.204, + "step": 2671 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012572584139823777, + "loss": 3.3883, + "step": 2672 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012569695218835766, + "loss": 3.4008, + "step": 2673 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012566806297847755, + "loss": 3.3178, + "step": 2674 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012563917376859745, + "loss": 3.0956, + "step": 2675 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012561028455871734, + "loss": 3.325, + "step": 2676 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001255813953488372, + "loss": 3.417, + "step": 2677 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001255525061389571, + "loss": 3.4145, + "step": 2678 + }, + { + "epoch": 0.38, + "learning_rate": 0.000125523616929077, + "loss": 3.5485, + "step": 2679 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001254947277191969, + "loss": 3.293, + "step": 2680 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012546583850931676, + "loss": 3.4376, + "step": 2681 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012543694929943665, + "loss": 3.427, + "step": 2682 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012540806008955657, + "loss": 3.4608, + "step": 2683 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012537917087967646, + "loss": 3.4159, + "step": 2684 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012535028166979633, + "loss": 3.3954, + "step": 2685 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012532139245991622, + "loss": 3.4478, + "step": 2686 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012529250325003612, + "loss": 3.5245, + "step": 2687 + }, + { + "epoch": 0.38, + "learning_rate": 0.000125263614040156, + "loss": 3.43, + "step": 2688 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001252347248302759, + "loss": 3.5288, + "step": 2689 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012520583562039577, + "loss": 3.4635, + "step": 2690 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012517694641051567, + "loss": 3.2762, + "step": 2691 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012514805720063556, + "loss": 3.207, + "step": 2692 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012511916799075545, + "loss": 3.318, + "step": 2693 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012509027878087535, + "loss": 3.4903, + "step": 2694 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012506138957099524, + "loss": 3.4562, + "step": 2695 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012503250036111514, + "loss": 3.3621, + "step": 2696 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012500361115123503, + "loss": 3.3899, + "step": 2697 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012497472194135492, + "loss": 3.3823, + "step": 2698 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001249458327314748, + "loss": 3.4669, + "step": 2699 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012491694352159468, + "loss": 3.4761, + "step": 2700 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012488805431171458, + "loss": 3.3686, + "step": 2701 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012485916510183447, + "loss": 3.2844, + "step": 2702 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012483027589195434, + "loss": 3.4819, + "step": 2703 + }, + { + "epoch": 0.38, + "learning_rate": 0.00012480138668207426, + "loss": 3.3464, + "step": 2704 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012477249747219415, + "loss": 3.3565, + "step": 2705 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012474360826231405, + "loss": 3.6032, + "step": 2706 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001247147190524339, + "loss": 3.3503, + "step": 2707 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001246858298425538, + "loss": 3.462, + "step": 2708 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001246569406326737, + "loss": 3.4015, + "step": 2709 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001246280514227936, + "loss": 3.3831, + "step": 2710 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001245991622129135, + "loss": 3.4502, + "step": 2711 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012457027300303336, + "loss": 3.5121, + "step": 2712 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012454138379315325, + "loss": 3.3088, + "step": 2713 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012451249458327314, + "loss": 3.405, + "step": 2714 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012448360537339306, + "loss": 3.4379, + "step": 2715 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012445471616351293, + "loss": 3.4802, + "step": 2716 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012442582695363282, + "loss": 3.3756, + "step": 2717 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012439693774375272, + "loss": 3.3297, + "step": 2718 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001243680485338726, + "loss": 3.4115, + "step": 2719 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001243391593239925, + "loss": 3.256, + "step": 2720 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012431027011411237, + "loss": 3.3898, + "step": 2721 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012428138090423227, + "loss": 3.2545, + "step": 2722 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012425249169435216, + "loss": 3.2719, + "step": 2723 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012422360248447205, + "loss": 3.3735, + "step": 2724 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012419471327459195, + "loss": 3.3794, + "step": 2725 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012416582406471184, + "loss": 3.4381, + "step": 2726 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012413693485483174, + "loss": 3.3723, + "step": 2727 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012410804564495163, + "loss": 3.153, + "step": 2728 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001240791564350715, + "loss": 3.3809, + "step": 2729 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001240502672251914, + "loss": 3.3281, + "step": 2730 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012402137801531128, + "loss": 3.1046, + "step": 2731 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012399248880543118, + "loss": 3.3417, + "step": 2732 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012396359959555107, + "loss": 3.3571, + "step": 2733 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012393471038567094, + "loss": 3.493, + "step": 2734 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012390582117579083, + "loss": 3.3214, + "step": 2735 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012387693196591075, + "loss": 3.4362, + "step": 2736 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012384804275603065, + "loss": 3.4042, + "step": 2737 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012381915354615051, + "loss": 3.4319, + "step": 2738 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001237902643362704, + "loss": 3.4458, + "step": 2739 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001237613751263903, + "loss": 3.4421, + "step": 2740 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001237324859165102, + "loss": 3.3754, + "step": 2741 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001237035967066301, + "loss": 3.347, + "step": 2742 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012367470749674996, + "loss": 3.4606, + "step": 2743 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012364581828686985, + "loss": 3.3032, + "step": 2744 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012361692907698974, + "loss": 3.3964, + "step": 2745 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012358803986710964, + "loss": 3.355, + "step": 2746 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012355915065722953, + "loss": 3.3482, + "step": 2747 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012353026144734942, + "loss": 3.4613, + "step": 2748 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012350137223746932, + "loss": 3.3546, + "step": 2749 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001234724830275892, + "loss": 3.2237, + "step": 2750 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012344359381770908, + "loss": 3.3636, + "step": 2751 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012341470460782897, + "loss": 3.2352, + "step": 2752 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012338581539794887, + "loss": 3.4965, + "step": 2753 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012335692618806876, + "loss": 3.2916, + "step": 2754 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012332803697818865, + "loss": 3.3528, + "step": 2755 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012329914776830852, + "loss": 3.2192, + "step": 2756 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012327025855842844, + "loss": 3.3678, + "step": 2757 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012324136934854834, + "loss": 3.2173, + "step": 2758 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012321248013866823, + "loss": 3.2887, + "step": 2759 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001231835909287881, + "loss": 3.3319, + "step": 2760 + }, + { + "epoch": 0.39, + "learning_rate": 0.000123154701718908, + "loss": 3.4748, + "step": 2761 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012312581250902788, + "loss": 3.2698, + "step": 2762 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012309692329914778, + "loss": 3.4107, + "step": 2763 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012306803408926767, + "loss": 3.3461, + "step": 2764 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012303914487938754, + "loss": 3.2252, + "step": 2765 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012301025566950743, + "loss": 3.225, + "step": 2766 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012298136645962735, + "loss": 3.4453, + "step": 2767 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012295247724974722, + "loss": 3.4721, + "step": 2768 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012292358803986711, + "loss": 3.3088, + "step": 2769 + }, + { + "epoch": 0.39, + "learning_rate": 0.000122894698829987, + "loss": 3.4014, + "step": 2770 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001228658096201069, + "loss": 3.4508, + "step": 2771 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001228369204102268, + "loss": 3.4795, + "step": 2772 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012280803120034666, + "loss": 3.3166, + "step": 2773 + }, + { + "epoch": 0.39, + "learning_rate": 0.00012277914199046656, + "loss": 3.3391, + "step": 2774 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012275025278058645, + "loss": 3.2663, + "step": 2775 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012272136357070634, + "loss": 3.3781, + "step": 2776 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012269247436082624, + "loss": 3.3014, + "step": 2777 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012266358515094613, + "loss": 3.2865, + "step": 2778 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012263469594106602, + "loss": 3.2788, + "step": 2779 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012260580673118592, + "loss": 3.2579, + "step": 2780 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001225769175213058, + "loss": 3.3732, + "step": 2781 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012254802831142568, + "loss": 3.3919, + "step": 2782 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012251913910154557, + "loss": 3.3965, + "step": 2783 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012249024989166547, + "loss": 3.3287, + "step": 2784 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012246136068178536, + "loss": 3.4794, + "step": 2785 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012243247147190525, + "loss": 3.3119, + "step": 2786 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012240358226202512, + "loss": 3.2659, + "step": 2787 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012237469305214504, + "loss": 3.4235, + "step": 2788 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012234580384226494, + "loss": 3.4274, + "step": 2789 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001223169146323848, + "loss": 3.1281, + "step": 2790 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001222880254225047, + "loss": 3.4995, + "step": 2791 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001222591362126246, + "loss": 3.5272, + "step": 2792 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012223024700274448, + "loss": 3.2849, + "step": 2793 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012220135779286438, + "loss": 3.4098, + "step": 2794 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012217246858298424, + "loss": 3.42, + "step": 2795 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012214357937310414, + "loss": 3.2024, + "step": 2796 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012211469016322403, + "loss": 3.4517, + "step": 2797 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012208580095334395, + "loss": 3.2603, + "step": 2798 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012205691174346383, + "loss": 3.0616, + "step": 2799 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012202802253358371, + "loss": 3.447, + "step": 2800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012199913332370361, + "loss": 3.2967, + "step": 2801 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001219702441138235, + "loss": 3.2181, + "step": 2802 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012194135490394338, + "loss": 3.2925, + "step": 2803 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012191246569406328, + "loss": 3.4124, + "step": 2804 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012188357648418316, + "loss": 3.2474, + "step": 2805 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012185468727430305, + "loss": 3.3938, + "step": 2806 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012182579806442293, + "loss": 3.3929, + "step": 2807 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012179690885454282, + "loss": 3.463, + "step": 2808 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012176801964466273, + "loss": 3.2457, + "step": 2809 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012173913043478263, + "loss": 3.3338, + "step": 2810 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001217102412249025, + "loss": 3.4464, + "step": 2811 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001216813520150224, + "loss": 3.3941, + "step": 2812 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012165246280514229, + "loss": 3.4187, + "step": 2813 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012162357359526217, + "loss": 3.4447, + "step": 2814 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012159468438538207, + "loss": 3.3622, + "step": 2815 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012156579517550195, + "loss": 3.2339, + "step": 2816 + }, + { + "epoch": 0.4, + "eval_loss": 3.5397603511810303, + "eval_runtime": 473.3374, + "eval_samples_per_second": 43.282, + "eval_steps_per_second": 14.427, + "step": 2816 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012153690596562184, + "loss": 3.2623, + "step": 2817 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012150801675574172, + "loss": 3.2769, + "step": 2818 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012147912754586164, + "loss": 3.2308, + "step": 2819 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012145023833598152, + "loss": 3.3513, + "step": 2820 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012142134912610142, + "loss": 3.3549, + "step": 2821 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001213924599162213, + "loss": 3.2788, + "step": 2822 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012136357070634119, + "loss": 3.5271, + "step": 2823 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012133468149646108, + "loss": 3.1716, + "step": 2824 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012130579228658096, + "loss": 3.3601, + "step": 2825 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012127690307670086, + "loss": 3.4085, + "step": 2826 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012124801386682074, + "loss": 3.2235, + "step": 2827 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012121912465694063, + "loss": 3.4152, + "step": 2828 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012119023544706051, + "loss": 3.291, + "step": 2829 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012116134623718043, + "loss": 3.322, + "step": 2830 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012113245702730031, + "loss": 3.3296, + "step": 2831 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012110356781742021, + "loss": 3.2322, + "step": 2832 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012107467860754009, + "loss": 3.4277, + "step": 2833 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012104578939765998, + "loss": 3.1975, + "step": 2834 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012101690018777988, + "loss": 3.3079, + "step": 2835 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012098801097789976, + "loss": 3.4171, + "step": 2836 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012095912176801965, + "loss": 3.206, + "step": 2837 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012093023255813953, + "loss": 3.3719, + "step": 2838 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012090134334825942, + "loss": 3.536, + "step": 2839 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012087245413837933, + "loss": 3.3528, + "step": 2840 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012084356492849923, + "loss": 3.3847, + "step": 2841 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001208146757186191, + "loss": 2.9773, + "step": 2842 + }, + { + "epoch": 0.4, + "learning_rate": 0.000120785786508739, + "loss": 3.3853, + "step": 2843 + }, + { + "epoch": 0.4, + "learning_rate": 0.00012075689729885888, + "loss": 3.3528, + "step": 2844 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012072800808897877, + "loss": 3.3483, + "step": 2845 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012069911887909865, + "loss": 3.4658, + "step": 2846 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012067022966921855, + "loss": 3.346, + "step": 2847 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012064134045933844, + "loss": 3.2132, + "step": 2848 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012061245124945832, + "loss": 3.4038, + "step": 2849 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012058356203957822, + "loss": 3.3602, + "step": 2850 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012055467282969812, + "loss": 3.4381, + "step": 2851 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012052578361981802, + "loss": 3.4591, + "step": 2852 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001204968944099379, + "loss": 3.3465, + "step": 2853 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012046800520005779, + "loss": 3.3748, + "step": 2854 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012043911599017767, + "loss": 3.3602, + "step": 2855 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012041022678029756, + "loss": 3.3046, + "step": 2856 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012038133757041745, + "loss": 3.4129, + "step": 2857 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012035244836053734, + "loss": 3.3221, + "step": 2858 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012032355915065723, + "loss": 3.351, + "step": 2859 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012029466994077711, + "loss": 3.4366, + "step": 2860 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012026578073089702, + "loss": 3.3561, + "step": 2861 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012023689152101691, + "loss": 3.4635, + "step": 2862 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012020800231113681, + "loss": 3.3082, + "step": 2863 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012017911310125669, + "loss": 3.4234, + "step": 2864 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012015022389137658, + "loss": 3.3442, + "step": 2865 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012012133468149646, + "loss": 3.3602, + "step": 2866 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012009244547161636, + "loss": 3.0126, + "step": 2867 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012006355626173624, + "loss": 3.2896, + "step": 2868 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012003466705185613, + "loss": 3.4567, + "step": 2869 + }, + { + "epoch": 0.41, + "learning_rate": 0.00012000577784197602, + "loss": 3.2361, + "step": 2870 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001199768886320959, + "loss": 3.4387, + "step": 2871 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011994799942221581, + "loss": 3.395, + "step": 2872 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001199191102123357, + "loss": 3.3643, + "step": 2873 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001198902210024556, + "loss": 3.4342, + "step": 2874 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011986133179257548, + "loss": 3.4494, + "step": 2875 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011983244258269537, + "loss": 3.3813, + "step": 2876 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011980355337281525, + "loss": 3.1537, + "step": 2877 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011977466416293515, + "loss": 3.4073, + "step": 2878 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011974577495305503, + "loss": 3.402, + "step": 2879 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011971688574317492, + "loss": 3.2713, + "step": 2880 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011968799653329482, + "loss": 3.454, + "step": 2881 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011965910732341472, + "loss": 3.2313, + "step": 2882 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001196302181135346, + "loss": 3.3605, + "step": 2883 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001196013289036545, + "loss": 3.3283, + "step": 2884 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011957243969377439, + "loss": 3.4797, + "step": 2885 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011954355048389427, + "loss": 3.4396, + "step": 2886 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011951466127401417, + "loss": 3.4563, + "step": 2887 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011948577206413405, + "loss": 3.4401, + "step": 2888 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011945688285425394, + "loss": 3.4105, + "step": 2889 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011942799364437382, + "loss": 3.2352, + "step": 2890 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011939910443449371, + "loss": 3.5398, + "step": 2891 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011937021522461361, + "loss": 3.2313, + "step": 2892 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011934132601473351, + "loss": 3.0719, + "step": 2893 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001193124368048534, + "loss": 3.3121, + "step": 2894 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011928354759497329, + "loss": 3.2686, + "step": 2895 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011925465838509318, + "loss": 3.3653, + "step": 2896 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011922576917521306, + "loss": 3.4363, + "step": 2897 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011919687996533296, + "loss": 3.525, + "step": 2898 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011916799075545284, + "loss": 3.3923, + "step": 2899 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011913910154557273, + "loss": 3.4238, + "step": 2900 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011911021233569261, + "loss": 3.2937, + "step": 2901 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001190813231258125, + "loss": 3.37, + "step": 2902 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011905243391593241, + "loss": 3.3827, + "step": 2903 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001190235447060523, + "loss": 3.4186, + "step": 2904 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011899465549617219, + "loss": 3.2927, + "step": 2905 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011896576628629208, + "loss": 3.2217, + "step": 2906 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011893687707641197, + "loss": 3.3386, + "step": 2907 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011890798786653185, + "loss": 3.3211, + "step": 2908 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011887909865665175, + "loss": 3.4936, + "step": 2909 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011885020944677163, + "loss": 3.3862, + "step": 2910 + }, + { + "epoch": 0.41, + "learning_rate": 0.00011882132023689152, + "loss": 3.3776, + "step": 2911 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001187924310270114, + "loss": 3.3151, + "step": 2912 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001187635418171313, + "loss": 3.1704, + "step": 2913 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001187346526072512, + "loss": 3.3042, + "step": 2914 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001187057633973711, + "loss": 3.0431, + "step": 2915 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011867687418749098, + "loss": 3.2402, + "step": 2916 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011864798497761087, + "loss": 3.3413, + "step": 2917 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011861909576773077, + "loss": 3.4283, + "step": 2918 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011859020655785065, + "loss": 3.2246, + "step": 2919 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011856131734797054, + "loss": 3.5293, + "step": 2920 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011853242813809042, + "loss": 3.309, + "step": 2921 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011850353892821031, + "loss": 3.4406, + "step": 2922 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001184746497183302, + "loss": 3.3076, + "step": 2923 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011844576050845011, + "loss": 3.3118, + "step": 2924 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011841687129857, + "loss": 3.3957, + "step": 2925 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011838798208868989, + "loss": 3.3812, + "step": 2926 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011835909287880977, + "loss": 3.3103, + "step": 2927 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011833020366892966, + "loss": 3.3207, + "step": 2928 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011830131445904956, + "loss": 3.3636, + "step": 2929 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011827242524916944, + "loss": 3.2148, + "step": 2930 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011824353603928933, + "loss": 3.4342, + "step": 2931 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011821464682940921, + "loss": 3.418, + "step": 2932 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001181857576195291, + "loss": 3.1815, + "step": 2933 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011815686840964899, + "loss": 3.3941, + "step": 2934 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001181279791997689, + "loss": 3.2534, + "step": 2935 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011809908998988879, + "loss": 3.2224, + "step": 2936 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011807020078000868, + "loss": 3.2472, + "step": 2937 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011804131157012856, + "loss": 3.3013, + "step": 2938 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011801242236024845, + "loss": 3.4542, + "step": 2939 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011798353315036835, + "loss": 3.3669, + "step": 2940 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011795464394048823, + "loss": 3.2054, + "step": 2941 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011792575473060812, + "loss": 3.4628, + "step": 2942 + }, + { + "epoch": 0.42, + "learning_rate": 0.000117896865520728, + "loss": 3.314, + "step": 2943 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001178679763108479, + "loss": 3.2845, + "step": 2944 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001178390871009678, + "loss": 3.3936, + "step": 2945 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001178101978910877, + "loss": 3.3622, + "step": 2946 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011778130868120758, + "loss": 3.4131, + "step": 2947 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011775241947132747, + "loss": 3.2771, + "step": 2948 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011772353026144735, + "loss": 3.4104, + "step": 2949 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011769464105156725, + "loss": 3.4032, + "step": 2950 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011766575184168714, + "loss": 3.2302, + "step": 2951 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011763686263180702, + "loss": 3.3607, + "step": 2952 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011760797342192691, + "loss": 3.2546, + "step": 2953 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001175790842120468, + "loss": 3.2651, + "step": 2954 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011755019500216669, + "loss": 3.2851, + "step": 2955 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001175213057922866, + "loss": 3.4795, + "step": 2956 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011749241658240649, + "loss": 3.4549, + "step": 2957 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011746352737252637, + "loss": 3.3106, + "step": 2958 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011743463816264626, + "loss": 3.363, + "step": 2959 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011740574895276614, + "loss": 3.2841, + "step": 2960 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011737685974288604, + "loss": 3.4212, + "step": 2961 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011734797053300593, + "loss": 3.3983, + "step": 2962 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011731908132312581, + "loss": 3.2952, + "step": 2963 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001172901921132457, + "loss": 3.4007, + "step": 2964 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011726130290336559, + "loss": 3.3081, + "step": 2965 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011723241369348549, + "loss": 3.4531, + "step": 2966 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011720352448360539, + "loss": 3.3969, + "step": 2967 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011717463527372528, + "loss": 3.3213, + "step": 2968 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011714574606384516, + "loss": 3.3514, + "step": 2969 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011711685685396505, + "loss": 3.3203, + "step": 2970 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011708796764408493, + "loss": 3.3769, + "step": 2971 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011705907843420483, + "loss": 3.3929, + "step": 2972 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011703018922432471, + "loss": 3.4084, + "step": 2973 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001170013000144446, + "loss": 3.2764, + "step": 2974 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001169724108045645, + "loss": 3.3365, + "step": 2975 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001169435215946844, + "loss": 3.137, + "step": 2976 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011691463238480428, + "loss": 3.3314, + "step": 2977 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011688574317492418, + "loss": 3.2965, + "step": 2978 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011685685396504407, + "loss": 3.4489, + "step": 2979 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011682796475516395, + "loss": 3.2876, + "step": 2980 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011679907554528385, + "loss": 3.3736, + "step": 2981 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011677018633540373, + "loss": 3.4143, + "step": 2982 + }, + { + "epoch": 0.42, + "learning_rate": 0.00011674129712552362, + "loss": 3.3062, + "step": 2983 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001167124079156435, + "loss": 3.3008, + "step": 2984 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001166835187057634, + "loss": 3.5034, + "step": 2985 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011665462949588329, + "loss": 3.2881, + "step": 2986 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001166257402860032, + "loss": 3.3307, + "step": 2987 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011659685107612308, + "loss": 3.3547, + "step": 2988 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011656796186624297, + "loss": 3.1519, + "step": 2989 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011653907265636286, + "loss": 3.4225, + "step": 2990 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011651018344648274, + "loss": 3.3672, + "step": 2991 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011648129423660264, + "loss": 3.4003, + "step": 2992 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011645240502672252, + "loss": 3.4046, + "step": 2993 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011642351581684241, + "loss": 3.3142, + "step": 2994 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011639462660696229, + "loss": 3.3948, + "step": 2995 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011636573739708219, + "loss": 3.3539, + "step": 2996 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011633684818720209, + "loss": 3.4269, + "step": 2997 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011630795897732199, + "loss": 3.4374, + "step": 2998 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011627906976744187, + "loss": 3.212, + "step": 2999 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011625018055756176, + "loss": 3.3384, + "step": 3000 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011622129134768165, + "loss": 3.5069, + "step": 3001 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011619240213780153, + "loss": 3.2896, + "step": 3002 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011616351292792143, + "loss": 3.2173, + "step": 3003 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011613462371804131, + "loss": 3.2881, + "step": 3004 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001161057345081612, + "loss": 3.4238, + "step": 3005 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011607684529828108, + "loss": 3.3646, + "step": 3006 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011604795608840098, + "loss": 3.2412, + "step": 3007 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011601906687852088, + "loss": 3.3506, + "step": 3008 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011599017766864078, + "loss": 3.3297, + "step": 3009 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011596128845876066, + "loss": 3.3462, + "step": 3010 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011593239924888055, + "loss": 3.3547, + "step": 3011 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011590351003900045, + "loss": 3.4532, + "step": 3012 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011587462082912033, + "loss": 3.4124, + "step": 3013 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011584573161924022, + "loss": 3.3584, + "step": 3014 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001158168424093601, + "loss": 3.3526, + "step": 3015 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011578795319948, + "loss": 3.338, + "step": 3016 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011575906398959987, + "loss": 3.4493, + "step": 3017 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001157301747797198, + "loss": 3.3514, + "step": 3018 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011570128556983968, + "loss": 3.423, + "step": 3019 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011567239635995957, + "loss": 3.3087, + "step": 3020 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011564350715007945, + "loss": 3.3231, + "step": 3021 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011561461794019934, + "loss": 3.2946, + "step": 3022 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011558572873031924, + "loss": 3.3502, + "step": 3023 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011555683952043912, + "loss": 3.3376, + "step": 3024 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011552795031055901, + "loss": 3.4109, + "step": 3025 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011549906110067889, + "loss": 3.3582, + "step": 3026 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011547017189079879, + "loss": 3.2769, + "step": 3027 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011544128268091867, + "loss": 3.4471, + "step": 3028 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011541239347103859, + "loss": 3.3088, + "step": 3029 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011538350426115847, + "loss": 3.434, + "step": 3030 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011535461505127836, + "loss": 3.4577, + "step": 3031 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011532572584139824, + "loss": 3.4747, + "step": 3032 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011529683663151814, + "loss": 3.339, + "step": 3033 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011526794742163803, + "loss": 3.2112, + "step": 3034 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011523905821175791, + "loss": 3.2662, + "step": 3035 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001152101690018778, + "loss": 3.3228, + "step": 3036 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011518127979199768, + "loss": 3.4458, + "step": 3037 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011515239058211758, + "loss": 3.4888, + "step": 3038 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011512350137223748, + "loss": 3.4086, + "step": 3039 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011509461216235738, + "loss": 3.3899, + "step": 3040 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011506572295247726, + "loss": 3.2718, + "step": 3041 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011503683374259715, + "loss": 3.3835, + "step": 3042 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011500794453271703, + "loss": 3.3354, + "step": 3043 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011497905532283693, + "loss": 3.3969, + "step": 3044 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011495016611295682, + "loss": 3.2528, + "step": 3045 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001149212769030767, + "loss": 3.2295, + "step": 3046 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001148923876931966, + "loss": 3.4159, + "step": 3047 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011486349848331647, + "loss": 3.4099, + "step": 3048 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011483460927343637, + "loss": 3.3859, + "step": 3049 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011480572006355628, + "loss": 3.381, + "step": 3050 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011477683085367617, + "loss": 3.1571, + "step": 3051 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011474794164379605, + "loss": 3.3107, + "step": 3052 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011471905243391594, + "loss": 3.3559, + "step": 3053 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011469016322403582, + "loss": 3.3384, + "step": 3054 + }, + { + "epoch": 0.43, + "learning_rate": 0.00011466127401415572, + "loss": 3.2054, + "step": 3055 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011463238480427561, + "loss": 3.3614, + "step": 3056 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011460349559439549, + "loss": 3.418, + "step": 3057 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011457460638451539, + "loss": 3.2436, + "step": 3058 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011454571717463527, + "loss": 3.3524, + "step": 3059 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011451682796475517, + "loss": 3.5018, + "step": 3060 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011448793875487507, + "loss": 3.4198, + "step": 3061 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011445904954499496, + "loss": 3.183, + "step": 3062 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011443016033511484, + "loss": 3.4058, + "step": 3063 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011440127112523474, + "loss": 3.3522, + "step": 3064 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011437238191535462, + "loss": 3.2152, + "step": 3065 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011434349270547451, + "loss": 3.2856, + "step": 3066 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001143146034955944, + "loss": 3.3922, + "step": 3067 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011428571428571428, + "loss": 3.3337, + "step": 3068 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011425682507583418, + "loss": 3.3856, + "step": 3069 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011422793586595406, + "loss": 3.4281, + "step": 3070 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011419904665607396, + "loss": 3.2898, + "step": 3071 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011417015744619386, + "loss": 3.2965, + "step": 3072 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011414126823631375, + "loss": 3.3825, + "step": 3073 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011411237902643363, + "loss": 3.4231, + "step": 3074 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011408348981655353, + "loss": 3.4035, + "step": 3075 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011405460060667341, + "loss": 3.4066, + "step": 3076 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001140257113967933, + "loss": 3.2967, + "step": 3077 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001139968221869132, + "loss": 3.3218, + "step": 3078 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011396793297703307, + "loss": 3.461, + "step": 3079 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011393904376715297, + "loss": 3.4681, + "step": 3080 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011391015455727288, + "loss": 3.2937, + "step": 3081 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011388126534739276, + "loss": 3.3003, + "step": 3082 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011385237613751265, + "loss": 3.4808, + "step": 3083 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011382348692763254, + "loss": 3.3351, + "step": 3084 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011379459771775242, + "loss": 3.3666, + "step": 3085 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011376570850787232, + "loss": 3.3265, + "step": 3086 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001137368192979922, + "loss": 3.1792, + "step": 3087 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011370793008811209, + "loss": 3.3736, + "step": 3088 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011367904087823199, + "loss": 3.2193, + "step": 3089 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011365015166835187, + "loss": 3.3988, + "step": 3090 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011362126245847176, + "loss": 3.3794, + "step": 3091 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011359237324859167, + "loss": 3.2979, + "step": 3092 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011356348403871155, + "loss": 3.3949, + "step": 3093 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011353459482883144, + "loss": 3.3369, + "step": 3094 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011350570561895134, + "loss": 3.2751, + "step": 3095 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011347681640907122, + "loss": 3.4321, + "step": 3096 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011344792719919111, + "loss": 3.3877, + "step": 3097 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011341903798931099, + "loss": 3.3482, + "step": 3098 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011339014877943088, + "loss": 3.2839, + "step": 3099 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011336125956955076, + "loss": 3.3243, + "step": 3100 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011333237035967066, + "loss": 3.4563, + "step": 3101 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011330348114979057, + "loss": 3.4088, + "step": 3102 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011327459193991046, + "loss": 3.281, + "step": 3103 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011324570273003034, + "loss": 3.2996, + "step": 3104 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011321681352015023, + "loss": 3.1804, + "step": 3105 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011318792431027013, + "loss": 3.3312, + "step": 3106 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011315903510039001, + "loss": 3.4377, + "step": 3107 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001131301458905099, + "loss": 3.4095, + "step": 3108 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011310125668062978, + "loss": 3.3624, + "step": 3109 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011307236747074968, + "loss": 3.3228, + "step": 3110 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011304347826086956, + "loss": 3.43, + "step": 3111 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011301458905098945, + "loss": 3.3281, + "step": 3112 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011298569984110936, + "loss": 3.243, + "step": 3113 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011295681063122925, + "loss": 3.4537, + "step": 3114 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011292792142134913, + "loss": 3.393, + "step": 3115 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011289903221146902, + "loss": 3.3423, + "step": 3116 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011287014300158892, + "loss": 3.4138, + "step": 3117 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001128412537917088, + "loss": 3.2931, + "step": 3118 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011281236458182869, + "loss": 3.4065, + "step": 3119 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011278347537194857, + "loss": 3.2166, + "step": 3120 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011275458616206847, + "loss": 3.3086, + "step": 3121 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011272569695218835, + "loss": 3.3501, + "step": 3122 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011269680774230827, + "loss": 3.5366, + "step": 3123 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011266791853242815, + "loss": 3.4074, + "step": 3124 + }, + { + "epoch": 0.44, + "learning_rate": 0.00011263902932254804, + "loss": 3.1839, + "step": 3125 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011261014011266792, + "loss": 3.3956, + "step": 3126 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011258125090278782, + "loss": 3.3401, + "step": 3127 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011255236169290771, + "loss": 3.3493, + "step": 3128 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011252347248302759, + "loss": 3.38, + "step": 3129 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011249458327314748, + "loss": 3.2987, + "step": 3130 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011246569406326736, + "loss": 3.2619, + "step": 3131 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011243680485338726, + "loss": 3.3791, + "step": 3132 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011240791564350714, + "loss": 3.401, + "step": 3133 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011237902643362706, + "loss": 3.1383, + "step": 3134 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011235013722374694, + "loss": 3.3987, + "step": 3135 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011232124801386683, + "loss": 3.248, + "step": 3136 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011229235880398671, + "loss": 3.2962, + "step": 3137 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011226346959410661, + "loss": 3.2754, + "step": 3138 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001122345803842265, + "loss": 3.4079, + "step": 3139 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011220569117434638, + "loss": 3.4831, + "step": 3140 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011217680196446628, + "loss": 3.4135, + "step": 3141 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011214791275458616, + "loss": 3.3875, + "step": 3142 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011211902354470605, + "loss": 3.2953, + "step": 3143 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011209013433482596, + "loss": 3.4627, + "step": 3144 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011206124512494585, + "loss": 3.2988, + "step": 3145 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011203235591506573, + "loss": 3.2434, + "step": 3146 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011200346670518562, + "loss": 3.4888, + "step": 3147 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001119745774953055, + "loss": 3.3249, + "step": 3148 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001119456882854254, + "loss": 3.4025, + "step": 3149 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011191679907554529, + "loss": 3.4046, + "step": 3150 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011188790986566517, + "loss": 3.2726, + "step": 3151 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011185902065578507, + "loss": 3.3567, + "step": 3152 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011183013144590495, + "loss": 3.3017, + "step": 3153 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011180124223602484, + "loss": 3.3534, + "step": 3154 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011177235302614475, + "loss": 3.3808, + "step": 3155 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011174346381626464, + "loss": 3.2337, + "step": 3156 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011171457460638452, + "loss": 3.3304, + "step": 3157 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011168568539650442, + "loss": 3.2466, + "step": 3158 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001116567961866243, + "loss": 3.4152, + "step": 3159 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011162790697674419, + "loss": 3.2942, + "step": 3160 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011159901776686408, + "loss": 3.3444, + "step": 3161 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011157012855698396, + "loss": 3.3307, + "step": 3162 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011154123934710386, + "loss": 3.3356, + "step": 3163 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011151235013722374, + "loss": 3.4839, + "step": 3164 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011148346092734366, + "loss": 3.4511, + "step": 3165 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011145457171746354, + "loss": 3.3769, + "step": 3166 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011142568250758343, + "loss": 3.2655, + "step": 3167 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011139679329770331, + "loss": 3.3711, + "step": 3168 + }, + { + "epoch": 0.45, + "eval_loss": 3.5232768058776855, + "eval_runtime": 472.4363, + "eval_samples_per_second": 43.365, + "eval_steps_per_second": 14.455, + "step": 3168 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011136790408782321, + "loss": 3.407, + "step": 3169 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011133901487794309, + "loss": 3.4157, + "step": 3170 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011131012566806298, + "loss": 3.2203, + "step": 3171 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011128123645818288, + "loss": 3.3327, + "step": 3172 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011125234724830276, + "loss": 3.2629, + "step": 3173 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011122345803842265, + "loss": 3.3861, + "step": 3174 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011119456882854256, + "loss": 3.3472, + "step": 3175 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011116567961866244, + "loss": 3.4091, + "step": 3176 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011113679040878233, + "loss": 3.2157, + "step": 3177 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011110790119890222, + "loss": 3.3241, + "step": 3178 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001110790119890221, + "loss": 3.3768, + "step": 3179 + }, + { + "epoch": 0.45, + "learning_rate": 0.000111050122779142, + "loss": 3.3307, + "step": 3180 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011102123356926188, + "loss": 3.4113, + "step": 3181 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011099234435938177, + "loss": 3.2805, + "step": 3182 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011096345514950167, + "loss": 3.4032, + "step": 3183 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011093456593962155, + "loss": 3.0521, + "step": 3184 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011090567672974144, + "loss": 3.2905, + "step": 3185 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011087678751986135, + "loss": 3.2763, + "step": 3186 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011084789830998123, + "loss": 3.2978, + "step": 3187 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011081900910010112, + "loss": 3.5079, + "step": 3188 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011079011989022102, + "loss": 3.3663, + "step": 3189 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001107612306803409, + "loss": 3.3789, + "step": 3190 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011073234147046079, + "loss": 3.3341, + "step": 3191 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011070345226058067, + "loss": 3.3016, + "step": 3192 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011067456305070056, + "loss": 3.3406, + "step": 3193 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011064567384082046, + "loss": 3.239, + "step": 3194 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011061678463094034, + "loss": 3.3135, + "step": 3195 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011058789542106025, + "loss": 3.4249, + "step": 3196 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011055900621118014, + "loss": 3.3227, + "step": 3197 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011053011700130002, + "loss": 3.2321, + "step": 3198 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011050122779141991, + "loss": 3.3633, + "step": 3199 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011047233858153981, + "loss": 3.1829, + "step": 3200 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011044344937165969, + "loss": 3.4205, + "step": 3201 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011041456016177958, + "loss": 3.3592, + "step": 3202 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011038567095189946, + "loss": 3.295, + "step": 3203 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011035678174201936, + "loss": 3.4284, + "step": 3204 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011032789253213925, + "loss": 3.2576, + "step": 3205 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011029900332225913, + "loss": 3.2283, + "step": 3206 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011027011411237904, + "loss": 3.3789, + "step": 3207 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011024122490249893, + "loss": 3.3567, + "step": 3208 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011021233569261881, + "loss": 3.2874, + "step": 3209 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001101834464827387, + "loss": 3.2771, + "step": 3210 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001101545572728586, + "loss": 3.4531, + "step": 3211 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011012566806297848, + "loss": 3.4304, + "step": 3212 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011009677885309837, + "loss": 3.3653, + "step": 3213 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011006788964321825, + "loss": 3.198, + "step": 3214 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011003900043333815, + "loss": 3.4138, + "step": 3215 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011001011122345804, + "loss": 3.3546, + "step": 3216 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010998122201357795, + "loss": 3.361, + "step": 3217 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010995233280369783, + "loss": 3.4561, + "step": 3218 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010992344359381772, + "loss": 3.3528, + "step": 3219 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001098945543839376, + "loss": 3.2822, + "step": 3220 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001098656651740575, + "loss": 3.138, + "step": 3221 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010983677596417739, + "loss": 3.3132, + "step": 3222 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010980788675429727, + "loss": 3.1742, + "step": 3223 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010977899754441716, + "loss": 3.4067, + "step": 3224 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010975010833453704, + "loss": 3.121, + "step": 3225 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010972121912465694, + "loss": 3.2398, + "step": 3226 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010969232991477683, + "loss": 3.3237, + "step": 3227 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010966344070489674, + "loss": 3.2556, + "step": 3228 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010963455149501662, + "loss": 3.3521, + "step": 3229 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010960566228513651, + "loss": 3.2995, + "step": 3230 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001095767730752564, + "loss": 3.2207, + "step": 3231 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010954788386537629, + "loss": 3.2197, + "step": 3232 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010951899465549618, + "loss": 3.2293, + "step": 3233 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010949010544561606, + "loss": 3.2608, + "step": 3234 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010946121623573596, + "loss": 3.0432, + "step": 3235 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010943232702585584, + "loss": 3.3144, + "step": 3236 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010940343781597573, + "loss": 3.304, + "step": 3237 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010937454860609564, + "loss": 3.3748, + "step": 3238 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010934565939621553, + "loss": 3.3572, + "step": 3239 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010931677018633541, + "loss": 3.3201, + "step": 3240 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001092878809764553, + "loss": 3.3465, + "step": 3241 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010925899176657519, + "loss": 3.4183, + "step": 3242 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010923010255669508, + "loss": 3.2054, + "step": 3243 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010920121334681497, + "loss": 3.3875, + "step": 3244 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010917232413693485, + "loss": 3.2239, + "step": 3245 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010914343492705475, + "loss": 3.4065, + "step": 3246 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010911454571717463, + "loss": 3.3298, + "step": 3247 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010908565650729452, + "loss": 3.503, + "step": 3248 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010905676729741443, + "loss": 3.1974, + "step": 3249 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010902787808753432, + "loss": 3.2414, + "step": 3250 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001089989888776542, + "loss": 3.2385, + "step": 3251 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001089700996677741, + "loss": 3.3298, + "step": 3252 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010894121045789398, + "loss": 3.3801, + "step": 3253 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010891232124801387, + "loss": 3.1857, + "step": 3254 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010888343203813376, + "loss": 3.2238, + "step": 3255 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010885454282825365, + "loss": 3.3985, + "step": 3256 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010882565361837354, + "loss": 3.4367, + "step": 3257 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010879676440849342, + "loss": 3.287, + "step": 3258 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010876787519861334, + "loss": 3.4034, + "step": 3259 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010873898598873322, + "loss": 3.3362, + "step": 3260 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010871009677885311, + "loss": 3.3303, + "step": 3261 + }, + { + "epoch": 0.46, + "learning_rate": 0.000108681207568973, + "loss": 3.3775, + "step": 3262 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010865231835909289, + "loss": 3.3738, + "step": 3263 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010862342914921277, + "loss": 3.2712, + "step": 3264 + }, + { + "epoch": 0.46, + "learning_rate": 0.00010859453993933266, + "loss": 3.3352, + "step": 3265 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010856565072945256, + "loss": 3.3474, + "step": 3266 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010853676151957244, + "loss": 3.2761, + "step": 3267 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010850787230969233, + "loss": 3.2135, + "step": 3268 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010847898309981221, + "loss": 3.3481, + "step": 3269 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010845009388993213, + "loss": 3.3173, + "step": 3270 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010842120468005201, + "loss": 3.2898, + "step": 3271 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001083923154701719, + "loss": 3.2506, + "step": 3272 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010836342626029179, + "loss": 3.2026, + "step": 3273 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010833453705041168, + "loss": 3.3621, + "step": 3274 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010830564784053156, + "loss": 3.2253, + "step": 3275 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010827675863065145, + "loss": 3.3631, + "step": 3276 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010824786942077135, + "loss": 3.4034, + "step": 3277 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010821898021089123, + "loss": 3.3907, + "step": 3278 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010819009100101112, + "loss": 3.3238, + "step": 3279 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010816120179113103, + "loss": 3.4008, + "step": 3280 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010813231258125092, + "loss": 3.1676, + "step": 3281 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001081034233713708, + "loss": 3.1822, + "step": 3282 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001080745341614907, + "loss": 3.2965, + "step": 3283 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010804564495161058, + "loss": 3.406, + "step": 3284 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010801675574173047, + "loss": 3.2503, + "step": 3285 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010798786653185035, + "loss": 3.3469, + "step": 3286 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010795897732197025, + "loss": 3.4601, + "step": 3287 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010793008811209014, + "loss": 3.3476, + "step": 3288 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010790119890221002, + "loss": 3.1634, + "step": 3289 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010787230969232991, + "loss": 3.2858, + "step": 3290 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010784342048244982, + "loss": 3.3034, + "step": 3291 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010781453127256971, + "loss": 3.2368, + "step": 3292 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001077856420626896, + "loss": 3.2122, + "step": 3293 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010775675285280949, + "loss": 3.2798, + "step": 3294 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010772786364292937, + "loss": 3.2974, + "step": 3295 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010769897443304926, + "loss": 3.3698, + "step": 3296 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010767008522316914, + "loss": 3.3504, + "step": 3297 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010764119601328904, + "loss": 3.1445, + "step": 3298 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010761230680340893, + "loss": 3.3549, + "step": 3299 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010758341759352881, + "loss": 3.1997, + "step": 3300 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010755452838364872, + "loss": 3.2635, + "step": 3301 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010752563917376861, + "loss": 3.3632, + "step": 3302 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010749674996388849, + "loss": 3.2498, + "step": 3303 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010746786075400839, + "loss": 3.2627, + "step": 3304 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010743897154412828, + "loss": 3.3686, + "step": 3305 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010741008233424816, + "loss": 3.2816, + "step": 3306 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010738119312436805, + "loss": 3.2955, + "step": 3307 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010735230391448793, + "loss": 3.3908, + "step": 3308 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010732341470460783, + "loss": 3.282, + "step": 3309 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010729452549472772, + "loss": 3.3469, + "step": 3310 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001072656362848476, + "loss": 3.2559, + "step": 3311 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010723674707496751, + "loss": 3.3632, + "step": 3312 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001072078578650874, + "loss": 3.3206, + "step": 3313 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010717896865520728, + "loss": 3.3854, + "step": 3314 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010715007944532718, + "loss": 3.2409, + "step": 3315 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010712119023544707, + "loss": 3.3704, + "step": 3316 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010709230102556695, + "loss": 3.202, + "step": 3317 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010706341181568685, + "loss": 3.3145, + "step": 3318 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010703452260580673, + "loss": 3.2447, + "step": 3319 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010700563339592662, + "loss": 3.4438, + "step": 3320 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010697674418604651, + "loss": 3.2226, + "step": 3321 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010694785497616642, + "loss": 3.3073, + "step": 3322 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001069189657662863, + "loss": 3.31, + "step": 3323 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001068900765564062, + "loss": 3.1201, + "step": 3324 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010686118734652608, + "loss": 3.4242, + "step": 3325 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010683229813664597, + "loss": 3.2957, + "step": 3326 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010680340892676586, + "loss": 3.3738, + "step": 3327 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010677451971688574, + "loss": 3.1818, + "step": 3328 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010674563050700564, + "loss": 3.3446, + "step": 3329 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010671674129712552, + "loss": 3.3844, + "step": 3330 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010668785208724541, + "loss": 3.3296, + "step": 3331 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001066589628773653, + "loss": 3.3716, + "step": 3332 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010663007366748521, + "loss": 3.1275, + "step": 3333 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010660118445760509, + "loss": 3.0919, + "step": 3334 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010657229524772499, + "loss": 3.3065, + "step": 3335 + }, + { + "epoch": 0.47, + "learning_rate": 0.00010654340603784487, + "loss": 3.3737, + "step": 3336 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010651451682796476, + "loss": 3.3247, + "step": 3337 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010648562761808465, + "loss": 3.0945, + "step": 3338 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010645673840820453, + "loss": 3.3309, + "step": 3339 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010642784919832443, + "loss": 3.3925, + "step": 3340 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010639895998844431, + "loss": 3.3906, + "step": 3341 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001063700707785642, + "loss": 3.3281, + "step": 3342 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010634118156868411, + "loss": 3.3451, + "step": 3343 + }, + { + "epoch": 0.48, + "learning_rate": 0.000106312292358804, + "loss": 3.4958, + "step": 3344 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010628340314892388, + "loss": 3.3795, + "step": 3345 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010625451393904378, + "loss": 3.1946, + "step": 3346 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010622562472916366, + "loss": 3.2881, + "step": 3347 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010619673551928355, + "loss": 3.3135, + "step": 3348 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010616784630940345, + "loss": 3.3223, + "step": 3349 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010613895709952333, + "loss": 3.4084, + "step": 3350 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010611006788964322, + "loss": 3.1788, + "step": 3351 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001060811786797631, + "loss": 3.2679, + "step": 3352 + }, + { + "epoch": 0.48, + "learning_rate": 0.000106052289469883, + "loss": 3.2749, + "step": 3353 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001060234002600029, + "loss": 3.2444, + "step": 3354 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001059945110501228, + "loss": 3.3425, + "step": 3355 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010596562184024268, + "loss": 3.2667, + "step": 3356 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010593673263036257, + "loss": 3.4639, + "step": 3357 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010590784342048245, + "loss": 3.283, + "step": 3358 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010587895421060234, + "loss": 3.3782, + "step": 3359 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010585006500072224, + "loss": 3.3153, + "step": 3360 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010582117579084212, + "loss": 3.3003, + "step": 3361 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010579228658096201, + "loss": 3.3667, + "step": 3362 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010576339737108189, + "loss": 3.1026, + "step": 3363 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010573450816120181, + "loss": 3.3164, + "step": 3364 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010570561895132169, + "loss": 3.3183, + "step": 3365 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010567672974144159, + "loss": 3.2863, + "step": 3366 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010564784053156147, + "loss": 3.4557, + "step": 3367 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010561895132168136, + "loss": 3.3675, + "step": 3368 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010559006211180124, + "loss": 3.2719, + "step": 3369 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010556117290192113, + "loss": 3.3039, + "step": 3370 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010553228369204103, + "loss": 3.3442, + "step": 3371 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010550339448216091, + "loss": 3.3411, + "step": 3372 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001054745052722808, + "loss": 3.2531, + "step": 3373 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010544561606240071, + "loss": 3.3079, + "step": 3374 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001054167268525206, + "loss": 3.2034, + "step": 3375 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010538783764264048, + "loss": 3.1568, + "step": 3376 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010535894843276038, + "loss": 3.3041, + "step": 3377 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010533005922288026, + "loss": 3.2839, + "step": 3378 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010530117001300015, + "loss": 3.2965, + "step": 3379 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010527228080312003, + "loss": 3.2, + "step": 3380 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010524339159323993, + "loss": 3.2672, + "step": 3381 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010521450238335982, + "loss": 3.3248, + "step": 3382 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001051856131734797, + "loss": 3.4735, + "step": 3383 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001051567239635996, + "loss": 3.4084, + "step": 3384 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001051278347537195, + "loss": 3.1859, + "step": 3385 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001050989455438394, + "loss": 3.2975, + "step": 3386 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010507005633395928, + "loss": 3.4183, + "step": 3387 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010504116712407917, + "loss": 3.276, + "step": 3388 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010501227791419905, + "loss": 3.2373, + "step": 3389 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010498338870431894, + "loss": 3.3277, + "step": 3390 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010495449949443882, + "loss": 3.2967, + "step": 3391 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010492561028455872, + "loss": 3.476, + "step": 3392 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010489672107467861, + "loss": 3.1755, + "step": 3393 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010486783186479849, + "loss": 3.3237, + "step": 3394 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001048389426549184, + "loss": 3.3224, + "step": 3395 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010481005344503829, + "loss": 3.212, + "step": 3396 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010478116423515819, + "loss": 3.2954, + "step": 3397 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010475227502527807, + "loss": 3.4549, + "step": 3398 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010472338581539796, + "loss": 3.2889, + "step": 3399 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010469449660551784, + "loss": 3.3868, + "step": 3400 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010466560739563773, + "loss": 3.3967, + "step": 3401 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010463671818575762, + "loss": 3.2687, + "step": 3402 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010460782897587751, + "loss": 3.2324, + "step": 3403 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001045789397659974, + "loss": 3.3779, + "step": 3404 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010455005055611728, + "loss": 3.2455, + "step": 3405 + }, + { + "epoch": 0.48, + "learning_rate": 0.00010452116134623719, + "loss": 3.3532, + "step": 3406 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010449227213635708, + "loss": 3.2354, + "step": 3407 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010446338292647698, + "loss": 3.3761, + "step": 3408 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010443449371659686, + "loss": 3.2817, + "step": 3409 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010440560450671675, + "loss": 3.2361, + "step": 3410 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010437671529683663, + "loss": 3.2874, + "step": 3411 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010434782608695653, + "loss": 3.3086, + "step": 3412 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001043189368770764, + "loss": 3.3358, + "step": 3413 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001042900476671963, + "loss": 3.2998, + "step": 3414 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001042611584573162, + "loss": 3.3185, + "step": 3415 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001042322692474361, + "loss": 3.2077, + "step": 3416 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010420338003755598, + "loss": 3.3911, + "step": 3417 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010417449082767588, + "loss": 3.3192, + "step": 3418 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010414560161779577, + "loss": 3.3712, + "step": 3419 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010411671240791565, + "loss": 3.2438, + "step": 3420 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010408782319803554, + "loss": 3.3327, + "step": 3421 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010405893398815542, + "loss": 3.2294, + "step": 3422 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010403004477827532, + "loss": 3.4021, + "step": 3423 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001040011555683952, + "loss": 3.3835, + "step": 3424 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010397226635851509, + "loss": 3.4587, + "step": 3425 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010394337714863499, + "loss": 3.4684, + "step": 3426 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010391448793875489, + "loss": 3.3067, + "step": 3427 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010388559872887477, + "loss": 3.1844, + "step": 3428 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010385670951899467, + "loss": 3.3517, + "step": 3429 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010382782030911456, + "loss": 3.3095, + "step": 3430 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010379893109923444, + "loss": 3.3435, + "step": 3431 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010377004188935433, + "loss": 3.2317, + "step": 3432 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010374115267947422, + "loss": 3.2868, + "step": 3433 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010371226346959411, + "loss": 3.3199, + "step": 3434 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010368337425971399, + "loss": 3.167, + "step": 3435 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010365448504983388, + "loss": 3.3248, + "step": 3436 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010362559583995379, + "loss": 3.262, + "step": 3437 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010359670663007368, + "loss": 3.3651, + "step": 3438 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010356781742019356, + "loss": 3.3711, + "step": 3439 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010353892821031346, + "loss": 3.2943, + "step": 3440 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010351003900043334, + "loss": 3.2476, + "step": 3441 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010348114979055323, + "loss": 3.1206, + "step": 3442 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010345226058067313, + "loss": 3.3442, + "step": 3443 + }, + { + "epoch": 0.49, + "learning_rate": 0.000103423371370793, + "loss": 3.378, + "step": 3444 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001033944821609129, + "loss": 3.4165, + "step": 3445 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010336559295103278, + "loss": 3.3018, + "step": 3446 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010333670374115267, + "loss": 3.4229, + "step": 3447 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010330781453127258, + "loss": 3.3003, + "step": 3448 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010327892532139248, + "loss": 3.2404, + "step": 3449 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010325003611151236, + "loss": 3.361, + "step": 3450 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010322114690163225, + "loss": 3.2063, + "step": 3451 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010319225769175213, + "loss": 3.3266, + "step": 3452 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010316336848187202, + "loss": 3.3469, + "step": 3453 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010313447927199192, + "loss": 3.2305, + "step": 3454 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001031055900621118, + "loss": 3.2308, + "step": 3455 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010307670085223169, + "loss": 3.3362, + "step": 3456 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010304781164235157, + "loss": 3.3026, + "step": 3457 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010301892243247149, + "loss": 3.2693, + "step": 3458 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010299003322259137, + "loss": 3.36, + "step": 3459 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010296114401271127, + "loss": 3.1219, + "step": 3460 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010293225480283115, + "loss": 3.2911, + "step": 3461 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010290336559295104, + "loss": 3.4447, + "step": 3462 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010287447638307092, + "loss": 3.2468, + "step": 3463 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010284558717319082, + "loss": 3.3301, + "step": 3464 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010281669796331071, + "loss": 3.2482, + "step": 3465 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010278780875343059, + "loss": 3.4332, + "step": 3466 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010275891954355048, + "loss": 3.3375, + "step": 3467 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010273003033367036, + "loss": 3.2605, + "step": 3468 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010270114112379028, + "loss": 3.3278, + "step": 3469 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010267225191391016, + "loss": 3.1607, + "step": 3470 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010264336270403006, + "loss": 3.1191, + "step": 3471 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010261447349414994, + "loss": 3.3582, + "step": 3472 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010258558428426983, + "loss": 3.3878, + "step": 3473 + }, + { + "epoch": 0.49, + "learning_rate": 0.00010255669507438971, + "loss": 3.3283, + "step": 3474 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001025278058645096, + "loss": 3.3213, + "step": 3475 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001024989166546295, + "loss": 3.2, + "step": 3476 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010247002744474938, + "loss": 3.3431, + "step": 3477 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010244113823486927, + "loss": 3.3376, + "step": 3478 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010241224902498918, + "loss": 3.3633, + "step": 3479 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010238335981510908, + "loss": 3.3458, + "step": 3480 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010235447060522896, + "loss": 3.3621, + "step": 3481 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010232558139534885, + "loss": 3.3093, + "step": 3482 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010229669218546873, + "loss": 3.2172, + "step": 3483 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010226780297558862, + "loss": 3.3148, + "step": 3484 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001022389137657085, + "loss": 3.2161, + "step": 3485 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001022100245558284, + "loss": 3.3887, + "step": 3486 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010218113534594829, + "loss": 3.2185, + "step": 3487 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010215224613606817, + "loss": 3.2962, + "step": 3488 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010212335692618807, + "loss": 3.4699, + "step": 3489 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010209446771630797, + "loss": 3.3173, + "step": 3490 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010206557850642787, + "loss": 3.3064, + "step": 3491 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010203668929654775, + "loss": 3.2422, + "step": 3492 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010200780008666764, + "loss": 3.454, + "step": 3493 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010197891087678752, + "loss": 3.3527, + "step": 3494 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010195002166690742, + "loss": 3.2701, + "step": 3495 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001019211324570273, + "loss": 3.3632, + "step": 3496 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010189224324714719, + "loss": 3.2785, + "step": 3497 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010186335403726708, + "loss": 3.2947, + "step": 3498 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010183446482738696, + "loss": 3.31, + "step": 3499 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010180557561750687, + "loss": 3.2879, + "step": 3500 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010177668640762676, + "loss": 3.3353, + "step": 3501 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010174779719774666, + "loss": 3.2496, + "step": 3502 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010171890798786654, + "loss": 3.3229, + "step": 3503 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010169001877798643, + "loss": 3.2976, + "step": 3504 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010166112956810631, + "loss": 3.4325, + "step": 3505 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010163224035822621, + "loss": 3.1573, + "step": 3506 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010160335114834609, + "loss": 3.1567, + "step": 3507 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010157446193846598, + "loss": 3.2892, + "step": 3508 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010154557272858587, + "loss": 3.3654, + "step": 3509 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010151668351870576, + "loss": 3.2534, + "step": 3510 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010148779430882566, + "loss": 3.1937, + "step": 3511 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010145890509894556, + "loss": 3.2221, + "step": 3512 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010143001588906545, + "loss": 3.145, + "step": 3513 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010140112667918533, + "loss": 3.1539, + "step": 3514 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010137223746930522, + "loss": 3.34, + "step": 3515 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001013433482594251, + "loss": 3.4149, + "step": 3516 + }, + { + "epoch": 0.5, + "learning_rate": 0.000101314459049545, + "loss": 3.3189, + "step": 3517 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010128556983966488, + "loss": 3.2401, + "step": 3518 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010125668062978477, + "loss": 3.2709, + "step": 3519 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010122779141990467, + "loss": 3.4195, + "step": 3520 + }, + { + "epoch": 0.5, + "eval_loss": 3.5005860328674316, + "eval_runtime": 472.0927, + "eval_samples_per_second": 43.396, + "eval_steps_per_second": 14.465, + "step": 3520 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010119890221002457, + "loss": 3.2958, + "step": 3521 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010117001300014445, + "loss": 3.4184, + "step": 3522 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010114112379026435, + "loss": 3.2575, + "step": 3523 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010111223458038424, + "loss": 3.3084, + "step": 3524 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010108334537050412, + "loss": 3.1578, + "step": 3525 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010105445616062402, + "loss": 3.3057, + "step": 3526 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001010255669507439, + "loss": 3.3618, + "step": 3527 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010099667774086379, + "loss": 3.4165, + "step": 3528 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010096778853098367, + "loss": 3.0307, + "step": 3529 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010093889932110356, + "loss": 3.4035, + "step": 3530 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010091001011122346, + "loss": 3.2687, + "step": 3531 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010088112090134337, + "loss": 3.2818, + "step": 3532 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010085223169146325, + "loss": 3.4536, + "step": 3533 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010082334248158314, + "loss": 3.2161, + "step": 3534 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010079445327170303, + "loss": 3.1778, + "step": 3535 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010076556406182291, + "loss": 3.2902, + "step": 3536 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010073667485194281, + "loss": 3.3542, + "step": 3537 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010070778564206269, + "loss": 3.3223, + "step": 3538 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010067889643218258, + "loss": 3.2997, + "step": 3539 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010065000722230246, + "loss": 3.297, + "step": 3540 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010062111801242236, + "loss": 3.2426, + "step": 3541 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010059222880254226, + "loss": 3.3119, + "step": 3542 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010056333959266216, + "loss": 3.3011, + "step": 3543 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010053445038278204, + "loss": 3.4242, + "step": 3544 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010050556117290193, + "loss": 3.3547, + "step": 3545 + }, + { + "epoch": 0.5, + "learning_rate": 0.00010047667196302182, + "loss": 3.1823, + "step": 3546 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001004477827531417, + "loss": 3.3671, + "step": 3547 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001004188935432616, + "loss": 3.2729, + "step": 3548 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010039000433338148, + "loss": 3.2731, + "step": 3549 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010036111512350137, + "loss": 3.3776, + "step": 3550 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010033222591362125, + "loss": 3.3522, + "step": 3551 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010030333670374115, + "loss": 3.2845, + "step": 3552 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010027444749386105, + "loss": 3.2711, + "step": 3553 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010024555828398095, + "loss": 3.3528, + "step": 3554 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010021666907410083, + "loss": 3.3548, + "step": 3555 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010018777986422072, + "loss": 3.1627, + "step": 3556 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010015889065434062, + "loss": 3.2985, + "step": 3557 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001001300014444605, + "loss": 3.4028, + "step": 3558 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010010111223458039, + "loss": 3.318, + "step": 3559 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010007222302470027, + "loss": 3.2277, + "step": 3560 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010004333381482016, + "loss": 3.3456, + "step": 3561 + }, + { + "epoch": 0.51, + "learning_rate": 0.00010001444460494004, + "loss": 3.2859, + "step": 3562 + }, + { + "epoch": 0.51, + "learning_rate": 9.998555539505995e-05, + "loss": 3.3805, + "step": 3563 + }, + { + "epoch": 0.51, + "learning_rate": 9.995666618517983e-05, + "loss": 3.2169, + "step": 3564 + }, + { + "epoch": 0.51, + "learning_rate": 9.992777697529973e-05, + "loss": 3.2084, + "step": 3565 + }, + { + "epoch": 0.51, + "learning_rate": 9.989888776541962e-05, + "loss": 3.4749, + "step": 3566 + }, + { + "epoch": 0.51, + "learning_rate": 9.986999855553951e-05, + "loss": 3.2897, + "step": 3567 + }, + { + "epoch": 0.51, + "learning_rate": 9.98411093456594e-05, + "loss": 3.3012, + "step": 3568 + }, + { + "epoch": 0.51, + "learning_rate": 9.981222013577929e-05, + "loss": 3.3644, + "step": 3569 + }, + { + "epoch": 0.51, + "learning_rate": 9.978333092589918e-05, + "loss": 3.2685, + "step": 3570 + }, + { + "epoch": 0.51, + "learning_rate": 9.975444171601908e-05, + "loss": 3.4293, + "step": 3571 + }, + { + "epoch": 0.51, + "learning_rate": 9.972555250613897e-05, + "loss": 3.4324, + "step": 3572 + }, + { + "epoch": 0.51, + "learning_rate": 9.969666329625885e-05, + "loss": 3.3411, + "step": 3573 + }, + { + "epoch": 0.51, + "learning_rate": 9.966777408637874e-05, + "loss": 3.3643, + "step": 3574 + }, + { + "epoch": 0.51, + "learning_rate": 9.963888487649862e-05, + "loss": 3.2441, + "step": 3575 + }, + { + "epoch": 0.51, + "learning_rate": 9.960999566661853e-05, + "loss": 3.3386, + "step": 3576 + }, + { + "epoch": 0.51, + "learning_rate": 9.958110645673841e-05, + "loss": 3.3044, + "step": 3577 + }, + { + "epoch": 0.51, + "learning_rate": 9.95522172468583e-05, + "loss": 3.2719, + "step": 3578 + }, + { + "epoch": 0.51, + "learning_rate": 9.952332803697819e-05, + "loss": 3.4724, + "step": 3579 + }, + { + "epoch": 0.51, + "learning_rate": 9.949443882709808e-05, + "loss": 3.4531, + "step": 3580 + }, + { + "epoch": 0.51, + "learning_rate": 9.946554961721797e-05, + "loss": 3.2413, + "step": 3581 + }, + { + "epoch": 0.51, + "learning_rate": 9.943666040733787e-05, + "loss": 3.3331, + "step": 3582 + }, + { + "epoch": 0.51, + "learning_rate": 9.940777119745776e-05, + "loss": 3.2825, + "step": 3583 + }, + { + "epoch": 0.51, + "learning_rate": 9.937888198757764e-05, + "loss": 3.1648, + "step": 3584 + }, + { + "epoch": 0.51, + "learning_rate": 9.934999277769753e-05, + "loss": 3.3269, + "step": 3585 + }, + { + "epoch": 0.51, + "learning_rate": 9.932110356781741e-05, + "loss": 3.2855, + "step": 3586 + }, + { + "epoch": 0.51, + "learning_rate": 9.929221435793732e-05, + "loss": 3.2763, + "step": 3587 + }, + { + "epoch": 0.51, + "learning_rate": 9.92633251480572e-05, + "loss": 3.3442, + "step": 3588 + }, + { + "epoch": 0.51, + "learning_rate": 9.92344359381771e-05, + "loss": 3.3143, + "step": 3589 + }, + { + "epoch": 0.51, + "learning_rate": 9.920554672829698e-05, + "loss": 3.1068, + "step": 3590 + }, + { + "epoch": 0.51, + "learning_rate": 9.917665751841687e-05, + "loss": 3.3014, + "step": 3591 + }, + { + "epoch": 0.51, + "learning_rate": 9.914776830853676e-05, + "loss": 3.2842, + "step": 3592 + }, + { + "epoch": 0.51, + "learning_rate": 9.911887909865666e-05, + "loss": 3.3375, + "step": 3593 + }, + { + "epoch": 0.51, + "learning_rate": 9.908998988877655e-05, + "loss": 3.2577, + "step": 3594 + }, + { + "epoch": 0.51, + "learning_rate": 9.906110067889643e-05, + "loss": 3.3795, + "step": 3595 + }, + { + "epoch": 0.51, + "learning_rate": 9.903221146901633e-05, + "loss": 3.3381, + "step": 3596 + }, + { + "epoch": 0.51, + "learning_rate": 9.900332225913622e-05, + "loss": 3.305, + "step": 3597 + }, + { + "epoch": 0.51, + "learning_rate": 9.897443304925611e-05, + "loss": 3.3266, + "step": 3598 + }, + { + "epoch": 0.51, + "learning_rate": 9.8945543839376e-05, + "loss": 3.3164, + "step": 3599 + }, + { + "epoch": 0.51, + "learning_rate": 9.891665462949589e-05, + "loss": 3.4941, + "step": 3600 + }, + { + "epoch": 0.51, + "learning_rate": 9.888776541961577e-05, + "loss": 3.3784, + "step": 3601 + }, + { + "epoch": 0.51, + "learning_rate": 9.885887620973568e-05, + "loss": 3.3169, + "step": 3602 + }, + { + "epoch": 0.51, + "learning_rate": 9.882998699985556e-05, + "loss": 3.3696, + "step": 3603 + }, + { + "epoch": 0.51, + "learning_rate": 9.880109778997545e-05, + "loss": 3.2779, + "step": 3604 + }, + { + "epoch": 0.51, + "learning_rate": 9.877220858009534e-05, + "loss": 3.4196, + "step": 3605 + }, + { + "epoch": 0.51, + "learning_rate": 9.874331937021522e-05, + "loss": 3.364, + "step": 3606 + }, + { + "epoch": 0.51, + "learning_rate": 9.871443016033512e-05, + "loss": 3.365, + "step": 3607 + }, + { + "epoch": 0.51, + "learning_rate": 9.868554095045501e-05, + "loss": 3.2851, + "step": 3608 + }, + { + "epoch": 0.51, + "learning_rate": 9.86566517405749e-05, + "loss": 3.4338, + "step": 3609 + }, + { + "epoch": 0.51, + "learning_rate": 9.862776253069479e-05, + "loss": 3.3938, + "step": 3610 + }, + { + "epoch": 0.51, + "learning_rate": 9.859887332081468e-05, + "loss": 3.1677, + "step": 3611 + }, + { + "epoch": 0.51, + "learning_rate": 9.856998411093456e-05, + "loss": 3.2969, + "step": 3612 + }, + { + "epoch": 0.51, + "learning_rate": 9.854109490105447e-05, + "loss": 3.1708, + "step": 3613 + }, + { + "epoch": 0.51, + "learning_rate": 9.851220569117435e-05, + "loss": 3.3359, + "step": 3614 + }, + { + "epoch": 0.51, + "learning_rate": 9.848331648129424e-05, + "loss": 3.3044, + "step": 3615 + }, + { + "epoch": 0.51, + "learning_rate": 9.845442727141413e-05, + "loss": 3.3036, + "step": 3616 + }, + { + "epoch": 0.51, + "learning_rate": 9.842553806153401e-05, + "loss": 3.1884, + "step": 3617 + }, + { + "epoch": 0.52, + "learning_rate": 9.839664885165392e-05, + "loss": 3.3224, + "step": 3618 + }, + { + "epoch": 0.52, + "learning_rate": 9.83677596417738e-05, + "loss": 3.3364, + "step": 3619 + }, + { + "epoch": 0.52, + "learning_rate": 9.83388704318937e-05, + "loss": 3.1456, + "step": 3620 + }, + { + "epoch": 0.52, + "learning_rate": 9.830998122201358e-05, + "loss": 3.3708, + "step": 3621 + }, + { + "epoch": 0.52, + "learning_rate": 9.828109201213347e-05, + "loss": 3.351, + "step": 3622 + }, + { + "epoch": 0.52, + "learning_rate": 9.825220280225336e-05, + "loss": 3.4185, + "step": 3623 + }, + { + "epoch": 0.52, + "learning_rate": 9.822331359237326e-05, + "loss": 3.2952, + "step": 3624 + }, + { + "epoch": 0.52, + "learning_rate": 9.819442438249314e-05, + "loss": 3.3222, + "step": 3625 + }, + { + "epoch": 0.52, + "learning_rate": 9.816553517261303e-05, + "loss": 3.2495, + "step": 3626 + }, + { + "epoch": 0.52, + "learning_rate": 9.813664596273293e-05, + "loss": 3.3842, + "step": 3627 + }, + { + "epoch": 0.52, + "learning_rate": 9.810775675285282e-05, + "loss": 3.3091, + "step": 3628 + }, + { + "epoch": 0.52, + "learning_rate": 9.807886754297271e-05, + "loss": 3.1676, + "step": 3629 + }, + { + "epoch": 0.52, + "learning_rate": 9.80499783330926e-05, + "loss": 3.2684, + "step": 3630 + }, + { + "epoch": 0.52, + "learning_rate": 9.802108912321249e-05, + "loss": 3.2345, + "step": 3631 + }, + { + "epoch": 0.52, + "learning_rate": 9.799219991333237e-05, + "loss": 3.2182, + "step": 3632 + }, + { + "epoch": 0.52, + "learning_rate": 9.796331070345226e-05, + "loss": 3.3009, + "step": 3633 + }, + { + "epoch": 0.52, + "learning_rate": 9.793442149357216e-05, + "loss": 3.1447, + "step": 3634 + }, + { + "epoch": 0.52, + "learning_rate": 9.790553228369205e-05, + "loss": 3.1446, + "step": 3635 + }, + { + "epoch": 0.52, + "learning_rate": 9.787664307381193e-05, + "loss": 3.2587, + "step": 3636 + }, + { + "epoch": 0.52, + "learning_rate": 9.784775386393182e-05, + "loss": 3.3582, + "step": 3637 + }, + { + "epoch": 0.52, + "learning_rate": 9.781886465405172e-05, + "loss": 3.3572, + "step": 3638 + }, + { + "epoch": 0.52, + "learning_rate": 9.778997544417161e-05, + "loss": 3.3, + "step": 3639 + }, + { + "epoch": 0.52, + "learning_rate": 9.77610862342915e-05, + "loss": 3.276, + "step": 3640 + }, + { + "epoch": 0.52, + "learning_rate": 9.773219702441139e-05, + "loss": 3.1791, + "step": 3641 + }, + { + "epoch": 0.52, + "learning_rate": 9.770330781453128e-05, + "loss": 3.3582, + "step": 3642 + }, + { + "epoch": 0.52, + "learning_rate": 9.767441860465116e-05, + "loss": 3.4701, + "step": 3643 + }, + { + "epoch": 0.52, + "learning_rate": 9.764552939477107e-05, + "loss": 3.4079, + "step": 3644 + }, + { + "epoch": 0.52, + "learning_rate": 9.761664018489095e-05, + "loss": 3.1479, + "step": 3645 + }, + { + "epoch": 0.52, + "learning_rate": 9.758775097501084e-05, + "loss": 3.3389, + "step": 3646 + }, + { + "epoch": 0.52, + "learning_rate": 9.755886176513072e-05, + "loss": 3.2733, + "step": 3647 + }, + { + "epoch": 0.52, + "learning_rate": 9.752997255525062e-05, + "loss": 3.3105, + "step": 3648 + }, + { + "epoch": 0.52, + "learning_rate": 9.750108334537051e-05, + "loss": 3.3759, + "step": 3649 + }, + { + "epoch": 0.52, + "learning_rate": 9.74721941354904e-05, + "loss": 3.1932, + "step": 3650 + }, + { + "epoch": 0.52, + "learning_rate": 9.74433049256103e-05, + "loss": 3.2846, + "step": 3651 + }, + { + "epoch": 0.52, + "learning_rate": 9.741441571573018e-05, + "loss": 3.3393, + "step": 3652 + }, + { + "epoch": 0.52, + "learning_rate": 9.738552650585007e-05, + "loss": 3.062, + "step": 3653 + }, + { + "epoch": 0.52, + "learning_rate": 9.735663729596995e-05, + "loss": 3.3248, + "step": 3654 + }, + { + "epoch": 0.52, + "learning_rate": 9.732774808608986e-05, + "loss": 3.3914, + "step": 3655 + }, + { + "epoch": 0.52, + "learning_rate": 9.729885887620974e-05, + "loss": 3.3856, + "step": 3656 + }, + { + "epoch": 0.52, + "learning_rate": 9.726996966632963e-05, + "loss": 3.2255, + "step": 3657 + }, + { + "epoch": 0.52, + "learning_rate": 9.724108045644951e-05, + "loss": 3.2713, + "step": 3658 + }, + { + "epoch": 0.52, + "learning_rate": 9.72121912465694e-05, + "loss": 3.2563, + "step": 3659 + }, + { + "epoch": 0.52, + "learning_rate": 9.71833020366893e-05, + "loss": 3.351, + "step": 3660 + }, + { + "epoch": 0.52, + "learning_rate": 9.71544128268092e-05, + "loss": 3.2914, + "step": 3661 + }, + { + "epoch": 0.52, + "learning_rate": 9.712552361692909e-05, + "loss": 3.2493, + "step": 3662 + }, + { + "epoch": 0.52, + "learning_rate": 9.709663440704897e-05, + "loss": 3.2843, + "step": 3663 + }, + { + "epoch": 0.52, + "learning_rate": 9.706774519716886e-05, + "loss": 3.3055, + "step": 3664 + }, + { + "epoch": 0.52, + "learning_rate": 9.703885598728876e-05, + "loss": 3.3493, + "step": 3665 + }, + { + "epoch": 0.52, + "learning_rate": 9.700996677740865e-05, + "loss": 3.4551, + "step": 3666 + }, + { + "epoch": 0.52, + "learning_rate": 9.698107756752853e-05, + "loss": 3.2609, + "step": 3667 + }, + { + "epoch": 0.52, + "learning_rate": 9.695218835764842e-05, + "loss": 3.2567, + "step": 3668 + }, + { + "epoch": 0.52, + "learning_rate": 9.69232991477683e-05, + "loss": 3.3386, + "step": 3669 + }, + { + "epoch": 0.52, + "learning_rate": 9.689440993788821e-05, + "loss": 3.343, + "step": 3670 + }, + { + "epoch": 0.52, + "learning_rate": 9.686552072800809e-05, + "loss": 3.2134, + "step": 3671 + }, + { + "epoch": 0.52, + "learning_rate": 9.683663151812799e-05, + "loss": 3.4653, + "step": 3672 + }, + { + "epoch": 0.52, + "learning_rate": 9.680774230824788e-05, + "loss": 3.305, + "step": 3673 + }, + { + "epoch": 0.52, + "learning_rate": 9.677885309836776e-05, + "loss": 3.2433, + "step": 3674 + }, + { + "epoch": 0.52, + "learning_rate": 9.674996388848765e-05, + "loss": 3.3959, + "step": 3675 + }, + { + "epoch": 0.52, + "learning_rate": 9.672107467860755e-05, + "loss": 3.3096, + "step": 3676 + }, + { + "epoch": 0.52, + "learning_rate": 9.669218546872744e-05, + "loss": 3.2794, + "step": 3677 + }, + { + "epoch": 0.52, + "learning_rate": 9.666329625884732e-05, + "loss": 3.2811, + "step": 3678 + }, + { + "epoch": 0.52, + "learning_rate": 9.663440704896722e-05, + "loss": 3.2929, + "step": 3679 + }, + { + "epoch": 0.52, + "learning_rate": 9.66055178390871e-05, + "loss": 3.1585, + "step": 3680 + }, + { + "epoch": 0.52, + "learning_rate": 9.6576628629207e-05, + "loss": 3.2812, + "step": 3681 + }, + { + "epoch": 0.52, + "learning_rate": 9.654773941932688e-05, + "loss": 3.192, + "step": 3682 + }, + { + "epoch": 0.52, + "learning_rate": 9.651885020944678e-05, + "loss": 3.3528, + "step": 3683 + }, + { + "epoch": 0.52, + "learning_rate": 9.648996099956667e-05, + "loss": 3.041, + "step": 3684 + }, + { + "epoch": 0.52, + "learning_rate": 9.646107178968655e-05, + "loss": 3.3006, + "step": 3685 + }, + { + "epoch": 0.52, + "learning_rate": 9.643218257980645e-05, + "loss": 3.4209, + "step": 3686 + }, + { + "epoch": 0.52, + "learning_rate": 9.640329336992634e-05, + "loss": 3.3817, + "step": 3687 + }, + { + "epoch": 0.53, + "learning_rate": 9.637440416004623e-05, + "loss": 3.3264, + "step": 3688 + }, + { + "epoch": 0.53, + "learning_rate": 9.634551495016611e-05, + "loss": 3.2459, + "step": 3689 + }, + { + "epoch": 0.53, + "learning_rate": 9.6316625740286e-05, + "loss": 3.3551, + "step": 3690 + }, + { + "epoch": 0.53, + "learning_rate": 9.62877365304059e-05, + "loss": 3.2263, + "step": 3691 + }, + { + "epoch": 0.53, + "learning_rate": 9.62588473205258e-05, + "loss": 3.1857, + "step": 3692 + }, + { + "epoch": 0.53, + "learning_rate": 9.622995811064567e-05, + "loss": 3.3901, + "step": 3693 + }, + { + "epoch": 0.53, + "learning_rate": 9.620106890076557e-05, + "loss": 3.348, + "step": 3694 + }, + { + "epoch": 0.53, + "learning_rate": 9.617217969088545e-05, + "loss": 3.2752, + "step": 3695 + }, + { + "epoch": 0.53, + "learning_rate": 9.614329048100534e-05, + "loss": 3.3257, + "step": 3696 + }, + { + "epoch": 0.53, + "learning_rate": 9.611440127112524e-05, + "loss": 3.2138, + "step": 3697 + }, + { + "epoch": 0.53, + "learning_rate": 9.608551206124513e-05, + "loss": 3.2224, + "step": 3698 + }, + { + "epoch": 0.53, + "learning_rate": 9.605662285136502e-05, + "loss": 3.3144, + "step": 3699 + }, + { + "epoch": 0.53, + "learning_rate": 9.60277336414849e-05, + "loss": 2.9823, + "step": 3700 + }, + { + "epoch": 0.53, + "learning_rate": 9.59988444316048e-05, + "loss": 3.3617, + "step": 3701 + }, + { + "epoch": 0.53, + "learning_rate": 9.596995522172469e-05, + "loss": 3.1095, + "step": 3702 + }, + { + "epoch": 0.53, + "learning_rate": 9.594106601184459e-05, + "loss": 3.3501, + "step": 3703 + }, + { + "epoch": 0.53, + "learning_rate": 9.591217680196447e-05, + "loss": 3.3588, + "step": 3704 + }, + { + "epoch": 0.53, + "learning_rate": 9.588328759208436e-05, + "loss": 3.2918, + "step": 3705 + }, + { + "epoch": 0.53, + "learning_rate": 9.585439838220424e-05, + "loss": 3.2578, + "step": 3706 + }, + { + "epoch": 0.53, + "learning_rate": 9.582550917232415e-05, + "loss": 3.2849, + "step": 3707 + }, + { + "epoch": 0.53, + "learning_rate": 9.579661996244403e-05, + "loss": 3.3586, + "step": 3708 + }, + { + "epoch": 0.53, + "learning_rate": 9.576773075256392e-05, + "loss": 3.0917, + "step": 3709 + }, + { + "epoch": 0.53, + "learning_rate": 9.573884154268382e-05, + "loss": 3.2508, + "step": 3710 + }, + { + "epoch": 0.53, + "learning_rate": 9.57099523328037e-05, + "loss": 3.3001, + "step": 3711 + }, + { + "epoch": 0.53, + "learning_rate": 9.56810631229236e-05, + "loss": 3.2744, + "step": 3712 + }, + { + "epoch": 0.53, + "learning_rate": 9.565217391304348e-05, + "loss": 3.283, + "step": 3713 + }, + { + "epoch": 0.53, + "learning_rate": 9.562328470316338e-05, + "loss": 3.2927, + "step": 3714 + }, + { + "epoch": 0.53, + "learning_rate": 9.559439549328326e-05, + "loss": 3.3497, + "step": 3715 + }, + { + "epoch": 0.53, + "learning_rate": 9.556550628340315e-05, + "loss": 3.1693, + "step": 3716 + }, + { + "epoch": 0.53, + "learning_rate": 9.553661707352305e-05, + "loss": 3.2967, + "step": 3717 + }, + { + "epoch": 0.53, + "learning_rate": 9.550772786364294e-05, + "loss": 3.2895, + "step": 3718 + }, + { + "epoch": 0.53, + "learning_rate": 9.547883865376282e-05, + "loss": 3.2079, + "step": 3719 + }, + { + "epoch": 0.53, + "learning_rate": 9.544994944388271e-05, + "loss": 3.3238, + "step": 3720 + }, + { + "epoch": 0.53, + "learning_rate": 9.542106023400261e-05, + "loss": 3.2215, + "step": 3721 + }, + { + "epoch": 0.53, + "learning_rate": 9.539217102412249e-05, + "loss": 3.2758, + "step": 3722 + }, + { + "epoch": 0.53, + "learning_rate": 9.53632818142424e-05, + "loss": 3.4565, + "step": 3723 + }, + { + "epoch": 0.53, + "learning_rate": 9.533439260436227e-05, + "loss": 3.1476, + "step": 3724 + }, + { + "epoch": 0.53, + "learning_rate": 9.530550339448217e-05, + "loss": 3.1799, + "step": 3725 + }, + { + "epoch": 0.53, + "learning_rate": 9.527661418460205e-05, + "loss": 3.2619, + "step": 3726 + }, + { + "epoch": 0.53, + "learning_rate": 9.524772497472194e-05, + "loss": 3.3384, + "step": 3727 + }, + { + "epoch": 0.53, + "learning_rate": 9.521883576484184e-05, + "loss": 3.3392, + "step": 3728 + }, + { + "epoch": 0.53, + "learning_rate": 9.518994655496173e-05, + "loss": 3.2635, + "step": 3729 + }, + { + "epoch": 0.53, + "learning_rate": 9.516105734508161e-05, + "loss": 3.2973, + "step": 3730 + }, + { + "epoch": 0.53, + "learning_rate": 9.51321681352015e-05, + "loss": 3.1182, + "step": 3731 + }, + { + "epoch": 0.53, + "learning_rate": 9.51032789253214e-05, + "loss": 3.366, + "step": 3732 + }, + { + "epoch": 0.53, + "learning_rate": 9.507438971544129e-05, + "loss": 3.2735, + "step": 3733 + }, + { + "epoch": 0.53, + "learning_rate": 9.504550050556119e-05, + "loss": 3.1093, + "step": 3734 + }, + { + "epoch": 0.53, + "learning_rate": 9.501661129568107e-05, + "loss": 3.2478, + "step": 3735 + }, + { + "epoch": 0.53, + "learning_rate": 9.498772208580096e-05, + "loss": 3.2874, + "step": 3736 + }, + { + "epoch": 0.53, + "learning_rate": 9.495883287592084e-05, + "loss": 3.1797, + "step": 3737 + }, + { + "epoch": 0.53, + "learning_rate": 9.492994366604075e-05, + "loss": 3.3518, + "step": 3738 + }, + { + "epoch": 0.53, + "learning_rate": 9.490105445616063e-05, + "loss": 3.3398, + "step": 3739 + }, + { + "epoch": 0.53, + "learning_rate": 9.487216524628052e-05, + "loss": 3.093, + "step": 3740 + }, + { + "epoch": 0.53, + "learning_rate": 9.48432760364004e-05, + "loss": 3.443, + "step": 3741 + }, + { + "epoch": 0.53, + "learning_rate": 9.48143868265203e-05, + "loss": 3.2541, + "step": 3742 + }, + { + "epoch": 0.53, + "learning_rate": 9.478549761664019e-05, + "loss": 3.2678, + "step": 3743 + }, + { + "epoch": 0.53, + "learning_rate": 9.475660840676008e-05, + "loss": 3.3353, + "step": 3744 + }, + { + "epoch": 0.53, + "learning_rate": 9.472771919687998e-05, + "loss": 3.2779, + "step": 3745 + }, + { + "epoch": 0.53, + "learning_rate": 9.469882998699986e-05, + "loss": 3.386, + "step": 3746 + }, + { + "epoch": 0.53, + "learning_rate": 9.466994077711975e-05, + "loss": 3.1789, + "step": 3747 + }, + { + "epoch": 0.53, + "learning_rate": 9.464105156723963e-05, + "loss": 3.2896, + "step": 3748 + }, + { + "epoch": 0.53, + "learning_rate": 9.461216235735954e-05, + "loss": 3.3565, + "step": 3749 + }, + { + "epoch": 0.53, + "learning_rate": 9.458327314747942e-05, + "loss": 3.1587, + "step": 3750 + }, + { + "epoch": 0.53, + "learning_rate": 9.455438393759931e-05, + "loss": 3.3827, + "step": 3751 + }, + { + "epoch": 0.53, + "learning_rate": 9.45254947277192e-05, + "loss": 3.3155, + "step": 3752 + }, + { + "epoch": 0.53, + "learning_rate": 9.449660551783909e-05, + "loss": 3.2712, + "step": 3753 + }, + { + "epoch": 0.53, + "learning_rate": 9.446771630795898e-05, + "loss": 3.2463, + "step": 3754 + }, + { + "epoch": 0.53, + "learning_rate": 9.443882709807888e-05, + "loss": 3.1915, + "step": 3755 + }, + { + "epoch": 0.53, + "learning_rate": 9.440993788819877e-05, + "loss": 3.1901, + "step": 3756 + }, + { + "epoch": 0.53, + "learning_rate": 9.438104867831865e-05, + "loss": 3.2365, + "step": 3757 + }, + { + "epoch": 0.54, + "learning_rate": 9.435215946843854e-05, + "loss": 3.3428, + "step": 3758 + }, + { + "epoch": 0.54, + "learning_rate": 9.432327025855844e-05, + "loss": 3.2661, + "step": 3759 + }, + { + "epoch": 0.54, + "learning_rate": 9.429438104867833e-05, + "loss": 3.3797, + "step": 3760 + }, + { + "epoch": 0.54, + "learning_rate": 9.426549183879821e-05, + "loss": 3.4258, + "step": 3761 + }, + { + "epoch": 0.54, + "learning_rate": 9.42366026289181e-05, + "loss": 3.2706, + "step": 3762 + }, + { + "epoch": 0.54, + "learning_rate": 9.420771341903798e-05, + "loss": 3.2997, + "step": 3763 + }, + { + "epoch": 0.54, + "learning_rate": 9.417882420915788e-05, + "loss": 3.3774, + "step": 3764 + }, + { + "epoch": 0.54, + "learning_rate": 9.414993499927777e-05, + "loss": 3.2793, + "step": 3765 + }, + { + "epoch": 0.54, + "learning_rate": 9.412104578939767e-05, + "loss": 3.2433, + "step": 3766 + }, + { + "epoch": 0.54, + "learning_rate": 9.409215657951756e-05, + "loss": 3.328, + "step": 3767 + }, + { + "epoch": 0.54, + "learning_rate": 9.406326736963744e-05, + "loss": 3.2594, + "step": 3768 + }, + { + "epoch": 0.54, + "learning_rate": 9.403437815975733e-05, + "loss": 3.3453, + "step": 3769 + }, + { + "epoch": 0.54, + "learning_rate": 9.400548894987723e-05, + "loss": 3.2623, + "step": 3770 + }, + { + "epoch": 0.54, + "learning_rate": 9.397659973999712e-05, + "loss": 3.3762, + "step": 3771 + }, + { + "epoch": 0.54, + "learning_rate": 9.3947710530117e-05, + "loss": 3.2156, + "step": 3772 + }, + { + "epoch": 0.54, + "learning_rate": 9.39188213202369e-05, + "loss": 3.2412, + "step": 3773 + }, + { + "epoch": 0.54, + "learning_rate": 9.388993211035678e-05, + "loss": 3.3074, + "step": 3774 + }, + { + "epoch": 0.54, + "learning_rate": 9.386104290047668e-05, + "loss": 3.2955, + "step": 3775 + }, + { + "epoch": 0.54, + "learning_rate": 9.383215369059656e-05, + "loss": 3.3387, + "step": 3776 + }, + { + "epoch": 0.54, + "learning_rate": 9.380326448071646e-05, + "loss": 3.2532, + "step": 3777 + }, + { + "epoch": 0.54, + "learning_rate": 9.377437527083635e-05, + "loss": 3.4034, + "step": 3778 + }, + { + "epoch": 0.54, + "learning_rate": 9.374548606095623e-05, + "loss": 3.2695, + "step": 3779 + }, + { + "epoch": 0.54, + "learning_rate": 9.371659685107614e-05, + "loss": 3.3823, + "step": 3780 + }, + { + "epoch": 0.54, + "learning_rate": 9.368770764119602e-05, + "loss": 3.3463, + "step": 3781 + }, + { + "epoch": 0.54, + "learning_rate": 9.365881843131591e-05, + "loss": 3.2318, + "step": 3782 + }, + { + "epoch": 0.54, + "learning_rate": 9.36299292214358e-05, + "loss": 3.1585, + "step": 3783 + }, + { + "epoch": 0.54, + "learning_rate": 9.360104001155569e-05, + "loss": 3.2317, + "step": 3784 + }, + { + "epoch": 0.54, + "learning_rate": 9.357215080167557e-05, + "loss": 3.2546, + "step": 3785 + }, + { + "epoch": 0.54, + "learning_rate": 9.354326159179548e-05, + "loss": 3.3546, + "step": 3786 + }, + { + "epoch": 0.54, + "learning_rate": 9.351437238191536e-05, + "loss": 3.291, + "step": 3787 + }, + { + "epoch": 0.54, + "learning_rate": 9.348548317203525e-05, + "loss": 3.2058, + "step": 3788 + }, + { + "epoch": 0.54, + "learning_rate": 9.345659396215514e-05, + "loss": 3.3634, + "step": 3789 + }, + { + "epoch": 0.54, + "learning_rate": 9.342770475227502e-05, + "loss": 3.2855, + "step": 3790 + }, + { + "epoch": 0.54, + "learning_rate": 9.339881554239493e-05, + "loss": 3.3437, + "step": 3791 + }, + { + "epoch": 0.54, + "learning_rate": 9.336992633251481e-05, + "loss": 3.3228, + "step": 3792 + }, + { + "epoch": 0.54, + "learning_rate": 9.33410371226347e-05, + "loss": 3.4097, + "step": 3793 + }, + { + "epoch": 0.54, + "learning_rate": 9.331214791275459e-05, + "loss": 3.2167, + "step": 3794 + }, + { + "epoch": 0.54, + "learning_rate": 9.328325870287448e-05, + "loss": 3.2635, + "step": 3795 + }, + { + "epoch": 0.54, + "learning_rate": 9.325436949299437e-05, + "loss": 3.1602, + "step": 3796 + }, + { + "epoch": 0.54, + "learning_rate": 9.322548028311427e-05, + "loss": 3.2665, + "step": 3797 + }, + { + "epoch": 0.54, + "learning_rate": 9.319659107323415e-05, + "loss": 3.2843, + "step": 3798 + }, + { + "epoch": 0.54, + "learning_rate": 9.316770186335404e-05, + "loss": 3.2841, + "step": 3799 + }, + { + "epoch": 0.54, + "learning_rate": 9.313881265347393e-05, + "loss": 3.0637, + "step": 3800 + }, + { + "epoch": 0.54, + "learning_rate": 9.310992344359383e-05, + "loss": 3.301, + "step": 3801 + }, + { + "epoch": 0.54, + "learning_rate": 9.308103423371371e-05, + "loss": 3.3319, + "step": 3802 + }, + { + "epoch": 0.54, + "learning_rate": 9.30521450238336e-05, + "loss": 3.1172, + "step": 3803 + }, + { + "epoch": 0.54, + "learning_rate": 9.30232558139535e-05, + "loss": 3.3949, + "step": 3804 + }, + { + "epoch": 0.54, + "learning_rate": 9.299436660407338e-05, + "loss": 3.3156, + "step": 3805 + }, + { + "epoch": 0.54, + "learning_rate": 9.296547739419327e-05, + "loss": 3.2566, + "step": 3806 + }, + { + "epoch": 0.54, + "learning_rate": 9.293658818431316e-05, + "loss": 3.171, + "step": 3807 + }, + { + "epoch": 0.54, + "learning_rate": 9.290769897443306e-05, + "loss": 3.1897, + "step": 3808 + }, + { + "epoch": 0.54, + "learning_rate": 9.287880976455294e-05, + "loss": 3.3065, + "step": 3809 + }, + { + "epoch": 0.54, + "learning_rate": 9.284992055467283e-05, + "loss": 3.3067, + "step": 3810 + }, + { + "epoch": 0.54, + "learning_rate": 9.282103134479273e-05, + "loss": 3.2692, + "step": 3811 + }, + { + "epoch": 0.54, + "learning_rate": 9.279214213491262e-05, + "loss": 3.1924, + "step": 3812 + }, + { + "epoch": 0.54, + "learning_rate": 9.27632529250325e-05, + "loss": 3.241, + "step": 3813 + }, + { + "epoch": 0.54, + "learning_rate": 9.27343637151524e-05, + "loss": 3.3199, + "step": 3814 + }, + { + "epoch": 0.54, + "learning_rate": 9.270547450527229e-05, + "loss": 3.2505, + "step": 3815 + }, + { + "epoch": 0.54, + "learning_rate": 9.267658529539217e-05, + "loss": 3.3046, + "step": 3816 + }, + { + "epoch": 0.54, + "learning_rate": 9.264769608551208e-05, + "loss": 3.2151, + "step": 3817 + }, + { + "epoch": 0.54, + "learning_rate": 9.261880687563196e-05, + "loss": 3.2005, + "step": 3818 + }, + { + "epoch": 0.54, + "learning_rate": 9.258991766575185e-05, + "loss": 3.3655, + "step": 3819 + }, + { + "epoch": 0.54, + "learning_rate": 9.256102845587173e-05, + "loss": 3.1541, + "step": 3820 + }, + { + "epoch": 0.54, + "learning_rate": 9.253213924599162e-05, + "loss": 3.2828, + "step": 3821 + }, + { + "epoch": 0.54, + "learning_rate": 9.250325003611152e-05, + "loss": 3.3187, + "step": 3822 + }, + { + "epoch": 0.54, + "learning_rate": 9.247436082623141e-05, + "loss": 3.2536, + "step": 3823 + }, + { + "epoch": 0.54, + "learning_rate": 9.244547161635129e-05, + "loss": 3.3031, + "step": 3824 + }, + { + "epoch": 0.54, + "learning_rate": 9.241658240647119e-05, + "loss": 3.1891, + "step": 3825 + }, + { + "epoch": 0.54, + "learning_rate": 9.238769319659108e-05, + "loss": 3.0456, + "step": 3826 + }, + { + "epoch": 0.54, + "learning_rate": 9.235880398671097e-05, + "loss": 3.3972, + "step": 3827 + }, + { + "epoch": 0.55, + "learning_rate": 9.232991477683087e-05, + "loss": 3.3103, + "step": 3828 + }, + { + "epoch": 0.55, + "learning_rate": 9.230102556695075e-05, + "loss": 3.0148, + "step": 3829 + }, + { + "epoch": 0.55, + "learning_rate": 9.227213635707064e-05, + "loss": 3.2918, + "step": 3830 + }, + { + "epoch": 0.55, + "learning_rate": 9.224324714719052e-05, + "loss": 3.3179, + "step": 3831 + }, + { + "epoch": 0.55, + "learning_rate": 9.221435793731041e-05, + "loss": 3.3337, + "step": 3832 + }, + { + "epoch": 0.55, + "learning_rate": 9.218546872743031e-05, + "loss": 3.3814, + "step": 3833 + }, + { + "epoch": 0.55, + "learning_rate": 9.21565795175502e-05, + "loss": 3.3344, + "step": 3834 + }, + { + "epoch": 0.55, + "learning_rate": 9.212769030767008e-05, + "loss": 3.1587, + "step": 3835 + }, + { + "epoch": 0.55, + "learning_rate": 9.209880109778998e-05, + "loss": 3.2824, + "step": 3836 + }, + { + "epoch": 0.55, + "learning_rate": 9.206991188790987e-05, + "loss": 3.2514, + "step": 3837 + }, + { + "epoch": 0.55, + "learning_rate": 9.204102267802976e-05, + "loss": 3.245, + "step": 3838 + }, + { + "epoch": 0.55, + "learning_rate": 9.201213346814966e-05, + "loss": 3.1114, + "step": 3839 + }, + { + "epoch": 0.55, + "learning_rate": 9.198324425826954e-05, + "loss": 3.1667, + "step": 3840 + }, + { + "epoch": 0.55, + "learning_rate": 9.195435504838943e-05, + "loss": 3.2927, + "step": 3841 + }, + { + "epoch": 0.55, + "learning_rate": 9.192546583850931e-05, + "loss": 3.1644, + "step": 3842 + }, + { + "epoch": 0.55, + "learning_rate": 9.189657662862922e-05, + "loss": 3.3846, + "step": 3843 + }, + { + "epoch": 0.55, + "learning_rate": 9.18676874187491e-05, + "loss": 3.3639, + "step": 3844 + }, + { + "epoch": 0.55, + "learning_rate": 9.1838798208869e-05, + "loss": 3.2076, + "step": 3845 + }, + { + "epoch": 0.55, + "learning_rate": 9.180990899898887e-05, + "loss": 3.2439, + "step": 3846 + }, + { + "epoch": 0.55, + "learning_rate": 9.178101978910877e-05, + "loss": 3.2574, + "step": 3847 + }, + { + "epoch": 0.55, + "learning_rate": 9.175213057922866e-05, + "loss": 3.3944, + "step": 3848 + }, + { + "epoch": 0.55, + "learning_rate": 9.172324136934856e-05, + "loss": 3.3432, + "step": 3849 + }, + { + "epoch": 0.55, + "learning_rate": 9.169435215946845e-05, + "loss": 3.451, + "step": 3850 + }, + { + "epoch": 0.55, + "learning_rate": 9.166546294958833e-05, + "loss": 3.4377, + "step": 3851 + }, + { + "epoch": 0.55, + "learning_rate": 9.163657373970822e-05, + "loss": 3.2292, + "step": 3852 + }, + { + "epoch": 0.55, + "learning_rate": 9.16076845298281e-05, + "loss": 3.2505, + "step": 3853 + }, + { + "epoch": 0.55, + "learning_rate": 9.157879531994801e-05, + "loss": 3.1625, + "step": 3854 + }, + { + "epoch": 0.55, + "learning_rate": 9.154990611006789e-05, + "loss": 3.3417, + "step": 3855 + }, + { + "epoch": 0.55, + "learning_rate": 9.152101690018779e-05, + "loss": 3.3231, + "step": 3856 + }, + { + "epoch": 0.55, + "learning_rate": 9.149212769030767e-05, + "loss": 3.4253, + "step": 3857 + }, + { + "epoch": 0.55, + "learning_rate": 9.146323848042756e-05, + "loss": 3.2481, + "step": 3858 + }, + { + "epoch": 0.55, + "learning_rate": 9.143434927054745e-05, + "loss": 3.2928, + "step": 3859 + }, + { + "epoch": 0.55, + "learning_rate": 9.140546006066735e-05, + "loss": 3.3022, + "step": 3860 + }, + { + "epoch": 0.55, + "learning_rate": 9.137657085078724e-05, + "loss": 3.2664, + "step": 3861 + }, + { + "epoch": 0.55, + "learning_rate": 9.134768164090712e-05, + "loss": 3.3661, + "step": 3862 + }, + { + "epoch": 0.55, + "learning_rate": 9.131879243102702e-05, + "loss": 3.2142, + "step": 3863 + }, + { + "epoch": 0.55, + "learning_rate": 9.128990322114691e-05, + "loss": 3.0478, + "step": 3864 + }, + { + "epoch": 0.55, + "learning_rate": 9.12610140112668e-05, + "loss": 3.285, + "step": 3865 + }, + { + "epoch": 0.55, + "learning_rate": 9.123212480138668e-05, + "loss": 3.1, + "step": 3866 + }, + { + "epoch": 0.55, + "learning_rate": 9.120323559150658e-05, + "loss": 3.2835, + "step": 3867 + }, + { + "epoch": 0.55, + "learning_rate": 9.117434638162646e-05, + "loss": 3.3589, + "step": 3868 + }, + { + "epoch": 0.55, + "learning_rate": 9.114545717174636e-05, + "loss": 3.2728, + "step": 3869 + }, + { + "epoch": 0.55, + "learning_rate": 9.111656796186624e-05, + "loss": 3.2323, + "step": 3870 + }, + { + "epoch": 0.55, + "learning_rate": 9.108767875198614e-05, + "loss": 3.2699, + "step": 3871 + }, + { + "epoch": 0.55, + "learning_rate": 9.105878954210603e-05, + "loss": 3.1105, + "step": 3872 + }, + { + "epoch": 0.55, + "eval_loss": 3.4803874492645264, + "eval_runtime": 471.7317, + "eval_samples_per_second": 43.429, + "eval_steps_per_second": 14.476, + "step": 3872 + }, + { + "epoch": 0.55, + "learning_rate": 9.102990033222591e-05, + "loss": 3.3334, + "step": 3873 + }, + { + "epoch": 0.55, + "learning_rate": 9.10010111223458e-05, + "loss": 3.2822, + "step": 3874 + }, + { + "epoch": 0.55, + "learning_rate": 9.09721219124657e-05, + "loss": 3.1631, + "step": 3875 + }, + { + "epoch": 0.55, + "learning_rate": 9.09432327025856e-05, + "loss": 3.2771, + "step": 3876 + }, + { + "epoch": 0.55, + "learning_rate": 9.091434349270547e-05, + "loss": 3.2999, + "step": 3877 + }, + { + "epoch": 0.55, + "learning_rate": 9.088545428282537e-05, + "loss": 3.19, + "step": 3878 + }, + { + "epoch": 0.55, + "learning_rate": 9.085656507294525e-05, + "loss": 3.405, + "step": 3879 + }, + { + "epoch": 0.55, + "learning_rate": 9.082767586306516e-05, + "loss": 3.3027, + "step": 3880 + }, + { + "epoch": 0.55, + "learning_rate": 9.079878665318504e-05, + "loss": 3.1167, + "step": 3881 + }, + { + "epoch": 0.55, + "learning_rate": 9.076989744330493e-05, + "loss": 3.2915, + "step": 3882 + }, + { + "epoch": 0.55, + "learning_rate": 9.074100823342482e-05, + "loss": 3.2832, + "step": 3883 + }, + { + "epoch": 0.55, + "learning_rate": 9.07121190235447e-05, + "loss": 3.032, + "step": 3884 + }, + { + "epoch": 0.55, + "learning_rate": 9.068322981366461e-05, + "loss": 3.1557, + "step": 3885 + }, + { + "epoch": 0.55, + "learning_rate": 9.065434060378449e-05, + "loss": 3.199, + "step": 3886 + }, + { + "epoch": 0.55, + "learning_rate": 9.062545139390439e-05, + "loss": 3.2869, + "step": 3887 + }, + { + "epoch": 0.55, + "learning_rate": 9.059656218402427e-05, + "loss": 3.3465, + "step": 3888 + }, + { + "epoch": 0.55, + "learning_rate": 9.056767297414416e-05, + "loss": 3.2342, + "step": 3889 + }, + { + "epoch": 0.55, + "learning_rate": 9.053878376426405e-05, + "loss": 3.2444, + "step": 3890 + }, + { + "epoch": 0.55, + "learning_rate": 9.050989455438395e-05, + "loss": 3.198, + "step": 3891 + }, + { + "epoch": 0.55, + "learning_rate": 9.048100534450383e-05, + "loss": 3.2915, + "step": 3892 + }, + { + "epoch": 0.55, + "learning_rate": 9.045211613462372e-05, + "loss": 3.181, + "step": 3893 + }, + { + "epoch": 0.55, + "learning_rate": 9.042322692474362e-05, + "loss": 3.3077, + "step": 3894 + }, + { + "epoch": 0.55, + "learning_rate": 9.03943377148635e-05, + "loss": 3.298, + "step": 3895 + }, + { + "epoch": 0.55, + "learning_rate": 9.03654485049834e-05, + "loss": 3.032, + "step": 3896 + }, + { + "epoch": 0.55, + "learning_rate": 9.033655929510328e-05, + "loss": 3.2134, + "step": 3897 + }, + { + "epoch": 0.55, + "learning_rate": 9.030767008522318e-05, + "loss": 3.2998, + "step": 3898 + }, + { + "epoch": 0.56, + "learning_rate": 9.027878087534306e-05, + "loss": 3.1998, + "step": 3899 + }, + { + "epoch": 0.56, + "learning_rate": 9.024989166546295e-05, + "loss": 3.1195, + "step": 3900 + }, + { + "epoch": 0.56, + "learning_rate": 9.022100245558285e-05, + "loss": 3.2613, + "step": 3901 + }, + { + "epoch": 0.56, + "learning_rate": 9.019211324570274e-05, + "loss": 3.364, + "step": 3902 + }, + { + "epoch": 0.56, + "learning_rate": 9.016322403582262e-05, + "loss": 3.2895, + "step": 3903 + }, + { + "epoch": 0.56, + "learning_rate": 9.013433482594251e-05, + "loss": 3.2736, + "step": 3904 + }, + { + "epoch": 0.56, + "learning_rate": 9.01054456160624e-05, + "loss": 3.3313, + "step": 3905 + }, + { + "epoch": 0.56, + "learning_rate": 9.00765564061823e-05, + "loss": 3.2997, + "step": 3906 + }, + { + "epoch": 0.56, + "learning_rate": 9.00476671963022e-05, + "loss": 3.3214, + "step": 3907 + }, + { + "epoch": 0.56, + "learning_rate": 9.001877798642207e-05, + "loss": 3.3161, + "step": 3908 + }, + { + "epoch": 0.56, + "learning_rate": 8.998988877654197e-05, + "loss": 3.3021, + "step": 3909 + }, + { + "epoch": 0.56, + "learning_rate": 8.996099956666185e-05, + "loss": 3.3702, + "step": 3910 + }, + { + "epoch": 0.56, + "learning_rate": 8.993211035678176e-05, + "loss": 3.2741, + "step": 3911 + }, + { + "epoch": 0.56, + "learning_rate": 8.990322114690164e-05, + "loss": 3.289, + "step": 3912 + }, + { + "epoch": 0.56, + "learning_rate": 8.987433193702153e-05, + "loss": 3.3696, + "step": 3913 + }, + { + "epoch": 0.56, + "learning_rate": 8.984544272714141e-05, + "loss": 3.3681, + "step": 3914 + }, + { + "epoch": 0.56, + "learning_rate": 8.98165535172613e-05, + "loss": 3.2758, + "step": 3915 + }, + { + "epoch": 0.56, + "learning_rate": 8.97876643073812e-05, + "loss": 3.2286, + "step": 3916 + }, + { + "epoch": 0.56, + "learning_rate": 8.975877509750109e-05, + "loss": 3.2878, + "step": 3917 + }, + { + "epoch": 0.56, + "learning_rate": 8.972988588762099e-05, + "loss": 3.4119, + "step": 3918 + }, + { + "epoch": 0.56, + "learning_rate": 8.970099667774087e-05, + "loss": 3.2123, + "step": 3919 + }, + { + "epoch": 0.56, + "learning_rate": 8.967210746786076e-05, + "loss": 3.0235, + "step": 3920 + }, + { + "epoch": 0.56, + "learning_rate": 8.964321825798064e-05, + "loss": 3.3914, + "step": 3921 + }, + { + "epoch": 0.56, + "learning_rate": 8.961432904810055e-05, + "loss": 3.1288, + "step": 3922 + }, + { + "epoch": 0.56, + "learning_rate": 8.958543983822043e-05, + "loss": 3.238, + "step": 3923 + }, + { + "epoch": 0.56, + "learning_rate": 8.955655062834032e-05, + "loss": 3.1957, + "step": 3924 + }, + { + "epoch": 0.56, + "learning_rate": 8.95276614184602e-05, + "loss": 3.249, + "step": 3925 + }, + { + "epoch": 0.56, + "learning_rate": 8.94987722085801e-05, + "loss": 3.181, + "step": 3926 + }, + { + "epoch": 0.56, + "learning_rate": 8.946988299869999e-05, + "loss": 3.281, + "step": 3927 + }, + { + "epoch": 0.56, + "learning_rate": 8.944099378881988e-05, + "loss": 3.2452, + "step": 3928 + }, + { + "epoch": 0.56, + "learning_rate": 8.941210457893976e-05, + "loss": 3.2608, + "step": 3929 + }, + { + "epoch": 0.56, + "learning_rate": 8.938321536905966e-05, + "loss": 3.3254, + "step": 3930 + }, + { + "epoch": 0.56, + "learning_rate": 8.935432615917955e-05, + "loss": 3.3185, + "step": 3931 + }, + { + "epoch": 0.56, + "learning_rate": 8.932543694929945e-05, + "loss": 3.2227, + "step": 3932 + }, + { + "epoch": 0.56, + "learning_rate": 8.929654773941934e-05, + "loss": 3.2378, + "step": 3933 + }, + { + "epoch": 0.56, + "learning_rate": 8.926765852953922e-05, + "loss": 3.3324, + "step": 3934 + }, + { + "epoch": 0.56, + "learning_rate": 8.923876931965911e-05, + "loss": 3.4224, + "step": 3935 + }, + { + "epoch": 0.56, + "learning_rate": 8.9209880109779e-05, + "loss": 3.2827, + "step": 3936 + }, + { + "epoch": 0.56, + "learning_rate": 8.91809908998989e-05, + "loss": 3.2855, + "step": 3937 + }, + { + "epoch": 0.56, + "learning_rate": 8.915210169001878e-05, + "loss": 3.0395, + "step": 3938 + }, + { + "epoch": 0.56, + "learning_rate": 8.912321248013867e-05, + "loss": 3.3829, + "step": 3939 + }, + { + "epoch": 0.56, + "learning_rate": 8.909432327025856e-05, + "loss": 3.2371, + "step": 3940 + }, + { + "epoch": 0.56, + "learning_rate": 8.906543406037845e-05, + "loss": 3.2505, + "step": 3941 + }, + { + "epoch": 0.56, + "learning_rate": 8.903654485049834e-05, + "loss": 3.3156, + "step": 3942 + }, + { + "epoch": 0.56, + "learning_rate": 8.900765564061824e-05, + "loss": 3.0712, + "step": 3943 + }, + { + "epoch": 0.56, + "learning_rate": 8.897876643073813e-05, + "loss": 3.3497, + "step": 3944 + }, + { + "epoch": 0.56, + "learning_rate": 8.894987722085801e-05, + "loss": 3.2546, + "step": 3945 + }, + { + "epoch": 0.56, + "learning_rate": 8.89209880109779e-05, + "loss": 3.153, + "step": 3946 + }, + { + "epoch": 0.56, + "learning_rate": 8.889209880109778e-05, + "loss": 3.0865, + "step": 3947 + }, + { + "epoch": 0.56, + "learning_rate": 8.886320959121769e-05, + "loss": 3.2698, + "step": 3948 + }, + { + "epoch": 0.56, + "learning_rate": 8.883432038133757e-05, + "loss": 3.2959, + "step": 3949 + }, + { + "epoch": 0.56, + "learning_rate": 8.880543117145747e-05, + "loss": 3.2559, + "step": 3950 + }, + { + "epoch": 0.56, + "learning_rate": 8.877654196157735e-05, + "loss": 3.3895, + "step": 3951 + }, + { + "epoch": 0.56, + "learning_rate": 8.874765275169724e-05, + "loss": 3.3017, + "step": 3952 + }, + { + "epoch": 0.56, + "learning_rate": 8.871876354181713e-05, + "loss": 3.1077, + "step": 3953 + }, + { + "epoch": 0.56, + "learning_rate": 8.868987433193703e-05, + "loss": 3.1145, + "step": 3954 + }, + { + "epoch": 0.56, + "learning_rate": 8.866098512205692e-05, + "loss": 3.0593, + "step": 3955 + }, + { + "epoch": 0.56, + "learning_rate": 8.86320959121768e-05, + "loss": 3.1752, + "step": 3956 + }, + { + "epoch": 0.56, + "learning_rate": 8.86032067022967e-05, + "loss": 3.1846, + "step": 3957 + }, + { + "epoch": 0.56, + "learning_rate": 8.857431749241659e-05, + "loss": 3.3596, + "step": 3958 + }, + { + "epoch": 0.56, + "learning_rate": 8.854542828253648e-05, + "loss": 3.2812, + "step": 3959 + }, + { + "epoch": 0.56, + "learning_rate": 8.851653907265636e-05, + "loss": 3.1094, + "step": 3960 + }, + { + "epoch": 0.56, + "learning_rate": 8.848764986277626e-05, + "loss": 3.2295, + "step": 3961 + }, + { + "epoch": 0.56, + "learning_rate": 8.845876065289614e-05, + "loss": 3.2622, + "step": 3962 + }, + { + "epoch": 0.56, + "learning_rate": 8.842987144301603e-05, + "loss": 3.0867, + "step": 3963 + }, + { + "epoch": 0.56, + "learning_rate": 8.840098223313593e-05, + "loss": 3.2774, + "step": 3964 + }, + { + "epoch": 0.56, + "learning_rate": 8.837209302325582e-05, + "loss": 3.3415, + "step": 3965 + }, + { + "epoch": 0.56, + "learning_rate": 8.834320381337571e-05, + "loss": 3.2803, + "step": 3966 + }, + { + "epoch": 0.56, + "learning_rate": 8.83143146034956e-05, + "loss": 3.3382, + "step": 3967 + }, + { + "epoch": 0.56, + "learning_rate": 8.828542539361549e-05, + "loss": 3.2908, + "step": 3968 + }, + { + "epoch": 0.57, + "learning_rate": 8.825653618373538e-05, + "loss": 3.2303, + "step": 3969 + }, + { + "epoch": 0.57, + "learning_rate": 8.822764697385528e-05, + "loss": 3.2081, + "step": 3970 + }, + { + "epoch": 0.57, + "learning_rate": 8.819875776397516e-05, + "loss": 3.1726, + "step": 3971 + }, + { + "epoch": 0.57, + "learning_rate": 8.816986855409505e-05, + "loss": 3.2753, + "step": 3972 + }, + { + "epoch": 0.57, + "learning_rate": 8.814097934421493e-05, + "loss": 3.1992, + "step": 3973 + }, + { + "epoch": 0.57, + "learning_rate": 8.811209013433484e-05, + "loss": 3.1367, + "step": 3974 + }, + { + "epoch": 0.57, + "learning_rate": 8.808320092445472e-05, + "loss": 3.3159, + "step": 3975 + }, + { + "epoch": 0.57, + "learning_rate": 8.805431171457461e-05, + "loss": 3.3195, + "step": 3976 + }, + { + "epoch": 0.57, + "learning_rate": 8.80254225046945e-05, + "loss": 3.3893, + "step": 3977 + }, + { + "epoch": 0.57, + "learning_rate": 8.799653329481438e-05, + "loss": 3.2885, + "step": 3978 + }, + { + "epoch": 0.57, + "learning_rate": 8.796764408493429e-05, + "loss": 3.2368, + "step": 3979 + }, + { + "epoch": 0.57, + "learning_rate": 8.793875487505417e-05, + "loss": 3.2786, + "step": 3980 + }, + { + "epoch": 0.57, + "learning_rate": 8.790986566517407e-05, + "loss": 3.1629, + "step": 3981 + }, + { + "epoch": 0.57, + "learning_rate": 8.788097645529395e-05, + "loss": 3.1387, + "step": 3982 + }, + { + "epoch": 0.57, + "learning_rate": 8.785208724541384e-05, + "loss": 3.2488, + "step": 3983 + }, + { + "epoch": 0.57, + "learning_rate": 8.782319803553372e-05, + "loss": 3.2105, + "step": 3984 + }, + { + "epoch": 0.57, + "learning_rate": 8.779430882565363e-05, + "loss": 3.2135, + "step": 3985 + }, + { + "epoch": 0.57, + "learning_rate": 8.776541961577351e-05, + "loss": 3.1729, + "step": 3986 + }, + { + "epoch": 0.57, + "learning_rate": 8.77365304058934e-05, + "loss": 3.286, + "step": 3987 + }, + { + "epoch": 0.57, + "learning_rate": 8.77076411960133e-05, + "loss": 3.224, + "step": 3988 + }, + { + "epoch": 0.57, + "learning_rate": 8.767875198613318e-05, + "loss": 3.298, + "step": 3989 + }, + { + "epoch": 0.57, + "learning_rate": 8.764986277625308e-05, + "loss": 3.1093, + "step": 3990 + }, + { + "epoch": 0.57, + "learning_rate": 8.762097356637296e-05, + "loss": 3.1439, + "step": 3991 + }, + { + "epoch": 0.57, + "learning_rate": 8.759208435649286e-05, + "loss": 3.3523, + "step": 3992 + }, + { + "epoch": 0.57, + "learning_rate": 8.756319514661274e-05, + "loss": 3.172, + "step": 3993 + }, + { + "epoch": 0.57, + "learning_rate": 8.753430593673263e-05, + "loss": 3.3619, + "step": 3994 + }, + { + "epoch": 0.57, + "learning_rate": 8.750541672685253e-05, + "loss": 3.366, + "step": 3995 + }, + { + "epoch": 0.57, + "learning_rate": 8.747652751697242e-05, + "loss": 3.3856, + "step": 3996 + }, + { + "epoch": 0.57, + "learning_rate": 8.74476383070923e-05, + "loss": 3.2243, + "step": 3997 + }, + { + "epoch": 0.57, + "learning_rate": 8.74187490972122e-05, + "loss": 3.2988, + "step": 3998 + }, + { + "epoch": 0.57, + "learning_rate": 8.738985988733209e-05, + "loss": 3.3289, + "step": 3999 + }, + { + "epoch": 0.57, + "learning_rate": 8.736097067745198e-05, + "loss": 3.259, + "step": 4000 + }, + { + "epoch": 0.57, + "learning_rate": 8.733208146757188e-05, + "loss": 3.258, + "step": 4001 + }, + { + "epoch": 0.57, + "learning_rate": 8.730319225769176e-05, + "loss": 3.406, + "step": 4002 + }, + { + "epoch": 0.57, + "learning_rate": 8.727430304781165e-05, + "loss": 3.3033, + "step": 4003 + }, + { + "epoch": 0.57, + "learning_rate": 8.724541383793153e-05, + "loss": 3.2368, + "step": 4004 + }, + { + "epoch": 0.57, + "learning_rate": 8.721652462805142e-05, + "loss": 3.0177, + "step": 4005 + }, + { + "epoch": 0.57, + "learning_rate": 8.718763541817132e-05, + "loss": 3.2402, + "step": 4006 + }, + { + "epoch": 0.57, + "learning_rate": 8.715874620829121e-05, + "loss": 3.1792, + "step": 4007 + }, + { + "epoch": 0.57, + "learning_rate": 8.712985699841109e-05, + "loss": 3.092, + "step": 4008 + }, + { + "epoch": 0.57, + "learning_rate": 8.710096778853099e-05, + "loss": 3.124, + "step": 4009 + }, + { + "epoch": 0.57, + "learning_rate": 8.707207857865088e-05, + "loss": 3.3448, + "step": 4010 + }, + { + "epoch": 0.57, + "learning_rate": 8.704318936877077e-05, + "loss": 3.331, + "step": 4011 + }, + { + "epoch": 0.57, + "learning_rate": 8.701430015889067e-05, + "loss": 3.339, + "step": 4012 + }, + { + "epoch": 0.57, + "learning_rate": 8.698541094901055e-05, + "loss": 3.1541, + "step": 4013 + }, + { + "epoch": 0.57, + "learning_rate": 8.695652173913044e-05, + "loss": 3.3561, + "step": 4014 + }, + { + "epoch": 0.57, + "learning_rate": 8.692763252925032e-05, + "loss": 3.1285, + "step": 4015 + }, + { + "epoch": 0.57, + "learning_rate": 8.689874331937023e-05, + "loss": 3.4157, + "step": 4016 + }, + { + "epoch": 0.57, + "learning_rate": 8.686985410949011e-05, + "loss": 3.2565, + "step": 4017 + }, + { + "epoch": 0.57, + "learning_rate": 8.684096489961e-05, + "loss": 3.2633, + "step": 4018 + }, + { + "epoch": 0.57, + "learning_rate": 8.681207568972988e-05, + "loss": 3.2736, + "step": 4019 + }, + { + "epoch": 0.57, + "learning_rate": 8.678318647984978e-05, + "loss": 3.2032, + "step": 4020 + }, + { + "epoch": 0.57, + "learning_rate": 8.675429726996967e-05, + "loss": 3.2755, + "step": 4021 + }, + { + "epoch": 0.57, + "learning_rate": 8.672540806008956e-05, + "loss": 3.0297, + "step": 4022 + }, + { + "epoch": 0.57, + "learning_rate": 8.669651885020946e-05, + "loss": 3.1787, + "step": 4023 + }, + { + "epoch": 0.57, + "learning_rate": 8.666762964032934e-05, + "loss": 3.2807, + "step": 4024 + }, + { + "epoch": 0.57, + "learning_rate": 8.663874043044923e-05, + "loss": 3.252, + "step": 4025 + }, + { + "epoch": 0.57, + "learning_rate": 8.660985122056913e-05, + "loss": 3.2084, + "step": 4026 + }, + { + "epoch": 0.57, + "learning_rate": 8.658096201068902e-05, + "loss": 3.2354, + "step": 4027 + }, + { + "epoch": 0.57, + "learning_rate": 8.65520728008089e-05, + "loss": 3.2249, + "step": 4028 + }, + { + "epoch": 0.57, + "learning_rate": 8.65231835909288e-05, + "loss": 3.2624, + "step": 4029 + }, + { + "epoch": 0.57, + "learning_rate": 8.649429438104867e-05, + "loss": 3.2647, + "step": 4030 + }, + { + "epoch": 0.57, + "learning_rate": 8.646540517116857e-05, + "loss": 3.3059, + "step": 4031 + }, + { + "epoch": 0.57, + "learning_rate": 8.643651596128846e-05, + "loss": 3.3355, + "step": 4032 + }, + { + "epoch": 0.57, + "learning_rate": 8.640762675140836e-05, + "loss": 3.2986, + "step": 4033 + }, + { + "epoch": 0.57, + "learning_rate": 8.637873754152825e-05, + "loss": 3.4018, + "step": 4034 + }, + { + "epoch": 0.57, + "learning_rate": 8.634984833164813e-05, + "loss": 3.2099, + "step": 4035 + }, + { + "epoch": 0.57, + "learning_rate": 8.632095912176802e-05, + "loss": 3.2506, + "step": 4036 + }, + { + "epoch": 0.57, + "learning_rate": 8.629206991188792e-05, + "loss": 3.0363, + "step": 4037 + }, + { + "epoch": 0.57, + "learning_rate": 8.626318070200781e-05, + "loss": 3.3057, + "step": 4038 + }, + { + "epoch": 0.58, + "learning_rate": 8.623429149212769e-05, + "loss": 3.174, + "step": 4039 + }, + { + "epoch": 0.58, + "learning_rate": 8.620540228224759e-05, + "loss": 3.1564, + "step": 4040 + }, + { + "epoch": 0.58, + "learning_rate": 8.617651307236747e-05, + "loss": 3.2195, + "step": 4041 + }, + { + "epoch": 0.58, + "learning_rate": 8.614762386248737e-05, + "loss": 3.3382, + "step": 4042 + }, + { + "epoch": 0.58, + "learning_rate": 8.611873465260725e-05, + "loss": 3.2327, + "step": 4043 + }, + { + "epoch": 0.58, + "learning_rate": 8.608984544272715e-05, + "loss": 3.2639, + "step": 4044 + }, + { + "epoch": 0.58, + "learning_rate": 8.606095623284704e-05, + "loss": 3.3016, + "step": 4045 + }, + { + "epoch": 0.58, + "learning_rate": 8.603206702296692e-05, + "loss": 3.2926, + "step": 4046 + }, + { + "epoch": 0.58, + "learning_rate": 8.600317781308681e-05, + "loss": 3.277, + "step": 4047 + }, + { + "epoch": 0.58, + "learning_rate": 8.597428860320671e-05, + "loss": 3.0697, + "step": 4048 + }, + { + "epoch": 0.58, + "learning_rate": 8.59453993933266e-05, + "loss": 3.2062, + "step": 4049 + }, + { + "epoch": 0.58, + "learning_rate": 8.591651018344648e-05, + "loss": 3.2135, + "step": 4050 + }, + { + "epoch": 0.58, + "learning_rate": 8.588762097356638e-05, + "loss": 3.3941, + "step": 4051 + }, + { + "epoch": 0.58, + "learning_rate": 8.585873176368626e-05, + "loss": 3.2611, + "step": 4052 + }, + { + "epoch": 0.58, + "learning_rate": 8.582984255380616e-05, + "loss": 3.1288, + "step": 4053 + }, + { + "epoch": 0.58, + "learning_rate": 8.580095334392604e-05, + "loss": 3.1586, + "step": 4054 + }, + { + "epoch": 0.58, + "learning_rate": 8.577206413404594e-05, + "loss": 3.2621, + "step": 4055 + }, + { + "epoch": 0.58, + "learning_rate": 8.574317492416582e-05, + "loss": 3.1419, + "step": 4056 + }, + { + "epoch": 0.58, + "learning_rate": 8.571428571428571e-05, + "loss": 3.2366, + "step": 4057 + }, + { + "epoch": 0.58, + "learning_rate": 8.56853965044056e-05, + "loss": 3.0991, + "step": 4058 + }, + { + "epoch": 0.58, + "learning_rate": 8.56565072945255e-05, + "loss": 3.2461, + "step": 4059 + }, + { + "epoch": 0.58, + "learning_rate": 8.56276180846454e-05, + "loss": 3.157, + "step": 4060 + }, + { + "epoch": 0.58, + "learning_rate": 8.559872887476527e-05, + "loss": 3.279, + "step": 4061 + }, + { + "epoch": 0.58, + "learning_rate": 8.556983966488517e-05, + "loss": 3.3706, + "step": 4062 + }, + { + "epoch": 0.58, + "learning_rate": 8.554095045500506e-05, + "loss": 3.2767, + "step": 4063 + }, + { + "epoch": 0.58, + "learning_rate": 8.551206124512496e-05, + "loss": 3.2843, + "step": 4064 + }, + { + "epoch": 0.58, + "learning_rate": 8.548317203524484e-05, + "loss": 3.2912, + "step": 4065 + }, + { + "epoch": 0.58, + "learning_rate": 8.545428282536473e-05, + "loss": 3.3588, + "step": 4066 + }, + { + "epoch": 0.58, + "learning_rate": 8.542539361548461e-05, + "loss": 3.1223, + "step": 4067 + }, + { + "epoch": 0.58, + "learning_rate": 8.539650440560452e-05, + "loss": 3.1939, + "step": 4068 + }, + { + "epoch": 0.58, + "learning_rate": 8.53676151957244e-05, + "loss": 3.2885, + "step": 4069 + }, + { + "epoch": 0.58, + "learning_rate": 8.533872598584429e-05, + "loss": 3.1524, + "step": 4070 + }, + { + "epoch": 0.58, + "learning_rate": 8.530983677596419e-05, + "loss": 3.2319, + "step": 4071 + }, + { + "epoch": 0.58, + "learning_rate": 8.528094756608407e-05, + "loss": 3.2579, + "step": 4072 + }, + { + "epoch": 0.58, + "learning_rate": 8.525205835620396e-05, + "loss": 3.2752, + "step": 4073 + }, + { + "epoch": 0.58, + "learning_rate": 8.522316914632385e-05, + "loss": 3.1005, + "step": 4074 + }, + { + "epoch": 0.58, + "learning_rate": 8.519427993644375e-05, + "loss": 3.2904, + "step": 4075 + }, + { + "epoch": 0.58, + "learning_rate": 8.516539072656363e-05, + "loss": 3.2652, + "step": 4076 + }, + { + "epoch": 0.58, + "learning_rate": 8.513650151668352e-05, + "loss": 3.3645, + "step": 4077 + }, + { + "epoch": 0.58, + "learning_rate": 8.51076123068034e-05, + "loss": 3.3647, + "step": 4078 + }, + { + "epoch": 0.58, + "learning_rate": 8.507872309692331e-05, + "loss": 3.197, + "step": 4079 + }, + { + "epoch": 0.58, + "learning_rate": 8.504983388704319e-05, + "loss": 3.2224, + "step": 4080 + }, + { + "epoch": 0.58, + "learning_rate": 8.502094467716308e-05, + "loss": 3.2377, + "step": 4081 + }, + { + "epoch": 0.58, + "learning_rate": 8.499205546728298e-05, + "loss": 3.2946, + "step": 4082 + }, + { + "epoch": 0.58, + "learning_rate": 8.496316625740286e-05, + "loss": 3.2442, + "step": 4083 + }, + { + "epoch": 0.58, + "learning_rate": 8.493427704752276e-05, + "loss": 3.2016, + "step": 4084 + }, + { + "epoch": 0.58, + "learning_rate": 8.490538783764264e-05, + "loss": 3.2061, + "step": 4085 + }, + { + "epoch": 0.58, + "learning_rate": 8.487649862776254e-05, + "loss": 3.3251, + "step": 4086 + }, + { + "epoch": 0.58, + "learning_rate": 8.484760941788242e-05, + "loss": 3.3625, + "step": 4087 + }, + { + "epoch": 0.58, + "learning_rate": 8.481872020800231e-05, + "loss": 3.0163, + "step": 4088 + }, + { + "epoch": 0.58, + "learning_rate": 8.47898309981222e-05, + "loss": 3.1424, + "step": 4089 + }, + { + "epoch": 0.58, + "learning_rate": 8.47609417882421e-05, + "loss": 3.3728, + "step": 4090 + }, + { + "epoch": 0.58, + "learning_rate": 8.473205257836198e-05, + "loss": 3.2705, + "step": 4091 + }, + { + "epoch": 0.58, + "learning_rate": 8.470316336848187e-05, + "loss": 3.3873, + "step": 4092 + }, + { + "epoch": 0.58, + "learning_rate": 8.467427415860177e-05, + "loss": 3.3183, + "step": 4093 + }, + { + "epoch": 0.58, + "learning_rate": 8.464538494872165e-05, + "loss": 3.2233, + "step": 4094 + }, + { + "epoch": 0.58, + "learning_rate": 8.461649573884156e-05, + "loss": 3.2937, + "step": 4095 + }, + { + "epoch": 0.58, + "learning_rate": 8.458760652896144e-05, + "loss": 3.3, + "step": 4096 + }, + { + "epoch": 0.58, + "learning_rate": 8.455871731908133e-05, + "loss": 3.148, + "step": 4097 + }, + { + "epoch": 0.58, + "learning_rate": 8.452982810920121e-05, + "loss": 3.2463, + "step": 4098 + }, + { + "epoch": 0.58, + "learning_rate": 8.45009388993211e-05, + "loss": 3.2626, + "step": 4099 + }, + { + "epoch": 0.58, + "learning_rate": 8.4472049689441e-05, + "loss": 3.2149, + "step": 4100 + }, + { + "epoch": 0.58, + "learning_rate": 8.444316047956089e-05, + "loss": 3.2738, + "step": 4101 + }, + { + "epoch": 0.58, + "learning_rate": 8.441427126968077e-05, + "loss": 3.258, + "step": 4102 + }, + { + "epoch": 0.58, + "learning_rate": 8.438538205980067e-05, + "loss": 3.2738, + "step": 4103 + }, + { + "epoch": 0.58, + "learning_rate": 8.435649284992056e-05, + "loss": 3.14, + "step": 4104 + }, + { + "epoch": 0.58, + "learning_rate": 8.432760364004045e-05, + "loss": 3.409, + "step": 4105 + }, + { + "epoch": 0.58, + "learning_rate": 8.429871443016035e-05, + "loss": 3.2291, + "step": 4106 + }, + { + "epoch": 0.58, + "learning_rate": 8.426982522028023e-05, + "loss": 3.0735, + "step": 4107 + }, + { + "epoch": 0.58, + "learning_rate": 8.424093601040012e-05, + "loss": 3.3047, + "step": 4108 + }, + { + "epoch": 0.59, + "learning_rate": 8.421204680052e-05, + "loss": 3.0758, + "step": 4109 + }, + { + "epoch": 0.59, + "learning_rate": 8.418315759063991e-05, + "loss": 3.1981, + "step": 4110 + }, + { + "epoch": 0.59, + "learning_rate": 8.415426838075979e-05, + "loss": 3.2431, + "step": 4111 + }, + { + "epoch": 0.59, + "learning_rate": 8.412537917087968e-05, + "loss": 3.2672, + "step": 4112 + }, + { + "epoch": 0.59, + "learning_rate": 8.409648996099956e-05, + "loss": 3.1203, + "step": 4113 + }, + { + "epoch": 0.59, + "learning_rate": 8.406760075111946e-05, + "loss": 3.1629, + "step": 4114 + }, + { + "epoch": 0.59, + "learning_rate": 8.403871154123935e-05, + "loss": 3.2416, + "step": 4115 + }, + { + "epoch": 0.59, + "learning_rate": 8.400982233135925e-05, + "loss": 3.074, + "step": 4116 + }, + { + "epoch": 0.59, + "learning_rate": 8.398093312147914e-05, + "loss": 3.3175, + "step": 4117 + }, + { + "epoch": 0.59, + "learning_rate": 8.395204391159902e-05, + "loss": 3.2744, + "step": 4118 + }, + { + "epoch": 0.59, + "learning_rate": 8.392315470171891e-05, + "loss": 3.289, + "step": 4119 + }, + { + "epoch": 0.59, + "learning_rate": 8.389426549183879e-05, + "loss": 3.2475, + "step": 4120 + }, + { + "epoch": 0.59, + "learning_rate": 8.38653762819587e-05, + "loss": 3.3102, + "step": 4121 + }, + { + "epoch": 0.59, + "learning_rate": 8.383648707207858e-05, + "loss": 3.3482, + "step": 4122 + }, + { + "epoch": 0.59, + "learning_rate": 8.380759786219847e-05, + "loss": 3.2501, + "step": 4123 + }, + { + "epoch": 0.59, + "learning_rate": 8.377870865231835e-05, + "loss": 3.0875, + "step": 4124 + }, + { + "epoch": 0.59, + "learning_rate": 8.374981944243825e-05, + "loss": 3.3484, + "step": 4125 + }, + { + "epoch": 0.59, + "learning_rate": 8.372093023255814e-05, + "loss": 3.3203, + "step": 4126 + }, + { + "epoch": 0.59, + "learning_rate": 8.369204102267804e-05, + "loss": 2.9373, + "step": 4127 + }, + { + "epoch": 0.59, + "learning_rate": 8.366315181279793e-05, + "loss": 3.3178, + "step": 4128 + }, + { + "epoch": 0.59, + "learning_rate": 8.363426260291781e-05, + "loss": 3.1093, + "step": 4129 + }, + { + "epoch": 0.59, + "learning_rate": 8.36053733930377e-05, + "loss": 3.1304, + "step": 4130 + }, + { + "epoch": 0.59, + "learning_rate": 8.35764841831576e-05, + "loss": 3.1829, + "step": 4131 + }, + { + "epoch": 0.59, + "learning_rate": 8.354759497327749e-05, + "loss": 3.3268, + "step": 4132 + }, + { + "epoch": 0.59, + "learning_rate": 8.351870576339737e-05, + "loss": 3.2223, + "step": 4133 + }, + { + "epoch": 0.59, + "learning_rate": 8.348981655351727e-05, + "loss": 3.1491, + "step": 4134 + }, + { + "epoch": 0.59, + "learning_rate": 8.346092734363715e-05, + "loss": 3.3163, + "step": 4135 + }, + { + "epoch": 0.59, + "learning_rate": 8.343203813375705e-05, + "loss": 3.1891, + "step": 4136 + }, + { + "epoch": 0.59, + "learning_rate": 8.340314892387693e-05, + "loss": 3.2958, + "step": 4137 + }, + { + "epoch": 0.59, + "learning_rate": 8.337425971399683e-05, + "loss": 3.2614, + "step": 4138 + }, + { + "epoch": 0.59, + "learning_rate": 8.334537050411672e-05, + "loss": 3.1945, + "step": 4139 + }, + { + "epoch": 0.59, + "learning_rate": 8.33164812942366e-05, + "loss": 3.2473, + "step": 4140 + }, + { + "epoch": 0.59, + "learning_rate": 8.32875920843565e-05, + "loss": 3.3148, + "step": 4141 + }, + { + "epoch": 0.59, + "learning_rate": 8.325870287447639e-05, + "loss": 3.2322, + "step": 4142 + }, + { + "epoch": 0.59, + "learning_rate": 8.322981366459628e-05, + "loss": 3.1535, + "step": 4143 + }, + { + "epoch": 0.59, + "learning_rate": 8.320092445471616e-05, + "loss": 3.348, + "step": 4144 + }, + { + "epoch": 0.59, + "learning_rate": 8.317203524483606e-05, + "loss": 3.3239, + "step": 4145 + }, + { + "epoch": 0.59, + "learning_rate": 8.314314603495594e-05, + "loss": 3.2792, + "step": 4146 + }, + { + "epoch": 0.59, + "learning_rate": 8.311425682507585e-05, + "loss": 3.249, + "step": 4147 + }, + { + "epoch": 0.59, + "learning_rate": 8.308536761519573e-05, + "loss": 3.2805, + "step": 4148 + }, + { + "epoch": 0.59, + "learning_rate": 8.305647840531562e-05, + "loss": 3.2893, + "step": 4149 + }, + { + "epoch": 0.59, + "learning_rate": 8.302758919543551e-05, + "loss": 3.2423, + "step": 4150 + }, + { + "epoch": 0.59, + "learning_rate": 8.29986999855554e-05, + "loss": 3.2868, + "step": 4151 + }, + { + "epoch": 0.59, + "learning_rate": 8.29698107756753e-05, + "loss": 3.3046, + "step": 4152 + }, + { + "epoch": 0.59, + "learning_rate": 8.294092156579518e-05, + "loss": 3.0475, + "step": 4153 + }, + { + "epoch": 0.59, + "learning_rate": 8.291203235591507e-05, + "loss": 3.4062, + "step": 4154 + }, + { + "epoch": 0.59, + "learning_rate": 8.288314314603496e-05, + "loss": 3.2152, + "step": 4155 + }, + { + "epoch": 0.59, + "learning_rate": 8.285425393615485e-05, + "loss": 3.3383, + "step": 4156 + }, + { + "epoch": 0.59, + "learning_rate": 8.282536472627474e-05, + "loss": 3.2178, + "step": 4157 + }, + { + "epoch": 0.59, + "learning_rate": 8.279647551639464e-05, + "loss": 3.221, + "step": 4158 + }, + { + "epoch": 0.59, + "learning_rate": 8.276758630651452e-05, + "loss": 3.2245, + "step": 4159 + }, + { + "epoch": 0.59, + "learning_rate": 8.273869709663441e-05, + "loss": 3.2269, + "step": 4160 + }, + { + "epoch": 0.59, + "learning_rate": 8.27098078867543e-05, + "loss": 3.2183, + "step": 4161 + }, + { + "epoch": 0.59, + "learning_rate": 8.268091867687418e-05, + "loss": 3.1523, + "step": 4162 + }, + { + "epoch": 0.59, + "learning_rate": 8.265202946699408e-05, + "loss": 3.1758, + "step": 4163 + }, + { + "epoch": 0.59, + "learning_rate": 8.262314025711397e-05, + "loss": 3.2164, + "step": 4164 + }, + { + "epoch": 0.59, + "learning_rate": 8.259425104723387e-05, + "loss": 3.2022, + "step": 4165 + }, + { + "epoch": 0.59, + "learning_rate": 8.256536183735375e-05, + "loss": 3.2189, + "step": 4166 + }, + { + "epoch": 0.59, + "learning_rate": 8.253647262747364e-05, + "loss": 3.2521, + "step": 4167 + }, + { + "epoch": 0.59, + "learning_rate": 8.250758341759353e-05, + "loss": 3.2237, + "step": 4168 + }, + { + "epoch": 0.59, + "learning_rate": 8.247869420771343e-05, + "loss": 3.2787, + "step": 4169 + }, + { + "epoch": 0.59, + "learning_rate": 8.244980499783331e-05, + "loss": 3.1623, + "step": 4170 + }, + { + "epoch": 0.59, + "learning_rate": 8.24209157879532e-05, + "loss": 3.2421, + "step": 4171 + }, + { + "epoch": 0.59, + "learning_rate": 8.23920265780731e-05, + "loss": 3.239, + "step": 4172 + }, + { + "epoch": 0.59, + "learning_rate": 8.236313736819299e-05, + "loss": 3.2289, + "step": 4173 + }, + { + "epoch": 0.59, + "learning_rate": 8.233424815831287e-05, + "loss": 3.2067, + "step": 4174 + }, + { + "epoch": 0.59, + "learning_rate": 8.230535894843276e-05, + "loss": 3.1906, + "step": 4175 + }, + { + "epoch": 0.59, + "learning_rate": 8.227646973855266e-05, + "loss": 3.3086, + "step": 4176 + }, + { + "epoch": 0.59, + "learning_rate": 8.224758052867254e-05, + "loss": 3.205, + "step": 4177 + }, + { + "epoch": 0.59, + "learning_rate": 8.221869131879245e-05, + "loss": 3.1586, + "step": 4178 + }, + { + "epoch": 0.6, + "learning_rate": 8.218980210891233e-05, + "loss": 3.2744, + "step": 4179 + }, + { + "epoch": 0.6, + "learning_rate": 8.216091289903222e-05, + "loss": 3.1454, + "step": 4180 + }, + { + "epoch": 0.6, + "learning_rate": 8.21320236891521e-05, + "loss": 3.2559, + "step": 4181 + }, + { + "epoch": 0.6, + "learning_rate": 8.2103134479272e-05, + "loss": 3.2432, + "step": 4182 + }, + { + "epoch": 0.6, + "learning_rate": 8.207424526939189e-05, + "loss": 3.2066, + "step": 4183 + }, + { + "epoch": 0.6, + "learning_rate": 8.204535605951178e-05, + "loss": 3.1526, + "step": 4184 + }, + { + "epoch": 0.6, + "learning_rate": 8.201646684963166e-05, + "loss": 3.1424, + "step": 4185 + }, + { + "epoch": 0.6, + "learning_rate": 8.198757763975156e-05, + "loss": 3.1921, + "step": 4186 + }, + { + "epoch": 0.6, + "learning_rate": 8.195868842987145e-05, + "loss": 3.1171, + "step": 4187 + }, + { + "epoch": 0.6, + "learning_rate": 8.192979921999133e-05, + "loss": 3.1883, + "step": 4188 + }, + { + "epoch": 0.6, + "learning_rate": 8.190091001011124e-05, + "loss": 3.3401, + "step": 4189 + }, + { + "epoch": 0.6, + "learning_rate": 8.187202080023112e-05, + "loss": 3.1864, + "step": 4190 + }, + { + "epoch": 0.6, + "learning_rate": 8.184313159035101e-05, + "loss": 3.259, + "step": 4191 + }, + { + "epoch": 0.6, + "learning_rate": 8.181424238047089e-05, + "loss": 3.1787, + "step": 4192 + }, + { + "epoch": 0.6, + "learning_rate": 8.178535317059078e-05, + "loss": 3.1158, + "step": 4193 + }, + { + "epoch": 0.6, + "learning_rate": 8.175646396071068e-05, + "loss": 3.2656, + "step": 4194 + }, + { + "epoch": 0.6, + "learning_rate": 8.172757475083057e-05, + "loss": 3.2065, + "step": 4195 + }, + { + "epoch": 0.6, + "learning_rate": 8.169868554095045e-05, + "loss": 3.2744, + "step": 4196 + }, + { + "epoch": 0.6, + "learning_rate": 8.166979633107035e-05, + "loss": 3.0871, + "step": 4197 + }, + { + "epoch": 0.6, + "learning_rate": 8.164090712119024e-05, + "loss": 3.3143, + "step": 4198 + }, + { + "epoch": 0.6, + "learning_rate": 8.161201791131013e-05, + "loss": 3.1667, + "step": 4199 + }, + { + "epoch": 0.6, + "learning_rate": 8.158312870143003e-05, + "loss": 3.2188, + "step": 4200 + }, + { + "epoch": 0.6, + "learning_rate": 8.155423949154991e-05, + "loss": 3.1783, + "step": 4201 + }, + { + "epoch": 0.6, + "learning_rate": 8.15253502816698e-05, + "loss": 3.2553, + "step": 4202 + }, + { + "epoch": 0.6, + "learning_rate": 8.149646107178968e-05, + "loss": 3.3402, + "step": 4203 + }, + { + "epoch": 0.6, + "learning_rate": 8.146757186190958e-05, + "loss": 3.2964, + "step": 4204 + }, + { + "epoch": 0.6, + "learning_rate": 8.143868265202947e-05, + "loss": 3.258, + "step": 4205 + }, + { + "epoch": 0.6, + "learning_rate": 8.140979344214936e-05, + "loss": 3.1899, + "step": 4206 + }, + { + "epoch": 0.6, + "learning_rate": 8.138090423226924e-05, + "loss": 3.2875, + "step": 4207 + }, + { + "epoch": 0.6, + "learning_rate": 8.135201502238914e-05, + "loss": 3.2679, + "step": 4208 + }, + { + "epoch": 0.6, + "learning_rate": 8.132312581250903e-05, + "loss": 3.0585, + "step": 4209 + }, + { + "epoch": 0.6, + "learning_rate": 8.129423660262893e-05, + "loss": 3.2907, + "step": 4210 + }, + { + "epoch": 0.6, + "learning_rate": 8.126534739274882e-05, + "loss": 3.1095, + "step": 4211 + }, + { + "epoch": 0.6, + "learning_rate": 8.12364581828687e-05, + "loss": 3.2634, + "step": 4212 + }, + { + "epoch": 0.6, + "learning_rate": 8.12075689729886e-05, + "loss": 3.3216, + "step": 4213 + }, + { + "epoch": 0.6, + "learning_rate": 8.117867976310847e-05, + "loss": 3.2927, + "step": 4214 + }, + { + "epoch": 0.6, + "learning_rate": 8.114979055322838e-05, + "loss": 3.2301, + "step": 4215 + }, + { + "epoch": 0.6, + "learning_rate": 8.112090134334826e-05, + "loss": 3.2072, + "step": 4216 + }, + { + "epoch": 0.6, + "learning_rate": 8.109201213346816e-05, + "loss": 3.283, + "step": 4217 + }, + { + "epoch": 0.6, + "learning_rate": 8.106312292358804e-05, + "loss": 3.1787, + "step": 4218 + }, + { + "epoch": 0.6, + "learning_rate": 8.103423371370793e-05, + "loss": 3.2416, + "step": 4219 + }, + { + "epoch": 0.6, + "learning_rate": 8.100534450382782e-05, + "loss": 3.0936, + "step": 4220 + }, + { + "epoch": 0.6, + "learning_rate": 8.097645529394772e-05, + "loss": 3.2907, + "step": 4221 + }, + { + "epoch": 0.6, + "learning_rate": 8.094756608406761e-05, + "loss": 3.1733, + "step": 4222 + }, + { + "epoch": 0.6, + "learning_rate": 8.091867687418749e-05, + "loss": 3.381, + "step": 4223 + }, + { + "epoch": 0.6, + "learning_rate": 8.088978766430739e-05, + "loss": 3.2477, + "step": 4224 + }, + { + "epoch": 0.6, + "eval_loss": 3.454590082168579, + "eval_runtime": 471.9004, + "eval_samples_per_second": 43.414, + "eval_steps_per_second": 14.471, + "step": 4224 + }, + { + "epoch": 0.6, + "learning_rate": 8.086089845442728e-05, + "loss": 3.1189, + "step": 4225 + }, + { + "epoch": 0.6, + "learning_rate": 8.083200924454717e-05, + "loss": 3.2583, + "step": 4226 + }, + { + "epoch": 0.6, + "learning_rate": 8.080312003466705e-05, + "loss": 3.3358, + "step": 4227 + }, + { + "epoch": 0.6, + "learning_rate": 8.077423082478695e-05, + "loss": 3.2218, + "step": 4228 + }, + { + "epoch": 0.6, + "learning_rate": 8.074534161490683e-05, + "loss": 3.2638, + "step": 4229 + }, + { + "epoch": 0.6, + "learning_rate": 8.071645240502672e-05, + "loss": 3.1001, + "step": 4230 + }, + { + "epoch": 0.6, + "learning_rate": 8.068756319514661e-05, + "loss": 3.1898, + "step": 4231 + }, + { + "epoch": 0.6, + "learning_rate": 8.065867398526651e-05, + "loss": 3.2099, + "step": 4232 + }, + { + "epoch": 0.6, + "learning_rate": 8.06297847753864e-05, + "loss": 3.2063, + "step": 4233 + }, + { + "epoch": 0.6, + "learning_rate": 8.060089556550628e-05, + "loss": 3.223, + "step": 4234 + }, + { + "epoch": 0.6, + "learning_rate": 8.057200635562618e-05, + "loss": 3.2809, + "step": 4235 + }, + { + "epoch": 0.6, + "learning_rate": 8.054311714574607e-05, + "loss": 3.2364, + "step": 4236 + }, + { + "epoch": 0.6, + "learning_rate": 8.051422793586596e-05, + "loss": 3.132, + "step": 4237 + }, + { + "epoch": 0.6, + "learning_rate": 8.048533872598584e-05, + "loss": 3.1899, + "step": 4238 + }, + { + "epoch": 0.6, + "learning_rate": 8.045644951610574e-05, + "loss": 3.2914, + "step": 4239 + }, + { + "epoch": 0.6, + "learning_rate": 8.042756030622562e-05, + "loss": 3.1883, + "step": 4240 + }, + { + "epoch": 0.6, + "learning_rate": 8.039867109634553e-05, + "loss": 3.1069, + "step": 4241 + }, + { + "epoch": 0.6, + "learning_rate": 8.03697818864654e-05, + "loss": 3.2493, + "step": 4242 + }, + { + "epoch": 0.6, + "learning_rate": 8.03408926765853e-05, + "loss": 3.1019, + "step": 4243 + }, + { + "epoch": 0.6, + "learning_rate": 8.03120034667052e-05, + "loss": 3.4186, + "step": 4244 + }, + { + "epoch": 0.6, + "learning_rate": 8.028311425682507e-05, + "loss": 2.9726, + "step": 4245 + }, + { + "epoch": 0.6, + "learning_rate": 8.025422504694498e-05, + "loss": 3.1249, + "step": 4246 + }, + { + "epoch": 0.6, + "learning_rate": 8.022533583706486e-05, + "loss": 3.0704, + "step": 4247 + }, + { + "epoch": 0.6, + "learning_rate": 8.019644662718476e-05, + "loss": 3.1305, + "step": 4248 + }, + { + "epoch": 0.6, + "learning_rate": 8.016755741730464e-05, + "loss": 3.2742, + "step": 4249 + }, + { + "epoch": 0.61, + "learning_rate": 8.013866820742453e-05, + "loss": 3.2609, + "step": 4250 + }, + { + "epoch": 0.61, + "learning_rate": 8.010977899754441e-05, + "loss": 3.2674, + "step": 4251 + }, + { + "epoch": 0.61, + "learning_rate": 8.008088978766432e-05, + "loss": 3.274, + "step": 4252 + }, + { + "epoch": 0.61, + "learning_rate": 8.00520005777842e-05, + "loss": 3.1981, + "step": 4253 + }, + { + "epoch": 0.61, + "learning_rate": 8.002311136790409e-05, + "loss": 3.1098, + "step": 4254 + }, + { + "epoch": 0.61, + "learning_rate": 7.999422215802399e-05, + "loss": 3.3066, + "step": 4255 + }, + { + "epoch": 0.61, + "learning_rate": 7.996533294814387e-05, + "loss": 3.2974, + "step": 4256 + }, + { + "epoch": 0.61, + "learning_rate": 7.993644373826377e-05, + "loss": 3.2273, + "step": 4257 + }, + { + "epoch": 0.61, + "learning_rate": 7.990755452838365e-05, + "loss": 3.3769, + "step": 4258 + }, + { + "epoch": 0.61, + "learning_rate": 7.987866531850355e-05, + "loss": 3.2165, + "step": 4259 + }, + { + "epoch": 0.61, + "learning_rate": 7.984977610862343e-05, + "loss": 3.2988, + "step": 4260 + }, + { + "epoch": 0.61, + "learning_rate": 7.982088689874332e-05, + "loss": 3.1474, + "step": 4261 + }, + { + "epoch": 0.61, + "learning_rate": 7.979199768886321e-05, + "loss": 3.1661, + "step": 4262 + }, + { + "epoch": 0.61, + "learning_rate": 7.976310847898311e-05, + "loss": 2.9669, + "step": 4263 + }, + { + "epoch": 0.61, + "learning_rate": 7.973421926910299e-05, + "loss": 3.1976, + "step": 4264 + }, + { + "epoch": 0.61, + "learning_rate": 7.970533005922288e-05, + "loss": 3.2131, + "step": 4265 + }, + { + "epoch": 0.61, + "learning_rate": 7.967644084934278e-05, + "loss": 3.2106, + "step": 4266 + }, + { + "epoch": 0.61, + "learning_rate": 7.964755163946267e-05, + "loss": 3.2483, + "step": 4267 + }, + { + "epoch": 0.61, + "learning_rate": 7.961866242958256e-05, + "loss": 3.3123, + "step": 4268 + }, + { + "epoch": 0.61, + "learning_rate": 7.958977321970244e-05, + "loss": 3.2434, + "step": 4269 + }, + { + "epoch": 0.61, + "learning_rate": 7.956088400982234e-05, + "loss": 3.2349, + "step": 4270 + }, + { + "epoch": 0.61, + "learning_rate": 7.953199479994222e-05, + "loss": 3.2684, + "step": 4271 + }, + { + "epoch": 0.61, + "learning_rate": 7.950310559006211e-05, + "loss": 3.3313, + "step": 4272 + }, + { + "epoch": 0.61, + "learning_rate": 7.9474216380182e-05, + "loss": 3.1969, + "step": 4273 + }, + { + "epoch": 0.61, + "learning_rate": 7.94453271703019e-05, + "loss": 3.1348, + "step": 4274 + }, + { + "epoch": 0.61, + "learning_rate": 7.941643796042178e-05, + "loss": 3.153, + "step": 4275 + }, + { + "epoch": 0.61, + "learning_rate": 7.938754875054167e-05, + "loss": 3.3107, + "step": 4276 + }, + { + "epoch": 0.61, + "learning_rate": 7.935865954066157e-05, + "loss": 2.9463, + "step": 4277 + }, + { + "epoch": 0.61, + "learning_rate": 7.932977033078146e-05, + "loss": 3.2237, + "step": 4278 + }, + { + "epoch": 0.61, + "learning_rate": 7.930088112090136e-05, + "loss": 3.2952, + "step": 4279 + }, + { + "epoch": 0.61, + "learning_rate": 7.927199191102124e-05, + "loss": 3.112, + "step": 4280 + }, + { + "epoch": 0.61, + "learning_rate": 7.924310270114113e-05, + "loss": 3.2988, + "step": 4281 + }, + { + "epoch": 0.61, + "learning_rate": 7.921421349126101e-05, + "loss": 3.2512, + "step": 4282 + }, + { + "epoch": 0.61, + "learning_rate": 7.918532428138092e-05, + "loss": 3.35, + "step": 4283 + }, + { + "epoch": 0.61, + "learning_rate": 7.91564350715008e-05, + "loss": 3.2903, + "step": 4284 + }, + { + "epoch": 0.61, + "learning_rate": 7.912754586162069e-05, + "loss": 3.2621, + "step": 4285 + }, + { + "epoch": 0.61, + "learning_rate": 7.909865665174057e-05, + "loss": 3.3227, + "step": 4286 + }, + { + "epoch": 0.61, + "learning_rate": 7.906976744186047e-05, + "loss": 3.1815, + "step": 4287 + }, + { + "epoch": 0.61, + "learning_rate": 7.904087823198036e-05, + "loss": 3.2774, + "step": 4288 + }, + { + "epoch": 0.61, + "learning_rate": 7.901198902210025e-05, + "loss": 3.2155, + "step": 4289 + }, + { + "epoch": 0.61, + "learning_rate": 7.898309981222013e-05, + "loss": 3.3794, + "step": 4290 + }, + { + "epoch": 0.61, + "learning_rate": 7.895421060234003e-05, + "loss": 3.351, + "step": 4291 + }, + { + "epoch": 0.61, + "learning_rate": 7.892532139245992e-05, + "loss": 3.1811, + "step": 4292 + }, + { + "epoch": 0.61, + "learning_rate": 7.88964321825798e-05, + "loss": 3.2645, + "step": 4293 + }, + { + "epoch": 0.61, + "learning_rate": 7.886754297269971e-05, + "loss": 3.2618, + "step": 4294 + }, + { + "epoch": 0.61, + "learning_rate": 7.883865376281959e-05, + "loss": 3.2929, + "step": 4295 + }, + { + "epoch": 0.61, + "learning_rate": 7.880976455293948e-05, + "loss": 3.3232, + "step": 4296 + }, + { + "epoch": 0.61, + "learning_rate": 7.878087534305936e-05, + "loss": 3.2274, + "step": 4297 + }, + { + "epoch": 0.61, + "learning_rate": 7.875198613317926e-05, + "loss": 3.2881, + "step": 4298 + }, + { + "epoch": 0.61, + "learning_rate": 7.872309692329915e-05, + "loss": 3.2647, + "step": 4299 + }, + { + "epoch": 0.61, + "learning_rate": 7.869420771341904e-05, + "loss": 3.3605, + "step": 4300 + }, + { + "epoch": 0.61, + "learning_rate": 7.866531850353893e-05, + "loss": 3.2639, + "step": 4301 + }, + { + "epoch": 0.61, + "learning_rate": 7.863642929365882e-05, + "loss": 3.3084, + "step": 4302 + }, + { + "epoch": 0.61, + "learning_rate": 7.860754008377871e-05, + "loss": 3.0701, + "step": 4303 + }, + { + "epoch": 0.61, + "learning_rate": 7.85786508738986e-05, + "loss": 3.2976, + "step": 4304 + }, + { + "epoch": 0.61, + "learning_rate": 7.85497616640185e-05, + "loss": 3.1141, + "step": 4305 + }, + { + "epoch": 0.61, + "learning_rate": 7.852087245413838e-05, + "loss": 3.0212, + "step": 4306 + }, + { + "epoch": 0.61, + "learning_rate": 7.849198324425827e-05, + "loss": 3.327, + "step": 4307 + }, + { + "epoch": 0.61, + "learning_rate": 7.846309403437815e-05, + "loss": 3.2982, + "step": 4308 + }, + { + "epoch": 0.61, + "learning_rate": 7.843420482449806e-05, + "loss": 3.1753, + "step": 4309 + }, + { + "epoch": 0.61, + "learning_rate": 7.840531561461794e-05, + "loss": 3.0708, + "step": 4310 + }, + { + "epoch": 0.61, + "learning_rate": 7.837642640473784e-05, + "loss": 3.3137, + "step": 4311 + }, + { + "epoch": 0.61, + "learning_rate": 7.834753719485772e-05, + "loss": 3.1944, + "step": 4312 + }, + { + "epoch": 0.61, + "learning_rate": 7.831864798497761e-05, + "loss": 3.2517, + "step": 4313 + }, + { + "epoch": 0.61, + "learning_rate": 7.82897587750975e-05, + "loss": 3.2313, + "step": 4314 + }, + { + "epoch": 0.61, + "learning_rate": 7.82608695652174e-05, + "loss": 3.1304, + "step": 4315 + }, + { + "epoch": 0.61, + "learning_rate": 7.823198035533729e-05, + "loss": 3.2009, + "step": 4316 + }, + { + "epoch": 0.61, + "learning_rate": 7.820309114545717e-05, + "loss": 3.3346, + "step": 4317 + }, + { + "epoch": 0.61, + "learning_rate": 7.817420193557707e-05, + "loss": 3.2339, + "step": 4318 + }, + { + "epoch": 0.61, + "learning_rate": 7.814531272569695e-05, + "loss": 3.3033, + "step": 4319 + }, + { + "epoch": 0.62, + "learning_rate": 7.811642351581685e-05, + "loss": 3.2246, + "step": 4320 + }, + { + "epoch": 0.62, + "learning_rate": 7.808753430593673e-05, + "loss": 3.3153, + "step": 4321 + }, + { + "epoch": 0.62, + "learning_rate": 7.805864509605663e-05, + "loss": 3.2447, + "step": 4322 + }, + { + "epoch": 0.62, + "learning_rate": 7.802975588617651e-05, + "loss": 3.2197, + "step": 4323 + }, + { + "epoch": 0.62, + "learning_rate": 7.80008666762964e-05, + "loss": 3.257, + "step": 4324 + }, + { + "epoch": 0.62, + "learning_rate": 7.79719774664163e-05, + "loss": 3.1468, + "step": 4325 + }, + { + "epoch": 0.62, + "learning_rate": 7.794308825653619e-05, + "loss": 3.2364, + "step": 4326 + }, + { + "epoch": 0.62, + "learning_rate": 7.791419904665608e-05, + "loss": 3.3317, + "step": 4327 + }, + { + "epoch": 0.62, + "learning_rate": 7.788530983677596e-05, + "loss": 3.3097, + "step": 4328 + }, + { + "epoch": 0.62, + "learning_rate": 7.785642062689586e-05, + "loss": 3.265, + "step": 4329 + }, + { + "epoch": 0.62, + "learning_rate": 7.782753141701575e-05, + "loss": 3.2576, + "step": 4330 + }, + { + "epoch": 0.62, + "learning_rate": 7.779864220713564e-05, + "loss": 3.2598, + "step": 4331 + }, + { + "epoch": 0.62, + "learning_rate": 7.776975299725553e-05, + "loss": 3.2786, + "step": 4332 + }, + { + "epoch": 0.62, + "learning_rate": 7.774086378737542e-05, + "loss": 3.1964, + "step": 4333 + }, + { + "epoch": 0.62, + "learning_rate": 7.77119745774953e-05, + "loss": 3.3129, + "step": 4334 + }, + { + "epoch": 0.62, + "learning_rate": 7.76830853676152e-05, + "loss": 3.333, + "step": 4335 + }, + { + "epoch": 0.62, + "learning_rate": 7.765419615773509e-05, + "loss": 3.2189, + "step": 4336 + }, + { + "epoch": 0.62, + "learning_rate": 7.762530694785498e-05, + "loss": 3.2821, + "step": 4337 + }, + { + "epoch": 0.62, + "learning_rate": 7.759641773797487e-05, + "loss": 3.2736, + "step": 4338 + }, + { + "epoch": 0.62, + "learning_rate": 7.756752852809475e-05, + "loss": 3.1567, + "step": 4339 + }, + { + "epoch": 0.62, + "learning_rate": 7.753863931821465e-05, + "loss": 3.1936, + "step": 4340 + }, + { + "epoch": 0.62, + "learning_rate": 7.750975010833454e-05, + "loss": 3.2622, + "step": 4341 + }, + { + "epoch": 0.62, + "learning_rate": 7.748086089845444e-05, + "loss": 3.3282, + "step": 4342 + }, + { + "epoch": 0.62, + "learning_rate": 7.745197168857432e-05, + "loss": 3.2657, + "step": 4343 + }, + { + "epoch": 0.62, + "learning_rate": 7.742308247869421e-05, + "loss": 3.1931, + "step": 4344 + }, + { + "epoch": 0.62, + "learning_rate": 7.739419326881409e-05, + "loss": 3.3269, + "step": 4345 + }, + { + "epoch": 0.62, + "learning_rate": 7.7365304058934e-05, + "loss": 3.2184, + "step": 4346 + }, + { + "epoch": 0.62, + "learning_rate": 7.733641484905388e-05, + "loss": 3.1989, + "step": 4347 + }, + { + "epoch": 0.62, + "learning_rate": 7.730752563917377e-05, + "loss": 3.1841, + "step": 4348 + }, + { + "epoch": 0.62, + "learning_rate": 7.727863642929367e-05, + "loss": 3.155, + "step": 4349 + }, + { + "epoch": 0.62, + "learning_rate": 7.724974721941355e-05, + "loss": 3.2665, + "step": 4350 + }, + { + "epoch": 0.62, + "learning_rate": 7.722085800953345e-05, + "loss": 3.2832, + "step": 4351 + }, + { + "epoch": 0.62, + "learning_rate": 7.719196879965333e-05, + "loss": 3.127, + "step": 4352 + }, + { + "epoch": 0.62, + "learning_rate": 7.716307958977323e-05, + "loss": 3.2496, + "step": 4353 + }, + { + "epoch": 0.62, + "learning_rate": 7.713419037989311e-05, + "loss": 3.0619, + "step": 4354 + }, + { + "epoch": 0.62, + "learning_rate": 7.7105301170013e-05, + "loss": 3.2216, + "step": 4355 + }, + { + "epoch": 0.62, + "learning_rate": 7.70764119601329e-05, + "loss": 3.1777, + "step": 4356 + }, + { + "epoch": 0.62, + "learning_rate": 7.704752275025279e-05, + "loss": 3.2618, + "step": 4357 + }, + { + "epoch": 0.62, + "learning_rate": 7.701863354037267e-05, + "loss": 3.328, + "step": 4358 + }, + { + "epoch": 0.62, + "learning_rate": 7.698974433049256e-05, + "loss": 3.1858, + "step": 4359 + }, + { + "epoch": 0.62, + "learning_rate": 7.696085512061246e-05, + "loss": 3.3507, + "step": 4360 + }, + { + "epoch": 0.62, + "learning_rate": 7.693196591073234e-05, + "loss": 3.1983, + "step": 4361 + }, + { + "epoch": 0.62, + "learning_rate": 7.690307670085225e-05, + "loss": 3.0417, + "step": 4362 + }, + { + "epoch": 0.62, + "learning_rate": 7.687418749097213e-05, + "loss": 3.1231, + "step": 4363 + }, + { + "epoch": 0.62, + "learning_rate": 7.684529828109202e-05, + "loss": 3.1928, + "step": 4364 + }, + { + "epoch": 0.62, + "learning_rate": 7.68164090712119e-05, + "loss": 3.1603, + "step": 4365 + }, + { + "epoch": 0.62, + "learning_rate": 7.67875198613318e-05, + "loss": 3.1971, + "step": 4366 + }, + { + "epoch": 0.62, + "learning_rate": 7.675863065145169e-05, + "loss": 3.3221, + "step": 4367 + }, + { + "epoch": 0.62, + "learning_rate": 7.672974144157158e-05, + "loss": 3.2545, + "step": 4368 + }, + { + "epoch": 0.62, + "learning_rate": 7.670085223169146e-05, + "loss": 3.2316, + "step": 4369 + }, + { + "epoch": 0.62, + "learning_rate": 7.667196302181136e-05, + "loss": 3.2825, + "step": 4370 + }, + { + "epoch": 0.62, + "learning_rate": 7.664307381193125e-05, + "loss": 3.1697, + "step": 4371 + }, + { + "epoch": 0.62, + "learning_rate": 7.661418460205114e-05, + "loss": 3.3155, + "step": 4372 + }, + { + "epoch": 0.62, + "learning_rate": 7.658529539217104e-05, + "loss": 3.2699, + "step": 4373 + }, + { + "epoch": 0.62, + "learning_rate": 7.655640618229092e-05, + "loss": 3.1791, + "step": 4374 + }, + { + "epoch": 0.62, + "learning_rate": 7.652751697241081e-05, + "loss": 3.2443, + "step": 4375 + }, + { + "epoch": 0.62, + "learning_rate": 7.649862776253069e-05, + "loss": 3.1897, + "step": 4376 + }, + { + "epoch": 0.62, + "learning_rate": 7.64697385526506e-05, + "loss": 3.1354, + "step": 4377 + }, + { + "epoch": 0.62, + "learning_rate": 7.644084934277048e-05, + "loss": 3.2865, + "step": 4378 + }, + { + "epoch": 0.62, + "learning_rate": 7.641196013289037e-05, + "loss": 3.1777, + "step": 4379 + }, + { + "epoch": 0.62, + "learning_rate": 7.638307092301025e-05, + "loss": 3.1771, + "step": 4380 + }, + { + "epoch": 0.62, + "learning_rate": 7.635418171313015e-05, + "loss": 3.1502, + "step": 4381 + }, + { + "epoch": 0.62, + "learning_rate": 7.632529250325004e-05, + "loss": 3.0661, + "step": 4382 + }, + { + "epoch": 0.62, + "learning_rate": 7.629640329336993e-05, + "loss": 3.1649, + "step": 4383 + }, + { + "epoch": 0.62, + "learning_rate": 7.626751408348983e-05, + "loss": 3.2292, + "step": 4384 + }, + { + "epoch": 0.62, + "learning_rate": 7.623862487360971e-05, + "loss": 3.0547, + "step": 4385 + }, + { + "epoch": 0.62, + "learning_rate": 7.62097356637296e-05, + "loss": 3.2867, + "step": 4386 + }, + { + "epoch": 0.62, + "learning_rate": 7.618084645384948e-05, + "loss": 3.2806, + "step": 4387 + }, + { + "epoch": 0.62, + "learning_rate": 7.615195724396939e-05, + "loss": 3.1973, + "step": 4388 + }, + { + "epoch": 0.62, + "learning_rate": 7.612306803408927e-05, + "loss": 3.1027, + "step": 4389 + }, + { + "epoch": 0.63, + "learning_rate": 7.609417882420916e-05, + "loss": 3.3486, + "step": 4390 + }, + { + "epoch": 0.63, + "learning_rate": 7.606528961432904e-05, + "loss": 3.2801, + "step": 4391 + }, + { + "epoch": 0.63, + "learning_rate": 7.603640040444894e-05, + "loss": 3.1514, + "step": 4392 + }, + { + "epoch": 0.63, + "learning_rate": 7.600751119456883e-05, + "loss": 3.1717, + "step": 4393 + }, + { + "epoch": 0.63, + "learning_rate": 7.597862198468873e-05, + "loss": 3.2219, + "step": 4394 + }, + { + "epoch": 0.63, + "learning_rate": 7.594973277480862e-05, + "loss": 3.2382, + "step": 4395 + }, + { + "epoch": 0.63, + "learning_rate": 7.59208435649285e-05, + "loss": 3.0522, + "step": 4396 + }, + { + "epoch": 0.63, + "learning_rate": 7.58919543550484e-05, + "loss": 3.1792, + "step": 4397 + }, + { + "epoch": 0.63, + "learning_rate": 7.586306514516829e-05, + "loss": 3.2079, + "step": 4398 + }, + { + "epoch": 0.63, + "learning_rate": 7.583417593528818e-05, + "loss": 3.2015, + "step": 4399 + }, + { + "epoch": 0.63, + "learning_rate": 7.580528672540806e-05, + "loss": 3.1522, + "step": 4400 + }, + { + "epoch": 0.63, + "learning_rate": 7.577639751552796e-05, + "loss": 3.2472, + "step": 4401 + }, + { + "epoch": 0.63, + "learning_rate": 7.574750830564784e-05, + "loss": 3.2378, + "step": 4402 + }, + { + "epoch": 0.63, + "learning_rate": 7.571861909576773e-05, + "loss": 3.2979, + "step": 4403 + }, + { + "epoch": 0.63, + "learning_rate": 7.568972988588762e-05, + "loss": 3.1266, + "step": 4404 + }, + { + "epoch": 0.63, + "learning_rate": 7.566084067600752e-05, + "loss": 2.9579, + "step": 4405 + }, + { + "epoch": 0.63, + "learning_rate": 7.563195146612741e-05, + "loss": 3.2063, + "step": 4406 + }, + { + "epoch": 0.63, + "learning_rate": 7.560306225624729e-05, + "loss": 3.2874, + "step": 4407 + }, + { + "epoch": 0.63, + "learning_rate": 7.557417304636718e-05, + "loss": 3.2408, + "step": 4408 + }, + { + "epoch": 0.63, + "learning_rate": 7.554528383648708e-05, + "loss": 3.2621, + "step": 4409 + }, + { + "epoch": 0.63, + "learning_rate": 7.551639462660697e-05, + "loss": 3.0716, + "step": 4410 + }, + { + "epoch": 0.63, + "learning_rate": 7.548750541672685e-05, + "loss": 3.2889, + "step": 4411 + }, + { + "epoch": 0.63, + "learning_rate": 7.545861620684675e-05, + "loss": 3.2795, + "step": 4412 + }, + { + "epoch": 0.63, + "learning_rate": 7.542972699696663e-05, + "loss": 3.088, + "step": 4413 + }, + { + "epoch": 0.63, + "learning_rate": 7.540083778708653e-05, + "loss": 3.2983, + "step": 4414 + }, + { + "epoch": 0.63, + "learning_rate": 7.537194857720641e-05, + "loss": 3.19, + "step": 4415 + }, + { + "epoch": 0.63, + "learning_rate": 7.534305936732631e-05, + "loss": 3.1916, + "step": 4416 + }, + { + "epoch": 0.63, + "learning_rate": 7.53141701574462e-05, + "loss": 3.298, + "step": 4417 + }, + { + "epoch": 0.63, + "learning_rate": 7.528528094756608e-05, + "loss": 3.1589, + "step": 4418 + }, + { + "epoch": 0.63, + "learning_rate": 7.525639173768598e-05, + "loss": 3.306, + "step": 4419 + }, + { + "epoch": 0.63, + "learning_rate": 7.522750252780587e-05, + "loss": 3.2359, + "step": 4420 + }, + { + "epoch": 0.63, + "learning_rate": 7.519861331792576e-05, + "loss": 3.278, + "step": 4421 + }, + { + "epoch": 0.63, + "learning_rate": 7.516972410804564e-05, + "loss": 3.3765, + "step": 4422 + }, + { + "epoch": 0.63, + "learning_rate": 7.514083489816554e-05, + "loss": 3.2689, + "step": 4423 + }, + { + "epoch": 0.63, + "learning_rate": 7.511194568828543e-05, + "loss": 3.3587, + "step": 4424 + }, + { + "epoch": 0.63, + "learning_rate": 7.508305647840533e-05, + "loss": 3.3364, + "step": 4425 + }, + { + "epoch": 0.63, + "learning_rate": 7.50541672685252e-05, + "loss": 3.2131, + "step": 4426 + }, + { + "epoch": 0.63, + "learning_rate": 7.50252780586451e-05, + "loss": 3.2932, + "step": 4427 + }, + { + "epoch": 0.63, + "learning_rate": 7.499638884876498e-05, + "loss": 3.2665, + "step": 4428 + }, + { + "epoch": 0.63, + "learning_rate": 7.496749963888487e-05, + "loss": 3.381, + "step": 4429 + }, + { + "epoch": 0.63, + "learning_rate": 7.493861042900477e-05, + "loss": 3.2373, + "step": 4430 + }, + { + "epoch": 0.63, + "learning_rate": 7.490972121912466e-05, + "loss": 3.1545, + "step": 4431 + }, + { + "epoch": 0.63, + "learning_rate": 7.488083200924456e-05, + "loss": 3.2665, + "step": 4432 + }, + { + "epoch": 0.63, + "learning_rate": 7.485194279936444e-05, + "loss": 3.3637, + "step": 4433 + }, + { + "epoch": 0.63, + "learning_rate": 7.482305358948433e-05, + "loss": 3.3338, + "step": 4434 + }, + { + "epoch": 0.63, + "learning_rate": 7.479416437960422e-05, + "loss": 3.117, + "step": 4435 + }, + { + "epoch": 0.63, + "learning_rate": 7.476527516972412e-05, + "loss": 2.8719, + "step": 4436 + }, + { + "epoch": 0.63, + "learning_rate": 7.4736385959844e-05, + "loss": 3.2194, + "step": 4437 + }, + { + "epoch": 0.63, + "learning_rate": 7.470749674996389e-05, + "loss": 3.3096, + "step": 4438 + }, + { + "epoch": 0.63, + "learning_rate": 7.467860754008377e-05, + "loss": 3.3919, + "step": 4439 + }, + { + "epoch": 0.63, + "learning_rate": 7.464971833020368e-05, + "loss": 3.2348, + "step": 4440 + }, + { + "epoch": 0.63, + "learning_rate": 7.462082912032356e-05, + "loss": 3.2321, + "step": 4441 + }, + { + "epoch": 0.63, + "learning_rate": 7.459193991044345e-05, + "loss": 3.0917, + "step": 4442 + }, + { + "epoch": 0.63, + "learning_rate": 7.456305070056335e-05, + "loss": 3.1843, + "step": 4443 + }, + { + "epoch": 0.63, + "learning_rate": 7.453416149068323e-05, + "loss": 3.1699, + "step": 4444 + }, + { + "epoch": 0.63, + "learning_rate": 7.450527228080313e-05, + "loss": 3.2327, + "step": 4445 + }, + { + "epoch": 0.63, + "learning_rate": 7.447638307092301e-05, + "loss": 3.2365, + "step": 4446 + }, + { + "epoch": 0.63, + "learning_rate": 7.444749386104291e-05, + "loss": 3.1183, + "step": 4447 + }, + { + "epoch": 0.63, + "learning_rate": 7.441860465116279e-05, + "loss": 3.2592, + "step": 4448 + }, + { + "epoch": 0.63, + "learning_rate": 7.438971544128268e-05, + "loss": 3.095, + "step": 4449 + }, + { + "epoch": 0.63, + "learning_rate": 7.436082623140256e-05, + "loss": 3.2686, + "step": 4450 + }, + { + "epoch": 0.63, + "learning_rate": 7.433193702152247e-05, + "loss": 3.2056, + "step": 4451 + }, + { + "epoch": 0.63, + "learning_rate": 7.430304781164235e-05, + "loss": 3.2602, + "step": 4452 + }, + { + "epoch": 0.63, + "learning_rate": 7.427415860176224e-05, + "loss": 3.1665, + "step": 4453 + }, + { + "epoch": 0.63, + "learning_rate": 7.424526939188214e-05, + "loss": 3.0854, + "step": 4454 + }, + { + "epoch": 0.63, + "learning_rate": 7.421638018200202e-05, + "loss": 3.0639, + "step": 4455 + }, + { + "epoch": 0.63, + "learning_rate": 7.418749097212193e-05, + "loss": 3.1661, + "step": 4456 + }, + { + "epoch": 0.63, + "learning_rate": 7.41586017622418e-05, + "loss": 3.1701, + "step": 4457 + }, + { + "epoch": 0.63, + "learning_rate": 7.41297125523617e-05, + "loss": 3.1404, + "step": 4458 + }, + { + "epoch": 0.63, + "learning_rate": 7.410082334248158e-05, + "loss": 3.1746, + "step": 4459 + }, + { + "epoch": 0.64, + "learning_rate": 7.407193413260147e-05, + "loss": 3.2634, + "step": 4460 + }, + { + "epoch": 0.64, + "learning_rate": 7.404304492272137e-05, + "loss": 3.058, + "step": 4461 + }, + { + "epoch": 0.64, + "learning_rate": 7.401415571284126e-05, + "loss": 3.1933, + "step": 4462 + }, + { + "epoch": 0.64, + "learning_rate": 7.398526650296114e-05, + "loss": 3.2946, + "step": 4463 + }, + { + "epoch": 0.64, + "learning_rate": 7.395637729308104e-05, + "loss": 3.3623, + "step": 4464 + }, + { + "epoch": 0.64, + "learning_rate": 7.392748808320093e-05, + "loss": 3.1468, + "step": 4465 + }, + { + "epoch": 0.64, + "learning_rate": 7.389859887332082e-05, + "loss": 3.2427, + "step": 4466 + }, + { + "epoch": 0.64, + "learning_rate": 7.386970966344072e-05, + "loss": 3.2678, + "step": 4467 + }, + { + "epoch": 0.64, + "learning_rate": 7.38408204535606e-05, + "loss": 3.2054, + "step": 4468 + }, + { + "epoch": 0.64, + "learning_rate": 7.381193124368049e-05, + "loss": 3.1882, + "step": 4469 + }, + { + "epoch": 0.64, + "learning_rate": 7.378304203380037e-05, + "loss": 3.1352, + "step": 4470 + }, + { + "epoch": 0.64, + "learning_rate": 7.375415282392027e-05, + "loss": 3.2657, + "step": 4471 + }, + { + "epoch": 0.64, + "learning_rate": 7.372526361404016e-05, + "loss": 3.2018, + "step": 4472 + }, + { + "epoch": 0.64, + "learning_rate": 7.369637440416005e-05, + "loss": 3.1582, + "step": 4473 + }, + { + "epoch": 0.64, + "learning_rate": 7.366748519427993e-05, + "loss": 3.2181, + "step": 4474 + }, + { + "epoch": 0.64, + "learning_rate": 7.363859598439983e-05, + "loss": 3.1828, + "step": 4475 + }, + { + "epoch": 0.64, + "learning_rate": 7.360970677451972e-05, + "loss": 3.3757, + "step": 4476 + }, + { + "epoch": 0.64, + "learning_rate": 7.358081756463961e-05, + "loss": 3.2835, + "step": 4477 + }, + { + "epoch": 0.64, + "learning_rate": 7.355192835475951e-05, + "loss": 3.2192, + "step": 4478 + }, + { + "epoch": 0.64, + "learning_rate": 7.352303914487939e-05, + "loss": 3.1814, + "step": 4479 + }, + { + "epoch": 0.64, + "learning_rate": 7.349414993499928e-05, + "loss": 3.0554, + "step": 4480 + }, + { + "epoch": 0.64, + "learning_rate": 7.346526072511916e-05, + "loss": 3.3356, + "step": 4481 + }, + { + "epoch": 0.64, + "learning_rate": 7.343637151523907e-05, + "loss": 3.2975, + "step": 4482 + }, + { + "epoch": 0.64, + "learning_rate": 7.340748230535895e-05, + "loss": 3.392, + "step": 4483 + }, + { + "epoch": 0.64, + "learning_rate": 7.337859309547884e-05, + "loss": 3.2069, + "step": 4484 + }, + { + "epoch": 0.64, + "learning_rate": 7.334970388559872e-05, + "loss": 3.2134, + "step": 4485 + }, + { + "epoch": 0.64, + "learning_rate": 7.332081467571862e-05, + "loss": 3.2042, + "step": 4486 + }, + { + "epoch": 0.64, + "learning_rate": 7.329192546583851e-05, + "loss": 3.3986, + "step": 4487 + }, + { + "epoch": 0.64, + "learning_rate": 7.32630362559584e-05, + "loss": 3.3561, + "step": 4488 + }, + { + "epoch": 0.64, + "learning_rate": 7.32341470460783e-05, + "loss": 3.2048, + "step": 4489 + }, + { + "epoch": 0.64, + "learning_rate": 7.320525783619818e-05, + "loss": 3.2647, + "step": 4490 + }, + { + "epoch": 0.64, + "learning_rate": 7.317636862631807e-05, + "loss": 3.2076, + "step": 4491 + }, + { + "epoch": 0.64, + "learning_rate": 7.314747941643795e-05, + "loss": 3.2791, + "step": 4492 + }, + { + "epoch": 0.64, + "learning_rate": 7.311859020655786e-05, + "loss": 3.2383, + "step": 4493 + }, + { + "epoch": 0.64, + "learning_rate": 7.308970099667774e-05, + "loss": 3.27, + "step": 4494 + }, + { + "epoch": 0.64, + "learning_rate": 7.306081178679764e-05, + "loss": 3.1158, + "step": 4495 + }, + { + "epoch": 0.64, + "learning_rate": 7.303192257691752e-05, + "loss": 3.2556, + "step": 4496 + }, + { + "epoch": 0.64, + "learning_rate": 7.300303336703741e-05, + "loss": 3.281, + "step": 4497 + }, + { + "epoch": 0.64, + "learning_rate": 7.29741441571573e-05, + "loss": 3.1698, + "step": 4498 + }, + { + "epoch": 0.64, + "learning_rate": 7.29452549472772e-05, + "loss": 3.2967, + "step": 4499 + }, + { + "epoch": 0.64, + "learning_rate": 7.291636573739709e-05, + "loss": 3.2783, + "step": 4500 + }, + { + "epoch": 0.64, + "learning_rate": 7.288747652751697e-05, + "loss": 3.315, + "step": 4501 + }, + { + "epoch": 0.64, + "learning_rate": 7.285858731763687e-05, + "loss": 3.2573, + "step": 4502 + }, + { + "epoch": 0.64, + "learning_rate": 7.282969810775676e-05, + "loss": 3.1854, + "step": 4503 + }, + { + "epoch": 0.64, + "learning_rate": 7.280080889787665e-05, + "loss": 3.235, + "step": 4504 + }, + { + "epoch": 0.64, + "learning_rate": 7.277191968799653e-05, + "loss": 3.2233, + "step": 4505 + }, + { + "epoch": 0.64, + "learning_rate": 7.274303047811643e-05, + "loss": 3.2246, + "step": 4506 + }, + { + "epoch": 0.64, + "learning_rate": 7.271414126823631e-05, + "loss": 3.2204, + "step": 4507 + }, + { + "epoch": 0.64, + "learning_rate": 7.268525205835622e-05, + "loss": 3.2522, + "step": 4508 + }, + { + "epoch": 0.64, + "learning_rate": 7.26563628484761e-05, + "loss": 3.0833, + "step": 4509 + }, + { + "epoch": 0.64, + "learning_rate": 7.262747363859599e-05, + "loss": 3.07, + "step": 4510 + }, + { + "epoch": 0.64, + "learning_rate": 7.259858442871588e-05, + "loss": 3.1149, + "step": 4511 + }, + { + "epoch": 0.64, + "learning_rate": 7.256969521883576e-05, + "loss": 3.2049, + "step": 4512 + }, + { + "epoch": 0.64, + "learning_rate": 7.254080600895567e-05, + "loss": 3.3168, + "step": 4513 + }, + { + "epoch": 0.64, + "learning_rate": 7.251191679907555e-05, + "loss": 3.2935, + "step": 4514 + }, + { + "epoch": 0.64, + "learning_rate": 7.248302758919544e-05, + "loss": 3.1564, + "step": 4515 + }, + { + "epoch": 0.64, + "learning_rate": 7.245413837931533e-05, + "loss": 3.2737, + "step": 4516 + }, + { + "epoch": 0.64, + "learning_rate": 7.242524916943522e-05, + "loss": 3.1057, + "step": 4517 + }, + { + "epoch": 0.64, + "learning_rate": 7.23963599595551e-05, + "loss": 3.2497, + "step": 4518 + }, + { + "epoch": 0.64, + "learning_rate": 7.2367470749675e-05, + "loss": 3.2421, + "step": 4519 + }, + { + "epoch": 0.64, + "learning_rate": 7.233858153979489e-05, + "loss": 3.2509, + "step": 4520 + }, + { + "epoch": 0.64, + "learning_rate": 7.230969232991478e-05, + "loss": 3.2018, + "step": 4521 + }, + { + "epoch": 0.64, + "learning_rate": 7.228080312003467e-05, + "loss": 3.1461, + "step": 4522 + }, + { + "epoch": 0.64, + "learning_rate": 7.225191391015455e-05, + "loss": 3.1516, + "step": 4523 + }, + { + "epoch": 0.64, + "learning_rate": 7.222302470027445e-05, + "loss": 3.2691, + "step": 4524 + }, + { + "epoch": 0.64, + "learning_rate": 7.219413549039434e-05, + "loss": 3.4174, + "step": 4525 + }, + { + "epoch": 0.64, + "learning_rate": 7.216524628051424e-05, + "loss": 3.2661, + "step": 4526 + }, + { + "epoch": 0.64, + "learning_rate": 7.213635707063412e-05, + "loss": 3.1742, + "step": 4527 + }, + { + "epoch": 0.64, + "learning_rate": 7.210746786075401e-05, + "loss": 3.2276, + "step": 4528 + }, + { + "epoch": 0.64, + "learning_rate": 7.20785786508739e-05, + "loss": 3.2369, + "step": 4529 + }, + { + "epoch": 0.64, + "learning_rate": 7.20496894409938e-05, + "loss": 3.2119, + "step": 4530 + }, + { + "epoch": 0.65, + "learning_rate": 7.202080023111368e-05, + "loss": 3.2396, + "step": 4531 + }, + { + "epoch": 0.65, + "learning_rate": 7.199191102123357e-05, + "loss": 3.1502, + "step": 4532 + }, + { + "epoch": 0.65, + "learning_rate": 7.196302181135347e-05, + "loss": 3.1763, + "step": 4533 + }, + { + "epoch": 0.65, + "learning_rate": 7.193413260147336e-05, + "loss": 3.1303, + "step": 4534 + }, + { + "epoch": 0.65, + "learning_rate": 7.190524339159324e-05, + "loss": 3.0942, + "step": 4535 + }, + { + "epoch": 0.65, + "learning_rate": 7.187635418171313e-05, + "loss": 3.1065, + "step": 4536 + }, + { + "epoch": 0.65, + "learning_rate": 7.184746497183303e-05, + "loss": 3.2077, + "step": 4537 + }, + { + "epoch": 0.65, + "learning_rate": 7.181857576195291e-05, + "loss": 3.2625, + "step": 4538 + }, + { + "epoch": 0.65, + "learning_rate": 7.17896865520728e-05, + "loss": 3.237, + "step": 4539 + }, + { + "epoch": 0.65, + "learning_rate": 7.17607973421927e-05, + "loss": 3.3133, + "step": 4540 + }, + { + "epoch": 0.65, + "learning_rate": 7.173190813231259e-05, + "loss": 3.2684, + "step": 4541 + }, + { + "epoch": 0.65, + "learning_rate": 7.170301892243247e-05, + "loss": 3.2643, + "step": 4542 + }, + { + "epoch": 0.65, + "learning_rate": 7.167412971255236e-05, + "loss": 3.2464, + "step": 4543 + }, + { + "epoch": 0.65, + "learning_rate": 7.164524050267226e-05, + "loss": 3.2603, + "step": 4544 + }, + { + "epoch": 0.65, + "learning_rate": 7.161635129279215e-05, + "loss": 3.1629, + "step": 4545 + }, + { + "epoch": 0.65, + "learning_rate": 7.158746208291203e-05, + "loss": 3.2386, + "step": 4546 + }, + { + "epoch": 0.65, + "learning_rate": 7.155857287303193e-05, + "loss": 3.322, + "step": 4547 + }, + { + "epoch": 0.65, + "learning_rate": 7.152968366315182e-05, + "loss": 2.9695, + "step": 4548 + }, + { + "epoch": 0.65, + "learning_rate": 7.15007944532717e-05, + "loss": 3.1846, + "step": 4549 + }, + { + "epoch": 0.65, + "learning_rate": 7.14719052433916e-05, + "loss": 3.2918, + "step": 4550 + }, + { + "epoch": 0.65, + "learning_rate": 7.144301603351149e-05, + "loss": 3.2516, + "step": 4551 + }, + { + "epoch": 0.65, + "learning_rate": 7.141412682363138e-05, + "loss": 3.2238, + "step": 4552 + }, + { + "epoch": 0.65, + "learning_rate": 7.138523761375126e-05, + "loss": 3.2354, + "step": 4553 + }, + { + "epoch": 0.65, + "learning_rate": 7.135634840387115e-05, + "loss": 3.1473, + "step": 4554 + }, + { + "epoch": 0.65, + "learning_rate": 7.132745919399105e-05, + "loss": 3.3592, + "step": 4555 + }, + { + "epoch": 0.65, + "learning_rate": 7.129856998411094e-05, + "loss": 3.2755, + "step": 4556 + }, + { + "epoch": 0.65, + "learning_rate": 7.126968077423082e-05, + "loss": 3.1127, + "step": 4557 + }, + { + "epoch": 0.65, + "learning_rate": 7.124079156435072e-05, + "loss": 3.1792, + "step": 4558 + }, + { + "epoch": 0.65, + "learning_rate": 7.121190235447061e-05, + "loss": 3.3286, + "step": 4559 + }, + { + "epoch": 0.65, + "learning_rate": 7.118301314459049e-05, + "loss": 3.2957, + "step": 4560 + }, + { + "epoch": 0.65, + "learning_rate": 7.11541239347104e-05, + "loss": 3.025, + "step": 4561 + }, + { + "epoch": 0.65, + "learning_rate": 7.112523472483028e-05, + "loss": 3.189, + "step": 4562 + }, + { + "epoch": 0.65, + "learning_rate": 7.109634551495017e-05, + "loss": 3.1633, + "step": 4563 + }, + { + "epoch": 0.65, + "learning_rate": 7.106745630507005e-05, + "loss": 3.0469, + "step": 4564 + }, + { + "epoch": 0.65, + "learning_rate": 7.103856709518995e-05, + "loss": 3.3885, + "step": 4565 + }, + { + "epoch": 0.65, + "learning_rate": 7.100967788530984e-05, + "loss": 3.293, + "step": 4566 + }, + { + "epoch": 0.65, + "learning_rate": 7.098078867542973e-05, + "loss": 3.1842, + "step": 4567 + }, + { + "epoch": 0.65, + "learning_rate": 7.095189946554961e-05, + "loss": 3.2436, + "step": 4568 + }, + { + "epoch": 0.65, + "learning_rate": 7.092301025566951e-05, + "loss": 3.2228, + "step": 4569 + }, + { + "epoch": 0.65, + "learning_rate": 7.08941210457894e-05, + "loss": 3.3918, + "step": 4570 + }, + { + "epoch": 0.65, + "learning_rate": 7.08652318359093e-05, + "loss": 3.0825, + "step": 4571 + }, + { + "epoch": 0.65, + "learning_rate": 7.083634262602919e-05, + "loss": 3.2249, + "step": 4572 + }, + { + "epoch": 0.65, + "learning_rate": 7.080745341614907e-05, + "loss": 3.0199, + "step": 4573 + }, + { + "epoch": 0.65, + "learning_rate": 7.077856420626896e-05, + "loss": 3.2107, + "step": 4574 + }, + { + "epoch": 0.65, + "learning_rate": 7.074967499638884e-05, + "loss": 3.1974, + "step": 4575 + }, + { + "epoch": 0.65, + "learning_rate": 7.072078578650875e-05, + "loss": 3.2753, + "step": 4576 + }, + { + "epoch": 0.65, + "eval_loss": 3.4311070442199707, + "eval_runtime": 471.4356, + "eval_samples_per_second": 43.457, + "eval_steps_per_second": 14.486, + "step": 4576 + }, + { + "epoch": 0.65, + "learning_rate": 7.069189657662863e-05, + "loss": 3.0329, + "step": 4577 + }, + { + "epoch": 0.65, + "learning_rate": 7.066300736674853e-05, + "loss": 3.3126, + "step": 4578 + }, + { + "epoch": 0.65, + "learning_rate": 7.06341181568684e-05, + "loss": 3.1901, + "step": 4579 + }, + { + "epoch": 0.65, + "learning_rate": 7.06052289469883e-05, + "loss": 3.163, + "step": 4580 + }, + { + "epoch": 0.65, + "learning_rate": 7.05763397371082e-05, + "loss": 3.1173, + "step": 4581 + }, + { + "epoch": 0.65, + "learning_rate": 7.054745052722809e-05, + "loss": 3.244, + "step": 4582 + }, + { + "epoch": 0.65, + "learning_rate": 7.051856131734798e-05, + "loss": 3.2602, + "step": 4583 + }, + { + "epoch": 0.65, + "learning_rate": 7.048967210746786e-05, + "loss": 2.991, + "step": 4584 + }, + { + "epoch": 0.65, + "learning_rate": 7.046078289758776e-05, + "loss": 3.2214, + "step": 4585 + }, + { + "epoch": 0.65, + "learning_rate": 7.043189368770764e-05, + "loss": 3.2427, + "step": 4586 + }, + { + "epoch": 0.65, + "learning_rate": 7.040300447782754e-05, + "loss": 3.2009, + "step": 4587 + }, + { + "epoch": 0.65, + "learning_rate": 7.037411526794742e-05, + "loss": 3.191, + "step": 4588 + }, + { + "epoch": 0.65, + "learning_rate": 7.034522605806732e-05, + "loss": 3.2651, + "step": 4589 + }, + { + "epoch": 0.65, + "learning_rate": 7.03163368481872e-05, + "loss": 3.3027, + "step": 4590 + }, + { + "epoch": 0.65, + "learning_rate": 7.028744763830709e-05, + "loss": 3.2212, + "step": 4591 + }, + { + "epoch": 0.65, + "learning_rate": 7.025855842842698e-05, + "loss": 3.2133, + "step": 4592 + }, + { + "epoch": 0.65, + "learning_rate": 7.022966921854688e-05, + "loss": 3.1425, + "step": 4593 + }, + { + "epoch": 0.65, + "learning_rate": 7.020078000866677e-05, + "loss": 3.2199, + "step": 4594 + }, + { + "epoch": 0.65, + "learning_rate": 7.017189079878665e-05, + "loss": 3.2051, + "step": 4595 + }, + { + "epoch": 0.65, + "learning_rate": 7.014300158890655e-05, + "loss": 3.1071, + "step": 4596 + }, + { + "epoch": 0.65, + "learning_rate": 7.011411237902644e-05, + "loss": 3.2108, + "step": 4597 + }, + { + "epoch": 0.65, + "learning_rate": 7.008522316914633e-05, + "loss": 3.1735, + "step": 4598 + }, + { + "epoch": 0.65, + "learning_rate": 7.005633395926621e-05, + "loss": 3.1795, + "step": 4599 + }, + { + "epoch": 0.65, + "learning_rate": 7.002744474938611e-05, + "loss": 3.1864, + "step": 4600 + }, + { + "epoch": 0.66, + "learning_rate": 6.999855553950599e-05, + "loss": 3.3092, + "step": 4601 + }, + { + "epoch": 0.66, + "learning_rate": 6.99696663296259e-05, + "loss": 3.1383, + "step": 4602 + }, + { + "epoch": 0.66, + "learning_rate": 6.994077711974578e-05, + "loss": 3.1475, + "step": 4603 + }, + { + "epoch": 0.66, + "learning_rate": 6.991188790986567e-05, + "loss": 3.2344, + "step": 4604 + }, + { + "epoch": 0.66, + "learning_rate": 6.988299869998556e-05, + "loss": 3.1652, + "step": 4605 + }, + { + "epoch": 0.66, + "learning_rate": 6.985410949010544e-05, + "loss": 3.2111, + "step": 4606 + }, + { + "epoch": 0.66, + "learning_rate": 6.982522028022534e-05, + "loss": 3.224, + "step": 4607 + }, + { + "epoch": 0.66, + "learning_rate": 6.979633107034523e-05, + "loss": 3.1232, + "step": 4608 + }, + { + "epoch": 0.66, + "learning_rate": 6.976744186046513e-05, + "loss": 3.2263, + "step": 4609 + }, + { + "epoch": 0.66, + "learning_rate": 6.9738552650585e-05, + "loss": 3.1037, + "step": 4610 + }, + { + "epoch": 0.66, + "learning_rate": 6.97096634407049e-05, + "loss": 3.2389, + "step": 4611 + }, + { + "epoch": 0.66, + "learning_rate": 6.968077423082478e-05, + "loss": 3.185, + "step": 4612 + }, + { + "epoch": 0.66, + "learning_rate": 6.965188502094469e-05, + "loss": 3.2433, + "step": 4613 + }, + { + "epoch": 0.66, + "learning_rate": 6.962299581106457e-05, + "loss": 3.2258, + "step": 4614 + }, + { + "epoch": 0.66, + "learning_rate": 6.959410660118446e-05, + "loss": 3.1148, + "step": 4615 + }, + { + "epoch": 0.66, + "learning_rate": 6.956521739130436e-05, + "loss": 3.2667, + "step": 4616 + }, + { + "epoch": 0.66, + "learning_rate": 6.953632818142424e-05, + "loss": 3.1186, + "step": 4617 + }, + { + "epoch": 0.66, + "learning_rate": 6.950743897154414e-05, + "loss": 3.085, + "step": 4618 + }, + { + "epoch": 0.66, + "learning_rate": 6.947854976166402e-05, + "loss": 3.1934, + "step": 4619 + }, + { + "epoch": 0.66, + "learning_rate": 6.944966055178392e-05, + "loss": 3.0886, + "step": 4620 + }, + { + "epoch": 0.66, + "learning_rate": 6.94207713419038e-05, + "loss": 3.1788, + "step": 4621 + }, + { + "epoch": 0.66, + "learning_rate": 6.939188213202369e-05, + "loss": 3.2761, + "step": 4622 + }, + { + "epoch": 0.66, + "learning_rate": 6.936299292214358e-05, + "loss": 3.237, + "step": 4623 + }, + { + "epoch": 0.66, + "learning_rate": 6.933410371226348e-05, + "loss": 3.1859, + "step": 4624 + }, + { + "epoch": 0.66, + "learning_rate": 6.930521450238336e-05, + "loss": 3.1281, + "step": 4625 + }, + { + "epoch": 0.66, + "learning_rate": 6.927632529250325e-05, + "loss": 3.1964, + "step": 4626 + }, + { + "epoch": 0.66, + "learning_rate": 6.924743608262315e-05, + "loss": 3.3033, + "step": 4627 + }, + { + "epoch": 0.66, + "learning_rate": 6.921854687274303e-05, + "loss": 3.1437, + "step": 4628 + }, + { + "epoch": 0.66, + "learning_rate": 6.918965766286293e-05, + "loss": 3.1905, + "step": 4629 + }, + { + "epoch": 0.66, + "learning_rate": 6.916076845298281e-05, + "loss": 3.2168, + "step": 4630 + }, + { + "epoch": 0.66, + "learning_rate": 6.913187924310271e-05, + "loss": 3.0697, + "step": 4631 + }, + { + "epoch": 0.66, + "learning_rate": 6.910299003322259e-05, + "loss": 3.2662, + "step": 4632 + }, + { + "epoch": 0.66, + "learning_rate": 6.907410082334248e-05, + "loss": 3.0232, + "step": 4633 + }, + { + "epoch": 0.66, + "learning_rate": 6.904521161346238e-05, + "loss": 3.0126, + "step": 4634 + }, + { + "epoch": 0.66, + "learning_rate": 6.901632240358227e-05, + "loss": 3.2893, + "step": 4635 + }, + { + "epoch": 0.66, + "learning_rate": 6.898743319370215e-05, + "loss": 3.0733, + "step": 4636 + }, + { + "epoch": 0.66, + "learning_rate": 6.895854398382204e-05, + "loss": 3.175, + "step": 4637 + }, + { + "epoch": 0.66, + "learning_rate": 6.892965477394194e-05, + "loss": 3.0857, + "step": 4638 + }, + { + "epoch": 0.66, + "learning_rate": 6.890076556406183e-05, + "loss": 3.2243, + "step": 4639 + }, + { + "epoch": 0.66, + "learning_rate": 6.887187635418173e-05, + "loss": 3.2138, + "step": 4640 + }, + { + "epoch": 0.66, + "learning_rate": 6.88429871443016e-05, + "loss": 3.2269, + "step": 4641 + }, + { + "epoch": 0.66, + "learning_rate": 6.88140979344215e-05, + "loss": 3.1455, + "step": 4642 + }, + { + "epoch": 0.66, + "learning_rate": 6.878520872454138e-05, + "loss": 3.24, + "step": 4643 + }, + { + "epoch": 0.66, + "learning_rate": 6.875631951466129e-05, + "loss": 3.0995, + "step": 4644 + }, + { + "epoch": 0.66, + "learning_rate": 6.872743030478117e-05, + "loss": 3.1545, + "step": 4645 + }, + { + "epoch": 0.66, + "learning_rate": 6.869854109490106e-05, + "loss": 3.2984, + "step": 4646 + }, + { + "epoch": 0.66, + "learning_rate": 6.866965188502094e-05, + "loss": 3.1757, + "step": 4647 + }, + { + "epoch": 0.66, + "learning_rate": 6.864076267514084e-05, + "loss": 3.2271, + "step": 4648 + }, + { + "epoch": 0.66, + "learning_rate": 6.861187346526073e-05, + "loss": 3.2733, + "step": 4649 + }, + { + "epoch": 0.66, + "learning_rate": 6.858298425538062e-05, + "loss": 3.1991, + "step": 4650 + }, + { + "epoch": 0.66, + "learning_rate": 6.855409504550052e-05, + "loss": 3.2941, + "step": 4651 + }, + { + "epoch": 0.66, + "learning_rate": 6.85252058356204e-05, + "loss": 3.1293, + "step": 4652 + }, + { + "epoch": 0.66, + "learning_rate": 6.849631662574029e-05, + "loss": 3.1675, + "step": 4653 + }, + { + "epoch": 0.66, + "learning_rate": 6.846742741586017e-05, + "loss": 3.1558, + "step": 4654 + }, + { + "epoch": 0.66, + "learning_rate": 6.843853820598008e-05, + "loss": 3.1124, + "step": 4655 + }, + { + "epoch": 0.66, + "learning_rate": 6.840964899609996e-05, + "loss": 2.9734, + "step": 4656 + }, + { + "epoch": 0.66, + "learning_rate": 6.838075978621985e-05, + "loss": 3.2422, + "step": 4657 + }, + { + "epoch": 0.66, + "learning_rate": 6.835187057633973e-05, + "loss": 3.0933, + "step": 4658 + }, + { + "epoch": 0.66, + "learning_rate": 6.832298136645963e-05, + "loss": 3.1473, + "step": 4659 + }, + { + "epoch": 0.66, + "learning_rate": 6.829409215657952e-05, + "loss": 3.221, + "step": 4660 + }, + { + "epoch": 0.66, + "learning_rate": 6.826520294669941e-05, + "loss": 3.2445, + "step": 4661 + }, + { + "epoch": 0.66, + "learning_rate": 6.82363137368193e-05, + "loss": 3.1867, + "step": 4662 + }, + { + "epoch": 0.66, + "learning_rate": 6.820742452693919e-05, + "loss": 3.2059, + "step": 4663 + }, + { + "epoch": 0.66, + "learning_rate": 6.817853531705908e-05, + "loss": 3.1624, + "step": 4664 + }, + { + "epoch": 0.66, + "learning_rate": 6.814964610717898e-05, + "loss": 3.0782, + "step": 4665 + }, + { + "epoch": 0.66, + "learning_rate": 6.812075689729887e-05, + "loss": 3.2645, + "step": 4666 + }, + { + "epoch": 0.66, + "learning_rate": 6.809186768741875e-05, + "loss": 3.1695, + "step": 4667 + }, + { + "epoch": 0.66, + "learning_rate": 6.806297847753864e-05, + "loss": 3.2342, + "step": 4668 + }, + { + "epoch": 0.66, + "learning_rate": 6.803408926765852e-05, + "loss": 3.159, + "step": 4669 + }, + { + "epoch": 0.66, + "learning_rate": 6.800520005777842e-05, + "loss": 3.1207, + "step": 4670 + }, + { + "epoch": 0.67, + "learning_rate": 6.797631084789831e-05, + "loss": 3.1898, + "step": 4671 + }, + { + "epoch": 0.67, + "learning_rate": 6.79474216380182e-05, + "loss": 3.2039, + "step": 4672 + }, + { + "epoch": 0.67, + "learning_rate": 6.791853242813809e-05, + "loss": 3.0811, + "step": 4673 + }, + { + "epoch": 0.67, + "learning_rate": 6.788964321825798e-05, + "loss": 3.2256, + "step": 4674 + }, + { + "epoch": 0.67, + "learning_rate": 6.786075400837787e-05, + "loss": 3.1116, + "step": 4675 + }, + { + "epoch": 0.67, + "learning_rate": 6.783186479849777e-05, + "loss": 3.0967, + "step": 4676 + }, + { + "epoch": 0.67, + "learning_rate": 6.780297558861766e-05, + "loss": 3.2738, + "step": 4677 + }, + { + "epoch": 0.67, + "learning_rate": 6.777408637873754e-05, + "loss": 3.2311, + "step": 4678 + }, + { + "epoch": 0.67, + "learning_rate": 6.774519716885744e-05, + "loss": 3.3502, + "step": 4679 + }, + { + "epoch": 0.67, + "learning_rate": 6.771630795897732e-05, + "loss": 3.2233, + "step": 4680 + }, + { + "epoch": 0.67, + "learning_rate": 6.768741874909722e-05, + "loss": 3.283, + "step": 4681 + }, + { + "epoch": 0.67, + "learning_rate": 6.76585295392171e-05, + "loss": 3.2997, + "step": 4682 + }, + { + "epoch": 0.67, + "learning_rate": 6.7629640329337e-05, + "loss": 3.1227, + "step": 4683 + }, + { + "epoch": 0.67, + "learning_rate": 6.760075111945688e-05, + "loss": 3.256, + "step": 4684 + }, + { + "epoch": 0.67, + "learning_rate": 6.757186190957677e-05, + "loss": 3.1657, + "step": 4685 + }, + { + "epoch": 0.67, + "learning_rate": 6.754297269969667e-05, + "loss": 3.2956, + "step": 4686 + }, + { + "epoch": 0.67, + "learning_rate": 6.751408348981656e-05, + "loss": 3.1841, + "step": 4687 + }, + { + "epoch": 0.67, + "learning_rate": 6.748519427993645e-05, + "loss": 3.2518, + "step": 4688 + }, + { + "epoch": 0.67, + "learning_rate": 6.745630507005633e-05, + "loss": 3.0815, + "step": 4689 + }, + { + "epoch": 0.67, + "learning_rate": 6.742741586017623e-05, + "loss": 3.1521, + "step": 4690 + }, + { + "epoch": 0.67, + "learning_rate": 6.739852665029611e-05, + "loss": 3.2677, + "step": 4691 + }, + { + "epoch": 0.67, + "learning_rate": 6.736963744041601e-05, + "loss": 3.1445, + "step": 4692 + }, + { + "epoch": 0.67, + "learning_rate": 6.73407482305359e-05, + "loss": 3.217, + "step": 4693 + }, + { + "epoch": 0.67, + "learning_rate": 6.731185902065579e-05, + "loss": 3.1792, + "step": 4694 + }, + { + "epoch": 0.67, + "learning_rate": 6.728296981077567e-05, + "loss": 2.9674, + "step": 4695 + }, + { + "epoch": 0.67, + "learning_rate": 6.725408060089556e-05, + "loss": 3.1301, + "step": 4696 + }, + { + "epoch": 0.67, + "learning_rate": 6.722519139101546e-05, + "loss": 3.2711, + "step": 4697 + }, + { + "epoch": 0.67, + "learning_rate": 6.719630218113535e-05, + "loss": 3.3011, + "step": 4698 + }, + { + "epoch": 0.67, + "learning_rate": 6.716741297125524e-05, + "loss": 3.1289, + "step": 4699 + }, + { + "epoch": 0.67, + "learning_rate": 6.713852376137512e-05, + "loss": 3.322, + "step": 4700 + }, + { + "epoch": 0.67, + "learning_rate": 6.710963455149502e-05, + "loss": 3.2275, + "step": 4701 + }, + { + "epoch": 0.67, + "learning_rate": 6.708074534161491e-05, + "loss": 3.2683, + "step": 4702 + }, + { + "epoch": 0.67, + "learning_rate": 6.70518561317348e-05, + "loss": 3.3123, + "step": 4703 + }, + { + "epoch": 0.67, + "learning_rate": 6.702296692185469e-05, + "loss": 3.1394, + "step": 4704 + }, + { + "epoch": 0.67, + "learning_rate": 6.699407771197458e-05, + "loss": 3.1832, + "step": 4705 + }, + { + "epoch": 0.67, + "learning_rate": 6.696518850209446e-05, + "loss": 3.2946, + "step": 4706 + }, + { + "epoch": 0.67, + "learning_rate": 6.693629929221437e-05, + "loss": 3.1702, + "step": 4707 + }, + { + "epoch": 0.67, + "learning_rate": 6.690741008233425e-05, + "loss": 3.2187, + "step": 4708 + }, + { + "epoch": 0.67, + "learning_rate": 6.687852087245414e-05, + "loss": 3.2411, + "step": 4709 + }, + { + "epoch": 0.67, + "learning_rate": 6.684963166257404e-05, + "loss": 3.2604, + "step": 4710 + }, + { + "epoch": 0.67, + "learning_rate": 6.682074245269392e-05, + "loss": 3.261, + "step": 4711 + }, + { + "epoch": 0.67, + "learning_rate": 6.679185324281382e-05, + "loss": 3.3329, + "step": 4712 + }, + { + "epoch": 0.67, + "learning_rate": 6.67629640329337e-05, + "loss": 3.1801, + "step": 4713 + }, + { + "epoch": 0.67, + "learning_rate": 6.67340748230536e-05, + "loss": 3.0755, + "step": 4714 + }, + { + "epoch": 0.67, + "learning_rate": 6.670518561317348e-05, + "loss": 3.2324, + "step": 4715 + }, + { + "epoch": 0.67, + "learning_rate": 6.667629640329337e-05, + "loss": 3.2004, + "step": 4716 + }, + { + "epoch": 0.67, + "learning_rate": 6.664740719341325e-05, + "loss": 3.0358, + "step": 4717 + }, + { + "epoch": 0.67, + "learning_rate": 6.661851798353316e-05, + "loss": 3.093, + "step": 4718 + }, + { + "epoch": 0.67, + "learning_rate": 6.658962877365304e-05, + "loss": 3.2078, + "step": 4719 + }, + { + "epoch": 0.67, + "learning_rate": 6.656073956377293e-05, + "loss": 3.3535, + "step": 4720 + }, + { + "epoch": 0.67, + "learning_rate": 6.653185035389283e-05, + "loss": 3.2595, + "step": 4721 + }, + { + "epoch": 0.67, + "learning_rate": 6.650296114401271e-05, + "loss": 3.1418, + "step": 4722 + }, + { + "epoch": 0.67, + "learning_rate": 6.647407193413262e-05, + "loss": 2.9937, + "step": 4723 + }, + { + "epoch": 0.67, + "learning_rate": 6.64451827242525e-05, + "loss": 3.2423, + "step": 4724 + }, + { + "epoch": 0.67, + "learning_rate": 6.641629351437239e-05, + "loss": 3.3218, + "step": 4725 + }, + { + "epoch": 0.67, + "learning_rate": 6.638740430449227e-05, + "loss": 3.1807, + "step": 4726 + }, + { + "epoch": 0.67, + "learning_rate": 6.635851509461216e-05, + "loss": 3.1824, + "step": 4727 + }, + { + "epoch": 0.67, + "learning_rate": 6.632962588473206e-05, + "loss": 3.1208, + "step": 4728 + }, + { + "epoch": 0.67, + "learning_rate": 6.630073667485195e-05, + "loss": 3.2159, + "step": 4729 + }, + { + "epoch": 0.67, + "learning_rate": 6.627184746497183e-05, + "loss": 3.1804, + "step": 4730 + }, + { + "epoch": 0.67, + "learning_rate": 6.624295825509173e-05, + "loss": 3.1377, + "step": 4731 + }, + { + "epoch": 0.67, + "learning_rate": 6.621406904521162e-05, + "loss": 3.2806, + "step": 4732 + }, + { + "epoch": 0.67, + "learning_rate": 6.618517983533151e-05, + "loss": 3.2248, + "step": 4733 + }, + { + "epoch": 0.67, + "learning_rate": 6.61562906254514e-05, + "loss": 3.1957, + "step": 4734 + }, + { + "epoch": 0.67, + "learning_rate": 6.612740141557129e-05, + "loss": 3.1038, + "step": 4735 + }, + { + "epoch": 0.67, + "learning_rate": 6.609851220569118e-05, + "loss": 3.2546, + "step": 4736 + }, + { + "epoch": 0.67, + "learning_rate": 6.606962299581106e-05, + "loss": 3.1971, + "step": 4737 + }, + { + "epoch": 0.67, + "learning_rate": 6.604073378593095e-05, + "loss": 3.2071, + "step": 4738 + }, + { + "epoch": 0.67, + "learning_rate": 6.601184457605085e-05, + "loss": 3.3514, + "step": 4739 + }, + { + "epoch": 0.67, + "learning_rate": 6.598295536617074e-05, + "loss": 3.2246, + "step": 4740 + }, + { + "epoch": 0.68, + "learning_rate": 6.595406615629062e-05, + "loss": 3.0967, + "step": 4741 + }, + { + "epoch": 0.68, + "learning_rate": 6.592517694641052e-05, + "loss": 3.1957, + "step": 4742 + }, + { + "epoch": 0.68, + "learning_rate": 6.589628773653041e-05, + "loss": 3.242, + "step": 4743 + }, + { + "epoch": 0.68, + "learning_rate": 6.58673985266503e-05, + "loss": 3.1726, + "step": 4744 + }, + { + "epoch": 0.68, + "learning_rate": 6.58385093167702e-05, + "loss": 3.1347, + "step": 4745 + }, + { + "epoch": 0.68, + "learning_rate": 6.580962010689008e-05, + "loss": 3.0802, + "step": 4746 + }, + { + "epoch": 0.68, + "learning_rate": 6.578073089700997e-05, + "loss": 3.2203, + "step": 4747 + }, + { + "epoch": 0.68, + "learning_rate": 6.575184168712985e-05, + "loss": 3.27, + "step": 4748 + }, + { + "epoch": 0.68, + "learning_rate": 6.572295247724976e-05, + "loss": 3.3669, + "step": 4749 + }, + { + "epoch": 0.68, + "learning_rate": 6.569406326736964e-05, + "loss": 3.0405, + "step": 4750 + }, + { + "epoch": 0.68, + "learning_rate": 6.566517405748953e-05, + "loss": 3.1361, + "step": 4751 + }, + { + "epoch": 0.68, + "learning_rate": 6.563628484760941e-05, + "loss": 3.2701, + "step": 4752 + }, + { + "epoch": 0.68, + "learning_rate": 6.560739563772931e-05, + "loss": 3.2333, + "step": 4753 + }, + { + "epoch": 0.68, + "learning_rate": 6.55785064278492e-05, + "loss": 3.1564, + "step": 4754 + }, + { + "epoch": 0.68, + "learning_rate": 6.55496172179691e-05, + "loss": 3.0823, + "step": 4755 + }, + { + "epoch": 0.68, + "learning_rate": 6.552072800808899e-05, + "loss": 3.1326, + "step": 4756 + }, + { + "epoch": 0.68, + "learning_rate": 6.549183879820887e-05, + "loss": 3.0293, + "step": 4757 + }, + { + "epoch": 0.68, + "learning_rate": 6.546294958832876e-05, + "loss": 3.1572, + "step": 4758 + }, + { + "epoch": 0.68, + "learning_rate": 6.543406037844864e-05, + "loss": 3.1817, + "step": 4759 + }, + { + "epoch": 0.68, + "learning_rate": 6.540517116856855e-05, + "loss": 3.2333, + "step": 4760 + }, + { + "epoch": 0.68, + "learning_rate": 6.537628195868843e-05, + "loss": 3.2094, + "step": 4761 + }, + { + "epoch": 0.68, + "learning_rate": 6.534739274880833e-05, + "loss": 3.1592, + "step": 4762 + }, + { + "epoch": 0.68, + "learning_rate": 6.53185035389282e-05, + "loss": 3.084, + "step": 4763 + }, + { + "epoch": 0.68, + "learning_rate": 6.52896143290481e-05, + "loss": 3.2225, + "step": 4764 + }, + { + "epoch": 0.68, + "learning_rate": 6.526072511916799e-05, + "loss": 3.0779, + "step": 4765 + }, + { + "epoch": 0.68, + "learning_rate": 6.523183590928789e-05, + "loss": 3.1048, + "step": 4766 + }, + { + "epoch": 0.68, + "learning_rate": 6.520294669940778e-05, + "loss": 3.2311, + "step": 4767 + }, + { + "epoch": 0.68, + "learning_rate": 6.517405748952766e-05, + "loss": 3.187, + "step": 4768 + }, + { + "epoch": 0.68, + "learning_rate": 6.514516827964755e-05, + "loss": 3.1343, + "step": 4769 + }, + { + "epoch": 0.68, + "learning_rate": 6.511627906976745e-05, + "loss": 3.1523, + "step": 4770 + }, + { + "epoch": 0.68, + "learning_rate": 6.508738985988734e-05, + "loss": 3.0041, + "step": 4771 + }, + { + "epoch": 0.68, + "learning_rate": 6.505850065000722e-05, + "loss": 3.1949, + "step": 4772 + }, + { + "epoch": 0.68, + "learning_rate": 6.502961144012712e-05, + "loss": 3.1414, + "step": 4773 + }, + { + "epoch": 0.68, + "learning_rate": 6.5000722230247e-05, + "loss": 3.0958, + "step": 4774 + }, + { + "epoch": 0.68, + "learning_rate": 6.49718330203669e-05, + "loss": 3.2459, + "step": 4775 + }, + { + "epoch": 0.68, + "learning_rate": 6.494294381048678e-05, + "loss": 3.1617, + "step": 4776 + }, + { + "epoch": 0.68, + "learning_rate": 6.491405460060668e-05, + "loss": 3.1454, + "step": 4777 + }, + { + "epoch": 0.68, + "learning_rate": 6.488516539072657e-05, + "loss": 3.1938, + "step": 4778 + }, + { + "epoch": 0.68, + "learning_rate": 6.485627618084645e-05, + "loss": 3.3808, + "step": 4779 + }, + { + "epoch": 0.68, + "learning_rate": 6.482738697096635e-05, + "loss": 3.0696, + "step": 4780 + }, + { + "epoch": 0.68, + "learning_rate": 6.479849776108624e-05, + "loss": 3.2487, + "step": 4781 + }, + { + "epoch": 0.68, + "learning_rate": 6.476960855120613e-05, + "loss": 3.3306, + "step": 4782 + }, + { + "epoch": 0.68, + "learning_rate": 6.474071934132601e-05, + "loss": 3.1838, + "step": 4783 + }, + { + "epoch": 0.68, + "learning_rate": 6.471183013144591e-05, + "loss": 3.1287, + "step": 4784 + }, + { + "epoch": 0.68, + "learning_rate": 6.468294092156579e-05, + "loss": 3.1744, + "step": 4785 + }, + { + "epoch": 0.68, + "learning_rate": 6.46540517116857e-05, + "loss": 3.2149, + "step": 4786 + }, + { + "epoch": 0.68, + "learning_rate": 6.462516250180558e-05, + "loss": 3.0864, + "step": 4787 + }, + { + "epoch": 0.68, + "learning_rate": 6.459627329192547e-05, + "loss": 3.2708, + "step": 4788 + }, + { + "epoch": 0.68, + "learning_rate": 6.456738408204535e-05, + "loss": 3.0833, + "step": 4789 + }, + { + "epoch": 0.68, + "learning_rate": 6.453849487216524e-05, + "loss": 3.1123, + "step": 4790 + }, + { + "epoch": 0.68, + "learning_rate": 6.450960566228514e-05, + "loss": 3.1003, + "step": 4791 + }, + { + "epoch": 0.68, + "learning_rate": 6.448071645240503e-05, + "loss": 3.1161, + "step": 4792 + }, + { + "epoch": 0.68, + "learning_rate": 6.445182724252493e-05, + "loss": 3.1413, + "step": 4793 + }, + { + "epoch": 0.68, + "learning_rate": 6.44229380326448e-05, + "loss": 3.1732, + "step": 4794 + }, + { + "epoch": 0.68, + "learning_rate": 6.43940488227647e-05, + "loss": 3.1983, + "step": 4795 + }, + { + "epoch": 0.68, + "learning_rate": 6.43651596128846e-05, + "loss": 3.1057, + "step": 4796 + }, + { + "epoch": 0.68, + "learning_rate": 6.433627040300449e-05, + "loss": 3.2081, + "step": 4797 + }, + { + "epoch": 0.68, + "learning_rate": 6.430738119312437e-05, + "loss": 3.2113, + "step": 4798 + }, + { + "epoch": 0.68, + "learning_rate": 6.427849198324426e-05, + "loss": 3.3364, + "step": 4799 + }, + { + "epoch": 0.68, + "learning_rate": 6.424960277336414e-05, + "loss": 3.321, + "step": 4800 + }, + { + "epoch": 0.68, + "learning_rate": 6.422071356348405e-05, + "loss": 3.0925, + "step": 4801 + }, + { + "epoch": 0.68, + "learning_rate": 6.419182435360393e-05, + "loss": 3.1668, + "step": 4802 + }, + { + "epoch": 0.68, + "learning_rate": 6.416293514372382e-05, + "loss": 3.1238, + "step": 4803 + }, + { + "epoch": 0.68, + "learning_rate": 6.413404593384372e-05, + "loss": 3.1487, + "step": 4804 + }, + { + "epoch": 0.68, + "learning_rate": 6.41051567239636e-05, + "loss": 2.9601, + "step": 4805 + }, + { + "epoch": 0.68, + "learning_rate": 6.407626751408349e-05, + "loss": 3.0914, + "step": 4806 + }, + { + "epoch": 0.68, + "learning_rate": 6.404737830420338e-05, + "loss": 3.3274, + "step": 4807 + }, + { + "epoch": 0.68, + "learning_rate": 6.401848909432328e-05, + "loss": 3.3154, + "step": 4808 + }, + { + "epoch": 0.68, + "learning_rate": 6.398959988444316e-05, + "loss": 3.0696, + "step": 4809 + }, + { + "epoch": 0.68, + "learning_rate": 6.396071067456305e-05, + "loss": 3.0216, + "step": 4810 + }, + { + "epoch": 0.68, + "learning_rate": 6.393182146468293e-05, + "loss": 3.2504, + "step": 4811 + }, + { + "epoch": 0.69, + "learning_rate": 6.390293225480284e-05, + "loss": 3.142, + "step": 4812 + }, + { + "epoch": 0.69, + "learning_rate": 6.387404304492272e-05, + "loss": 3.1038, + "step": 4813 + }, + { + "epoch": 0.69, + "learning_rate": 6.384515383504261e-05, + "loss": 3.1674, + "step": 4814 + }, + { + "epoch": 0.69, + "learning_rate": 6.381626462516251e-05, + "loss": 3.2564, + "step": 4815 + }, + { + "epoch": 0.69, + "learning_rate": 6.378737541528239e-05, + "loss": 3.144, + "step": 4816 + }, + { + "epoch": 0.69, + "learning_rate": 6.37584862054023e-05, + "loss": 3.2916, + "step": 4817 + }, + { + "epoch": 0.69, + "learning_rate": 6.372959699552218e-05, + "loss": 3.3567, + "step": 4818 + }, + { + "epoch": 0.69, + "learning_rate": 6.370070778564207e-05, + "loss": 3.1593, + "step": 4819 + }, + { + "epoch": 0.69, + "learning_rate": 6.367181857576195e-05, + "loss": 3.218, + "step": 4820 + }, + { + "epoch": 0.69, + "learning_rate": 6.364292936588184e-05, + "loss": 3.347, + "step": 4821 + }, + { + "epoch": 0.69, + "learning_rate": 6.361404015600174e-05, + "loss": 3.205, + "step": 4822 + }, + { + "epoch": 0.69, + "learning_rate": 6.358515094612163e-05, + "loss": 3.1968, + "step": 4823 + }, + { + "epoch": 0.69, + "learning_rate": 6.355626173624151e-05, + "loss": 3.1964, + "step": 4824 + }, + { + "epoch": 0.69, + "learning_rate": 6.35273725263614e-05, + "loss": 3.2347, + "step": 4825 + }, + { + "epoch": 0.69, + "learning_rate": 6.34984833164813e-05, + "loss": 3.3071, + "step": 4826 + }, + { + "epoch": 0.69, + "learning_rate": 6.346959410660118e-05, + "loss": 3.2471, + "step": 4827 + }, + { + "epoch": 0.69, + "learning_rate": 6.344070489672109e-05, + "loss": 3.2021, + "step": 4828 + }, + { + "epoch": 0.69, + "learning_rate": 6.341181568684097e-05, + "loss": 3.1955, + "step": 4829 + }, + { + "epoch": 0.69, + "learning_rate": 6.338292647696086e-05, + "loss": 3.0071, + "step": 4830 + }, + { + "epoch": 0.69, + "learning_rate": 6.335403726708074e-05, + "loss": 3.2419, + "step": 4831 + }, + { + "epoch": 0.69, + "learning_rate": 6.332514805720064e-05, + "loss": 3.0888, + "step": 4832 + }, + { + "epoch": 0.69, + "learning_rate": 6.329625884732053e-05, + "loss": 3.1945, + "step": 4833 + }, + { + "epoch": 0.69, + "learning_rate": 6.326736963744042e-05, + "loss": 3.0264, + "step": 4834 + }, + { + "epoch": 0.69, + "learning_rate": 6.32384804275603e-05, + "loss": 3.1711, + "step": 4835 + }, + { + "epoch": 0.69, + "learning_rate": 6.32095912176802e-05, + "loss": 3.1518, + "step": 4836 + }, + { + "epoch": 0.69, + "learning_rate": 6.318070200780009e-05, + "loss": 3.0763, + "step": 4837 + }, + { + "epoch": 0.69, + "learning_rate": 6.315181279791998e-05, + "loss": 3.1948, + "step": 4838 + }, + { + "epoch": 0.69, + "learning_rate": 6.312292358803988e-05, + "loss": 3.2302, + "step": 4839 + }, + { + "epoch": 0.69, + "learning_rate": 6.309403437815976e-05, + "loss": 3.1315, + "step": 4840 + }, + { + "epoch": 0.69, + "learning_rate": 6.306514516827965e-05, + "loss": 3.0718, + "step": 4841 + }, + { + "epoch": 0.69, + "learning_rate": 6.303625595839953e-05, + "loss": 3.2503, + "step": 4842 + }, + { + "epoch": 0.69, + "learning_rate": 6.300736674851944e-05, + "loss": 3.1226, + "step": 4843 + }, + { + "epoch": 0.69, + "learning_rate": 6.297847753863932e-05, + "loss": 3.2664, + "step": 4844 + }, + { + "epoch": 0.69, + "learning_rate": 6.294958832875921e-05, + "loss": 2.9803, + "step": 4845 + }, + { + "epoch": 0.69, + "learning_rate": 6.29206991188791e-05, + "loss": 3.2725, + "step": 4846 + }, + { + "epoch": 0.69, + "learning_rate": 6.289180990899899e-05, + "loss": 3.1647, + "step": 4847 + }, + { + "epoch": 0.69, + "learning_rate": 6.286292069911888e-05, + "loss": 3.164, + "step": 4848 + }, + { + "epoch": 0.69, + "learning_rate": 6.283403148923878e-05, + "loss": 3.2146, + "step": 4849 + }, + { + "epoch": 0.69, + "learning_rate": 6.280514227935867e-05, + "loss": 3.2211, + "step": 4850 + }, + { + "epoch": 0.69, + "learning_rate": 6.277625306947855e-05, + "loss": 3.0367, + "step": 4851 + }, + { + "epoch": 0.69, + "learning_rate": 6.274736385959844e-05, + "loss": 3.1545, + "step": 4852 + }, + { + "epoch": 0.69, + "learning_rate": 6.271847464971832e-05, + "loss": 3.185, + "step": 4853 + }, + { + "epoch": 0.69, + "learning_rate": 6.268958543983823e-05, + "loss": 3.0758, + "step": 4854 + }, + { + "epoch": 0.69, + "learning_rate": 6.266069622995811e-05, + "loss": 3.1946, + "step": 4855 + }, + { + "epoch": 0.69, + "learning_rate": 6.2631807020078e-05, + "loss": 3.0128, + "step": 4856 + }, + { + "epoch": 0.69, + "learning_rate": 6.260291781019789e-05, + "loss": 3.2722, + "step": 4857 + }, + { + "epoch": 0.69, + "learning_rate": 6.257402860031778e-05, + "loss": 3.0703, + "step": 4858 + }, + { + "epoch": 0.69, + "learning_rate": 6.254513939043767e-05, + "loss": 3.3005, + "step": 4859 + }, + { + "epoch": 0.69, + "learning_rate": 6.251625018055757e-05, + "loss": 3.2755, + "step": 4860 + }, + { + "epoch": 0.69, + "learning_rate": 6.248736097067746e-05, + "loss": 3.1733, + "step": 4861 + }, + { + "epoch": 0.69, + "learning_rate": 6.245847176079734e-05, + "loss": 3.0191, + "step": 4862 + }, + { + "epoch": 0.69, + "learning_rate": 6.242958255091724e-05, + "loss": 3.1909, + "step": 4863 + }, + { + "epoch": 0.69, + "learning_rate": 6.240069334103713e-05, + "loss": 3.344, + "step": 4864 + }, + { + "epoch": 0.69, + "learning_rate": 6.237180413115702e-05, + "loss": 3.2325, + "step": 4865 + }, + { + "epoch": 0.69, + "learning_rate": 6.23429149212769e-05, + "loss": 3.1482, + "step": 4866 + }, + { + "epoch": 0.69, + "learning_rate": 6.23140257113968e-05, + "loss": 3.2666, + "step": 4867 + }, + { + "epoch": 0.69, + "learning_rate": 6.228513650151668e-05, + "loss": 3.1017, + "step": 4868 + }, + { + "epoch": 0.69, + "learning_rate": 6.225624729163657e-05, + "loss": 3.0212, + "step": 4869 + }, + { + "epoch": 0.69, + "learning_rate": 6.222735808175647e-05, + "loss": 3.141, + "step": 4870 + }, + { + "epoch": 0.69, + "learning_rate": 6.219846887187636e-05, + "loss": 3.1939, + "step": 4871 + }, + { + "epoch": 0.69, + "learning_rate": 6.216957966199625e-05, + "loss": 3.1901, + "step": 4872 + }, + { + "epoch": 0.69, + "learning_rate": 6.214069045211613e-05, + "loss": 2.9811, + "step": 4873 + }, + { + "epoch": 0.69, + "learning_rate": 6.211180124223603e-05, + "loss": 3.1416, + "step": 4874 + }, + { + "epoch": 0.69, + "learning_rate": 6.208291203235592e-05, + "loss": 3.2035, + "step": 4875 + }, + { + "epoch": 0.69, + "learning_rate": 6.205402282247581e-05, + "loss": 3.3768, + "step": 4876 + }, + { + "epoch": 0.69, + "learning_rate": 6.20251336125957e-05, + "loss": 3.1864, + "step": 4877 + }, + { + "epoch": 0.69, + "learning_rate": 6.199624440271559e-05, + "loss": 3.2768, + "step": 4878 + }, + { + "epoch": 0.69, + "learning_rate": 6.196735519283547e-05, + "loss": 3.2096, + "step": 4879 + }, + { + "epoch": 0.69, + "learning_rate": 6.193846598295538e-05, + "loss": 3.2389, + "step": 4880 + }, + { + "epoch": 0.69, + "learning_rate": 6.190957677307526e-05, + "loss": 3.1871, + "step": 4881 + }, + { + "epoch": 0.7, + "learning_rate": 6.188068756319515e-05, + "loss": 3.1195, + "step": 4882 + }, + { + "epoch": 0.7, + "learning_rate": 6.185179835331504e-05, + "loss": 3.1172, + "step": 4883 + }, + { + "epoch": 0.7, + "learning_rate": 6.182290914343492e-05, + "loss": 3.2083, + "step": 4884 + }, + { + "epoch": 0.7, + "learning_rate": 6.179401993355482e-05, + "loss": 3.2845, + "step": 4885 + }, + { + "epoch": 0.7, + "learning_rate": 6.176513072367471e-05, + "loss": 3.2313, + "step": 4886 + }, + { + "epoch": 0.7, + "learning_rate": 6.17362415137946e-05, + "loss": 3.2976, + "step": 4887 + }, + { + "epoch": 0.7, + "learning_rate": 6.170735230391449e-05, + "loss": 3.2979, + "step": 4888 + }, + { + "epoch": 0.7, + "learning_rate": 6.167846309403438e-05, + "loss": 3.2395, + "step": 4889 + }, + { + "epoch": 0.7, + "learning_rate": 6.164957388415426e-05, + "loss": 3.2283, + "step": 4890 + }, + { + "epoch": 0.7, + "learning_rate": 6.162068467427417e-05, + "loss": 3.1972, + "step": 4891 + }, + { + "epoch": 0.7, + "learning_rate": 6.159179546439405e-05, + "loss": 3.2107, + "step": 4892 + }, + { + "epoch": 0.7, + "learning_rate": 6.156290625451394e-05, + "loss": 3.1887, + "step": 4893 + }, + { + "epoch": 0.7, + "learning_rate": 6.153401704463384e-05, + "loss": 3.0887, + "step": 4894 + }, + { + "epoch": 0.7, + "learning_rate": 6.150512783475372e-05, + "loss": 3.2432, + "step": 4895 + }, + { + "epoch": 0.7, + "learning_rate": 6.147623862487361e-05, + "loss": 3.0324, + "step": 4896 + }, + { + "epoch": 0.7, + "learning_rate": 6.14473494149935e-05, + "loss": 3.1536, + "step": 4897 + }, + { + "epoch": 0.7, + "learning_rate": 6.14184602051134e-05, + "loss": 3.0445, + "step": 4898 + }, + { + "epoch": 0.7, + "learning_rate": 6.138957099523328e-05, + "loss": 3.224, + "step": 4899 + }, + { + "epoch": 0.7, + "learning_rate": 6.136068178535317e-05, + "loss": 3.297, + "step": 4900 + }, + { + "epoch": 0.7, + "learning_rate": 6.133179257547307e-05, + "loss": 3.2922, + "step": 4901 + }, + { + "epoch": 0.7, + "learning_rate": 6.130290336559296e-05, + "loss": 3.2604, + "step": 4902 + }, + { + "epoch": 0.7, + "learning_rate": 6.127401415571284e-05, + "loss": 3.1391, + "step": 4903 + }, + { + "epoch": 0.7, + "learning_rate": 6.124512494583273e-05, + "loss": 3.2075, + "step": 4904 + }, + { + "epoch": 0.7, + "learning_rate": 6.121623573595263e-05, + "loss": 3.2132, + "step": 4905 + }, + { + "epoch": 0.7, + "learning_rate": 6.118734652607252e-05, + "loss": 3.0788, + "step": 4906 + }, + { + "epoch": 0.7, + "learning_rate": 6.11584573161924e-05, + "loss": 3.1895, + "step": 4907 + }, + { + "epoch": 0.7, + "learning_rate": 6.11295681063123e-05, + "loss": 3.2684, + "step": 4908 + }, + { + "epoch": 0.7, + "learning_rate": 6.110067889643219e-05, + "loss": 3.1677, + "step": 4909 + }, + { + "epoch": 0.7, + "learning_rate": 6.107178968655207e-05, + "loss": 3.1967, + "step": 4910 + }, + { + "epoch": 0.7, + "learning_rate": 6.104290047667198e-05, + "loss": 3.1881, + "step": 4911 + }, + { + "epoch": 0.7, + "learning_rate": 6.101401126679186e-05, + "loss": 3.2648, + "step": 4912 + }, + { + "epoch": 0.7, + "learning_rate": 6.098512205691175e-05, + "loss": 3.2341, + "step": 4913 + }, + { + "epoch": 0.7, + "learning_rate": 6.095623284703164e-05, + "loss": 3.1041, + "step": 4914 + }, + { + "epoch": 0.7, + "learning_rate": 6.0927343637151525e-05, + "loss": 3.1151, + "step": 4915 + }, + { + "epoch": 0.7, + "learning_rate": 6.089845442727141e-05, + "loss": 3.178, + "step": 4916 + }, + { + "epoch": 0.7, + "learning_rate": 6.086956521739131e-05, + "loss": 3.0908, + "step": 4917 + }, + { + "epoch": 0.7, + "learning_rate": 6.08406760075112e-05, + "loss": 3.1385, + "step": 4918 + }, + { + "epoch": 0.7, + "learning_rate": 6.081178679763109e-05, + "loss": 3.1282, + "step": 4919 + }, + { + "epoch": 0.7, + "learning_rate": 6.0782897587750974e-05, + "loss": 3.2666, + "step": 4920 + }, + { + "epoch": 0.7, + "learning_rate": 6.075400837787086e-05, + "loss": 3.1628, + "step": 4921 + }, + { + "epoch": 0.7, + "learning_rate": 6.072511916799076e-05, + "loss": 3.1016, + "step": 4922 + }, + { + "epoch": 0.7, + "learning_rate": 6.069622995811065e-05, + "loss": 3.1877, + "step": 4923 + }, + { + "epoch": 0.7, + "learning_rate": 6.066734074823054e-05, + "loss": 3.241, + "step": 4924 + }, + { + "epoch": 0.7, + "learning_rate": 6.063845153835043e-05, + "loss": 3.2473, + "step": 4925 + }, + { + "epoch": 0.7, + "learning_rate": 6.0609562328470316e-05, + "loss": 3.3245, + "step": 4926 + }, + { + "epoch": 0.7, + "learning_rate": 6.058067311859022e-05, + "loss": 3.0849, + "step": 4927 + }, + { + "epoch": 0.7, + "learning_rate": 6.0551783908710104e-05, + "loss": 3.1867, + "step": 4928 + }, + { + "epoch": 0.7, + "eval_loss": 3.405430316925049, + "eval_runtime": 471.5274, + "eval_samples_per_second": 43.448, + "eval_steps_per_second": 14.483, + "step": 4928 + }, + { + "epoch": 0.7, + "learning_rate": 6.052289469882999e-05, + "loss": 3.2446, + "step": 4929 + }, + { + "epoch": 0.7, + "learning_rate": 6.049400548894988e-05, + "loss": 3.0791, + "step": 4930 + }, + { + "epoch": 0.7, + "learning_rate": 6.0465116279069765e-05, + "loss": 3.1938, + "step": 4931 + }, + { + "epoch": 0.7, + "learning_rate": 6.0436227069189666e-05, + "loss": 3.1848, + "step": 4932 + }, + { + "epoch": 0.7, + "learning_rate": 6.040733785930955e-05, + "loss": 3.236, + "step": 4933 + }, + { + "epoch": 0.7, + "learning_rate": 6.037844864942944e-05, + "loss": 3.0853, + "step": 4934 + }, + { + "epoch": 0.7, + "learning_rate": 6.034955943954933e-05, + "loss": 3.2123, + "step": 4935 + }, + { + "epoch": 0.7, + "learning_rate": 6.032067022966922e-05, + "loss": 3.2857, + "step": 4936 + }, + { + "epoch": 0.7, + "learning_rate": 6.029178101978911e-05, + "loss": 3.1235, + "step": 4937 + }, + { + "epoch": 0.7, + "learning_rate": 6.026289180990901e-05, + "loss": 2.9182, + "step": 4938 + }, + { + "epoch": 0.7, + "learning_rate": 6.0234002600028895e-05, + "loss": 3.1968, + "step": 4939 + }, + { + "epoch": 0.7, + "learning_rate": 6.020511339014878e-05, + "loss": 3.1346, + "step": 4940 + }, + { + "epoch": 0.7, + "learning_rate": 6.017622418026867e-05, + "loss": 3.0872, + "step": 4941 + }, + { + "epoch": 0.7, + "learning_rate": 6.0147334970388557e-05, + "loss": 3.1449, + "step": 4942 + }, + { + "epoch": 0.7, + "learning_rate": 6.011844576050846e-05, + "loss": 3.015, + "step": 4943 + }, + { + "epoch": 0.7, + "learning_rate": 6.0089556550628344e-05, + "loss": 3.2121, + "step": 4944 + }, + { + "epoch": 0.7, + "learning_rate": 6.006066734074823e-05, + "loss": 3.1967, + "step": 4945 + }, + { + "epoch": 0.7, + "learning_rate": 6.003177813086812e-05, + "loss": 3.0941, + "step": 4946 + }, + { + "epoch": 0.7, + "learning_rate": 6.000288892098801e-05, + "loss": 3.1809, + "step": 4947 + }, + { + "epoch": 0.7, + "learning_rate": 5.9973999711107906e-05, + "loss": 2.9571, + "step": 4948 + }, + { + "epoch": 0.7, + "learning_rate": 5.99451105012278e-05, + "loss": 3.1754, + "step": 4949 + }, + { + "epoch": 0.7, + "learning_rate": 5.991622129134769e-05, + "loss": 3.143, + "step": 4950 + }, + { + "epoch": 0.7, + "learning_rate": 5.9887332081467574e-05, + "loss": 3.0815, + "step": 4951 + }, + { + "epoch": 0.71, + "learning_rate": 5.985844287158746e-05, + "loss": 3.2158, + "step": 4952 + }, + { + "epoch": 0.71, + "learning_rate": 5.982955366170736e-05, + "loss": 3.1671, + "step": 4953 + }, + { + "epoch": 0.71, + "learning_rate": 5.980066445182725e-05, + "loss": 3.2071, + "step": 4954 + }, + { + "epoch": 0.71, + "learning_rate": 5.9771775241947136e-05, + "loss": 3.1664, + "step": 4955 + }, + { + "epoch": 0.71, + "learning_rate": 5.974288603206702e-05, + "loss": 3.1808, + "step": 4956 + }, + { + "epoch": 0.71, + "learning_rate": 5.971399682218691e-05, + "loss": 3.1447, + "step": 4957 + }, + { + "epoch": 0.71, + "learning_rate": 5.9685107612306804e-05, + "loss": 3.2338, + "step": 4958 + }, + { + "epoch": 0.71, + "learning_rate": 5.96562184024267e-05, + "loss": 3.191, + "step": 4959 + }, + { + "epoch": 0.71, + "learning_rate": 5.962732919254659e-05, + "loss": 3.1847, + "step": 4960 + }, + { + "epoch": 0.71, + "learning_rate": 5.959843998266648e-05, + "loss": 3.018, + "step": 4961 + }, + { + "epoch": 0.71, + "learning_rate": 5.9569550772786365e-05, + "loss": 3.0103, + "step": 4962 + }, + { + "epoch": 0.71, + "learning_rate": 5.954066156290625e-05, + "loss": 3.1028, + "step": 4963 + }, + { + "epoch": 0.71, + "learning_rate": 5.951177235302615e-05, + "loss": 3.2361, + "step": 4964 + }, + { + "epoch": 0.71, + "learning_rate": 5.948288314314604e-05, + "loss": 3.2101, + "step": 4965 + }, + { + "epoch": 0.71, + "learning_rate": 5.945399393326593e-05, + "loss": 3.3008, + "step": 4966 + }, + { + "epoch": 0.71, + "learning_rate": 5.9425104723385814e-05, + "loss": 3.0315, + "step": 4967 + }, + { + "epoch": 0.71, + "learning_rate": 5.93962155135057e-05, + "loss": 3.2395, + "step": 4968 + }, + { + "epoch": 0.71, + "learning_rate": 5.93673263036256e-05, + "loss": 3.1923, + "step": 4969 + }, + { + "epoch": 0.71, + "learning_rate": 5.933843709374549e-05, + "loss": 3.185, + "step": 4970 + }, + { + "epoch": 0.71, + "learning_rate": 5.930954788386538e-05, + "loss": 2.9979, + "step": 4971 + }, + { + "epoch": 0.71, + "learning_rate": 5.928065867398527e-05, + "loss": 3.169, + "step": 4972 + }, + { + "epoch": 0.71, + "learning_rate": 5.925176946410516e-05, + "loss": 3.1904, + "step": 4973 + }, + { + "epoch": 0.71, + "learning_rate": 5.922288025422506e-05, + "loss": 3.1637, + "step": 4974 + }, + { + "epoch": 0.71, + "learning_rate": 5.9193991044344944e-05, + "loss": 2.9797, + "step": 4975 + }, + { + "epoch": 0.71, + "learning_rate": 5.916510183446483e-05, + "loss": 3.1532, + "step": 4976 + }, + { + "epoch": 0.71, + "learning_rate": 5.913621262458472e-05, + "loss": 3.1193, + "step": 4977 + }, + { + "epoch": 0.71, + "learning_rate": 5.9107323414704606e-05, + "loss": 3.1299, + "step": 4978 + }, + { + "epoch": 0.71, + "learning_rate": 5.907843420482449e-05, + "loss": 3.0867, + "step": 4979 + }, + { + "epoch": 0.71, + "learning_rate": 5.904954499494439e-05, + "loss": 3.1889, + "step": 4980 + }, + { + "epoch": 0.71, + "learning_rate": 5.902065578506428e-05, + "loss": 3.192, + "step": 4981 + }, + { + "epoch": 0.71, + "learning_rate": 5.8991766575184174e-05, + "loss": 3.2168, + "step": 4982 + }, + { + "epoch": 0.71, + "learning_rate": 5.896287736530406e-05, + "loss": 3.2185, + "step": 4983 + }, + { + "epoch": 0.71, + "learning_rate": 5.893398815542395e-05, + "loss": 3.176, + "step": 4984 + }, + { + "epoch": 0.71, + "learning_rate": 5.890509894554385e-05, + "loss": 3.1294, + "step": 4985 + }, + { + "epoch": 0.71, + "learning_rate": 5.8876209735663736e-05, + "loss": 3.1345, + "step": 4986 + }, + { + "epoch": 0.71, + "learning_rate": 5.884732052578362e-05, + "loss": 3.3109, + "step": 4987 + }, + { + "epoch": 0.71, + "learning_rate": 5.881843131590351e-05, + "loss": 3.0086, + "step": 4988 + }, + { + "epoch": 0.71, + "learning_rate": 5.87895421060234e-05, + "loss": 3.2247, + "step": 4989 + }, + { + "epoch": 0.71, + "learning_rate": 5.87606528961433e-05, + "loss": 3.1655, + "step": 4990 + }, + { + "epoch": 0.71, + "learning_rate": 5.8731763686263185e-05, + "loss": 3.1094, + "step": 4991 + }, + { + "epoch": 0.71, + "learning_rate": 5.870287447638307e-05, + "loss": 3.1659, + "step": 4992 + }, + { + "epoch": 0.71, + "learning_rate": 5.8673985266502965e-05, + "loss": 3.1532, + "step": 4993 + }, + { + "epoch": 0.71, + "learning_rate": 5.864509605662285e-05, + "loss": 3.2451, + "step": 4994 + }, + { + "epoch": 0.71, + "learning_rate": 5.8616206846742746e-05, + "loss": 3.1007, + "step": 4995 + }, + { + "epoch": 0.71, + "learning_rate": 5.858731763686264e-05, + "loss": 3.1713, + "step": 4996 + }, + { + "epoch": 0.71, + "learning_rate": 5.855842842698253e-05, + "loss": 3.1186, + "step": 4997 + }, + { + "epoch": 0.71, + "learning_rate": 5.8529539217102414e-05, + "loss": 3.1076, + "step": 4998 + }, + { + "epoch": 0.71, + "learning_rate": 5.85006500072223e-05, + "loss": 3.2508, + "step": 4999 + }, + { + "epoch": 0.71, + "learning_rate": 5.84717607973422e-05, + "loss": 3.0121, + "step": 5000 + }, + { + "epoch": 0.71, + "learning_rate": 5.844287158746209e-05, + "loss": 3.1222, + "step": 5001 + }, + { + "epoch": 0.71, + "learning_rate": 5.8413982377581976e-05, + "loss": 3.19, + "step": 5002 + }, + { + "epoch": 0.71, + "learning_rate": 5.838509316770186e-05, + "loss": 3.1305, + "step": 5003 + }, + { + "epoch": 0.71, + "learning_rate": 5.835620395782175e-05, + "loss": 3.1201, + "step": 5004 + }, + { + "epoch": 0.71, + "learning_rate": 5.8327314747941644e-05, + "loss": 3.2659, + "step": 5005 + }, + { + "epoch": 0.71, + "learning_rate": 5.829842553806154e-05, + "loss": 3.1431, + "step": 5006 + }, + { + "epoch": 0.71, + "learning_rate": 5.826953632818143e-05, + "loss": 3.2168, + "step": 5007 + }, + { + "epoch": 0.71, + "learning_rate": 5.824064711830132e-05, + "loss": 3.1828, + "step": 5008 + }, + { + "epoch": 0.71, + "learning_rate": 5.8211757908421206e-05, + "loss": 3.1976, + "step": 5009 + }, + { + "epoch": 0.71, + "learning_rate": 5.818286869854109e-05, + "loss": 3.0626, + "step": 5010 + }, + { + "epoch": 0.71, + "learning_rate": 5.815397948866099e-05, + "loss": 3.1023, + "step": 5011 + }, + { + "epoch": 0.71, + "learning_rate": 5.812509027878088e-05, + "loss": 3.1348, + "step": 5012 + }, + { + "epoch": 0.71, + "learning_rate": 5.809620106890077e-05, + "loss": 3.2077, + "step": 5013 + }, + { + "epoch": 0.71, + "learning_rate": 5.8067311859020655e-05, + "loss": 3.312, + "step": 5014 + }, + { + "epoch": 0.71, + "learning_rate": 5.803842264914054e-05, + "loss": 3.3103, + "step": 5015 + }, + { + "epoch": 0.71, + "learning_rate": 5.800953343926044e-05, + "loss": 3.1398, + "step": 5016 + }, + { + "epoch": 0.71, + "learning_rate": 5.798064422938033e-05, + "loss": 3.1509, + "step": 5017 + }, + { + "epoch": 0.71, + "learning_rate": 5.795175501950022e-05, + "loss": 3.2007, + "step": 5018 + }, + { + "epoch": 0.71, + "learning_rate": 5.792286580962011e-05, + "loss": 3.2066, + "step": 5019 + }, + { + "epoch": 0.71, + "learning_rate": 5.789397659974e-05, + "loss": 3.1849, + "step": 5020 + }, + { + "epoch": 0.71, + "learning_rate": 5.78650873898599e-05, + "loss": 3.2473, + "step": 5021 + }, + { + "epoch": 0.72, + "learning_rate": 5.7836198179979785e-05, + "loss": 3.1957, + "step": 5022 + }, + { + "epoch": 0.72, + "learning_rate": 5.780730897009967e-05, + "loss": 3.1588, + "step": 5023 + }, + { + "epoch": 0.72, + "learning_rate": 5.777841976021956e-05, + "loss": 3.1026, + "step": 5024 + }, + { + "epoch": 0.72, + "learning_rate": 5.7749530550339446e-05, + "loss": 3.0227, + "step": 5025 + }, + { + "epoch": 0.72, + "learning_rate": 5.772064134045933e-05, + "loss": 3.1725, + "step": 5026 + }, + { + "epoch": 0.72, + "learning_rate": 5.7691752130579234e-05, + "loss": 3.3349, + "step": 5027 + }, + { + "epoch": 0.72, + "learning_rate": 5.766286292069912e-05, + "loss": 3.179, + "step": 5028 + }, + { + "epoch": 0.72, + "learning_rate": 5.7633973710819014e-05, + "loss": 3.2301, + "step": 5029 + }, + { + "epoch": 0.72, + "learning_rate": 5.76050845009389e-05, + "loss": 3.1841, + "step": 5030 + }, + { + "epoch": 0.72, + "learning_rate": 5.757619529105879e-05, + "loss": 3.1111, + "step": 5031 + }, + { + "epoch": 0.72, + "learning_rate": 5.754730608117869e-05, + "loss": 3.1174, + "step": 5032 + }, + { + "epoch": 0.72, + "learning_rate": 5.7518416871298576e-05, + "loss": 3.1633, + "step": 5033 + }, + { + "epoch": 0.72, + "learning_rate": 5.748952766141846e-05, + "loss": 2.9734, + "step": 5034 + }, + { + "epoch": 0.72, + "learning_rate": 5.746063845153835e-05, + "loss": 3.1083, + "step": 5035 + }, + { + "epoch": 0.72, + "learning_rate": 5.743174924165824e-05, + "loss": 3.211, + "step": 5036 + }, + { + "epoch": 0.72, + "learning_rate": 5.740286003177814e-05, + "loss": 3.1702, + "step": 5037 + }, + { + "epoch": 0.72, + "learning_rate": 5.7373970821898025e-05, + "loss": 3.2022, + "step": 5038 + }, + { + "epoch": 0.72, + "learning_rate": 5.734508161201791e-05, + "loss": 3.1054, + "step": 5039 + }, + { + "epoch": 0.72, + "learning_rate": 5.7316192402137806e-05, + "loss": 3.1327, + "step": 5040 + }, + { + "epoch": 0.72, + "learning_rate": 5.728730319225769e-05, + "loss": 3.0072, + "step": 5041 + }, + { + "epoch": 0.72, + "learning_rate": 5.725841398237759e-05, + "loss": 3.1727, + "step": 5042 + }, + { + "epoch": 0.72, + "learning_rate": 5.722952477249748e-05, + "loss": 3.247, + "step": 5043 + }, + { + "epoch": 0.72, + "learning_rate": 5.720063556261737e-05, + "loss": 3.1123, + "step": 5044 + }, + { + "epoch": 0.72, + "learning_rate": 5.7171746352737255e-05, + "loss": 3.1567, + "step": 5045 + }, + { + "epoch": 0.72, + "learning_rate": 5.714285714285714e-05, + "loss": 3.1468, + "step": 5046 + }, + { + "epoch": 0.72, + "learning_rate": 5.711396793297703e-05, + "loss": 3.1196, + "step": 5047 + }, + { + "epoch": 0.72, + "learning_rate": 5.708507872309693e-05, + "loss": 3.193, + "step": 5048 + }, + { + "epoch": 0.72, + "learning_rate": 5.7056189513216816e-05, + "loss": 3.0999, + "step": 5049 + }, + { + "epoch": 0.72, + "learning_rate": 5.7027300303336703e-05, + "loss": 3.2171, + "step": 5050 + }, + { + "epoch": 0.72, + "learning_rate": 5.69984110934566e-05, + "loss": 3.2542, + "step": 5051 + }, + { + "epoch": 0.72, + "learning_rate": 5.6969521883576484e-05, + "loss": 3.2912, + "step": 5052 + }, + { + "epoch": 0.72, + "learning_rate": 5.694063267369638e-05, + "loss": 3.1588, + "step": 5053 + }, + { + "epoch": 0.72, + "learning_rate": 5.691174346381627e-05, + "loss": 3.2516, + "step": 5054 + }, + { + "epoch": 0.72, + "learning_rate": 5.688285425393616e-05, + "loss": 3.1515, + "step": 5055 + }, + { + "epoch": 0.72, + "learning_rate": 5.6853965044056046e-05, + "loss": 3.2265, + "step": 5056 + }, + { + "epoch": 0.72, + "learning_rate": 5.682507583417593e-05, + "loss": 3.1729, + "step": 5057 + }, + { + "epoch": 0.72, + "learning_rate": 5.6796186624295834e-05, + "loss": 3.2611, + "step": 5058 + }, + { + "epoch": 0.72, + "learning_rate": 5.676729741441572e-05, + "loss": 3.2122, + "step": 5059 + }, + { + "epoch": 0.72, + "learning_rate": 5.673840820453561e-05, + "loss": 3.3132, + "step": 5060 + }, + { + "epoch": 0.72, + "learning_rate": 5.6709518994655495e-05, + "loss": 3.195, + "step": 5061 + }, + { + "epoch": 0.72, + "learning_rate": 5.668062978477538e-05, + "loss": 3.208, + "step": 5062 + }, + { + "epoch": 0.72, + "learning_rate": 5.665174057489528e-05, + "loss": 3.2916, + "step": 5063 + }, + { + "epoch": 0.72, + "learning_rate": 5.662285136501517e-05, + "loss": 3.1674, + "step": 5064 + }, + { + "epoch": 0.72, + "learning_rate": 5.6593962155135063e-05, + "loss": 3.1717, + "step": 5065 + }, + { + "epoch": 0.72, + "learning_rate": 5.656507294525495e-05, + "loss": 3.1739, + "step": 5066 + }, + { + "epoch": 0.72, + "learning_rate": 5.653618373537484e-05, + "loss": 3.0539, + "step": 5067 + }, + { + "epoch": 0.72, + "learning_rate": 5.6507294525494725e-05, + "loss": 3.1643, + "step": 5068 + }, + { + "epoch": 0.72, + "learning_rate": 5.6478405315614625e-05, + "loss": 3.1571, + "step": 5069 + }, + { + "epoch": 0.72, + "learning_rate": 5.644951610573451e-05, + "loss": 3.1703, + "step": 5070 + }, + { + "epoch": 0.72, + "learning_rate": 5.64206268958544e-05, + "loss": 3.1374, + "step": 5071 + }, + { + "epoch": 0.72, + "learning_rate": 5.6391737685974286e-05, + "loss": 3.1227, + "step": 5072 + }, + { + "epoch": 0.72, + "learning_rate": 5.636284847609417e-05, + "loss": 3.1776, + "step": 5073 + }, + { + "epoch": 0.72, + "learning_rate": 5.6333959266214074e-05, + "loss": 3.2057, + "step": 5074 + }, + { + "epoch": 0.72, + "learning_rate": 5.630507005633396e-05, + "loss": 3.2371, + "step": 5075 + }, + { + "epoch": 0.72, + "learning_rate": 5.6276180846453855e-05, + "loss": 3.2429, + "step": 5076 + }, + { + "epoch": 0.72, + "learning_rate": 5.624729163657374e-05, + "loss": 3.1389, + "step": 5077 + }, + { + "epoch": 0.72, + "learning_rate": 5.621840242669363e-05, + "loss": 3.2158, + "step": 5078 + }, + { + "epoch": 0.72, + "learning_rate": 5.618951321681353e-05, + "loss": 3.2406, + "step": 5079 + }, + { + "epoch": 0.72, + "learning_rate": 5.6160624006933417e-05, + "loss": 2.9815, + "step": 5080 + }, + { + "epoch": 0.72, + "learning_rate": 5.6131734797053304e-05, + "loss": 3.1771, + "step": 5081 + }, + { + "epoch": 0.72, + "learning_rate": 5.610284558717319e-05, + "loss": 3.0717, + "step": 5082 + }, + { + "epoch": 0.72, + "learning_rate": 5.607395637729308e-05, + "loss": 3.1711, + "step": 5083 + }, + { + "epoch": 0.72, + "learning_rate": 5.604506716741298e-05, + "loss": 3.1081, + "step": 5084 + }, + { + "epoch": 0.72, + "learning_rate": 5.6016177957532865e-05, + "loss": 3.1248, + "step": 5085 + }, + { + "epoch": 0.72, + "learning_rate": 5.598728874765275e-05, + "loss": 3.0333, + "step": 5086 + }, + { + "epoch": 0.72, + "learning_rate": 5.5958399537772646e-05, + "loss": 3.2139, + "step": 5087 + }, + { + "epoch": 0.72, + "learning_rate": 5.592951032789253e-05, + "loss": 3.2611, + "step": 5088 + }, + { + "epoch": 0.72, + "learning_rate": 5.590062111801242e-05, + "loss": 3.1232, + "step": 5089 + }, + { + "epoch": 0.72, + "learning_rate": 5.587173190813232e-05, + "loss": 3.109, + "step": 5090 + }, + { + "epoch": 0.72, + "learning_rate": 5.584284269825221e-05, + "loss": 3.1966, + "step": 5091 + }, + { + "epoch": 0.72, + "learning_rate": 5.5813953488372095e-05, + "loss": 3.1803, + "step": 5092 + }, + { + "epoch": 0.73, + "learning_rate": 5.578506427849198e-05, + "loss": 3.0442, + "step": 5093 + }, + { + "epoch": 0.73, + "learning_rate": 5.575617506861187e-05, + "loss": 3.1491, + "step": 5094 + }, + { + "epoch": 0.73, + "learning_rate": 5.572728585873177e-05, + "loss": 3.3084, + "step": 5095 + }, + { + "epoch": 0.73, + "learning_rate": 5.569839664885166e-05, + "loss": 3.2134, + "step": 5096 + }, + { + "epoch": 0.73, + "learning_rate": 5.5669507438971544e-05, + "loss": 3.219, + "step": 5097 + }, + { + "epoch": 0.73, + "learning_rate": 5.564061822909144e-05, + "loss": 2.9762, + "step": 5098 + }, + { + "epoch": 0.73, + "learning_rate": 5.5611729019211325e-05, + "loss": 2.9179, + "step": 5099 + }, + { + "epoch": 0.73, + "learning_rate": 5.558283980933122e-05, + "loss": 3.2013, + "step": 5100 + }, + { + "epoch": 0.73, + "learning_rate": 5.555395059945111e-05, + "loss": 2.9588, + "step": 5101 + }, + { + "epoch": 0.73, + "learning_rate": 5.5525061389571e-05, + "loss": 3.0973, + "step": 5102 + }, + { + "epoch": 0.73, + "learning_rate": 5.5496172179690886e-05, + "loss": 3.1533, + "step": 5103 + }, + { + "epoch": 0.73, + "learning_rate": 5.5467282969810774e-05, + "loss": 3.1741, + "step": 5104 + }, + { + "epoch": 0.73, + "learning_rate": 5.5438393759930674e-05, + "loss": 3.0876, + "step": 5105 + }, + { + "epoch": 0.73, + "learning_rate": 5.540950455005056e-05, + "loss": 3.2014, + "step": 5106 + }, + { + "epoch": 0.73, + "learning_rate": 5.538061534017045e-05, + "loss": 3.1576, + "step": 5107 + }, + { + "epoch": 0.73, + "learning_rate": 5.5351726130290335e-05, + "loss": 3.1465, + "step": 5108 + }, + { + "epoch": 0.73, + "learning_rate": 5.532283692041023e-05, + "loss": 3.2683, + "step": 5109 + }, + { + "epoch": 0.73, + "learning_rate": 5.529394771053012e-05, + "loss": 3.266, + "step": 5110 + }, + { + "epoch": 0.73, + "learning_rate": 5.526505850065001e-05, + "loss": 3.1291, + "step": 5111 + }, + { + "epoch": 0.73, + "learning_rate": 5.5236169290769904e-05, + "loss": 3.2675, + "step": 5112 + }, + { + "epoch": 0.73, + "learning_rate": 5.520728008088979e-05, + "loss": 3.2307, + "step": 5113 + }, + { + "epoch": 0.73, + "learning_rate": 5.517839087100968e-05, + "loss": 2.9991, + "step": 5114 + }, + { + "epoch": 0.73, + "learning_rate": 5.5149501661129565e-05, + "loss": 3.1703, + "step": 5115 + }, + { + "epoch": 0.73, + "learning_rate": 5.5120612451249466e-05, + "loss": 3.059, + "step": 5116 + }, + { + "epoch": 0.73, + "learning_rate": 5.509172324136935e-05, + "loss": 3.1885, + "step": 5117 + }, + { + "epoch": 0.73, + "learning_rate": 5.506283403148924e-05, + "loss": 3.2284, + "step": 5118 + }, + { + "epoch": 0.73, + "learning_rate": 5.503394482160913e-05, + "loss": 3.1545, + "step": 5119 + }, + { + "epoch": 0.73, + "learning_rate": 5.500505561172902e-05, + "loss": 3.1512, + "step": 5120 + }, + { + "epoch": 0.73, + "learning_rate": 5.4976166401848914e-05, + "loss": 3.2021, + "step": 5121 + }, + { + "epoch": 0.73, + "learning_rate": 5.49472771919688e-05, + "loss": 3.1952, + "step": 5122 + }, + { + "epoch": 0.73, + "learning_rate": 5.4918387982088695e-05, + "loss": 3.1533, + "step": 5123 + }, + { + "epoch": 0.73, + "learning_rate": 5.488949877220858e-05, + "loss": 3.2113, + "step": 5124 + }, + { + "epoch": 0.73, + "learning_rate": 5.486060956232847e-05, + "loss": 3.1937, + "step": 5125 + }, + { + "epoch": 0.73, + "learning_rate": 5.483172035244837e-05, + "loss": 3.1545, + "step": 5126 + }, + { + "epoch": 0.73, + "learning_rate": 5.480283114256826e-05, + "loss": 3.2336, + "step": 5127 + }, + { + "epoch": 0.73, + "learning_rate": 5.4773941932688144e-05, + "loss": 3.1997, + "step": 5128 + }, + { + "epoch": 0.73, + "learning_rate": 5.474505272280803e-05, + "loss": 3.0685, + "step": 5129 + }, + { + "epoch": 0.73, + "learning_rate": 5.471616351292792e-05, + "loss": 3.0653, + "step": 5130 + }, + { + "epoch": 0.73, + "learning_rate": 5.468727430304782e-05, + "loss": 3.2056, + "step": 5131 + }, + { + "epoch": 0.73, + "learning_rate": 5.4658385093167706e-05, + "loss": 3.2199, + "step": 5132 + }, + { + "epoch": 0.73, + "learning_rate": 5.462949588328759e-05, + "loss": 3.2525, + "step": 5133 + }, + { + "epoch": 0.73, + "learning_rate": 5.460060667340749e-05, + "loss": 3.2594, + "step": 5134 + }, + { + "epoch": 0.73, + "learning_rate": 5.4571717463527374e-05, + "loss": 3.2341, + "step": 5135 + }, + { + "epoch": 0.73, + "learning_rate": 5.454282825364726e-05, + "loss": 3.2136, + "step": 5136 + }, + { + "epoch": 0.73, + "learning_rate": 5.451393904376716e-05, + "loss": 3.1541, + "step": 5137 + }, + { + "epoch": 0.73, + "learning_rate": 5.448504983388705e-05, + "loss": 3.1913, + "step": 5138 + }, + { + "epoch": 0.73, + "learning_rate": 5.4456160624006935e-05, + "loss": 3.0299, + "step": 5139 + }, + { + "epoch": 0.73, + "learning_rate": 5.442727141412682e-05, + "loss": 3.1656, + "step": 5140 + }, + { + "epoch": 0.73, + "learning_rate": 5.439838220424671e-05, + "loss": 3.237, + "step": 5141 + }, + { + "epoch": 0.73, + "learning_rate": 5.436949299436661e-05, + "loss": 3.2268, + "step": 5142 + }, + { + "epoch": 0.73, + "learning_rate": 5.43406037844865e-05, + "loss": 3.1631, + "step": 5143 + }, + { + "epoch": 0.73, + "learning_rate": 5.4311714574606384e-05, + "loss": 3.1169, + "step": 5144 + }, + { + "epoch": 0.73, + "learning_rate": 5.428282536472628e-05, + "loss": 3.0628, + "step": 5145 + }, + { + "epoch": 0.73, + "learning_rate": 5.4253936154846165e-05, + "loss": 3.2385, + "step": 5146 + }, + { + "epoch": 0.73, + "learning_rate": 5.4225046944966066e-05, + "loss": 3.1183, + "step": 5147 + }, + { + "epoch": 0.73, + "learning_rate": 5.419615773508595e-05, + "loss": 3.0587, + "step": 5148 + }, + { + "epoch": 0.73, + "learning_rate": 5.416726852520584e-05, + "loss": 3.1037, + "step": 5149 + }, + { + "epoch": 0.73, + "learning_rate": 5.413837931532573e-05, + "loss": 3.0909, + "step": 5150 + }, + { + "epoch": 0.73, + "learning_rate": 5.4109490105445614e-05, + "loss": 2.9987, + "step": 5151 + }, + { + "epoch": 0.73, + "learning_rate": 5.4080600895565515e-05, + "loss": 3.2461, + "step": 5152 + }, + { + "epoch": 0.73, + "learning_rate": 5.40517116856854e-05, + "loss": 3.1846, + "step": 5153 + }, + { + "epoch": 0.73, + "learning_rate": 5.402282247580529e-05, + "loss": 3.1396, + "step": 5154 + }, + { + "epoch": 0.73, + "learning_rate": 5.3993933265925176e-05, + "loss": 3.2256, + "step": 5155 + }, + { + "epoch": 0.73, + "learning_rate": 5.396504405604507e-05, + "loss": 3.1149, + "step": 5156 + }, + { + "epoch": 0.73, + "learning_rate": 5.3936154846164957e-05, + "loss": 3.1215, + "step": 5157 + }, + { + "epoch": 0.73, + "learning_rate": 5.390726563628486e-05, + "loss": 3.1063, + "step": 5158 + }, + { + "epoch": 0.73, + "learning_rate": 5.3878376426404744e-05, + "loss": 3.1478, + "step": 5159 + }, + { + "epoch": 0.73, + "learning_rate": 5.384948721652463e-05, + "loss": 3.1244, + "step": 5160 + }, + { + "epoch": 0.73, + "learning_rate": 5.382059800664452e-05, + "loss": 3.1537, + "step": 5161 + }, + { + "epoch": 0.73, + "learning_rate": 5.3791708796764405e-05, + "loss": 3.1722, + "step": 5162 + }, + { + "epoch": 0.74, + "learning_rate": 5.3762819586884306e-05, + "loss": 3.1656, + "step": 5163 + }, + { + "epoch": 0.74, + "learning_rate": 5.373393037700419e-05, + "loss": 3.2235, + "step": 5164 + }, + { + "epoch": 0.74, + "learning_rate": 5.370504116712408e-05, + "loss": 3.2493, + "step": 5165 + }, + { + "epoch": 0.74, + "learning_rate": 5.367615195724397e-05, + "loss": 3.0734, + "step": 5166 + }, + { + "epoch": 0.74, + "learning_rate": 5.364726274736386e-05, + "loss": 3.2295, + "step": 5167 + }, + { + "epoch": 0.74, + "learning_rate": 5.3618373537483755e-05, + "loss": 3.1616, + "step": 5168 + }, + { + "epoch": 0.74, + "learning_rate": 5.358948432760364e-05, + "loss": 3.2689, + "step": 5169 + }, + { + "epoch": 0.74, + "learning_rate": 5.3560595117723536e-05, + "loss": 3.252, + "step": 5170 + }, + { + "epoch": 0.74, + "learning_rate": 5.353170590784342e-05, + "loss": 3.1139, + "step": 5171 + }, + { + "epoch": 0.74, + "learning_rate": 5.350281669796331e-05, + "loss": 3.2654, + "step": 5172 + }, + { + "epoch": 0.74, + "learning_rate": 5.347392748808321e-05, + "loss": 3.2094, + "step": 5173 + }, + { + "epoch": 0.74, + "learning_rate": 5.34450382782031e-05, + "loss": 3.0202, + "step": 5174 + }, + { + "epoch": 0.74, + "learning_rate": 5.3416149068322984e-05, + "loss": 3.2164, + "step": 5175 + }, + { + "epoch": 0.74, + "learning_rate": 5.338725985844287e-05, + "loss": 3.143, + "step": 5176 + }, + { + "epoch": 0.74, + "learning_rate": 5.335837064856276e-05, + "loss": 3.2617, + "step": 5177 + }, + { + "epoch": 0.74, + "learning_rate": 5.332948143868265e-05, + "loss": 3.0766, + "step": 5178 + }, + { + "epoch": 0.74, + "learning_rate": 5.3300592228802546e-05, + "loss": 3.2323, + "step": 5179 + }, + { + "epoch": 0.74, + "learning_rate": 5.327170301892243e-05, + "loss": 3.1394, + "step": 5180 + }, + { + "epoch": 0.74, + "learning_rate": 5.324281380904233e-05, + "loss": 3.1591, + "step": 5181 + }, + { + "epoch": 0.74, + "learning_rate": 5.3213924599162214e-05, + "loss": 3.0493, + "step": 5182 + }, + { + "epoch": 0.74, + "learning_rate": 5.31850353892821e-05, + "loss": 3.2181, + "step": 5183 + }, + { + "epoch": 0.74, + "learning_rate": 5.3156146179402e-05, + "loss": 3.153, + "step": 5184 + }, + { + "epoch": 0.74, + "learning_rate": 5.312725696952189e-05, + "loss": 3.163, + "step": 5185 + }, + { + "epoch": 0.74, + "learning_rate": 5.3098367759641776e-05, + "loss": 3.1937, + "step": 5186 + }, + { + "epoch": 0.74, + "learning_rate": 5.306947854976166e-05, + "loss": 3.1086, + "step": 5187 + }, + { + "epoch": 0.74, + "learning_rate": 5.304058933988155e-05, + "loss": 3.1899, + "step": 5188 + }, + { + "epoch": 0.74, + "learning_rate": 5.301170013000145e-05, + "loss": 2.8025, + "step": 5189 + }, + { + "epoch": 0.74, + "learning_rate": 5.298281092012134e-05, + "loss": 3.2051, + "step": 5190 + }, + { + "epoch": 0.74, + "learning_rate": 5.2953921710241225e-05, + "loss": 3.1696, + "step": 5191 + }, + { + "epoch": 0.74, + "learning_rate": 5.292503250036112e-05, + "loss": 2.8641, + "step": 5192 + }, + { + "epoch": 0.74, + "learning_rate": 5.2896143290481006e-05, + "loss": 3.0356, + "step": 5193 + }, + { + "epoch": 0.74, + "learning_rate": 5.2867254080600906e-05, + "loss": 2.9993, + "step": 5194 + }, + { + "epoch": 0.74, + "learning_rate": 5.283836487072079e-05, + "loss": 3.1589, + "step": 5195 + }, + { + "epoch": 0.74, + "learning_rate": 5.280947566084068e-05, + "loss": 2.8759, + "step": 5196 + }, + { + "epoch": 0.74, + "learning_rate": 5.278058645096057e-05, + "loss": 3.2366, + "step": 5197 + }, + { + "epoch": 0.74, + "learning_rate": 5.2751697241080454e-05, + "loss": 3.2196, + "step": 5198 + }, + { + "epoch": 0.74, + "learning_rate": 5.2722808031200355e-05, + "loss": 3.126, + "step": 5199 + }, + { + "epoch": 0.74, + "learning_rate": 5.269391882132024e-05, + "loss": 3.0893, + "step": 5200 + }, + { + "epoch": 0.74, + "learning_rate": 5.266502961144013e-05, + "loss": 3.2265, + "step": 5201 + }, + { + "epoch": 0.74, + "learning_rate": 5.2636140401560016e-05, + "loss": 3.0793, + "step": 5202 + }, + { + "epoch": 0.74, + "learning_rate": 5.260725119167991e-05, + "loss": 3.1028, + "step": 5203 + }, + { + "epoch": 0.74, + "learning_rate": 5.25783619817998e-05, + "loss": 3.12, + "step": 5204 + }, + { + "epoch": 0.74, + "learning_rate": 5.25494727719197e-05, + "loss": 3.0613, + "step": 5205 + }, + { + "epoch": 0.74, + "learning_rate": 5.2520583562039585e-05, + "loss": 3.0577, + "step": 5206 + }, + { + "epoch": 0.74, + "learning_rate": 5.249169435215947e-05, + "loss": 3.079, + "step": 5207 + }, + { + "epoch": 0.74, + "learning_rate": 5.246280514227936e-05, + "loss": 3.2207, + "step": 5208 + }, + { + "epoch": 0.74, + "learning_rate": 5.2433915932399246e-05, + "loss": 3.1076, + "step": 5209 + }, + { + "epoch": 0.74, + "learning_rate": 5.2405026722519146e-05, + "loss": 3.1991, + "step": 5210 + }, + { + "epoch": 0.74, + "learning_rate": 5.2376137512639033e-05, + "loss": 3.2086, + "step": 5211 + }, + { + "epoch": 0.74, + "learning_rate": 5.234724830275892e-05, + "loss": 2.8687, + "step": 5212 + }, + { + "epoch": 0.74, + "learning_rate": 5.231835909287881e-05, + "loss": 3.1321, + "step": 5213 + }, + { + "epoch": 0.74, + "learning_rate": 5.22894698829987e-05, + "loss": 3.2268, + "step": 5214 + }, + { + "epoch": 0.74, + "learning_rate": 5.2260580673118595e-05, + "loss": 3.2203, + "step": 5215 + }, + { + "epoch": 0.74, + "learning_rate": 5.223169146323849e-05, + "loss": 3.1536, + "step": 5216 + }, + { + "epoch": 0.74, + "learning_rate": 5.2202802253358376e-05, + "loss": 3.2663, + "step": 5217 + }, + { + "epoch": 0.74, + "learning_rate": 5.217391304347826e-05, + "loss": 3.2019, + "step": 5218 + }, + { + "epoch": 0.74, + "learning_rate": 5.214502383359815e-05, + "loss": 3.0641, + "step": 5219 + }, + { + "epoch": 0.74, + "learning_rate": 5.211613462371805e-05, + "loss": 3.0838, + "step": 5220 + }, + { + "epoch": 0.74, + "learning_rate": 5.208724541383794e-05, + "loss": 3.1684, + "step": 5221 + }, + { + "epoch": 0.74, + "learning_rate": 5.2058356203957825e-05, + "loss": 3.149, + "step": 5222 + }, + { + "epoch": 0.74, + "learning_rate": 5.202946699407771e-05, + "loss": 3.0806, + "step": 5223 + }, + { + "epoch": 0.74, + "learning_rate": 5.20005777841976e-05, + "loss": 3.2016, + "step": 5224 + }, + { + "epoch": 0.74, + "learning_rate": 5.197168857431749e-05, + "loss": 3.1248, + "step": 5225 + }, + { + "epoch": 0.74, + "learning_rate": 5.1942799364437387e-05, + "loss": 3.2296, + "step": 5226 + }, + { + "epoch": 0.74, + "learning_rate": 5.191391015455728e-05, + "loss": 3.1643, + "step": 5227 + }, + { + "epoch": 0.74, + "learning_rate": 5.188502094467717e-05, + "loss": 3.119, + "step": 5228 + }, + { + "epoch": 0.74, + "learning_rate": 5.1856131734797054e-05, + "loss": 3.0145, + "step": 5229 + }, + { + "epoch": 0.74, + "learning_rate": 5.182724252491694e-05, + "loss": 3.1054, + "step": 5230 + }, + { + "epoch": 0.74, + "learning_rate": 5.179835331503684e-05, + "loss": 2.9761, + "step": 5231 + }, + { + "epoch": 0.74, + "learning_rate": 5.176946410515673e-05, + "loss": 3.1658, + "step": 5232 + }, + { + "epoch": 0.75, + "learning_rate": 5.1740574895276616e-05, + "loss": 3.1663, + "step": 5233 + }, + { + "epoch": 0.75, + "learning_rate": 5.17116856853965e-05, + "loss": 3.1418, + "step": 5234 + }, + { + "epoch": 0.75, + "learning_rate": 5.168279647551639e-05, + "loss": 3.1757, + "step": 5235 + }, + { + "epoch": 0.75, + "learning_rate": 5.165390726563629e-05, + "loss": 3.2377, + "step": 5236 + }, + { + "epoch": 0.75, + "learning_rate": 5.162501805575618e-05, + "loss": 3.1921, + "step": 5237 + }, + { + "epoch": 0.75, + "learning_rate": 5.1596128845876065e-05, + "loss": 3.0378, + "step": 5238 + }, + { + "epoch": 0.75, + "learning_rate": 5.156723963599596e-05, + "loss": 3.1379, + "step": 5239 + }, + { + "epoch": 0.75, + "learning_rate": 5.1538350426115846e-05, + "loss": 3.1644, + "step": 5240 + }, + { + "epoch": 0.75, + "learning_rate": 5.1509461216235747e-05, + "loss": 3.212, + "step": 5241 + }, + { + "epoch": 0.75, + "learning_rate": 5.1480572006355634e-05, + "loss": 2.8968, + "step": 5242 + }, + { + "epoch": 0.75, + "learning_rate": 5.145168279647552e-05, + "loss": 3.2158, + "step": 5243 + }, + { + "epoch": 0.75, + "learning_rate": 5.142279358659541e-05, + "loss": 2.9899, + "step": 5244 + }, + { + "epoch": 0.75, + "learning_rate": 5.1393904376715295e-05, + "loss": 2.9403, + "step": 5245 + }, + { + "epoch": 0.75, + "learning_rate": 5.136501516683518e-05, + "loss": 3.2206, + "step": 5246 + }, + { + "epoch": 0.75, + "learning_rate": 5.133612595695508e-05, + "loss": 3.1934, + "step": 5247 + }, + { + "epoch": 0.75, + "learning_rate": 5.130723674707497e-05, + "loss": 3.1608, + "step": 5248 + }, + { + "epoch": 0.75, + "learning_rate": 5.1278347537194856e-05, + "loss": 3.0498, + "step": 5249 + }, + { + "epoch": 0.75, + "learning_rate": 5.124945832731475e-05, + "loss": 3.1471, + "step": 5250 + }, + { + "epoch": 0.75, + "learning_rate": 5.122056911743464e-05, + "loss": 3.0589, + "step": 5251 + }, + { + "epoch": 0.75, + "learning_rate": 5.119167990755454e-05, + "loss": 3.2602, + "step": 5252 + }, + { + "epoch": 0.75, + "learning_rate": 5.1162790697674425e-05, + "loss": 3.211, + "step": 5253 + }, + { + "epoch": 0.75, + "learning_rate": 5.113390148779431e-05, + "loss": 3.0975, + "step": 5254 + }, + { + "epoch": 0.75, + "learning_rate": 5.11050122779142e-05, + "loss": 3.1378, + "step": 5255 + }, + { + "epoch": 0.75, + "learning_rate": 5.1076123068034086e-05, + "loss": 3.0698, + "step": 5256 + }, + { + "epoch": 0.75, + "learning_rate": 5.104723385815399e-05, + "loss": 3.2347, + "step": 5257 + }, + { + "epoch": 0.75, + "learning_rate": 5.1018344648273874e-05, + "loss": 3.2977, + "step": 5258 + }, + { + "epoch": 0.75, + "learning_rate": 5.098945543839376e-05, + "loss": 3.1856, + "step": 5259 + }, + { + "epoch": 0.75, + "learning_rate": 5.096056622851365e-05, + "loss": 3.0422, + "step": 5260 + }, + { + "epoch": 0.75, + "learning_rate": 5.093167701863354e-05, + "loss": 3.1303, + "step": 5261 + }, + { + "epoch": 0.75, + "learning_rate": 5.0902787808753436e-05, + "loss": 3.1463, + "step": 5262 + }, + { + "epoch": 0.75, + "learning_rate": 5.087389859887333e-05, + "loss": 3.1041, + "step": 5263 + }, + { + "epoch": 0.75, + "learning_rate": 5.0845009388993216e-05, + "loss": 3.029, + "step": 5264 + }, + { + "epoch": 0.75, + "learning_rate": 5.0816120179113103e-05, + "loss": 3.1371, + "step": 5265 + }, + { + "epoch": 0.75, + "learning_rate": 5.078723096923299e-05, + "loss": 3.002, + "step": 5266 + }, + { + "epoch": 0.75, + "learning_rate": 5.075834175935288e-05, + "loss": 3.1604, + "step": 5267 + }, + { + "epoch": 0.75, + "learning_rate": 5.072945254947278e-05, + "loss": 3.1258, + "step": 5268 + }, + { + "epoch": 0.75, + "learning_rate": 5.0700563339592665e-05, + "loss": 3.0734, + "step": 5269 + }, + { + "epoch": 0.75, + "learning_rate": 5.067167412971255e-05, + "loss": 3.2201, + "step": 5270 + }, + { + "epoch": 0.75, + "learning_rate": 5.064278491983244e-05, + "loss": 3.1679, + "step": 5271 + }, + { + "epoch": 0.75, + "learning_rate": 5.061389570995233e-05, + "loss": 3.1449, + "step": 5272 + }, + { + "epoch": 0.75, + "learning_rate": 5.058500650007223e-05, + "loss": 3.2038, + "step": 5273 + }, + { + "epoch": 0.75, + "learning_rate": 5.055611729019212e-05, + "loss": 3.28, + "step": 5274 + }, + { + "epoch": 0.75, + "learning_rate": 5.052722808031201e-05, + "loss": 3.1376, + "step": 5275 + }, + { + "epoch": 0.75, + "learning_rate": 5.0498338870431895e-05, + "loss": 3.0618, + "step": 5276 + }, + { + "epoch": 0.75, + "learning_rate": 5.046944966055178e-05, + "loss": 3.1943, + "step": 5277 + }, + { + "epoch": 0.75, + "learning_rate": 5.044056045067168e-05, + "loss": 3.162, + "step": 5278 + }, + { + "epoch": 0.75, + "learning_rate": 5.041167124079157e-05, + "loss": 3.1661, + "step": 5279 + }, + { + "epoch": 0.75, + "learning_rate": 5.038278203091146e-05, + "loss": 3.2009, + "step": 5280 + }, + { + "epoch": 0.75, + "eval_loss": 3.3782970905303955, + "eval_runtime": 472.5543, + "eval_samples_per_second": 43.354, + "eval_steps_per_second": 14.451, + "step": 5280 + }, + { + "epoch": 0.75, + "learning_rate": 5.0353892821031344e-05, + "loss": 2.8974, + "step": 5281 + }, + { + "epoch": 0.75, + "learning_rate": 5.032500361115123e-05, + "loss": 3.1747, + "step": 5282 + }, + { + "epoch": 0.75, + "learning_rate": 5.029611440127113e-05, + "loss": 3.2308, + "step": 5283 + }, + { + "epoch": 0.75, + "learning_rate": 5.026722519139102e-05, + "loss": 3.224, + "step": 5284 + }, + { + "epoch": 0.75, + "learning_rate": 5.023833598151091e-05, + "loss": 3.0434, + "step": 5285 + }, + { + "epoch": 0.75, + "learning_rate": 5.02094467716308e-05, + "loss": 3.265, + "step": 5286 + }, + { + "epoch": 0.75, + "learning_rate": 5.0180557561750686e-05, + "loss": 3.176, + "step": 5287 + }, + { + "epoch": 0.75, + "learning_rate": 5.015166835187057e-05, + "loss": 3.1993, + "step": 5288 + }, + { + "epoch": 0.75, + "learning_rate": 5.0122779141990474e-05, + "loss": 3.1893, + "step": 5289 + }, + { + "epoch": 0.75, + "learning_rate": 5.009388993211036e-05, + "loss": 3.1407, + "step": 5290 + }, + { + "epoch": 0.75, + "learning_rate": 5.006500072223025e-05, + "loss": 3.1743, + "step": 5291 + }, + { + "epoch": 0.75, + "learning_rate": 5.0036111512350135e-05, + "loss": 3.0759, + "step": 5292 + }, + { + "epoch": 0.75, + "learning_rate": 5.000722230247002e-05, + "loss": 3.1738, + "step": 5293 + }, + { + "epoch": 0.75, + "learning_rate": 4.9978333092589916e-05, + "loss": 3.016, + "step": 5294 + }, + { + "epoch": 0.75, + "learning_rate": 4.994944388270981e-05, + "loss": 3.1617, + "step": 5295 + }, + { + "epoch": 0.75, + "learning_rate": 4.99205546728297e-05, + "loss": 3.0395, + "step": 5296 + }, + { + "epoch": 0.75, + "learning_rate": 4.989166546294959e-05, + "loss": 3.0232, + "step": 5297 + }, + { + "epoch": 0.75, + "learning_rate": 4.9862776253069485e-05, + "loss": 3.1016, + "step": 5298 + }, + { + "epoch": 0.75, + "learning_rate": 4.983388704318937e-05, + "loss": 3.1152, + "step": 5299 + }, + { + "epoch": 0.75, + "learning_rate": 4.9804997833309265e-05, + "loss": 3.0738, + "step": 5300 + }, + { + "epoch": 0.75, + "learning_rate": 4.977610862342915e-05, + "loss": 3.1971, + "step": 5301 + }, + { + "epoch": 0.75, + "learning_rate": 4.974721941354904e-05, + "loss": 3.1543, + "step": 5302 + }, + { + "epoch": 0.76, + "learning_rate": 4.971833020366893e-05, + "loss": 3.2111, + "step": 5303 + }, + { + "epoch": 0.76, + "learning_rate": 4.968944099378882e-05, + "loss": 3.056, + "step": 5304 + }, + { + "epoch": 0.76, + "learning_rate": 4.966055178390871e-05, + "loss": 3.0425, + "step": 5305 + }, + { + "epoch": 0.76, + "learning_rate": 4.96316625740286e-05, + "loss": 3.1703, + "step": 5306 + }, + { + "epoch": 0.76, + "learning_rate": 4.960277336414849e-05, + "loss": 3.1008, + "step": 5307 + }, + { + "epoch": 0.76, + "learning_rate": 4.957388415426838e-05, + "loss": 3.2465, + "step": 5308 + }, + { + "epoch": 0.76, + "learning_rate": 4.9544994944388276e-05, + "loss": 3.1674, + "step": 5309 + }, + { + "epoch": 0.76, + "learning_rate": 4.951610573450816e-05, + "loss": 3.0894, + "step": 5310 + }, + { + "epoch": 0.76, + "learning_rate": 4.948721652462806e-05, + "loss": 3.2196, + "step": 5311 + }, + { + "epoch": 0.76, + "learning_rate": 4.9458327314747944e-05, + "loss": 3.2325, + "step": 5312 + }, + { + "epoch": 0.76, + "learning_rate": 4.942943810486784e-05, + "loss": 3.2285, + "step": 5313 + }, + { + "epoch": 0.76, + "learning_rate": 4.9400548894987725e-05, + "loss": 3.1028, + "step": 5314 + }, + { + "epoch": 0.76, + "learning_rate": 4.937165968510761e-05, + "loss": 3.1758, + "step": 5315 + }, + { + "epoch": 0.76, + "learning_rate": 4.9342770475227506e-05, + "loss": 3.1665, + "step": 5316 + }, + { + "epoch": 0.76, + "learning_rate": 4.931388126534739e-05, + "loss": 3.0878, + "step": 5317 + }, + { + "epoch": 0.76, + "learning_rate": 4.928499205546728e-05, + "loss": 3.2068, + "step": 5318 + }, + { + "epoch": 0.76, + "learning_rate": 4.9256102845587174e-05, + "loss": 3.0782, + "step": 5319 + }, + { + "epoch": 0.76, + "learning_rate": 4.922721363570707e-05, + "loss": 3.2516, + "step": 5320 + }, + { + "epoch": 0.76, + "learning_rate": 4.919832442582696e-05, + "loss": 3.1966, + "step": 5321 + }, + { + "epoch": 0.76, + "learning_rate": 4.916943521594685e-05, + "loss": 3.1208, + "step": 5322 + }, + { + "epoch": 0.76, + "learning_rate": 4.9140546006066735e-05, + "loss": 2.9893, + "step": 5323 + }, + { + "epoch": 0.76, + "learning_rate": 4.911165679618663e-05, + "loss": 2.9886, + "step": 5324 + }, + { + "epoch": 0.76, + "learning_rate": 4.9082767586306516e-05, + "loss": 3.2782, + "step": 5325 + }, + { + "epoch": 0.76, + "learning_rate": 4.905387837642641e-05, + "loss": 2.8643, + "step": 5326 + }, + { + "epoch": 0.76, + "learning_rate": 4.90249891665463e-05, + "loss": 3.2197, + "step": 5327 + }, + { + "epoch": 0.76, + "learning_rate": 4.8996099956666184e-05, + "loss": 3.1602, + "step": 5328 + }, + { + "epoch": 0.76, + "learning_rate": 4.896721074678608e-05, + "loss": 3.2031, + "step": 5329 + }, + { + "epoch": 0.76, + "learning_rate": 4.8938321536905965e-05, + "loss": 3.152, + "step": 5330 + }, + { + "epoch": 0.76, + "learning_rate": 4.890943232702586e-05, + "loss": 3.1258, + "step": 5331 + }, + { + "epoch": 0.76, + "learning_rate": 4.888054311714575e-05, + "loss": 3.1495, + "step": 5332 + }, + { + "epoch": 0.76, + "learning_rate": 4.885165390726564e-05, + "loss": 3.0992, + "step": 5333 + }, + { + "epoch": 0.76, + "learning_rate": 4.8822764697385533e-05, + "loss": 3.0927, + "step": 5334 + }, + { + "epoch": 0.76, + "learning_rate": 4.879387548750542e-05, + "loss": 3.1619, + "step": 5335 + }, + { + "epoch": 0.76, + "learning_rate": 4.876498627762531e-05, + "loss": 3.1023, + "step": 5336 + }, + { + "epoch": 0.76, + "learning_rate": 4.87360970677452e-05, + "loss": 3.1128, + "step": 5337 + }, + { + "epoch": 0.76, + "learning_rate": 4.870720785786509e-05, + "loss": 3.104, + "step": 5338 + }, + { + "epoch": 0.76, + "learning_rate": 4.8678318647984976e-05, + "loss": 2.8925, + "step": 5339 + }, + { + "epoch": 0.76, + "learning_rate": 4.864942943810487e-05, + "loss": 3.1436, + "step": 5340 + }, + { + "epoch": 0.76, + "learning_rate": 4.8620540228224756e-05, + "loss": 3.2427, + "step": 5341 + }, + { + "epoch": 0.76, + "learning_rate": 4.859165101834465e-05, + "loss": 3.2214, + "step": 5342 + }, + { + "epoch": 0.76, + "learning_rate": 4.8562761808464544e-05, + "loss": 3.1747, + "step": 5343 + }, + { + "epoch": 0.76, + "learning_rate": 4.853387259858443e-05, + "loss": 3.2018, + "step": 5344 + }, + { + "epoch": 0.76, + "learning_rate": 4.8504983388704325e-05, + "loss": 3.1354, + "step": 5345 + }, + { + "epoch": 0.76, + "learning_rate": 4.847609417882421e-05, + "loss": 3.0884, + "step": 5346 + }, + { + "epoch": 0.76, + "learning_rate": 4.8447204968944106e-05, + "loss": 3.1204, + "step": 5347 + }, + { + "epoch": 0.76, + "learning_rate": 4.841831575906399e-05, + "loss": 3.0706, + "step": 5348 + }, + { + "epoch": 0.76, + "learning_rate": 4.838942654918388e-05, + "loss": 3.154, + "step": 5349 + }, + { + "epoch": 0.76, + "learning_rate": 4.8360537339303774e-05, + "loss": 3.1685, + "step": 5350 + }, + { + "epoch": 0.76, + "learning_rate": 4.833164812942366e-05, + "loss": 3.2801, + "step": 5351 + }, + { + "epoch": 0.76, + "learning_rate": 4.830275891954355e-05, + "loss": 3.0305, + "step": 5352 + }, + { + "epoch": 0.76, + "learning_rate": 4.827386970966344e-05, + "loss": 2.9677, + "step": 5353 + }, + { + "epoch": 0.76, + "learning_rate": 4.8244980499783335e-05, + "loss": 3.1084, + "step": 5354 + }, + { + "epoch": 0.76, + "learning_rate": 4.821609128990322e-05, + "loss": 2.9526, + "step": 5355 + }, + { + "epoch": 0.76, + "learning_rate": 4.8187202080023116e-05, + "loss": 3.0777, + "step": 5356 + }, + { + "epoch": 0.76, + "learning_rate": 4.8158312870143e-05, + "loss": 3.1239, + "step": 5357 + }, + { + "epoch": 0.76, + "learning_rate": 4.81294236602629e-05, + "loss": 3.1601, + "step": 5358 + }, + { + "epoch": 0.76, + "learning_rate": 4.8100534450382784e-05, + "loss": 3.2667, + "step": 5359 + }, + { + "epoch": 0.76, + "learning_rate": 4.807164524050267e-05, + "loss": 3.0284, + "step": 5360 + }, + { + "epoch": 0.76, + "learning_rate": 4.8042756030622565e-05, + "loss": 3.1923, + "step": 5361 + }, + { + "epoch": 0.76, + "learning_rate": 4.801386682074245e-05, + "loss": 3.1982, + "step": 5362 + }, + { + "epoch": 0.76, + "learning_rate": 4.7984977610862346e-05, + "loss": 3.2425, + "step": 5363 + }, + { + "epoch": 0.76, + "learning_rate": 4.795608840098223e-05, + "loss": 3.1461, + "step": 5364 + }, + { + "epoch": 0.76, + "learning_rate": 4.792719919110212e-05, + "loss": 3.1147, + "step": 5365 + }, + { + "epoch": 0.76, + "learning_rate": 4.7898309981222014e-05, + "loss": 3.2444, + "step": 5366 + }, + { + "epoch": 0.76, + "learning_rate": 4.786942077134191e-05, + "loss": 3.2765, + "step": 5367 + }, + { + "epoch": 0.76, + "learning_rate": 4.78405315614618e-05, + "loss": 3.2053, + "step": 5368 + }, + { + "epoch": 0.76, + "learning_rate": 4.781164235158169e-05, + "loss": 3.1532, + "step": 5369 + }, + { + "epoch": 0.76, + "learning_rate": 4.7782753141701576e-05, + "loss": 3.1784, + "step": 5370 + }, + { + "epoch": 0.76, + "learning_rate": 4.775386393182147e-05, + "loss": 3.062, + "step": 5371 + }, + { + "epoch": 0.76, + "learning_rate": 4.7724974721941357e-05, + "loss": 3.0357, + "step": 5372 + }, + { + "epoch": 0.77, + "learning_rate": 4.7696085512061244e-05, + "loss": 3.2979, + "step": 5373 + }, + { + "epoch": 0.77, + "learning_rate": 4.766719630218114e-05, + "loss": 3.0243, + "step": 5374 + }, + { + "epoch": 0.77, + "learning_rate": 4.7638307092301024e-05, + "loss": 3.2944, + "step": 5375 + }, + { + "epoch": 0.77, + "learning_rate": 4.760941788242092e-05, + "loss": 3.2016, + "step": 5376 + }, + { + "epoch": 0.77, + "learning_rate": 4.7580528672540805e-05, + "loss": 3.1149, + "step": 5377 + }, + { + "epoch": 0.77, + "learning_rate": 4.75516394626607e-05, + "loss": 3.2387, + "step": 5378 + }, + { + "epoch": 0.77, + "learning_rate": 4.752275025278059e-05, + "loss": 3.1331, + "step": 5379 + }, + { + "epoch": 0.77, + "learning_rate": 4.749386104290048e-05, + "loss": 3.1386, + "step": 5380 + }, + { + "epoch": 0.77, + "learning_rate": 4.7464971833020374e-05, + "loss": 3.1489, + "step": 5381 + }, + { + "epoch": 0.77, + "learning_rate": 4.743608262314026e-05, + "loss": 2.882, + "step": 5382 + }, + { + "epoch": 0.77, + "learning_rate": 4.740719341326015e-05, + "loss": 3.078, + "step": 5383 + }, + { + "epoch": 0.77, + "learning_rate": 4.737830420338004e-05, + "loss": 3.0792, + "step": 5384 + }, + { + "epoch": 0.77, + "learning_rate": 4.734941499349993e-05, + "loss": 3.0568, + "step": 5385 + }, + { + "epoch": 0.77, + "learning_rate": 4.7320525783619816e-05, + "loss": 3.1153, + "step": 5386 + }, + { + "epoch": 0.77, + "learning_rate": 4.729163657373971e-05, + "loss": 3.2472, + "step": 5387 + }, + { + "epoch": 0.77, + "learning_rate": 4.72627473638596e-05, + "loss": 3.1513, + "step": 5388 + }, + { + "epoch": 0.77, + "learning_rate": 4.723385815397949e-05, + "loss": 3.273, + "step": 5389 + }, + { + "epoch": 0.77, + "learning_rate": 4.7204968944099384e-05, + "loss": 3.1339, + "step": 5390 + }, + { + "epoch": 0.77, + "learning_rate": 4.717607973421927e-05, + "loss": 3.2117, + "step": 5391 + }, + { + "epoch": 0.77, + "learning_rate": 4.7147190524339165e-05, + "loss": 3.0859, + "step": 5392 + }, + { + "epoch": 0.77, + "learning_rate": 4.711830131445905e-05, + "loss": 3.0746, + "step": 5393 + }, + { + "epoch": 0.77, + "learning_rate": 4.708941210457894e-05, + "loss": 3.1658, + "step": 5394 + }, + { + "epoch": 0.77, + "learning_rate": 4.706052289469883e-05, + "loss": 3.1445, + "step": 5395 + }, + { + "epoch": 0.77, + "learning_rate": 4.703163368481872e-05, + "loss": 3.0719, + "step": 5396 + }, + { + "epoch": 0.77, + "learning_rate": 4.7002744474938614e-05, + "loss": 3.182, + "step": 5397 + }, + { + "epoch": 0.77, + "learning_rate": 4.69738552650585e-05, + "loss": 3.207, + "step": 5398 + }, + { + "epoch": 0.77, + "learning_rate": 4.694496605517839e-05, + "loss": 3.1019, + "step": 5399 + }, + { + "epoch": 0.77, + "learning_rate": 4.691607684529828e-05, + "loss": 3.1441, + "step": 5400 + }, + { + "epoch": 0.77, + "learning_rate": 4.6887187635418176e-05, + "loss": 3.1312, + "step": 5401 + }, + { + "epoch": 0.77, + "learning_rate": 4.685829842553807e-05, + "loss": 3.0884, + "step": 5402 + }, + { + "epoch": 0.77, + "learning_rate": 4.682940921565796e-05, + "loss": 3.0809, + "step": 5403 + }, + { + "epoch": 0.77, + "learning_rate": 4.6800520005777844e-05, + "loss": 2.9426, + "step": 5404 + }, + { + "epoch": 0.77, + "learning_rate": 4.677163079589774e-05, + "loss": 3.1012, + "step": 5405 + }, + { + "epoch": 0.77, + "learning_rate": 4.6742741586017625e-05, + "loss": 3.2305, + "step": 5406 + }, + { + "epoch": 0.77, + "learning_rate": 4.671385237613751e-05, + "loss": 3.169, + "step": 5407 + }, + { + "epoch": 0.77, + "learning_rate": 4.6684963166257406e-05, + "loss": 3.237, + "step": 5408 + }, + { + "epoch": 0.77, + "learning_rate": 4.665607395637729e-05, + "loss": 3.2113, + "step": 5409 + }, + { + "epoch": 0.77, + "learning_rate": 4.6627184746497186e-05, + "loss": 3.3055, + "step": 5410 + }, + { + "epoch": 0.77, + "learning_rate": 4.6598295536617073e-05, + "loss": 2.98, + "step": 5411 + }, + { + "epoch": 0.77, + "learning_rate": 4.656940632673697e-05, + "loss": 3.0582, + "step": 5412 + }, + { + "epoch": 0.77, + "learning_rate": 4.6540517116856854e-05, + "loss": 3.2416, + "step": 5413 + }, + { + "epoch": 0.77, + "learning_rate": 4.651162790697675e-05, + "loss": 3.1169, + "step": 5414 + }, + { + "epoch": 0.77, + "learning_rate": 4.6482738697096635e-05, + "loss": 3.0852, + "step": 5415 + }, + { + "epoch": 0.77, + "learning_rate": 4.645384948721653e-05, + "loss": 3.0634, + "step": 5416 + }, + { + "epoch": 0.77, + "learning_rate": 4.6424960277336416e-05, + "loss": 3.0914, + "step": 5417 + }, + { + "epoch": 0.77, + "learning_rate": 4.639607106745631e-05, + "loss": 3.1037, + "step": 5418 + }, + { + "epoch": 0.77, + "learning_rate": 4.63671818575762e-05, + "loss": 3.0749, + "step": 5419 + }, + { + "epoch": 0.77, + "learning_rate": 4.6338292647696084e-05, + "loss": 2.9571, + "step": 5420 + }, + { + "epoch": 0.77, + "learning_rate": 4.630940343781598e-05, + "loss": 3.0772, + "step": 5421 + }, + { + "epoch": 0.77, + "learning_rate": 4.6280514227935865e-05, + "loss": 3.1799, + "step": 5422 + }, + { + "epoch": 0.77, + "learning_rate": 4.625162501805576e-05, + "loss": 3.3737, + "step": 5423 + }, + { + "epoch": 0.77, + "learning_rate": 4.6222735808175646e-05, + "loss": 2.9617, + "step": 5424 + }, + { + "epoch": 0.77, + "learning_rate": 4.619384659829554e-05, + "loss": 3.2483, + "step": 5425 + }, + { + "epoch": 0.77, + "learning_rate": 4.6164957388415433e-05, + "loss": 3.2585, + "step": 5426 + }, + { + "epoch": 0.77, + "learning_rate": 4.613606817853532e-05, + "loss": 3.2683, + "step": 5427 + }, + { + "epoch": 0.77, + "learning_rate": 4.610717896865521e-05, + "loss": 3.1022, + "step": 5428 + }, + { + "epoch": 0.77, + "learning_rate": 4.60782897587751e-05, + "loss": 3.2188, + "step": 5429 + }, + { + "epoch": 0.77, + "learning_rate": 4.604940054889499e-05, + "loss": 3.1245, + "step": 5430 + }, + { + "epoch": 0.77, + "learning_rate": 4.602051133901488e-05, + "loss": 3.2285, + "step": 5431 + }, + { + "epoch": 0.77, + "learning_rate": 4.599162212913477e-05, + "loss": 3.1879, + "step": 5432 + }, + { + "epoch": 0.77, + "learning_rate": 4.5962732919254656e-05, + "loss": 3.0554, + "step": 5433 + }, + { + "epoch": 0.77, + "learning_rate": 4.593384370937455e-05, + "loss": 3.1055, + "step": 5434 + }, + { + "epoch": 0.77, + "learning_rate": 4.590495449949444e-05, + "loss": 3.2009, + "step": 5435 + }, + { + "epoch": 0.77, + "learning_rate": 4.587606528961433e-05, + "loss": 3.1747, + "step": 5436 + }, + { + "epoch": 0.77, + "learning_rate": 4.5847176079734225e-05, + "loss": 3.0973, + "step": 5437 + }, + { + "epoch": 0.77, + "learning_rate": 4.581828686985411e-05, + "loss": 3.0826, + "step": 5438 + }, + { + "epoch": 0.77, + "learning_rate": 4.5789397659974006e-05, + "loss": 3.0979, + "step": 5439 + }, + { + "epoch": 0.77, + "learning_rate": 4.576050845009389e-05, + "loss": 3.0841, + "step": 5440 + }, + { + "epoch": 0.77, + "learning_rate": 4.573161924021378e-05, + "loss": 2.9547, + "step": 5441 + }, + { + "epoch": 0.77, + "learning_rate": 4.5702730030333674e-05, + "loss": 3.1232, + "step": 5442 + }, + { + "epoch": 0.77, + "learning_rate": 4.567384082045356e-05, + "loss": 3.2677, + "step": 5443 + }, + { + "epoch": 0.78, + "learning_rate": 4.5644951610573454e-05, + "loss": 3.2299, + "step": 5444 + }, + { + "epoch": 0.78, + "learning_rate": 4.561606240069334e-05, + "loss": 3.1185, + "step": 5445 + }, + { + "epoch": 0.78, + "learning_rate": 4.558717319081323e-05, + "loss": 3.0334, + "step": 5446 + }, + { + "epoch": 0.78, + "learning_rate": 4.555828398093312e-05, + "loss": 3.1425, + "step": 5447 + }, + { + "epoch": 0.78, + "learning_rate": 4.5529394771053016e-05, + "loss": 3.058, + "step": 5448 + }, + { + "epoch": 0.78, + "learning_rate": 4.55005055611729e-05, + "loss": 3.0042, + "step": 5449 + }, + { + "epoch": 0.78, + "learning_rate": 4.54716163512928e-05, + "loss": 3.1667, + "step": 5450 + }, + { + "epoch": 0.78, + "learning_rate": 4.5442727141412684e-05, + "loss": 3.1671, + "step": 5451 + }, + { + "epoch": 0.78, + "learning_rate": 4.541383793153258e-05, + "loss": 3.0781, + "step": 5452 + }, + { + "epoch": 0.78, + "learning_rate": 4.5384948721652465e-05, + "loss": 3.0669, + "step": 5453 + }, + { + "epoch": 0.78, + "learning_rate": 4.535605951177235e-05, + "loss": 3.0942, + "step": 5454 + }, + { + "epoch": 0.78, + "learning_rate": 4.5327170301892246e-05, + "loss": 3.0422, + "step": 5455 + }, + { + "epoch": 0.78, + "learning_rate": 4.529828109201213e-05, + "loss": 3.1276, + "step": 5456 + }, + { + "epoch": 0.78, + "learning_rate": 4.526939188213203e-05, + "loss": 2.9173, + "step": 5457 + }, + { + "epoch": 0.78, + "learning_rate": 4.5240502672251914e-05, + "loss": 3.0792, + "step": 5458 + }, + { + "epoch": 0.78, + "learning_rate": 4.521161346237181e-05, + "loss": 3.1322, + "step": 5459 + }, + { + "epoch": 0.78, + "learning_rate": 4.51827242524917e-05, + "loss": 3.112, + "step": 5460 + }, + { + "epoch": 0.78, + "learning_rate": 4.515383504261159e-05, + "loss": 3.1221, + "step": 5461 + }, + { + "epoch": 0.78, + "learning_rate": 4.5124945832731476e-05, + "loss": 3.1167, + "step": 5462 + }, + { + "epoch": 0.78, + "learning_rate": 4.509605662285137e-05, + "loss": 3.2434, + "step": 5463 + }, + { + "epoch": 0.78, + "learning_rate": 4.5067167412971256e-05, + "loss": 3.0045, + "step": 5464 + }, + { + "epoch": 0.78, + "learning_rate": 4.503827820309115e-05, + "loss": 3.0463, + "step": 5465 + }, + { + "epoch": 0.78, + "learning_rate": 4.500938899321104e-05, + "loss": 2.8559, + "step": 5466 + }, + { + "epoch": 0.78, + "learning_rate": 4.4980499783330924e-05, + "loss": 3.0624, + "step": 5467 + }, + { + "epoch": 0.78, + "learning_rate": 4.495161057345082e-05, + "loss": 3.1129, + "step": 5468 + }, + { + "epoch": 0.78, + "learning_rate": 4.4922721363570705e-05, + "loss": 3.1786, + "step": 5469 + }, + { + "epoch": 0.78, + "learning_rate": 4.48938321536906e-05, + "loss": 2.9897, + "step": 5470 + }, + { + "epoch": 0.78, + "learning_rate": 4.486494294381049e-05, + "loss": 3.013, + "step": 5471 + }, + { + "epoch": 0.78, + "learning_rate": 4.483605373393038e-05, + "loss": 3.2182, + "step": 5472 + }, + { + "epoch": 0.78, + "learning_rate": 4.4807164524050274e-05, + "loss": 3.0045, + "step": 5473 + }, + { + "epoch": 0.78, + "learning_rate": 4.477827531417016e-05, + "loss": 2.9859, + "step": 5474 + }, + { + "epoch": 0.78, + "learning_rate": 4.474938610429005e-05, + "loss": 3.1631, + "step": 5475 + }, + { + "epoch": 0.78, + "learning_rate": 4.472049689440994e-05, + "loss": 3.3028, + "step": 5476 + }, + { + "epoch": 0.78, + "learning_rate": 4.469160768452983e-05, + "loss": 3.1968, + "step": 5477 + }, + { + "epoch": 0.78, + "learning_rate": 4.466271847464972e-05, + "loss": 3.1583, + "step": 5478 + }, + { + "epoch": 0.78, + "learning_rate": 4.463382926476961e-05, + "loss": 3.1995, + "step": 5479 + }, + { + "epoch": 0.78, + "learning_rate": 4.46049400548895e-05, + "loss": 3.1569, + "step": 5480 + }, + { + "epoch": 0.78, + "learning_rate": 4.457605084500939e-05, + "loss": 3.0926, + "step": 5481 + }, + { + "epoch": 0.78, + "learning_rate": 4.454716163512928e-05, + "loss": 3.1807, + "step": 5482 + }, + { + "epoch": 0.78, + "learning_rate": 4.451827242524917e-05, + "loss": 3.1208, + "step": 5483 + }, + { + "epoch": 0.78, + "learning_rate": 4.4489383215369065e-05, + "loss": 3.1641, + "step": 5484 + }, + { + "epoch": 0.78, + "learning_rate": 4.446049400548895e-05, + "loss": 3.1255, + "step": 5485 + }, + { + "epoch": 0.78, + "learning_rate": 4.4431604795608846e-05, + "loss": 3.2746, + "step": 5486 + }, + { + "epoch": 0.78, + "learning_rate": 4.440271558572873e-05, + "loss": 3.1542, + "step": 5487 + }, + { + "epoch": 0.78, + "learning_rate": 4.437382637584862e-05, + "loss": 2.9932, + "step": 5488 + }, + { + "epoch": 0.78, + "learning_rate": 4.4344937165968514e-05, + "loss": 3.2184, + "step": 5489 + }, + { + "epoch": 0.78, + "learning_rate": 4.43160479560884e-05, + "loss": 3.1889, + "step": 5490 + }, + { + "epoch": 0.78, + "learning_rate": 4.4287158746208295e-05, + "loss": 3.1113, + "step": 5491 + }, + { + "epoch": 0.78, + "learning_rate": 4.425826953632818e-05, + "loss": 3.135, + "step": 5492 + }, + { + "epoch": 0.78, + "learning_rate": 4.422938032644807e-05, + "loss": 3.2097, + "step": 5493 + }, + { + "epoch": 0.78, + "learning_rate": 4.420049111656796e-05, + "loss": 3.2467, + "step": 5494 + }, + { + "epoch": 0.78, + "learning_rate": 4.417160190668786e-05, + "loss": 3.2019, + "step": 5495 + }, + { + "epoch": 0.78, + "learning_rate": 4.4142712696807744e-05, + "loss": 3.1264, + "step": 5496 + }, + { + "epoch": 0.78, + "learning_rate": 4.411382348692764e-05, + "loss": 3.0359, + "step": 5497 + }, + { + "epoch": 0.78, + "learning_rate": 4.4084934277047525e-05, + "loss": 3.1811, + "step": 5498 + }, + { + "epoch": 0.78, + "learning_rate": 4.405604506716742e-05, + "loss": 3.1627, + "step": 5499 + }, + { + "epoch": 0.78, + "learning_rate": 4.4027155857287305e-05, + "loss": 3.0806, + "step": 5500 + }, + { + "epoch": 0.78, + "learning_rate": 4.399826664740719e-05, + "loss": 3.117, + "step": 5501 + }, + { + "epoch": 0.78, + "learning_rate": 4.3969377437527086e-05, + "loss": 3.1178, + "step": 5502 + }, + { + "epoch": 0.78, + "learning_rate": 4.394048822764697e-05, + "loss": 3.1311, + "step": 5503 + }, + { + "epoch": 0.78, + "learning_rate": 4.391159901776686e-05, + "loss": 3.1437, + "step": 5504 + }, + { + "epoch": 0.78, + "learning_rate": 4.3882709807886754e-05, + "loss": 3.1687, + "step": 5505 + }, + { + "epoch": 0.78, + "learning_rate": 4.385382059800665e-05, + "loss": 3.1019, + "step": 5506 + }, + { + "epoch": 0.78, + "learning_rate": 4.382493138812654e-05, + "loss": 3.0737, + "step": 5507 + }, + { + "epoch": 0.78, + "learning_rate": 4.379604217824643e-05, + "loss": 3.1824, + "step": 5508 + }, + { + "epoch": 0.78, + "learning_rate": 4.3767152968366316e-05, + "loss": 3.1009, + "step": 5509 + }, + { + "epoch": 0.78, + "learning_rate": 4.373826375848621e-05, + "loss": 3.096, + "step": 5510 + }, + { + "epoch": 0.78, + "learning_rate": 4.37093745486061e-05, + "loss": 3.1401, + "step": 5511 + }, + { + "epoch": 0.78, + "learning_rate": 4.368048533872599e-05, + "loss": 3.092, + "step": 5512 + }, + { + "epoch": 0.78, + "learning_rate": 4.365159612884588e-05, + "loss": 3.1106, + "step": 5513 + }, + { + "epoch": 0.79, + "learning_rate": 4.3622706918965765e-05, + "loss": 3.0776, + "step": 5514 + }, + { + "epoch": 0.79, + "learning_rate": 4.359381770908566e-05, + "loss": 2.9404, + "step": 5515 + }, + { + "epoch": 0.79, + "learning_rate": 4.3564928499205546e-05, + "loss": 3.2286, + "step": 5516 + }, + { + "epoch": 0.79, + "learning_rate": 4.353603928932544e-05, + "loss": 3.0828, + "step": 5517 + }, + { + "epoch": 0.79, + "learning_rate": 4.350715007944533e-05, + "loss": 2.9596, + "step": 5518 + }, + { + "epoch": 0.79, + "learning_rate": 4.347826086956522e-05, + "loss": 3.0728, + "step": 5519 + }, + { + "epoch": 0.79, + "learning_rate": 4.3449371659685114e-05, + "loss": 3.1399, + "step": 5520 + }, + { + "epoch": 0.79, + "learning_rate": 4.3420482449805e-05, + "loss": 3.0716, + "step": 5521 + }, + { + "epoch": 0.79, + "learning_rate": 4.339159323992489e-05, + "loss": 3.1167, + "step": 5522 + }, + { + "epoch": 0.79, + "learning_rate": 4.336270403004478e-05, + "loss": 3.0735, + "step": 5523 + }, + { + "epoch": 0.79, + "learning_rate": 4.333381482016467e-05, + "loss": 3.0712, + "step": 5524 + }, + { + "epoch": 0.79, + "learning_rate": 4.330492561028456e-05, + "loss": 3.264, + "step": 5525 + }, + { + "epoch": 0.79, + "learning_rate": 4.327603640040445e-05, + "loss": 3.0417, + "step": 5526 + }, + { + "epoch": 0.79, + "learning_rate": 4.324714719052434e-05, + "loss": 3.154, + "step": 5527 + }, + { + "epoch": 0.79, + "learning_rate": 4.321825798064423e-05, + "loss": 3.085, + "step": 5528 + }, + { + "epoch": 0.79, + "learning_rate": 4.3189368770764125e-05, + "loss": 3.2064, + "step": 5529 + }, + { + "epoch": 0.79, + "learning_rate": 4.316047956088401e-05, + "loss": 3.1593, + "step": 5530 + }, + { + "epoch": 0.79, + "learning_rate": 4.3131590351003906e-05, + "loss": 3.197, + "step": 5531 + }, + { + "epoch": 0.79, + "learning_rate": 4.310270114112379e-05, + "loss": 2.9919, + "step": 5532 + }, + { + "epoch": 0.79, + "learning_rate": 4.3073811931243686e-05, + "loss": 3.201, + "step": 5533 + }, + { + "epoch": 0.79, + "learning_rate": 4.3044922721363574e-05, + "loss": 2.9411, + "step": 5534 + }, + { + "epoch": 0.79, + "learning_rate": 4.301603351148346e-05, + "loss": 3.051, + "step": 5535 + }, + { + "epoch": 0.79, + "learning_rate": 4.2987144301603354e-05, + "loss": 3.1746, + "step": 5536 + }, + { + "epoch": 0.79, + "learning_rate": 4.295825509172324e-05, + "loss": 3.1132, + "step": 5537 + }, + { + "epoch": 0.79, + "learning_rate": 4.292936588184313e-05, + "loss": 3.1978, + "step": 5538 + }, + { + "epoch": 0.79, + "learning_rate": 4.290047667196302e-05, + "loss": 2.9316, + "step": 5539 + }, + { + "epoch": 0.79, + "learning_rate": 4.287158746208291e-05, + "loss": 3.1084, + "step": 5540 + }, + { + "epoch": 0.79, + "learning_rate": 4.28426982522028e-05, + "loss": 3.1946, + "step": 5541 + }, + { + "epoch": 0.79, + "learning_rate": 4.28138090423227e-05, + "loss": 3.1314, + "step": 5542 + }, + { + "epoch": 0.79, + "learning_rate": 4.2784919832442584e-05, + "loss": 2.9638, + "step": 5543 + }, + { + "epoch": 0.79, + "learning_rate": 4.275603062256248e-05, + "loss": 3.1317, + "step": 5544 + }, + { + "epoch": 0.79, + "learning_rate": 4.2727141412682365e-05, + "loss": 3.0273, + "step": 5545 + }, + { + "epoch": 0.79, + "learning_rate": 4.269825220280226e-05, + "loss": 3.1777, + "step": 5546 + }, + { + "epoch": 0.79, + "learning_rate": 4.2669362992922146e-05, + "loss": 3.1611, + "step": 5547 + }, + { + "epoch": 0.79, + "learning_rate": 4.264047378304203e-05, + "loss": 3.0826, + "step": 5548 + }, + { + "epoch": 0.79, + "learning_rate": 4.261158457316193e-05, + "loss": 3.1074, + "step": 5549 + }, + { + "epoch": 0.79, + "learning_rate": 4.2582695363281814e-05, + "loss": 3.1507, + "step": 5550 + }, + { + "epoch": 0.79, + "learning_rate": 4.25538061534017e-05, + "loss": 2.9495, + "step": 5551 + }, + { + "epoch": 0.79, + "learning_rate": 4.2524916943521595e-05, + "loss": 3.155, + "step": 5552 + }, + { + "epoch": 0.79, + "learning_rate": 4.249602773364149e-05, + "loss": 3.105, + "step": 5553 + }, + { + "epoch": 0.79, + "learning_rate": 4.246713852376138e-05, + "loss": 3.1968, + "step": 5554 + }, + { + "epoch": 0.79, + "learning_rate": 4.243824931388127e-05, + "loss": 3.0571, + "step": 5555 + }, + { + "epoch": 0.79, + "learning_rate": 4.2409360104001156e-05, + "loss": 3.086, + "step": 5556 + }, + { + "epoch": 0.79, + "learning_rate": 4.238047089412105e-05, + "loss": 3.1495, + "step": 5557 + }, + { + "epoch": 0.79, + "learning_rate": 4.235158168424094e-05, + "loss": 3.0802, + "step": 5558 + }, + { + "epoch": 0.79, + "learning_rate": 4.2322692474360824e-05, + "loss": 3.2062, + "step": 5559 + }, + { + "epoch": 0.79, + "learning_rate": 4.229380326448072e-05, + "loss": 2.9959, + "step": 5560 + }, + { + "epoch": 0.79, + "learning_rate": 4.2264914054600605e-05, + "loss": 3.0546, + "step": 5561 + }, + { + "epoch": 0.79, + "learning_rate": 4.22360248447205e-05, + "loss": 3.0392, + "step": 5562 + }, + { + "epoch": 0.79, + "learning_rate": 4.2207135634840386e-05, + "loss": 3.0652, + "step": 5563 + }, + { + "epoch": 0.79, + "learning_rate": 4.217824642496028e-05, + "loss": 2.9144, + "step": 5564 + }, + { + "epoch": 0.79, + "learning_rate": 4.2149357215080174e-05, + "loss": 2.9547, + "step": 5565 + }, + { + "epoch": 0.79, + "learning_rate": 4.212046800520006e-05, + "loss": 3.2341, + "step": 5566 + }, + { + "epoch": 0.79, + "learning_rate": 4.2091578795319955e-05, + "loss": 3.0326, + "step": 5567 + }, + { + "epoch": 0.79, + "learning_rate": 4.206268958543984e-05, + "loss": 3.1595, + "step": 5568 + }, + { + "epoch": 0.79, + "learning_rate": 4.203380037555973e-05, + "loss": 3.2042, + "step": 5569 + }, + { + "epoch": 0.79, + "learning_rate": 4.200491116567962e-05, + "loss": 3.252, + "step": 5570 + }, + { + "epoch": 0.79, + "learning_rate": 4.197602195579951e-05, + "loss": 3.0884, + "step": 5571 + }, + { + "epoch": 0.79, + "learning_rate": 4.1947132745919397e-05, + "loss": 3.1607, + "step": 5572 + }, + { + "epoch": 0.79, + "learning_rate": 4.191824353603929e-05, + "loss": 3.0982, + "step": 5573 + }, + { + "epoch": 0.79, + "learning_rate": 4.188935432615918e-05, + "loss": 3.1095, + "step": 5574 + }, + { + "epoch": 0.79, + "learning_rate": 4.186046511627907e-05, + "loss": 3.0324, + "step": 5575 + }, + { + "epoch": 0.79, + "learning_rate": 4.1831575906398965e-05, + "loss": 3.0945, + "step": 5576 + }, + { + "epoch": 0.79, + "learning_rate": 4.180268669651885e-05, + "loss": 3.096, + "step": 5577 + }, + { + "epoch": 0.79, + "learning_rate": 4.1773797486638746e-05, + "loss": 3.102, + "step": 5578 + }, + { + "epoch": 0.79, + "learning_rate": 4.174490827675863e-05, + "loss": 3.1228, + "step": 5579 + }, + { + "epoch": 0.79, + "learning_rate": 4.171601906687853e-05, + "loss": 3.1557, + "step": 5580 + }, + { + "epoch": 0.79, + "learning_rate": 4.1687129856998414e-05, + "loss": 3.12, + "step": 5581 + }, + { + "epoch": 0.79, + "learning_rate": 4.16582406471183e-05, + "loss": 3.3135, + "step": 5582 + }, + { + "epoch": 0.79, + "learning_rate": 4.1629351437238195e-05, + "loss": 3.0905, + "step": 5583 + }, + { + "epoch": 0.8, + "learning_rate": 4.160046222735808e-05, + "loss": 3.2271, + "step": 5584 + }, + { + "epoch": 0.8, + "learning_rate": 4.157157301747797e-05, + "loss": 3.1322, + "step": 5585 + }, + { + "epoch": 0.8, + "learning_rate": 4.154268380759786e-05, + "loss": 3.0577, + "step": 5586 + }, + { + "epoch": 0.8, + "learning_rate": 4.1513794597717757e-05, + "loss": 2.9445, + "step": 5587 + }, + { + "epoch": 0.8, + "learning_rate": 4.148490538783765e-05, + "loss": 3.1296, + "step": 5588 + }, + { + "epoch": 0.8, + "learning_rate": 4.145601617795754e-05, + "loss": 3.1376, + "step": 5589 + }, + { + "epoch": 0.8, + "learning_rate": 4.1427126968077424e-05, + "loss": 3.1108, + "step": 5590 + }, + { + "epoch": 0.8, + "learning_rate": 4.139823775819732e-05, + "loss": 3.1172, + "step": 5591 + }, + { + "epoch": 0.8, + "learning_rate": 4.1369348548317205e-05, + "loss": 3.1765, + "step": 5592 + }, + { + "epoch": 0.8, + "learning_rate": 4.134045933843709e-05, + "loss": 3.1219, + "step": 5593 + }, + { + "epoch": 0.8, + "learning_rate": 4.1311570128556986e-05, + "loss": 3.1936, + "step": 5594 + }, + { + "epoch": 0.8, + "learning_rate": 4.128268091867687e-05, + "loss": 3.1505, + "step": 5595 + }, + { + "epoch": 0.8, + "learning_rate": 4.125379170879677e-05, + "loss": 3.1478, + "step": 5596 + }, + { + "epoch": 0.8, + "learning_rate": 4.1224902498916654e-05, + "loss": 3.1023, + "step": 5597 + }, + { + "epoch": 0.8, + "learning_rate": 4.119601328903655e-05, + "loss": 3.0069, + "step": 5598 + }, + { + "epoch": 0.8, + "learning_rate": 4.1167124079156435e-05, + "loss": 3.054, + "step": 5599 + }, + { + "epoch": 0.8, + "learning_rate": 4.113823486927633e-05, + "loss": 2.9706, + "step": 5600 + }, + { + "epoch": 0.8, + "learning_rate": 4.110934565939622e-05, + "loss": 3.152, + "step": 5601 + }, + { + "epoch": 0.8, + "learning_rate": 4.108045644951611e-05, + "loss": 3.076, + "step": 5602 + }, + { + "epoch": 0.8, + "learning_rate": 4.1051567239636e-05, + "loss": 3.2783, + "step": 5603 + }, + { + "epoch": 0.8, + "learning_rate": 4.102267802975589e-05, + "loss": 3.0151, + "step": 5604 + }, + { + "epoch": 0.8, + "learning_rate": 4.099378881987578e-05, + "loss": 3.1927, + "step": 5605 + }, + { + "epoch": 0.8, + "learning_rate": 4.0964899609995665e-05, + "loss": 3.0818, + "step": 5606 + }, + { + "epoch": 0.8, + "learning_rate": 4.093601040011556e-05, + "loss": 3.0521, + "step": 5607 + }, + { + "epoch": 0.8, + "learning_rate": 4.0907121190235446e-05, + "loss": 3.1366, + "step": 5608 + }, + { + "epoch": 0.8, + "learning_rate": 4.087823198035534e-05, + "loss": 3.1397, + "step": 5609 + }, + { + "epoch": 0.8, + "learning_rate": 4.0849342770475226e-05, + "loss": 3.1935, + "step": 5610 + }, + { + "epoch": 0.8, + "learning_rate": 4.082045356059512e-05, + "loss": 3.1474, + "step": 5611 + }, + { + "epoch": 0.8, + "learning_rate": 4.0791564350715014e-05, + "loss": 3.0972, + "step": 5612 + }, + { + "epoch": 0.8, + "learning_rate": 4.07626751408349e-05, + "loss": 3.0839, + "step": 5613 + }, + { + "epoch": 0.8, + "learning_rate": 4.073378593095479e-05, + "loss": 3.0802, + "step": 5614 + }, + { + "epoch": 0.8, + "learning_rate": 4.070489672107468e-05, + "loss": 3.189, + "step": 5615 + }, + { + "epoch": 0.8, + "learning_rate": 4.067600751119457e-05, + "loss": 3.0959, + "step": 5616 + }, + { + "epoch": 0.8, + "learning_rate": 4.064711830131446e-05, + "loss": 3.2138, + "step": 5617 + }, + { + "epoch": 0.8, + "learning_rate": 4.061822909143435e-05, + "loss": 2.9459, + "step": 5618 + }, + { + "epoch": 0.8, + "learning_rate": 4.058933988155424e-05, + "loss": 3.188, + "step": 5619 + }, + { + "epoch": 0.8, + "learning_rate": 4.056045067167413e-05, + "loss": 3.2137, + "step": 5620 + }, + { + "epoch": 0.8, + "learning_rate": 4.053156146179402e-05, + "loss": 3.0826, + "step": 5621 + }, + { + "epoch": 0.8, + "learning_rate": 4.050267225191391e-05, + "loss": 3.2074, + "step": 5622 + }, + { + "epoch": 0.8, + "learning_rate": 4.0473783042033806e-05, + "loss": 3.1752, + "step": 5623 + }, + { + "epoch": 0.8, + "learning_rate": 4.044489383215369e-05, + "loss": 3.0614, + "step": 5624 + }, + { + "epoch": 0.8, + "learning_rate": 4.0416004622273586e-05, + "loss": 3.1443, + "step": 5625 + }, + { + "epoch": 0.8, + "learning_rate": 4.0387115412393473e-05, + "loss": 3.1138, + "step": 5626 + }, + { + "epoch": 0.8, + "learning_rate": 4.035822620251336e-05, + "loss": 3.0843, + "step": 5627 + }, + { + "epoch": 0.8, + "learning_rate": 4.0329336992633254e-05, + "loss": 3.1314, + "step": 5628 + }, + { + "epoch": 0.8, + "learning_rate": 4.030044778275314e-05, + "loss": 3.0496, + "step": 5629 + }, + { + "epoch": 0.8, + "learning_rate": 4.0271558572873035e-05, + "loss": 3.2638, + "step": 5630 + }, + { + "epoch": 0.8, + "learning_rate": 4.024266936299292e-05, + "loss": 3.176, + "step": 5631 + }, + { + "epoch": 0.8, + "learning_rate": 4.021378015311281e-05, + "loss": 3.1037, + "step": 5632 + }, + { + "epoch": 0.8, + "eval_loss": 3.35094952583313, + "eval_runtime": 472.6533, + "eval_samples_per_second": 43.345, + "eval_steps_per_second": 14.448, + "step": 5632 + }, + { + "epoch": 0.8, + "learning_rate": 4.01848909432327e-05, + "loss": 3.1948, + "step": 5633 + }, + { + "epoch": 0.8, + "learning_rate": 4.01560017333526e-05, + "loss": 3.1198, + "step": 5634 + }, + { + "epoch": 0.8, + "learning_rate": 4.012711252347249e-05, + "loss": 3.2102, + "step": 5635 + }, + { + "epoch": 0.8, + "learning_rate": 4.009822331359238e-05, + "loss": 3.1267, + "step": 5636 + }, + { + "epoch": 0.8, + "learning_rate": 4.0069334103712265e-05, + "loss": 3.1205, + "step": 5637 + }, + { + "epoch": 0.8, + "learning_rate": 4.004044489383216e-05, + "loss": 3.1644, + "step": 5638 + }, + { + "epoch": 0.8, + "learning_rate": 4.0011555683952046e-05, + "loss": 3.211, + "step": 5639 + }, + { + "epoch": 0.8, + "learning_rate": 3.998266647407193e-05, + "loss": 3.1573, + "step": 5640 + }, + { + "epoch": 0.8, + "learning_rate": 3.995377726419183e-05, + "loss": 3.0154, + "step": 5641 + }, + { + "epoch": 0.8, + "learning_rate": 3.9924888054311714e-05, + "loss": 2.8478, + "step": 5642 + }, + { + "epoch": 0.8, + "learning_rate": 3.989599884443161e-05, + "loss": 3.2054, + "step": 5643 + }, + { + "epoch": 0.8, + "learning_rate": 3.9867109634551495e-05, + "loss": 3.1459, + "step": 5644 + }, + { + "epoch": 0.8, + "learning_rate": 3.983822042467139e-05, + "loss": 3.2107, + "step": 5645 + }, + { + "epoch": 0.8, + "learning_rate": 3.980933121479128e-05, + "loss": 3.0071, + "step": 5646 + }, + { + "epoch": 0.8, + "learning_rate": 3.978044200491117e-05, + "loss": 3.0957, + "step": 5647 + }, + { + "epoch": 0.8, + "learning_rate": 3.9751552795031056e-05, + "loss": 3.1346, + "step": 5648 + }, + { + "epoch": 0.8, + "learning_rate": 3.972266358515095e-05, + "loss": 3.0385, + "step": 5649 + }, + { + "epoch": 0.8, + "learning_rate": 3.969377437527084e-05, + "loss": 2.986, + "step": 5650 + }, + { + "epoch": 0.8, + "learning_rate": 3.966488516539073e-05, + "loss": 3.1062, + "step": 5651 + }, + { + "epoch": 0.8, + "learning_rate": 3.963599595551062e-05, + "loss": 3.1744, + "step": 5652 + }, + { + "epoch": 0.8, + "learning_rate": 3.9607106745630505e-05, + "loss": 3.0897, + "step": 5653 + }, + { + "epoch": 0.81, + "learning_rate": 3.95782175357504e-05, + "loss": 3.2293, + "step": 5654 + }, + { + "epoch": 0.81, + "learning_rate": 3.9549328325870286e-05, + "loss": 3.1602, + "step": 5655 + }, + { + "epoch": 0.81, + "learning_rate": 3.952043911599018e-05, + "loss": 3.1055, + "step": 5656 + }, + { + "epoch": 0.81, + "learning_rate": 3.949154990611007e-05, + "loss": 2.9947, + "step": 5657 + }, + { + "epoch": 0.81, + "learning_rate": 3.946266069622996e-05, + "loss": 3.0883, + "step": 5658 + }, + { + "epoch": 0.81, + "learning_rate": 3.9433771486349854e-05, + "loss": 3.0944, + "step": 5659 + }, + { + "epoch": 0.81, + "learning_rate": 3.940488227646974e-05, + "loss": 3.1408, + "step": 5660 + }, + { + "epoch": 0.81, + "learning_rate": 3.937599306658963e-05, + "loss": 3.1542, + "step": 5661 + }, + { + "epoch": 0.81, + "learning_rate": 3.934710385670952e-05, + "loss": 3.1758, + "step": 5662 + }, + { + "epoch": 0.81, + "learning_rate": 3.931821464682941e-05, + "loss": 3.0643, + "step": 5663 + }, + { + "epoch": 0.81, + "learning_rate": 3.92893254369493e-05, + "loss": 3.2559, + "step": 5664 + }, + { + "epoch": 0.81, + "learning_rate": 3.926043622706919e-05, + "loss": 3.1238, + "step": 5665 + }, + { + "epoch": 0.81, + "learning_rate": 3.923154701718908e-05, + "loss": 3.1534, + "step": 5666 + }, + { + "epoch": 0.81, + "learning_rate": 3.920265780730897e-05, + "loss": 3.1156, + "step": 5667 + }, + { + "epoch": 0.81, + "learning_rate": 3.917376859742886e-05, + "loss": 3.1027, + "step": 5668 + }, + { + "epoch": 0.81, + "learning_rate": 3.914487938754875e-05, + "loss": 3.1216, + "step": 5669 + }, + { + "epoch": 0.81, + "learning_rate": 3.9115990177668646e-05, + "loss": 3.0997, + "step": 5670 + }, + { + "epoch": 0.81, + "learning_rate": 3.908710096778853e-05, + "loss": 3.183, + "step": 5671 + }, + { + "epoch": 0.81, + "learning_rate": 3.905821175790843e-05, + "loss": 3.1182, + "step": 5672 + }, + { + "epoch": 0.81, + "learning_rate": 3.9029322548028314e-05, + "loss": 3.1436, + "step": 5673 + }, + { + "epoch": 0.81, + "learning_rate": 3.90004333381482e-05, + "loss": 3.1885, + "step": 5674 + }, + { + "epoch": 0.81, + "learning_rate": 3.8971544128268095e-05, + "loss": 3.1091, + "step": 5675 + }, + { + "epoch": 0.81, + "learning_rate": 3.894265491838798e-05, + "loss": 3.0086, + "step": 5676 + }, + { + "epoch": 0.81, + "learning_rate": 3.8913765708507876e-05, + "loss": 3.1322, + "step": 5677 + }, + { + "epoch": 0.81, + "learning_rate": 3.888487649862776e-05, + "loss": 3.086, + "step": 5678 + }, + { + "epoch": 0.81, + "learning_rate": 3.885598728874765e-05, + "loss": 3.1215, + "step": 5679 + }, + { + "epoch": 0.81, + "learning_rate": 3.8827098078867544e-05, + "loss": 3.0616, + "step": 5680 + }, + { + "epoch": 0.81, + "learning_rate": 3.879820886898744e-05, + "loss": 3.3159, + "step": 5681 + }, + { + "epoch": 0.81, + "learning_rate": 3.8769319659107324e-05, + "loss": 3.1251, + "step": 5682 + }, + { + "epoch": 0.81, + "learning_rate": 3.874043044922722e-05, + "loss": 3.1339, + "step": 5683 + }, + { + "epoch": 0.81, + "learning_rate": 3.8711541239347105e-05, + "loss": 3.2388, + "step": 5684 + }, + { + "epoch": 0.81, + "learning_rate": 3.8682652029467e-05, + "loss": 3.132, + "step": 5685 + }, + { + "epoch": 0.81, + "learning_rate": 3.8653762819586886e-05, + "loss": 3.1474, + "step": 5686 + }, + { + "epoch": 0.81, + "learning_rate": 3.862487360970677e-05, + "loss": 3.1231, + "step": 5687 + }, + { + "epoch": 0.81, + "learning_rate": 3.859598439982667e-05, + "loss": 3.1913, + "step": 5688 + }, + { + "epoch": 0.81, + "learning_rate": 3.8567095189946554e-05, + "loss": 3.2096, + "step": 5689 + }, + { + "epoch": 0.81, + "learning_rate": 3.853820598006645e-05, + "loss": 3.144, + "step": 5690 + }, + { + "epoch": 0.81, + "learning_rate": 3.8509316770186335e-05, + "loss": 3.079, + "step": 5691 + }, + { + "epoch": 0.81, + "learning_rate": 3.848042756030623e-05, + "loss": 3.1755, + "step": 5692 + }, + { + "epoch": 0.81, + "learning_rate": 3.845153835042612e-05, + "loss": 3.1755, + "step": 5693 + }, + { + "epoch": 0.81, + "learning_rate": 3.842264914054601e-05, + "loss": 3.0532, + "step": 5694 + }, + { + "epoch": 0.81, + "learning_rate": 3.83937599306659e-05, + "loss": 3.1325, + "step": 5695 + }, + { + "epoch": 0.81, + "learning_rate": 3.836487072078579e-05, + "loss": 3.0894, + "step": 5696 + }, + { + "epoch": 0.81, + "learning_rate": 3.833598151090568e-05, + "loss": 2.9988, + "step": 5697 + }, + { + "epoch": 0.81, + "learning_rate": 3.830709230102557e-05, + "loss": 2.995, + "step": 5698 + }, + { + "epoch": 0.81, + "learning_rate": 3.827820309114546e-05, + "loss": 3.0924, + "step": 5699 + }, + { + "epoch": 0.81, + "learning_rate": 3.8249313881265345e-05, + "loss": 3.1804, + "step": 5700 + }, + { + "epoch": 0.81, + "learning_rate": 3.822042467138524e-05, + "loss": 3.1738, + "step": 5701 + }, + { + "epoch": 0.81, + "learning_rate": 3.8191535461505126e-05, + "loss": 2.9256, + "step": 5702 + }, + { + "epoch": 0.81, + "learning_rate": 3.816264625162502e-05, + "loss": 3.1161, + "step": 5703 + }, + { + "epoch": 0.81, + "learning_rate": 3.8133757041744914e-05, + "loss": 3.2177, + "step": 5704 + }, + { + "epoch": 0.81, + "learning_rate": 3.81048678318648e-05, + "loss": 2.9096, + "step": 5705 + }, + { + "epoch": 0.81, + "learning_rate": 3.8075978621984695e-05, + "loss": 3.1869, + "step": 5706 + }, + { + "epoch": 0.81, + "learning_rate": 3.804708941210458e-05, + "loss": 2.9959, + "step": 5707 + }, + { + "epoch": 0.81, + "learning_rate": 3.801820020222447e-05, + "loss": 3.15, + "step": 5708 + }, + { + "epoch": 0.81, + "learning_rate": 3.798931099234436e-05, + "loss": 3.203, + "step": 5709 + }, + { + "epoch": 0.81, + "learning_rate": 3.796042178246425e-05, + "loss": 3.1453, + "step": 5710 + }, + { + "epoch": 0.81, + "learning_rate": 3.7931532572584144e-05, + "loss": 3.0416, + "step": 5711 + }, + { + "epoch": 0.81, + "learning_rate": 3.790264336270403e-05, + "loss": 3.1197, + "step": 5712 + }, + { + "epoch": 0.81, + "learning_rate": 3.787375415282392e-05, + "loss": 3.0855, + "step": 5713 + }, + { + "epoch": 0.81, + "learning_rate": 3.784486494294381e-05, + "loss": 3.2529, + "step": 5714 + }, + { + "epoch": 0.81, + "learning_rate": 3.7815975733063705e-05, + "loss": 3.1082, + "step": 5715 + }, + { + "epoch": 0.81, + "learning_rate": 3.778708652318359e-05, + "loss": 3.0494, + "step": 5716 + }, + { + "epoch": 0.81, + "learning_rate": 3.7758197313303486e-05, + "loss": 3.1625, + "step": 5717 + }, + { + "epoch": 0.81, + "learning_rate": 3.772930810342337e-05, + "loss": 3.1332, + "step": 5718 + }, + { + "epoch": 0.81, + "learning_rate": 3.770041889354327e-05, + "loss": 2.9911, + "step": 5719 + }, + { + "epoch": 0.81, + "learning_rate": 3.7671529683663154e-05, + "loss": 3.1648, + "step": 5720 + }, + { + "epoch": 0.81, + "learning_rate": 3.764264047378304e-05, + "loss": 3.1478, + "step": 5721 + }, + { + "epoch": 0.81, + "learning_rate": 3.7613751263902935e-05, + "loss": 3.2288, + "step": 5722 + }, + { + "epoch": 0.81, + "learning_rate": 3.758486205402282e-05, + "loss": 3.0045, + "step": 5723 + }, + { + "epoch": 0.81, + "learning_rate": 3.7555972844142716e-05, + "loss": 3.1102, + "step": 5724 + }, + { + "epoch": 0.82, + "learning_rate": 3.75270836342626e-05, + "loss": 3.0512, + "step": 5725 + }, + { + "epoch": 0.82, + "learning_rate": 3.749819442438249e-05, + "loss": 3.0181, + "step": 5726 + }, + { + "epoch": 0.82, + "learning_rate": 3.7469305214502384e-05, + "loss": 3.0271, + "step": 5727 + }, + { + "epoch": 0.82, + "learning_rate": 3.744041600462228e-05, + "loss": 3.019, + "step": 5728 + }, + { + "epoch": 0.82, + "learning_rate": 3.7411526794742165e-05, + "loss": 3.1775, + "step": 5729 + }, + { + "epoch": 0.82, + "learning_rate": 3.738263758486206e-05, + "loss": 3.1779, + "step": 5730 + }, + { + "epoch": 0.82, + "learning_rate": 3.7353748374981946e-05, + "loss": 2.8991, + "step": 5731 + }, + { + "epoch": 0.82, + "learning_rate": 3.732485916510184e-05, + "loss": 3.1068, + "step": 5732 + }, + { + "epoch": 0.82, + "learning_rate": 3.7295969955221727e-05, + "loss": 2.9594, + "step": 5733 + }, + { + "epoch": 0.82, + "learning_rate": 3.7267080745341614e-05, + "loss": 3.0418, + "step": 5734 + }, + { + "epoch": 0.82, + "learning_rate": 3.723819153546151e-05, + "loss": 3.0018, + "step": 5735 + }, + { + "epoch": 0.82, + "learning_rate": 3.7209302325581394e-05, + "loss": 3.1529, + "step": 5736 + }, + { + "epoch": 0.82, + "learning_rate": 3.718041311570128e-05, + "loss": 3.0914, + "step": 5737 + }, + { + "epoch": 0.82, + "learning_rate": 3.7151523905821175e-05, + "loss": 3.1752, + "step": 5738 + }, + { + "epoch": 0.82, + "learning_rate": 3.712263469594107e-05, + "loss": 3.0554, + "step": 5739 + }, + { + "epoch": 0.82, + "learning_rate": 3.709374548606096e-05, + "loss": 3.147, + "step": 5740 + }, + { + "epoch": 0.82, + "learning_rate": 3.706485627618085e-05, + "loss": 3.1708, + "step": 5741 + }, + { + "epoch": 0.82, + "learning_rate": 3.703596706630074e-05, + "loss": 3.1171, + "step": 5742 + }, + { + "epoch": 0.82, + "learning_rate": 3.700707785642063e-05, + "loss": 3.1213, + "step": 5743 + }, + { + "epoch": 0.82, + "learning_rate": 3.697818864654052e-05, + "loss": 3.188, + "step": 5744 + }, + { + "epoch": 0.82, + "learning_rate": 3.694929943666041e-05, + "loss": 2.901, + "step": 5745 + }, + { + "epoch": 0.82, + "learning_rate": 3.69204102267803e-05, + "loss": 3.0635, + "step": 5746 + }, + { + "epoch": 0.82, + "learning_rate": 3.6891521016900186e-05, + "loss": 3.1305, + "step": 5747 + }, + { + "epoch": 0.82, + "learning_rate": 3.686263180702008e-05, + "loss": 3.0389, + "step": 5748 + }, + { + "epoch": 0.82, + "learning_rate": 3.683374259713997e-05, + "loss": 3.125, + "step": 5749 + }, + { + "epoch": 0.82, + "learning_rate": 3.680485338725986e-05, + "loss": 3.0956, + "step": 5750 + }, + { + "epoch": 0.82, + "learning_rate": 3.6775964177379754e-05, + "loss": 3.1115, + "step": 5751 + }, + { + "epoch": 0.82, + "learning_rate": 3.674707496749964e-05, + "loss": 3.0745, + "step": 5752 + }, + { + "epoch": 0.82, + "learning_rate": 3.6718185757619535e-05, + "loss": 3.2283, + "step": 5753 + }, + { + "epoch": 0.82, + "learning_rate": 3.668929654773942e-05, + "loss": 3.2078, + "step": 5754 + }, + { + "epoch": 0.82, + "learning_rate": 3.666040733785931e-05, + "loss": 3.0762, + "step": 5755 + }, + { + "epoch": 0.82, + "learning_rate": 3.66315181279792e-05, + "loss": 3.127, + "step": 5756 + }, + { + "epoch": 0.82, + "learning_rate": 3.660262891809909e-05, + "loss": 3.0701, + "step": 5757 + }, + { + "epoch": 0.82, + "learning_rate": 3.657373970821898e-05, + "loss": 3.0519, + "step": 5758 + }, + { + "epoch": 0.82, + "learning_rate": 3.654485049833887e-05, + "loss": 3.0686, + "step": 5759 + }, + { + "epoch": 0.82, + "learning_rate": 3.651596128845876e-05, + "loss": 3.1107, + "step": 5760 + }, + { + "epoch": 0.82, + "learning_rate": 3.648707207857865e-05, + "loss": 3.004, + "step": 5761 + }, + { + "epoch": 0.82, + "learning_rate": 3.6458182868698546e-05, + "loss": 2.9062, + "step": 5762 + }, + { + "epoch": 0.82, + "learning_rate": 3.642929365881843e-05, + "loss": 2.9686, + "step": 5763 + }, + { + "epoch": 0.82, + "learning_rate": 3.640040444893833e-05, + "loss": 3.1951, + "step": 5764 + }, + { + "epoch": 0.82, + "learning_rate": 3.6371515239058214e-05, + "loss": 3.1462, + "step": 5765 + }, + { + "epoch": 0.82, + "learning_rate": 3.634262602917811e-05, + "loss": 3.2414, + "step": 5766 + }, + { + "epoch": 0.82, + "learning_rate": 3.6313736819297995e-05, + "loss": 3.0106, + "step": 5767 + }, + { + "epoch": 0.82, + "learning_rate": 3.628484760941788e-05, + "loss": 3.2559, + "step": 5768 + }, + { + "epoch": 0.82, + "learning_rate": 3.6255958399537775e-05, + "loss": 3.1398, + "step": 5769 + }, + { + "epoch": 0.82, + "learning_rate": 3.622706918965766e-05, + "loss": 3.1913, + "step": 5770 + }, + { + "epoch": 0.82, + "learning_rate": 3.619817997977755e-05, + "loss": 3.045, + "step": 5771 + }, + { + "epoch": 0.82, + "learning_rate": 3.6169290769897443e-05, + "loss": 3.1046, + "step": 5772 + }, + { + "epoch": 0.82, + "learning_rate": 3.614040156001734e-05, + "loss": 3.1989, + "step": 5773 + }, + { + "epoch": 0.82, + "learning_rate": 3.6111512350137224e-05, + "loss": 3.2277, + "step": 5774 + }, + { + "epoch": 0.82, + "learning_rate": 3.608262314025712e-05, + "loss": 3.0579, + "step": 5775 + }, + { + "epoch": 0.82, + "learning_rate": 3.6053733930377005e-05, + "loss": 3.0565, + "step": 5776 + }, + { + "epoch": 0.82, + "learning_rate": 3.60248447204969e-05, + "loss": 3.2018, + "step": 5777 + }, + { + "epoch": 0.82, + "learning_rate": 3.5995955510616786e-05, + "loss": 3.2889, + "step": 5778 + }, + { + "epoch": 0.82, + "learning_rate": 3.596706630073668e-05, + "loss": 3.0778, + "step": 5779 + }, + { + "epoch": 0.82, + "learning_rate": 3.593817709085657e-05, + "loss": 3.1019, + "step": 5780 + }, + { + "epoch": 0.82, + "learning_rate": 3.5909287880976454e-05, + "loss": 3.241, + "step": 5781 + }, + { + "epoch": 0.82, + "learning_rate": 3.588039867109635e-05, + "loss": 3.1829, + "step": 5782 + }, + { + "epoch": 0.82, + "learning_rate": 3.5851509461216235e-05, + "loss": 3.1955, + "step": 5783 + }, + { + "epoch": 0.82, + "learning_rate": 3.582262025133613e-05, + "loss": 3.2168, + "step": 5784 + }, + { + "epoch": 0.82, + "learning_rate": 3.5793731041456016e-05, + "loss": 3.1748, + "step": 5785 + }, + { + "epoch": 0.82, + "learning_rate": 3.576484183157591e-05, + "loss": 3.2433, + "step": 5786 + }, + { + "epoch": 0.82, + "learning_rate": 3.57359526216958e-05, + "loss": 3.0951, + "step": 5787 + }, + { + "epoch": 0.82, + "learning_rate": 3.570706341181569e-05, + "loss": 3.1137, + "step": 5788 + }, + { + "epoch": 0.82, + "learning_rate": 3.567817420193558e-05, + "loss": 3.1623, + "step": 5789 + }, + { + "epoch": 0.82, + "learning_rate": 3.564928499205547e-05, + "loss": 3.0718, + "step": 5790 + }, + { + "epoch": 0.82, + "learning_rate": 3.562039578217536e-05, + "loss": 3.1426, + "step": 5791 + }, + { + "epoch": 0.82, + "learning_rate": 3.5591506572295245e-05, + "loss": 3.1258, + "step": 5792 + }, + { + "epoch": 0.82, + "learning_rate": 3.556261736241514e-05, + "loss": 3.1174, + "step": 5793 + }, + { + "epoch": 0.82, + "learning_rate": 3.5533728152535026e-05, + "loss": 3.0803, + "step": 5794 + }, + { + "epoch": 0.83, + "learning_rate": 3.550483894265492e-05, + "loss": 3.174, + "step": 5795 + }, + { + "epoch": 0.83, + "learning_rate": 3.547594973277481e-05, + "loss": 3.209, + "step": 5796 + }, + { + "epoch": 0.83, + "learning_rate": 3.54470605228947e-05, + "loss": 3.1218, + "step": 5797 + }, + { + "epoch": 0.83, + "learning_rate": 3.5418171313014595e-05, + "loss": 3.1554, + "step": 5798 + }, + { + "epoch": 0.83, + "learning_rate": 3.538928210313448e-05, + "loss": 3.1106, + "step": 5799 + }, + { + "epoch": 0.83, + "learning_rate": 3.5360392893254376e-05, + "loss": 3.1266, + "step": 5800 + }, + { + "epoch": 0.83, + "learning_rate": 3.533150368337426e-05, + "loss": 3.1263, + "step": 5801 + }, + { + "epoch": 0.83, + "learning_rate": 3.530261447349415e-05, + "loss": 3.1582, + "step": 5802 + }, + { + "epoch": 0.83, + "learning_rate": 3.5273725263614044e-05, + "loss": 3.1887, + "step": 5803 + }, + { + "epoch": 0.83, + "learning_rate": 3.524483605373393e-05, + "loss": 3.0272, + "step": 5804 + }, + { + "epoch": 0.83, + "learning_rate": 3.521594684385382e-05, + "loss": 3.1066, + "step": 5805 + }, + { + "epoch": 0.83, + "learning_rate": 3.518705763397371e-05, + "loss": 3.1363, + "step": 5806 + }, + { + "epoch": 0.83, + "learning_rate": 3.51581684240936e-05, + "loss": 3.0005, + "step": 5807 + }, + { + "epoch": 0.83, + "learning_rate": 3.512927921421349e-05, + "loss": 3.1468, + "step": 5808 + }, + { + "epoch": 0.83, + "learning_rate": 3.5100390004333386e-05, + "loss": 3.137, + "step": 5809 + }, + { + "epoch": 0.83, + "learning_rate": 3.507150079445327e-05, + "loss": 2.8747, + "step": 5810 + }, + { + "epoch": 0.83, + "learning_rate": 3.504261158457317e-05, + "loss": 3.1055, + "step": 5811 + }, + { + "epoch": 0.83, + "learning_rate": 3.5013722374693054e-05, + "loss": 3.0413, + "step": 5812 + }, + { + "epoch": 0.83, + "learning_rate": 3.498483316481295e-05, + "loss": 3.0186, + "step": 5813 + }, + { + "epoch": 0.83, + "learning_rate": 3.4955943954932835e-05, + "loss": 3.0483, + "step": 5814 + }, + { + "epoch": 0.83, + "learning_rate": 3.492705474505272e-05, + "loss": 2.9472, + "step": 5815 + }, + { + "epoch": 0.83, + "learning_rate": 3.4898165535172616e-05, + "loss": 3.1869, + "step": 5816 + }, + { + "epoch": 0.83, + "learning_rate": 3.48692763252925e-05, + "loss": 3.091, + "step": 5817 + }, + { + "epoch": 0.83, + "learning_rate": 3.484038711541239e-05, + "loss": 3.0505, + "step": 5818 + }, + { + "epoch": 0.83, + "learning_rate": 3.4811497905532284e-05, + "loss": 3.0532, + "step": 5819 + }, + { + "epoch": 0.83, + "learning_rate": 3.478260869565218e-05, + "loss": 2.9293, + "step": 5820 + }, + { + "epoch": 0.83, + "learning_rate": 3.475371948577207e-05, + "loss": 3.1341, + "step": 5821 + }, + { + "epoch": 0.83, + "learning_rate": 3.472483027589196e-05, + "loss": 2.8325, + "step": 5822 + }, + { + "epoch": 0.83, + "learning_rate": 3.4695941066011846e-05, + "loss": 3.2622, + "step": 5823 + }, + { + "epoch": 0.83, + "learning_rate": 3.466705185613174e-05, + "loss": 3.1963, + "step": 5824 + }, + { + "epoch": 0.83, + "learning_rate": 3.4638162646251626e-05, + "loss": 3.0659, + "step": 5825 + }, + { + "epoch": 0.83, + "learning_rate": 3.4609273436371513e-05, + "loss": 3.1355, + "step": 5826 + }, + { + "epoch": 0.83, + "learning_rate": 3.458038422649141e-05, + "loss": 3.0923, + "step": 5827 + }, + { + "epoch": 0.83, + "learning_rate": 3.4551495016611294e-05, + "loss": 3.1681, + "step": 5828 + }, + { + "epoch": 0.83, + "learning_rate": 3.452260580673119e-05, + "loss": 3.0811, + "step": 5829 + }, + { + "epoch": 0.83, + "learning_rate": 3.4493716596851075e-05, + "loss": 3.1874, + "step": 5830 + }, + { + "epoch": 0.83, + "learning_rate": 3.446482738697097e-05, + "loss": 3.1023, + "step": 5831 + }, + { + "epoch": 0.83, + "learning_rate": 3.443593817709086e-05, + "loss": 3.1716, + "step": 5832 + }, + { + "epoch": 0.83, + "learning_rate": 3.440704896721075e-05, + "loss": 2.9498, + "step": 5833 + }, + { + "epoch": 0.83, + "learning_rate": 3.4378159757330644e-05, + "loss": 3.0727, + "step": 5834 + }, + { + "epoch": 0.83, + "learning_rate": 3.434927054745053e-05, + "loss": 3.0121, + "step": 5835 + }, + { + "epoch": 0.83, + "learning_rate": 3.432038133757042e-05, + "loss": 3.1766, + "step": 5836 + }, + { + "epoch": 0.83, + "learning_rate": 3.429149212769031e-05, + "loss": 3.0586, + "step": 5837 + }, + { + "epoch": 0.83, + "learning_rate": 3.42626029178102e-05, + "loss": 3.2201, + "step": 5838 + }, + { + "epoch": 0.83, + "learning_rate": 3.4233713707930086e-05, + "loss": 3.0646, + "step": 5839 + }, + { + "epoch": 0.83, + "learning_rate": 3.420482449804998e-05, + "loss": 3.1721, + "step": 5840 + }, + { + "epoch": 0.83, + "learning_rate": 3.417593528816987e-05, + "loss": 3.0589, + "step": 5841 + }, + { + "epoch": 0.83, + "learning_rate": 3.414704607828976e-05, + "loss": 3.1793, + "step": 5842 + }, + { + "epoch": 0.83, + "learning_rate": 3.411815686840965e-05, + "loss": 3.114, + "step": 5843 + }, + { + "epoch": 0.83, + "learning_rate": 3.408926765852954e-05, + "loss": 3.0718, + "step": 5844 + }, + { + "epoch": 0.83, + "learning_rate": 3.4060378448649435e-05, + "loss": 3.2071, + "step": 5845 + }, + { + "epoch": 0.83, + "learning_rate": 3.403148923876932e-05, + "loss": 3.1263, + "step": 5846 + }, + { + "epoch": 0.83, + "learning_rate": 3.400260002888921e-05, + "loss": 3.1385, + "step": 5847 + }, + { + "epoch": 0.83, + "learning_rate": 3.39737108190091e-05, + "loss": 3.032, + "step": 5848 + }, + { + "epoch": 0.83, + "learning_rate": 3.394482160912899e-05, + "loss": 3.0534, + "step": 5849 + }, + { + "epoch": 0.83, + "learning_rate": 3.3915932399248884e-05, + "loss": 3.0448, + "step": 5850 + }, + { + "epoch": 0.83, + "learning_rate": 3.388704318936877e-05, + "loss": 3.0444, + "step": 5851 + }, + { + "epoch": 0.83, + "learning_rate": 3.385815397948866e-05, + "loss": 3.1049, + "step": 5852 + }, + { + "epoch": 0.83, + "learning_rate": 3.382926476960855e-05, + "loss": 3.0882, + "step": 5853 + }, + { + "epoch": 0.83, + "learning_rate": 3.380037555972844e-05, + "loss": 3.1759, + "step": 5854 + }, + { + "epoch": 0.83, + "learning_rate": 3.377148634984833e-05, + "loss": 3.0127, + "step": 5855 + }, + { + "epoch": 0.83, + "learning_rate": 3.374259713996823e-05, + "loss": 3.1281, + "step": 5856 + }, + { + "epoch": 0.83, + "learning_rate": 3.3713707930088114e-05, + "loss": 3.1889, + "step": 5857 + }, + { + "epoch": 0.83, + "learning_rate": 3.368481872020801e-05, + "loss": 2.9567, + "step": 5858 + }, + { + "epoch": 0.83, + "learning_rate": 3.3655929510327895e-05, + "loss": 3.0444, + "step": 5859 + }, + { + "epoch": 0.83, + "learning_rate": 3.362704030044778e-05, + "loss": 3.0166, + "step": 5860 + }, + { + "epoch": 0.83, + "learning_rate": 3.3598151090567675e-05, + "loss": 3.1204, + "step": 5861 + }, + { + "epoch": 0.83, + "learning_rate": 3.356926188068756e-05, + "loss": 3.0538, + "step": 5862 + }, + { + "epoch": 0.83, + "learning_rate": 3.3540372670807456e-05, + "loss": 3.235, + "step": 5863 + }, + { + "epoch": 0.83, + "learning_rate": 3.351148346092734e-05, + "loss": 3.2553, + "step": 5864 + }, + { + "epoch": 0.84, + "learning_rate": 3.348259425104723e-05, + "loss": 2.9994, + "step": 5865 + }, + { + "epoch": 0.84, + "learning_rate": 3.3453705041167124e-05, + "loss": 3.061, + "step": 5866 + }, + { + "epoch": 0.84, + "learning_rate": 3.342481583128702e-05, + "loss": 3.1033, + "step": 5867 + }, + { + "epoch": 0.84, + "learning_rate": 3.339592662140691e-05, + "loss": 3.0325, + "step": 5868 + }, + { + "epoch": 0.84, + "learning_rate": 3.33670374115268e-05, + "loss": 3.1356, + "step": 5869 + }, + { + "epoch": 0.84, + "learning_rate": 3.3338148201646686e-05, + "loss": 3.1409, + "step": 5870 + }, + { + "epoch": 0.84, + "learning_rate": 3.330925899176658e-05, + "loss": 2.8731, + "step": 5871 + }, + { + "epoch": 0.84, + "learning_rate": 3.328036978188647e-05, + "loss": 2.917, + "step": 5872 + }, + { + "epoch": 0.84, + "learning_rate": 3.3251480572006354e-05, + "loss": 3.1651, + "step": 5873 + }, + { + "epoch": 0.84, + "learning_rate": 3.322259136212625e-05, + "loss": 3.0817, + "step": 5874 + }, + { + "epoch": 0.84, + "learning_rate": 3.3193702152246135e-05, + "loss": 3.1778, + "step": 5875 + }, + { + "epoch": 0.84, + "learning_rate": 3.316481294236603e-05, + "loss": 3.1177, + "step": 5876 + }, + { + "epoch": 0.84, + "learning_rate": 3.3135923732485916e-05, + "loss": 2.9929, + "step": 5877 + }, + { + "epoch": 0.84, + "learning_rate": 3.310703452260581e-05, + "loss": 3.0402, + "step": 5878 + }, + { + "epoch": 0.84, + "learning_rate": 3.30781453127257e-05, + "loss": 3.0111, + "step": 5879 + }, + { + "epoch": 0.84, + "learning_rate": 3.304925610284559e-05, + "loss": 3.1887, + "step": 5880 + }, + { + "epoch": 0.84, + "learning_rate": 3.302036689296548e-05, + "loss": 2.7138, + "step": 5881 + }, + { + "epoch": 0.84, + "learning_rate": 3.299147768308537e-05, + "loss": 3.0009, + "step": 5882 + }, + { + "epoch": 0.84, + "learning_rate": 3.296258847320526e-05, + "loss": 2.9829, + "step": 5883 + }, + { + "epoch": 0.84, + "learning_rate": 3.293369926332515e-05, + "loss": 3.099, + "step": 5884 + }, + { + "epoch": 0.84, + "learning_rate": 3.290481005344504e-05, + "loss": 3.1086, + "step": 5885 + }, + { + "epoch": 0.84, + "learning_rate": 3.2875920843564926e-05, + "loss": 3.0638, + "step": 5886 + }, + { + "epoch": 0.84, + "learning_rate": 3.284703163368482e-05, + "loss": 3.0933, + "step": 5887 + }, + { + "epoch": 0.84, + "learning_rate": 3.281814242380471e-05, + "loss": 3.1918, + "step": 5888 + }, + { + "epoch": 0.84, + "learning_rate": 3.27892532139246e-05, + "loss": 3.117, + "step": 5889 + }, + { + "epoch": 0.84, + "learning_rate": 3.2760364004044495e-05, + "loss": 3.009, + "step": 5890 + }, + { + "epoch": 0.84, + "learning_rate": 3.273147479416438e-05, + "loss": 3.1886, + "step": 5891 + }, + { + "epoch": 0.84, + "learning_rate": 3.2702585584284276e-05, + "loss": 2.9678, + "step": 5892 + }, + { + "epoch": 0.84, + "learning_rate": 3.267369637440416e-05, + "loss": 3.154, + "step": 5893 + }, + { + "epoch": 0.84, + "learning_rate": 3.264480716452405e-05, + "loss": 3.207, + "step": 5894 + }, + { + "epoch": 0.84, + "learning_rate": 3.2615917954643944e-05, + "loss": 3.0728, + "step": 5895 + }, + { + "epoch": 0.84, + "learning_rate": 3.258702874476383e-05, + "loss": 3.0138, + "step": 5896 + }, + { + "epoch": 0.84, + "learning_rate": 3.2558139534883724e-05, + "loss": 3.0363, + "step": 5897 + }, + { + "epoch": 0.84, + "learning_rate": 3.252925032500361e-05, + "loss": 3.1933, + "step": 5898 + }, + { + "epoch": 0.84, + "learning_rate": 3.25003611151235e-05, + "loss": 3.0942, + "step": 5899 + }, + { + "epoch": 0.84, + "learning_rate": 3.247147190524339e-05, + "loss": 3.1072, + "step": 5900 + }, + { + "epoch": 0.84, + "learning_rate": 3.2442582695363286e-05, + "loss": 3.1678, + "step": 5901 + }, + { + "epoch": 0.84, + "learning_rate": 3.241369348548317e-05, + "loss": 3.0824, + "step": 5902 + }, + { + "epoch": 0.84, + "learning_rate": 3.238480427560307e-05, + "loss": 3.0526, + "step": 5903 + }, + { + "epoch": 0.84, + "learning_rate": 3.2355915065722954e-05, + "loss": 3.0983, + "step": 5904 + }, + { + "epoch": 0.84, + "learning_rate": 3.232702585584285e-05, + "loss": 3.2043, + "step": 5905 + }, + { + "epoch": 0.84, + "learning_rate": 3.2298136645962735e-05, + "loss": 3.1118, + "step": 5906 + }, + { + "epoch": 0.84, + "learning_rate": 3.226924743608262e-05, + "loss": 3.1107, + "step": 5907 + }, + { + "epoch": 0.84, + "learning_rate": 3.2240358226202516e-05, + "loss": 3.1229, + "step": 5908 + }, + { + "epoch": 0.84, + "learning_rate": 3.22114690163224e-05, + "loss": 3.1159, + "step": 5909 + }, + { + "epoch": 0.84, + "learning_rate": 3.21825798064423e-05, + "loss": 3.0419, + "step": 5910 + }, + { + "epoch": 0.84, + "learning_rate": 3.2153690596562184e-05, + "loss": 3.2061, + "step": 5911 + }, + { + "epoch": 0.84, + "learning_rate": 3.212480138668207e-05, + "loss": 3.106, + "step": 5912 + }, + { + "epoch": 0.84, + "learning_rate": 3.2095912176801965e-05, + "loss": 3.0385, + "step": 5913 + }, + { + "epoch": 0.84, + "learning_rate": 3.206702296692186e-05, + "loss": 3.1538, + "step": 5914 + }, + { + "epoch": 0.84, + "learning_rate": 3.2038133757041745e-05, + "loss": 3.0929, + "step": 5915 + }, + { + "epoch": 0.84, + "learning_rate": 3.200924454716164e-05, + "loss": 3.0517, + "step": 5916 + }, + { + "epoch": 0.84, + "learning_rate": 3.1980355337281526e-05, + "loss": 2.9422, + "step": 5917 + }, + { + "epoch": 0.84, + "learning_rate": 3.195146612740142e-05, + "loss": 3.1486, + "step": 5918 + }, + { + "epoch": 0.84, + "learning_rate": 3.192257691752131e-05, + "loss": 3.1113, + "step": 5919 + }, + { + "epoch": 0.84, + "learning_rate": 3.1893687707641194e-05, + "loss": 3.1429, + "step": 5920 + }, + { + "epoch": 0.84, + "learning_rate": 3.186479849776109e-05, + "loss": 3.0844, + "step": 5921 + }, + { + "epoch": 0.84, + "learning_rate": 3.1835909287880975e-05, + "loss": 3.0432, + "step": 5922 + }, + { + "epoch": 0.84, + "learning_rate": 3.180702007800087e-05, + "loss": 2.9963, + "step": 5923 + }, + { + "epoch": 0.84, + "learning_rate": 3.1778130868120756e-05, + "loss": 2.9608, + "step": 5924 + }, + { + "epoch": 0.84, + "learning_rate": 3.174924165824065e-05, + "loss": 3.1549, + "step": 5925 + }, + { + "epoch": 0.84, + "learning_rate": 3.1720352448360544e-05, + "loss": 2.9382, + "step": 5926 + }, + { + "epoch": 0.84, + "learning_rate": 3.169146323848043e-05, + "loss": 3.1066, + "step": 5927 + }, + { + "epoch": 0.84, + "learning_rate": 3.166257402860032e-05, + "loss": 3.0371, + "step": 5928 + }, + { + "epoch": 0.84, + "learning_rate": 3.163368481872021e-05, + "loss": 3.04, + "step": 5929 + }, + { + "epoch": 0.84, + "learning_rate": 3.16047956088401e-05, + "loss": 3.1512, + "step": 5930 + }, + { + "epoch": 0.84, + "learning_rate": 3.157590639895999e-05, + "loss": 3.1722, + "step": 5931 + }, + { + "epoch": 0.84, + "learning_rate": 3.154701718907988e-05, + "loss": 2.9079, + "step": 5932 + }, + { + "epoch": 0.84, + "learning_rate": 3.1518127979199767e-05, + "loss": 2.8963, + "step": 5933 + }, + { + "epoch": 0.84, + "learning_rate": 3.148923876931966e-05, + "loss": 3.0942, + "step": 5934 + }, + { + "epoch": 0.85, + "learning_rate": 3.146034955943955e-05, + "loss": 3.1648, + "step": 5935 + }, + { + "epoch": 0.85, + "learning_rate": 3.143146034955944e-05, + "loss": 3.0909, + "step": 5936 + }, + { + "epoch": 0.85, + "learning_rate": 3.1402571139679335e-05, + "loss": 3.0367, + "step": 5937 + }, + { + "epoch": 0.85, + "learning_rate": 3.137368192979922e-05, + "loss": 2.9949, + "step": 5938 + }, + { + "epoch": 0.85, + "learning_rate": 3.1344792719919116e-05, + "loss": 3.0726, + "step": 5939 + }, + { + "epoch": 0.85, + "learning_rate": 3.1315903510039e-05, + "loss": 3.2591, + "step": 5940 + }, + { + "epoch": 0.85, + "learning_rate": 3.128701430015889e-05, + "loss": 2.9792, + "step": 5941 + }, + { + "epoch": 0.85, + "learning_rate": 3.1258125090278784e-05, + "loss": 3.0628, + "step": 5942 + }, + { + "epoch": 0.85, + "learning_rate": 3.122923588039867e-05, + "loss": 2.9426, + "step": 5943 + }, + { + "epoch": 0.85, + "learning_rate": 3.1200346670518565e-05, + "loss": 3.031, + "step": 5944 + }, + { + "epoch": 0.85, + "learning_rate": 3.117145746063845e-05, + "loss": 3.1403, + "step": 5945 + }, + { + "epoch": 0.85, + "learning_rate": 3.114256825075834e-05, + "loss": 3.1605, + "step": 5946 + }, + { + "epoch": 0.85, + "learning_rate": 3.111367904087823e-05, + "loss": 2.9976, + "step": 5947 + }, + { + "epoch": 0.85, + "learning_rate": 3.1084789830998127e-05, + "loss": 3.139, + "step": 5948 + }, + { + "epoch": 0.85, + "learning_rate": 3.1055900621118014e-05, + "loss": 3.1015, + "step": 5949 + }, + { + "epoch": 0.85, + "learning_rate": 3.102701141123791e-05, + "loss": 2.9959, + "step": 5950 + }, + { + "epoch": 0.85, + "learning_rate": 3.0998122201357794e-05, + "loss": 3.1794, + "step": 5951 + }, + { + "epoch": 0.85, + "learning_rate": 3.096923299147769e-05, + "loss": 3.0914, + "step": 5952 + }, + { + "epoch": 0.85, + "learning_rate": 3.0940343781597575e-05, + "loss": 3.0307, + "step": 5953 + }, + { + "epoch": 0.85, + "learning_rate": 3.091145457171746e-05, + "loss": 2.9845, + "step": 5954 + }, + { + "epoch": 0.85, + "learning_rate": 3.0882565361837356e-05, + "loss": 3.0399, + "step": 5955 + }, + { + "epoch": 0.85, + "learning_rate": 3.085367615195724e-05, + "loss": 3.2198, + "step": 5956 + }, + { + "epoch": 0.85, + "learning_rate": 3.082478694207713e-05, + "loss": 3.1431, + "step": 5957 + }, + { + "epoch": 0.85, + "learning_rate": 3.0795897732197024e-05, + "loss": 2.9562, + "step": 5958 + }, + { + "epoch": 0.85, + "learning_rate": 3.076700852231692e-05, + "loss": 3.0612, + "step": 5959 + }, + { + "epoch": 0.85, + "learning_rate": 3.0738119312436805e-05, + "loss": 2.9404, + "step": 5960 + }, + { + "epoch": 0.85, + "learning_rate": 3.07092301025567e-05, + "loss": 3.0973, + "step": 5961 + }, + { + "epoch": 0.85, + "learning_rate": 3.0680340892676586e-05, + "loss": 3.1981, + "step": 5962 + }, + { + "epoch": 0.85, + "learning_rate": 3.065145168279648e-05, + "loss": 3.0318, + "step": 5963 + }, + { + "epoch": 0.85, + "learning_rate": 3.062256247291637e-05, + "loss": 3.0276, + "step": 5964 + }, + { + "epoch": 0.85, + "learning_rate": 3.059367326303626e-05, + "loss": 2.9113, + "step": 5965 + }, + { + "epoch": 0.85, + "learning_rate": 3.056478405315615e-05, + "loss": 3.1369, + "step": 5966 + }, + { + "epoch": 0.85, + "learning_rate": 3.0535894843276035e-05, + "loss": 3.108, + "step": 5967 + }, + { + "epoch": 0.85, + "learning_rate": 3.050700563339593e-05, + "loss": 3.0283, + "step": 5968 + }, + { + "epoch": 0.85, + "learning_rate": 3.047811642351582e-05, + "loss": 3.0095, + "step": 5969 + }, + { + "epoch": 0.85, + "learning_rate": 3.0449227213635706e-05, + "loss": 3.19, + "step": 5970 + }, + { + "epoch": 0.85, + "learning_rate": 3.04203380037556e-05, + "loss": 3.0185, + "step": 5971 + }, + { + "epoch": 0.85, + "learning_rate": 3.0391448793875487e-05, + "loss": 3.0372, + "step": 5972 + }, + { + "epoch": 0.85, + "learning_rate": 3.036255958399538e-05, + "loss": 2.9137, + "step": 5973 + }, + { + "epoch": 0.85, + "learning_rate": 3.033367037411527e-05, + "loss": 3.1789, + "step": 5974 + }, + { + "epoch": 0.85, + "learning_rate": 3.0304781164235158e-05, + "loss": 3.2761, + "step": 5975 + }, + { + "epoch": 0.85, + "learning_rate": 3.0275891954355052e-05, + "loss": 2.9749, + "step": 5976 + }, + { + "epoch": 0.85, + "learning_rate": 3.024700274447494e-05, + "loss": 2.9962, + "step": 5977 + }, + { + "epoch": 0.85, + "learning_rate": 3.0218113534594833e-05, + "loss": 3.0592, + "step": 5978 + }, + { + "epoch": 0.85, + "learning_rate": 3.018922432471472e-05, + "loss": 3.1442, + "step": 5979 + }, + { + "epoch": 0.85, + "learning_rate": 3.016033511483461e-05, + "loss": 3.0294, + "step": 5980 + }, + { + "epoch": 0.85, + "learning_rate": 3.0131445904954504e-05, + "loss": 3.1141, + "step": 5981 + }, + { + "epoch": 0.85, + "learning_rate": 3.010255669507439e-05, + "loss": 2.9618, + "step": 5982 + }, + { + "epoch": 0.85, + "learning_rate": 3.0073667485194278e-05, + "loss": 2.9459, + "step": 5983 + }, + { + "epoch": 0.85, + "learning_rate": 3.0044778275314172e-05, + "loss": 3.0394, + "step": 5984 + }, + { + "epoch": 0.85, + "eval_loss": 3.3314239978790283, + "eval_runtime": 472.5204, + "eval_samples_per_second": 43.357, + "eval_steps_per_second": 14.452, + "step": 5984 + }, + { + "epoch": 0.85, + "learning_rate": 3.001588906543406e-05, + "loss": 3.1192, + "step": 5985 + }, + { + "epoch": 0.85, + "learning_rate": 2.9986999855553953e-05, + "loss": 2.9188, + "step": 5986 + }, + { + "epoch": 0.85, + "learning_rate": 2.9958110645673843e-05, + "loss": 2.913, + "step": 5987 + }, + { + "epoch": 0.85, + "learning_rate": 2.992922143579373e-05, + "loss": 3.1198, + "step": 5988 + }, + { + "epoch": 0.85, + "learning_rate": 2.9900332225913624e-05, + "loss": 2.9629, + "step": 5989 + }, + { + "epoch": 0.85, + "learning_rate": 2.987144301603351e-05, + "loss": 3.0012, + "step": 5990 + }, + { + "epoch": 0.85, + "learning_rate": 2.9842553806153402e-05, + "loss": 3.0383, + "step": 5991 + }, + { + "epoch": 0.85, + "learning_rate": 2.9813664596273296e-05, + "loss": 3.0143, + "step": 5992 + }, + { + "epoch": 0.85, + "learning_rate": 2.9784775386393183e-05, + "loss": 3.0603, + "step": 5993 + }, + { + "epoch": 0.85, + "learning_rate": 2.9755886176513076e-05, + "loss": 2.8586, + "step": 5994 + }, + { + "epoch": 0.85, + "learning_rate": 2.9726996966632964e-05, + "loss": 3.0506, + "step": 5995 + }, + { + "epoch": 0.85, + "learning_rate": 2.969810775675285e-05, + "loss": 3.1137, + "step": 5996 + }, + { + "epoch": 0.85, + "learning_rate": 2.9669218546872744e-05, + "loss": 2.989, + "step": 5997 + }, + { + "epoch": 0.85, + "learning_rate": 2.9640329336992635e-05, + "loss": 3.1343, + "step": 5998 + }, + { + "epoch": 0.85, + "learning_rate": 2.961144012711253e-05, + "loss": 2.9752, + "step": 5999 + }, + { + "epoch": 0.85, + "learning_rate": 2.9582550917232416e-05, + "loss": 3.0701, + "step": 6000 + }, + { + "epoch": 0.85, + "learning_rate": 2.9553661707352303e-05, + "loss": 3.1102, + "step": 6001 + }, + { + "epoch": 0.85, + "learning_rate": 2.9524772497472197e-05, + "loss": 3.1009, + "step": 6002 + }, + { + "epoch": 0.85, + "learning_rate": 2.9495883287592087e-05, + "loss": 3.004, + "step": 6003 + }, + { + "epoch": 0.85, + "learning_rate": 2.9466994077711974e-05, + "loss": 3.1166, + "step": 6004 + }, + { + "epoch": 0.85, + "learning_rate": 2.9438104867831868e-05, + "loss": 3.0657, + "step": 6005 + }, + { + "epoch": 0.86, + "learning_rate": 2.9409215657951755e-05, + "loss": 2.9392, + "step": 6006 + }, + { + "epoch": 0.86, + "learning_rate": 2.938032644807165e-05, + "loss": 3.1314, + "step": 6007 + }, + { + "epoch": 0.86, + "learning_rate": 2.9351437238191536e-05, + "loss": 3.0163, + "step": 6008 + }, + { + "epoch": 0.86, + "learning_rate": 2.9322548028311426e-05, + "loss": 3.2141, + "step": 6009 + }, + { + "epoch": 0.86, + "learning_rate": 2.929365881843132e-05, + "loss": 2.911, + "step": 6010 + }, + { + "epoch": 0.86, + "learning_rate": 2.9264769608551207e-05, + "loss": 2.9646, + "step": 6011 + }, + { + "epoch": 0.86, + "learning_rate": 2.92358803986711e-05, + "loss": 3.0786, + "step": 6012 + }, + { + "epoch": 0.86, + "learning_rate": 2.9206991188790988e-05, + "loss": 3.0391, + "step": 6013 + }, + { + "epoch": 0.86, + "learning_rate": 2.9178101978910875e-05, + "loss": 3.1584, + "step": 6014 + }, + { + "epoch": 0.86, + "learning_rate": 2.914921276903077e-05, + "loss": 3.0752, + "step": 6015 + }, + { + "epoch": 0.86, + "learning_rate": 2.912032355915066e-05, + "loss": 3.1557, + "step": 6016 + }, + { + "epoch": 0.86, + "learning_rate": 2.9091434349270546e-05, + "loss": 3.0763, + "step": 6017 + }, + { + "epoch": 0.86, + "learning_rate": 2.906254513939044e-05, + "loss": 3.075, + "step": 6018 + }, + { + "epoch": 0.86, + "learning_rate": 2.9033655929510327e-05, + "loss": 3.0939, + "step": 6019 + }, + { + "epoch": 0.86, + "learning_rate": 2.900476671963022e-05, + "loss": 3.0377, + "step": 6020 + }, + { + "epoch": 0.86, + "learning_rate": 2.897587750975011e-05, + "loss": 3.0022, + "step": 6021 + }, + { + "epoch": 0.86, + "learning_rate": 2.894698829987e-05, + "loss": 3.0906, + "step": 6022 + }, + { + "epoch": 0.86, + "learning_rate": 2.8918099089989892e-05, + "loss": 3.1968, + "step": 6023 + }, + { + "epoch": 0.86, + "learning_rate": 2.888920988010978e-05, + "loss": 3.0314, + "step": 6024 + }, + { + "epoch": 0.86, + "learning_rate": 2.8860320670229666e-05, + "loss": 3.1483, + "step": 6025 + }, + { + "epoch": 0.86, + "learning_rate": 2.883143146034956e-05, + "loss": 3.1315, + "step": 6026 + }, + { + "epoch": 0.86, + "learning_rate": 2.880254225046945e-05, + "loss": 3.1747, + "step": 6027 + }, + { + "epoch": 0.86, + "learning_rate": 2.8773653040589345e-05, + "loss": 3.1125, + "step": 6028 + }, + { + "epoch": 0.86, + "learning_rate": 2.874476383070923e-05, + "loss": 3.2254, + "step": 6029 + }, + { + "epoch": 0.86, + "learning_rate": 2.871587462082912e-05, + "loss": 3.0684, + "step": 6030 + }, + { + "epoch": 0.86, + "learning_rate": 2.8686985410949013e-05, + "loss": 3.0463, + "step": 6031 + }, + { + "epoch": 0.86, + "learning_rate": 2.8658096201068903e-05, + "loss": 2.9878, + "step": 6032 + }, + { + "epoch": 0.86, + "learning_rate": 2.8629206991188793e-05, + "loss": 3.1846, + "step": 6033 + }, + { + "epoch": 0.86, + "learning_rate": 2.8600317781308684e-05, + "loss": 3.2068, + "step": 6034 + }, + { + "epoch": 0.86, + "learning_rate": 2.857142857142857e-05, + "loss": 2.9923, + "step": 6035 + }, + { + "epoch": 0.86, + "learning_rate": 2.8542539361548465e-05, + "loss": 2.9978, + "step": 6036 + }, + { + "epoch": 0.86, + "learning_rate": 2.8513650151668352e-05, + "loss": 3.0313, + "step": 6037 + }, + { + "epoch": 0.86, + "learning_rate": 2.8484760941788242e-05, + "loss": 3.124, + "step": 6038 + }, + { + "epoch": 0.86, + "learning_rate": 2.8455871731908136e-05, + "loss": 2.8379, + "step": 6039 + }, + { + "epoch": 0.86, + "learning_rate": 2.8426982522028023e-05, + "loss": 3.0884, + "step": 6040 + }, + { + "epoch": 0.86, + "learning_rate": 2.8398093312147917e-05, + "loss": 2.9867, + "step": 6041 + }, + { + "epoch": 0.86, + "learning_rate": 2.8369204102267804e-05, + "loss": 3.1433, + "step": 6042 + }, + { + "epoch": 0.86, + "learning_rate": 2.834031489238769e-05, + "loss": 3.1496, + "step": 6043 + }, + { + "epoch": 0.86, + "learning_rate": 2.8311425682507585e-05, + "loss": 3.0847, + "step": 6044 + }, + { + "epoch": 0.86, + "learning_rate": 2.8282536472627475e-05, + "loss": 3.1325, + "step": 6045 + }, + { + "epoch": 0.86, + "learning_rate": 2.8253647262747362e-05, + "loss": 3.072, + "step": 6046 + }, + { + "epoch": 0.86, + "learning_rate": 2.8224758052867256e-05, + "loss": 3.1329, + "step": 6047 + }, + { + "epoch": 0.86, + "learning_rate": 2.8195868842987143e-05, + "loss": 2.9888, + "step": 6048 + }, + { + "epoch": 0.86, + "learning_rate": 2.8166979633107037e-05, + "loss": 3.1742, + "step": 6049 + }, + { + "epoch": 0.86, + "learning_rate": 2.8138090423226927e-05, + "loss": 3.077, + "step": 6050 + }, + { + "epoch": 0.86, + "learning_rate": 2.8109201213346814e-05, + "loss": 2.8865, + "step": 6051 + }, + { + "epoch": 0.86, + "learning_rate": 2.8080312003466708e-05, + "loss": 3.0586, + "step": 6052 + }, + { + "epoch": 0.86, + "learning_rate": 2.8051422793586595e-05, + "loss": 2.852, + "step": 6053 + }, + { + "epoch": 0.86, + "learning_rate": 2.802253358370649e-05, + "loss": 3.1931, + "step": 6054 + }, + { + "epoch": 0.86, + "learning_rate": 2.7993644373826376e-05, + "loss": 3.0282, + "step": 6055 + }, + { + "epoch": 0.86, + "learning_rate": 2.7964755163946267e-05, + "loss": 3.0781, + "step": 6056 + }, + { + "epoch": 0.86, + "learning_rate": 2.793586595406616e-05, + "loss": 3.1172, + "step": 6057 + }, + { + "epoch": 0.86, + "learning_rate": 2.7906976744186048e-05, + "loss": 3.1747, + "step": 6058 + }, + { + "epoch": 0.86, + "learning_rate": 2.7878087534305935e-05, + "loss": 3.1268, + "step": 6059 + }, + { + "epoch": 0.86, + "learning_rate": 2.784919832442583e-05, + "loss": 3.1027, + "step": 6060 + }, + { + "epoch": 0.86, + "learning_rate": 2.782030911454572e-05, + "loss": 3.1948, + "step": 6061 + }, + { + "epoch": 0.86, + "learning_rate": 2.779141990466561e-05, + "loss": 2.917, + "step": 6062 + }, + { + "epoch": 0.86, + "learning_rate": 2.77625306947855e-05, + "loss": 3.135, + "step": 6063 + }, + { + "epoch": 0.86, + "learning_rate": 2.7733641484905387e-05, + "loss": 3.2288, + "step": 6064 + }, + { + "epoch": 0.86, + "learning_rate": 2.770475227502528e-05, + "loss": 2.9871, + "step": 6065 + }, + { + "epoch": 0.86, + "learning_rate": 2.7675863065145168e-05, + "loss": 3.0479, + "step": 6066 + }, + { + "epoch": 0.86, + "learning_rate": 2.764697385526506e-05, + "loss": 3.2121, + "step": 6067 + }, + { + "epoch": 0.86, + "learning_rate": 2.7618084645384952e-05, + "loss": 2.9775, + "step": 6068 + }, + { + "epoch": 0.86, + "learning_rate": 2.758919543550484e-05, + "loss": 3.032, + "step": 6069 + }, + { + "epoch": 0.86, + "learning_rate": 2.7560306225624733e-05, + "loss": 2.9528, + "step": 6070 + }, + { + "epoch": 0.86, + "learning_rate": 2.753141701574462e-05, + "loss": 2.944, + "step": 6071 + }, + { + "epoch": 0.86, + "learning_rate": 2.750252780586451e-05, + "loss": 3.0658, + "step": 6072 + }, + { + "epoch": 0.86, + "learning_rate": 2.74736385959844e-05, + "loss": 3.151, + "step": 6073 + }, + { + "epoch": 0.86, + "learning_rate": 2.744474938610429e-05, + "loss": 3.1508, + "step": 6074 + }, + { + "epoch": 0.86, + "learning_rate": 2.7415860176224185e-05, + "loss": 3.1112, + "step": 6075 + }, + { + "epoch": 0.87, + "learning_rate": 2.7386970966344072e-05, + "loss": 3.1306, + "step": 6076 + }, + { + "epoch": 0.87, + "learning_rate": 2.735808175646396e-05, + "loss": 3.0435, + "step": 6077 + }, + { + "epoch": 0.87, + "learning_rate": 2.7329192546583853e-05, + "loss": 3.1812, + "step": 6078 + }, + { + "epoch": 0.87, + "learning_rate": 2.7300303336703743e-05, + "loss": 3.0725, + "step": 6079 + }, + { + "epoch": 0.87, + "learning_rate": 2.727141412682363e-05, + "loss": 3.0793, + "step": 6080 + }, + { + "epoch": 0.87, + "learning_rate": 2.7242524916943524e-05, + "loss": 3.0949, + "step": 6081 + }, + { + "epoch": 0.87, + "learning_rate": 2.721363570706341e-05, + "loss": 3.0673, + "step": 6082 + }, + { + "epoch": 0.87, + "learning_rate": 2.7184746497183305e-05, + "loss": 3.1137, + "step": 6083 + }, + { + "epoch": 0.87, + "learning_rate": 2.7155857287303192e-05, + "loss": 3.1059, + "step": 6084 + }, + { + "epoch": 0.87, + "learning_rate": 2.7126968077423083e-05, + "loss": 3.1905, + "step": 6085 + }, + { + "epoch": 0.87, + "learning_rate": 2.7098078867542976e-05, + "loss": 3.1762, + "step": 6086 + }, + { + "epoch": 0.87, + "learning_rate": 2.7069189657662863e-05, + "loss": 3.1323, + "step": 6087 + }, + { + "epoch": 0.87, + "learning_rate": 2.7040300447782757e-05, + "loss": 2.8869, + "step": 6088 + }, + { + "epoch": 0.87, + "learning_rate": 2.7011411237902644e-05, + "loss": 3.0922, + "step": 6089 + }, + { + "epoch": 0.87, + "learning_rate": 2.6982522028022535e-05, + "loss": 3.1239, + "step": 6090 + }, + { + "epoch": 0.87, + "learning_rate": 2.695363281814243e-05, + "loss": 3.0454, + "step": 6091 + }, + { + "epoch": 0.87, + "learning_rate": 2.6924743608262316e-05, + "loss": 3.1074, + "step": 6092 + }, + { + "epoch": 0.87, + "learning_rate": 2.6895854398382203e-05, + "loss": 3.1191, + "step": 6093 + }, + { + "epoch": 0.87, + "learning_rate": 2.6866965188502097e-05, + "loss": 3.0851, + "step": 6094 + }, + { + "epoch": 0.87, + "learning_rate": 2.6838075978621984e-05, + "loss": 2.9811, + "step": 6095 + }, + { + "epoch": 0.87, + "learning_rate": 2.6809186768741877e-05, + "loss": 3.1226, + "step": 6096 + }, + { + "epoch": 0.87, + "learning_rate": 2.6780297558861768e-05, + "loss": 3.1572, + "step": 6097 + }, + { + "epoch": 0.87, + "learning_rate": 2.6751408348981655e-05, + "loss": 3.0967, + "step": 6098 + }, + { + "epoch": 0.87, + "learning_rate": 2.672251913910155e-05, + "loss": 3.071, + "step": 6099 + }, + { + "epoch": 0.87, + "learning_rate": 2.6693629929221436e-05, + "loss": 3.1452, + "step": 6100 + }, + { + "epoch": 0.87, + "learning_rate": 2.6664740719341326e-05, + "loss": 3.0124, + "step": 6101 + }, + { + "epoch": 0.87, + "learning_rate": 2.6635851509461217e-05, + "loss": 3.021, + "step": 6102 + }, + { + "epoch": 0.87, + "learning_rate": 2.6606962299581107e-05, + "loss": 3.0597, + "step": 6103 + }, + { + "epoch": 0.87, + "learning_rate": 2.6578073089701e-05, + "loss": 3.1272, + "step": 6104 + }, + { + "epoch": 0.87, + "learning_rate": 2.6549183879820888e-05, + "loss": 3.0598, + "step": 6105 + }, + { + "epoch": 0.87, + "learning_rate": 2.6520294669940775e-05, + "loss": 3.0873, + "step": 6106 + }, + { + "epoch": 0.87, + "learning_rate": 2.649140546006067e-05, + "loss": 3.3011, + "step": 6107 + }, + { + "epoch": 0.87, + "learning_rate": 2.646251625018056e-05, + "loss": 2.9501, + "step": 6108 + }, + { + "epoch": 0.87, + "learning_rate": 2.6433627040300453e-05, + "loss": 3.1859, + "step": 6109 + }, + { + "epoch": 0.87, + "learning_rate": 2.640473783042034e-05, + "loss": 3.1604, + "step": 6110 + }, + { + "epoch": 0.87, + "learning_rate": 2.6375848620540227e-05, + "loss": 3.0502, + "step": 6111 + }, + { + "epoch": 0.87, + "learning_rate": 2.634695941066012e-05, + "loss": 3.1266, + "step": 6112 + }, + { + "epoch": 0.87, + "learning_rate": 2.6318070200780008e-05, + "loss": 3.1003, + "step": 6113 + }, + { + "epoch": 0.87, + "learning_rate": 2.62891809908999e-05, + "loss": 2.9669, + "step": 6114 + }, + { + "epoch": 0.87, + "learning_rate": 2.6260291781019792e-05, + "loss": 3.0027, + "step": 6115 + }, + { + "epoch": 0.87, + "learning_rate": 2.623140257113968e-05, + "loss": 3.2412, + "step": 6116 + }, + { + "epoch": 0.87, + "learning_rate": 2.6202513361259573e-05, + "loss": 3.1539, + "step": 6117 + }, + { + "epoch": 0.87, + "learning_rate": 2.617362415137946e-05, + "loss": 3.1696, + "step": 6118 + }, + { + "epoch": 0.87, + "learning_rate": 2.614473494149935e-05, + "loss": 3.1418, + "step": 6119 + }, + { + "epoch": 0.87, + "learning_rate": 2.6115845731619244e-05, + "loss": 2.9554, + "step": 6120 + }, + { + "epoch": 0.87, + "learning_rate": 2.608695652173913e-05, + "loss": 2.9984, + "step": 6121 + }, + { + "epoch": 0.87, + "learning_rate": 2.6058067311859025e-05, + "loss": 3.2487, + "step": 6122 + }, + { + "epoch": 0.87, + "learning_rate": 2.6029178101978912e-05, + "loss": 3.0161, + "step": 6123 + }, + { + "epoch": 0.87, + "learning_rate": 2.60002888920988e-05, + "loss": 3.0282, + "step": 6124 + }, + { + "epoch": 0.87, + "learning_rate": 2.5971399682218693e-05, + "loss": 3.081, + "step": 6125 + }, + { + "epoch": 0.87, + "learning_rate": 2.5942510472338584e-05, + "loss": 3.0999, + "step": 6126 + }, + { + "epoch": 0.87, + "learning_rate": 2.591362126245847e-05, + "loss": 3.1143, + "step": 6127 + }, + { + "epoch": 0.87, + "learning_rate": 2.5884732052578365e-05, + "loss": 3.0133, + "step": 6128 + }, + { + "epoch": 0.87, + "learning_rate": 2.585584284269825e-05, + "loss": 3.0934, + "step": 6129 + }, + { + "epoch": 0.87, + "learning_rate": 2.5826953632818145e-05, + "loss": 2.9122, + "step": 6130 + }, + { + "epoch": 0.87, + "learning_rate": 2.5798064422938033e-05, + "loss": 3.0558, + "step": 6131 + }, + { + "epoch": 0.87, + "learning_rate": 2.5769175213057923e-05, + "loss": 3.1037, + "step": 6132 + }, + { + "epoch": 0.87, + "learning_rate": 2.5740286003177817e-05, + "loss": 3.083, + "step": 6133 + }, + { + "epoch": 0.87, + "learning_rate": 2.5711396793297704e-05, + "loss": 3.2274, + "step": 6134 + }, + { + "epoch": 0.87, + "learning_rate": 2.568250758341759e-05, + "loss": 3.0305, + "step": 6135 + }, + { + "epoch": 0.87, + "learning_rate": 2.5653618373537485e-05, + "loss": 3.1308, + "step": 6136 + }, + { + "epoch": 0.87, + "learning_rate": 2.5624729163657375e-05, + "loss": 3.1218, + "step": 6137 + }, + { + "epoch": 0.87, + "learning_rate": 2.559583995377727e-05, + "loss": 3.1344, + "step": 6138 + }, + { + "epoch": 0.87, + "learning_rate": 2.5566950743897156e-05, + "loss": 3.2072, + "step": 6139 + }, + { + "epoch": 0.87, + "learning_rate": 2.5538061534017043e-05, + "loss": 3.0067, + "step": 6140 + }, + { + "epoch": 0.87, + "learning_rate": 2.5509172324136937e-05, + "loss": 3.1007, + "step": 6141 + }, + { + "epoch": 0.87, + "learning_rate": 2.5480283114256824e-05, + "loss": 3.1694, + "step": 6142 + }, + { + "epoch": 0.87, + "learning_rate": 2.5451393904376718e-05, + "loss": 3.1094, + "step": 6143 + }, + { + "epoch": 0.87, + "learning_rate": 2.5422504694496608e-05, + "loss": 3.0832, + "step": 6144 + }, + { + "epoch": 0.87, + "learning_rate": 2.5393615484616495e-05, + "loss": 3.2072, + "step": 6145 + }, + { + "epoch": 0.88, + "learning_rate": 2.536472627473639e-05, + "loss": 3.1934, + "step": 6146 + }, + { + "epoch": 0.88, + "learning_rate": 2.5335837064856276e-05, + "loss": 3.0024, + "step": 6147 + }, + { + "epoch": 0.88, + "learning_rate": 2.5306947854976167e-05, + "loss": 3.0558, + "step": 6148 + }, + { + "epoch": 0.88, + "learning_rate": 2.527805864509606e-05, + "loss": 3.0448, + "step": 6149 + }, + { + "epoch": 0.88, + "learning_rate": 2.5249169435215947e-05, + "loss": 3.2103, + "step": 6150 + }, + { + "epoch": 0.88, + "learning_rate": 2.522028022533584e-05, + "loss": 3.0402, + "step": 6151 + }, + { + "epoch": 0.88, + "learning_rate": 2.519139101545573e-05, + "loss": 3.0199, + "step": 6152 + }, + { + "epoch": 0.88, + "learning_rate": 2.5162501805575615e-05, + "loss": 2.9997, + "step": 6153 + }, + { + "epoch": 0.88, + "learning_rate": 2.513361259569551e-05, + "loss": 3.0992, + "step": 6154 + }, + { + "epoch": 0.88, + "learning_rate": 2.51047233858154e-05, + "loss": 3.0769, + "step": 6155 + }, + { + "epoch": 0.88, + "learning_rate": 2.5075834175935287e-05, + "loss": 3.0348, + "step": 6156 + }, + { + "epoch": 0.88, + "learning_rate": 2.504694496605518e-05, + "loss": 3.0612, + "step": 6157 + }, + { + "epoch": 0.88, + "learning_rate": 2.5018055756175068e-05, + "loss": 3.1116, + "step": 6158 + }, + { + "epoch": 0.88, + "learning_rate": 2.4989166546294958e-05, + "loss": 3.1809, + "step": 6159 + }, + { + "epoch": 0.88, + "learning_rate": 2.496027733641485e-05, + "loss": 2.9785, + "step": 6160 + }, + { + "epoch": 0.88, + "learning_rate": 2.4931388126534742e-05, + "loss": 3.1658, + "step": 6161 + }, + { + "epoch": 0.88, + "learning_rate": 2.4902498916654633e-05, + "loss": 3.0982, + "step": 6162 + }, + { + "epoch": 0.88, + "learning_rate": 2.487360970677452e-05, + "loss": 3.153, + "step": 6163 + }, + { + "epoch": 0.88, + "learning_rate": 2.484472049689441e-05, + "loss": 3.1239, + "step": 6164 + }, + { + "epoch": 0.88, + "learning_rate": 2.48158312870143e-05, + "loss": 3.0978, + "step": 6165 + }, + { + "epoch": 0.88, + "learning_rate": 2.478694207713419e-05, + "loss": 3.008, + "step": 6166 + }, + { + "epoch": 0.88, + "learning_rate": 2.475805286725408e-05, + "loss": 3.0674, + "step": 6167 + }, + { + "epoch": 0.88, + "learning_rate": 2.4729163657373972e-05, + "loss": 2.8771, + "step": 6168 + }, + { + "epoch": 0.88, + "learning_rate": 2.4700274447493862e-05, + "loss": 3.0125, + "step": 6169 + }, + { + "epoch": 0.88, + "learning_rate": 2.4671385237613753e-05, + "loss": 3.1738, + "step": 6170 + }, + { + "epoch": 0.88, + "learning_rate": 2.464249602773364e-05, + "loss": 3.0806, + "step": 6171 + }, + { + "epoch": 0.88, + "learning_rate": 2.4613606817853534e-05, + "loss": 2.9466, + "step": 6172 + }, + { + "epoch": 0.88, + "learning_rate": 2.4584717607973424e-05, + "loss": 3.0043, + "step": 6173 + }, + { + "epoch": 0.88, + "learning_rate": 2.4555828398093315e-05, + "loss": 3.0152, + "step": 6174 + }, + { + "epoch": 0.88, + "learning_rate": 2.4526939188213205e-05, + "loss": 3.0443, + "step": 6175 + }, + { + "epoch": 0.88, + "learning_rate": 2.4498049978333092e-05, + "loss": 3.0692, + "step": 6176 + }, + { + "epoch": 0.88, + "learning_rate": 2.4469160768452982e-05, + "loss": 3.1542, + "step": 6177 + }, + { + "epoch": 0.88, + "learning_rate": 2.4440271558572876e-05, + "loss": 3.1115, + "step": 6178 + }, + { + "epoch": 0.88, + "learning_rate": 2.4411382348692767e-05, + "loss": 3.1023, + "step": 6179 + }, + { + "epoch": 0.88, + "learning_rate": 2.4382493138812654e-05, + "loss": 3.1114, + "step": 6180 + }, + { + "epoch": 0.88, + "learning_rate": 2.4353603928932544e-05, + "loss": 3.1446, + "step": 6181 + }, + { + "epoch": 0.88, + "learning_rate": 2.4324714719052435e-05, + "loss": 3.0185, + "step": 6182 + }, + { + "epoch": 0.88, + "learning_rate": 2.4295825509172325e-05, + "loss": 2.9706, + "step": 6183 + }, + { + "epoch": 0.88, + "learning_rate": 2.4266936299292216e-05, + "loss": 3.0785, + "step": 6184 + }, + { + "epoch": 0.88, + "learning_rate": 2.4238047089412106e-05, + "loss": 2.9412, + "step": 6185 + }, + { + "epoch": 0.88, + "learning_rate": 2.4209157879531996e-05, + "loss": 3.1385, + "step": 6186 + }, + { + "epoch": 0.88, + "learning_rate": 2.4180268669651887e-05, + "loss": 3.2132, + "step": 6187 + }, + { + "epoch": 0.88, + "learning_rate": 2.4151379459771774e-05, + "loss": 3.1793, + "step": 6188 + }, + { + "epoch": 0.88, + "learning_rate": 2.4122490249891668e-05, + "loss": 3.1368, + "step": 6189 + }, + { + "epoch": 0.88, + "learning_rate": 2.4093601040011558e-05, + "loss": 2.9862, + "step": 6190 + }, + { + "epoch": 0.88, + "learning_rate": 2.406471183013145e-05, + "loss": 3.1003, + "step": 6191 + }, + { + "epoch": 0.88, + "learning_rate": 2.4035822620251336e-05, + "loss": 3.0735, + "step": 6192 + }, + { + "epoch": 0.88, + "learning_rate": 2.4006933410371226e-05, + "loss": 2.8659, + "step": 6193 + }, + { + "epoch": 0.88, + "learning_rate": 2.3978044200491117e-05, + "loss": 3.0916, + "step": 6194 + }, + { + "epoch": 0.88, + "learning_rate": 2.3949154990611007e-05, + "loss": 3.1532, + "step": 6195 + }, + { + "epoch": 0.88, + "learning_rate": 2.39202657807309e-05, + "loss": 3.1385, + "step": 6196 + }, + { + "epoch": 0.88, + "learning_rate": 2.3891376570850788e-05, + "loss": 2.9107, + "step": 6197 + }, + { + "epoch": 0.88, + "learning_rate": 2.3862487360970678e-05, + "loss": 3.0304, + "step": 6198 + }, + { + "epoch": 0.88, + "learning_rate": 2.383359815109057e-05, + "loss": 3.1211, + "step": 6199 + }, + { + "epoch": 0.88, + "learning_rate": 2.380470894121046e-05, + "loss": 3.0548, + "step": 6200 + }, + { + "epoch": 0.88, + "learning_rate": 2.377581973133035e-05, + "loss": 3.1243, + "step": 6201 + }, + { + "epoch": 0.88, + "learning_rate": 2.374693052145024e-05, + "loss": 3.1833, + "step": 6202 + }, + { + "epoch": 0.88, + "learning_rate": 2.371804131157013e-05, + "loss": 2.9298, + "step": 6203 + }, + { + "epoch": 0.88, + "learning_rate": 2.368915210169002e-05, + "loss": 3.1059, + "step": 6204 + }, + { + "epoch": 0.88, + "learning_rate": 2.3660262891809908e-05, + "loss": 3.1305, + "step": 6205 + }, + { + "epoch": 0.88, + "learning_rate": 2.36313736819298e-05, + "loss": 3.0229, + "step": 6206 + }, + { + "epoch": 0.88, + "learning_rate": 2.3602484472049692e-05, + "loss": 3.1692, + "step": 6207 + }, + { + "epoch": 0.88, + "learning_rate": 2.3573595262169583e-05, + "loss": 3.1101, + "step": 6208 + }, + { + "epoch": 0.88, + "learning_rate": 2.354470605228947e-05, + "loss": 2.9789, + "step": 6209 + }, + { + "epoch": 0.88, + "learning_rate": 2.351581684240936e-05, + "loss": 3.1158, + "step": 6210 + }, + { + "epoch": 0.88, + "learning_rate": 2.348692763252925e-05, + "loss": 3.0718, + "step": 6211 + }, + { + "epoch": 0.88, + "learning_rate": 2.345803842264914e-05, + "loss": 3.157, + "step": 6212 + }, + { + "epoch": 0.88, + "learning_rate": 2.3429149212769035e-05, + "loss": 3.1713, + "step": 6213 + }, + { + "epoch": 0.88, + "learning_rate": 2.3400260002888922e-05, + "loss": 2.9399, + "step": 6214 + }, + { + "epoch": 0.88, + "learning_rate": 2.3371370793008812e-05, + "loss": 3.1105, + "step": 6215 + }, + { + "epoch": 0.89, + "learning_rate": 2.3342481583128703e-05, + "loss": 3.0805, + "step": 6216 + }, + { + "epoch": 0.89, + "learning_rate": 2.3313592373248593e-05, + "loss": 3.1651, + "step": 6217 + }, + { + "epoch": 0.89, + "learning_rate": 2.3284703163368484e-05, + "loss": 2.9673, + "step": 6218 + }, + { + "epoch": 0.89, + "learning_rate": 2.3255813953488374e-05, + "loss": 3.0281, + "step": 6219 + }, + { + "epoch": 0.89, + "learning_rate": 2.3226924743608265e-05, + "loss": 3.0074, + "step": 6220 + }, + { + "epoch": 0.89, + "learning_rate": 2.3198035533728155e-05, + "loss": 3.175, + "step": 6221 + }, + { + "epoch": 0.89, + "learning_rate": 2.3169146323848042e-05, + "loss": 3.1241, + "step": 6222 + }, + { + "epoch": 0.89, + "learning_rate": 2.3140257113967932e-05, + "loss": 2.9221, + "step": 6223 + }, + { + "epoch": 0.89, + "learning_rate": 2.3111367904087823e-05, + "loss": 3.089, + "step": 6224 + }, + { + "epoch": 0.89, + "learning_rate": 2.3082478694207717e-05, + "loss": 3.1303, + "step": 6225 + }, + { + "epoch": 0.89, + "learning_rate": 2.3053589484327604e-05, + "loss": 3.0978, + "step": 6226 + }, + { + "epoch": 0.89, + "learning_rate": 2.3024700274447494e-05, + "loss": 2.8762, + "step": 6227 + }, + { + "epoch": 0.89, + "learning_rate": 2.2995811064567385e-05, + "loss": 2.7943, + "step": 6228 + }, + { + "epoch": 0.89, + "learning_rate": 2.2966921854687275e-05, + "loss": 3.1311, + "step": 6229 + }, + { + "epoch": 0.89, + "learning_rate": 2.2938032644807165e-05, + "loss": 3.0437, + "step": 6230 + }, + { + "epoch": 0.89, + "learning_rate": 2.2909143434927056e-05, + "loss": 3.0388, + "step": 6231 + }, + { + "epoch": 0.89, + "learning_rate": 2.2880254225046946e-05, + "loss": 3.1408, + "step": 6232 + }, + { + "epoch": 0.89, + "learning_rate": 2.2851365015166837e-05, + "loss": 3.0528, + "step": 6233 + }, + { + "epoch": 0.89, + "learning_rate": 2.2822475805286727e-05, + "loss": 3.1269, + "step": 6234 + }, + { + "epoch": 0.89, + "learning_rate": 2.2793586595406614e-05, + "loss": 3.1778, + "step": 6235 + }, + { + "epoch": 0.89, + "learning_rate": 2.2764697385526508e-05, + "loss": 3.1633, + "step": 6236 + }, + { + "epoch": 0.89, + "learning_rate": 2.27358081756464e-05, + "loss": 3.0909, + "step": 6237 + }, + { + "epoch": 0.89, + "learning_rate": 2.270691896576629e-05, + "loss": 3.0567, + "step": 6238 + }, + { + "epoch": 0.89, + "learning_rate": 2.2678029755886176e-05, + "loss": 2.9684, + "step": 6239 + }, + { + "epoch": 0.89, + "learning_rate": 2.2649140546006066e-05, + "loss": 3.0475, + "step": 6240 + }, + { + "epoch": 0.89, + "learning_rate": 2.2620251336125957e-05, + "loss": 3.0568, + "step": 6241 + }, + { + "epoch": 0.89, + "learning_rate": 2.259136212624585e-05, + "loss": 3.1187, + "step": 6242 + }, + { + "epoch": 0.89, + "learning_rate": 2.2562472916365738e-05, + "loss": 3.0751, + "step": 6243 + }, + { + "epoch": 0.89, + "learning_rate": 2.2533583706485628e-05, + "loss": 3.0804, + "step": 6244 + }, + { + "epoch": 0.89, + "learning_rate": 2.250469449660552e-05, + "loss": 3.1316, + "step": 6245 + }, + { + "epoch": 0.89, + "learning_rate": 2.247580528672541e-05, + "loss": 2.9817, + "step": 6246 + }, + { + "epoch": 0.89, + "learning_rate": 2.24469160768453e-05, + "loss": 3.0022, + "step": 6247 + }, + { + "epoch": 0.89, + "learning_rate": 2.241802686696519e-05, + "loss": 3.0777, + "step": 6248 + }, + { + "epoch": 0.89, + "learning_rate": 2.238913765708508e-05, + "loss": 3.0788, + "step": 6249 + }, + { + "epoch": 0.89, + "learning_rate": 2.236024844720497e-05, + "loss": 3.1099, + "step": 6250 + }, + { + "epoch": 0.89, + "learning_rate": 2.233135923732486e-05, + "loss": 3.0913, + "step": 6251 + }, + { + "epoch": 0.89, + "learning_rate": 2.230247002744475e-05, + "loss": 3.0541, + "step": 6252 + }, + { + "epoch": 0.89, + "learning_rate": 2.227358081756464e-05, + "loss": 2.9763, + "step": 6253 + }, + { + "epoch": 0.89, + "learning_rate": 2.2244691607684533e-05, + "loss": 3.0568, + "step": 6254 + }, + { + "epoch": 0.89, + "learning_rate": 2.2215802397804423e-05, + "loss": 3.2268, + "step": 6255 + }, + { + "epoch": 0.89, + "learning_rate": 2.218691318792431e-05, + "loss": 3.0721, + "step": 6256 + }, + { + "epoch": 0.89, + "learning_rate": 2.21580239780442e-05, + "loss": 3.1068, + "step": 6257 + }, + { + "epoch": 0.89, + "learning_rate": 2.212913476816409e-05, + "loss": 3.0956, + "step": 6258 + }, + { + "epoch": 0.89, + "learning_rate": 2.210024555828398e-05, + "loss": 2.968, + "step": 6259 + }, + { + "epoch": 0.89, + "learning_rate": 2.2071356348403872e-05, + "loss": 3.0614, + "step": 6260 + }, + { + "epoch": 0.89, + "learning_rate": 2.2042467138523762e-05, + "loss": 2.9459, + "step": 6261 + }, + { + "epoch": 0.89, + "learning_rate": 2.2013577928643653e-05, + "loss": 3.0805, + "step": 6262 + }, + { + "epoch": 0.89, + "learning_rate": 2.1984688718763543e-05, + "loss": 2.9983, + "step": 6263 + }, + { + "epoch": 0.89, + "learning_rate": 2.195579950888343e-05, + "loss": 3.1816, + "step": 6264 + }, + { + "epoch": 0.89, + "learning_rate": 2.1926910299003324e-05, + "loss": 3.1293, + "step": 6265 + }, + { + "epoch": 0.89, + "learning_rate": 2.1898021089123214e-05, + "loss": 3.0368, + "step": 6266 + }, + { + "epoch": 0.89, + "learning_rate": 2.1869131879243105e-05, + "loss": 3.0531, + "step": 6267 + }, + { + "epoch": 0.89, + "learning_rate": 2.1840242669362995e-05, + "loss": 3.1396, + "step": 6268 + }, + { + "epoch": 0.89, + "learning_rate": 2.1811353459482882e-05, + "loss": 2.9849, + "step": 6269 + }, + { + "epoch": 0.89, + "learning_rate": 2.1782464249602773e-05, + "loss": 3.1006, + "step": 6270 + }, + { + "epoch": 0.89, + "learning_rate": 2.1753575039722667e-05, + "loss": 3.1267, + "step": 6271 + }, + { + "epoch": 0.89, + "learning_rate": 2.1724685829842557e-05, + "loss": 3.1102, + "step": 6272 + }, + { + "epoch": 0.89, + "learning_rate": 2.1695796619962444e-05, + "loss": 3.1123, + "step": 6273 + }, + { + "epoch": 0.89, + "learning_rate": 2.1666907410082335e-05, + "loss": 3.0632, + "step": 6274 + }, + { + "epoch": 0.89, + "learning_rate": 2.1638018200202225e-05, + "loss": 3.1324, + "step": 6275 + }, + { + "epoch": 0.89, + "learning_rate": 2.1609128990322115e-05, + "loss": 2.9165, + "step": 6276 + }, + { + "epoch": 0.89, + "learning_rate": 2.1580239780442006e-05, + "loss": 3.1479, + "step": 6277 + }, + { + "epoch": 0.89, + "learning_rate": 2.1551350570561896e-05, + "loss": 3.0407, + "step": 6278 + }, + { + "epoch": 0.89, + "learning_rate": 2.1522461360681787e-05, + "loss": 3.1366, + "step": 6279 + }, + { + "epoch": 0.89, + "learning_rate": 2.1493572150801677e-05, + "loss": 3.132, + "step": 6280 + }, + { + "epoch": 0.89, + "learning_rate": 2.1464682940921564e-05, + "loss": 3.1849, + "step": 6281 + }, + { + "epoch": 0.89, + "learning_rate": 2.1435793731041455e-05, + "loss": 3.0809, + "step": 6282 + }, + { + "epoch": 0.89, + "learning_rate": 2.140690452116135e-05, + "loss": 3.1511, + "step": 6283 + }, + { + "epoch": 0.89, + "learning_rate": 2.137801531128124e-05, + "loss": 3.1877, + "step": 6284 + }, + { + "epoch": 0.89, + "learning_rate": 2.134912610140113e-05, + "loss": 3.0118, + "step": 6285 + }, + { + "epoch": 0.89, + "learning_rate": 2.1320236891521016e-05, + "loss": 3.0608, + "step": 6286 + }, + { + "epoch": 0.9, + "learning_rate": 2.1291347681640907e-05, + "loss": 2.9224, + "step": 6287 + }, + { + "epoch": 0.9, + "learning_rate": 2.1262458471760797e-05, + "loss": 3.0317, + "step": 6288 + }, + { + "epoch": 0.9, + "learning_rate": 2.123356926188069e-05, + "loss": 3.1024, + "step": 6289 + }, + { + "epoch": 0.9, + "learning_rate": 2.1204680052000578e-05, + "loss": 2.9715, + "step": 6290 + }, + { + "epoch": 0.9, + "learning_rate": 2.117579084212047e-05, + "loss": 3.0075, + "step": 6291 + }, + { + "epoch": 0.9, + "learning_rate": 2.114690163224036e-05, + "loss": 3.0364, + "step": 6292 + }, + { + "epoch": 0.9, + "learning_rate": 2.111801242236025e-05, + "loss": 3.0769, + "step": 6293 + }, + { + "epoch": 0.9, + "learning_rate": 2.108912321248014e-05, + "loss": 2.9219, + "step": 6294 + }, + { + "epoch": 0.9, + "learning_rate": 2.106023400260003e-05, + "loss": 3.1058, + "step": 6295 + }, + { + "epoch": 0.9, + "learning_rate": 2.103134479271992e-05, + "loss": 3.0322, + "step": 6296 + }, + { + "epoch": 0.9, + "learning_rate": 2.100245558283981e-05, + "loss": 2.9246, + "step": 6297 + }, + { + "epoch": 0.9, + "learning_rate": 2.0973566372959698e-05, + "loss": 3.0454, + "step": 6298 + }, + { + "epoch": 0.9, + "learning_rate": 2.094467716307959e-05, + "loss": 3.1417, + "step": 6299 + }, + { + "epoch": 0.9, + "learning_rate": 2.0915787953199483e-05, + "loss": 2.9522, + "step": 6300 + }, + { + "epoch": 0.9, + "learning_rate": 2.0886898743319373e-05, + "loss": 3.0798, + "step": 6301 + }, + { + "epoch": 0.9, + "learning_rate": 2.0858009533439263e-05, + "loss": 3.0087, + "step": 6302 + }, + { + "epoch": 0.9, + "learning_rate": 2.082912032355915e-05, + "loss": 3.03, + "step": 6303 + }, + { + "epoch": 0.9, + "learning_rate": 2.080023111367904e-05, + "loss": 3.1378, + "step": 6304 + }, + { + "epoch": 0.9, + "learning_rate": 2.077134190379893e-05, + "loss": 3.0412, + "step": 6305 + }, + { + "epoch": 0.9, + "learning_rate": 2.0742452693918825e-05, + "loss": 3.1259, + "step": 6306 + }, + { + "epoch": 0.9, + "learning_rate": 2.0713563484038712e-05, + "loss": 2.9775, + "step": 6307 + }, + { + "epoch": 0.9, + "learning_rate": 2.0684674274158603e-05, + "loss": 2.9517, + "step": 6308 + }, + { + "epoch": 0.9, + "learning_rate": 2.0655785064278493e-05, + "loss": 3.0625, + "step": 6309 + }, + { + "epoch": 0.9, + "learning_rate": 2.0626895854398384e-05, + "loss": 2.9316, + "step": 6310 + }, + { + "epoch": 0.9, + "learning_rate": 2.0598006644518274e-05, + "loss": 3.1149, + "step": 6311 + }, + { + "epoch": 0.9, + "learning_rate": 2.0569117434638164e-05, + "loss": 3.2036, + "step": 6312 + }, + { + "epoch": 0.9, + "learning_rate": 2.0540228224758055e-05, + "loss": 3.1569, + "step": 6313 + }, + { + "epoch": 0.9, + "learning_rate": 2.0511339014877945e-05, + "loss": 3.1524, + "step": 6314 + }, + { + "epoch": 0.9, + "learning_rate": 2.0482449804997832e-05, + "loss": 3.0428, + "step": 6315 + }, + { + "epoch": 0.9, + "learning_rate": 2.0453560595117723e-05, + "loss": 3.0148, + "step": 6316 + }, + { + "epoch": 0.9, + "learning_rate": 2.0424671385237613e-05, + "loss": 3.051, + "step": 6317 + }, + { + "epoch": 0.9, + "learning_rate": 2.0395782175357507e-05, + "loss": 3.0329, + "step": 6318 + }, + { + "epoch": 0.9, + "learning_rate": 2.0366892965477394e-05, + "loss": 2.9549, + "step": 6319 + }, + { + "epoch": 0.9, + "learning_rate": 2.0338003755597285e-05, + "loss": 3.1519, + "step": 6320 + }, + { + "epoch": 0.9, + "learning_rate": 2.0309114545717175e-05, + "loss": 2.9162, + "step": 6321 + }, + { + "epoch": 0.9, + "learning_rate": 2.0280225335837065e-05, + "loss": 3.012, + "step": 6322 + }, + { + "epoch": 0.9, + "learning_rate": 2.0251336125956956e-05, + "loss": 2.8915, + "step": 6323 + }, + { + "epoch": 0.9, + "learning_rate": 2.0222446916076846e-05, + "loss": 3.0236, + "step": 6324 + }, + { + "epoch": 0.9, + "learning_rate": 2.0193557706196737e-05, + "loss": 3.0933, + "step": 6325 + }, + { + "epoch": 0.9, + "learning_rate": 2.0164668496316627e-05, + "loss": 3.1941, + "step": 6326 + }, + { + "epoch": 0.9, + "learning_rate": 2.0135779286436518e-05, + "loss": 3.1418, + "step": 6327 + }, + { + "epoch": 0.9, + "learning_rate": 2.0106890076556405e-05, + "loss": 3.0804, + "step": 6328 + }, + { + "epoch": 0.9, + "learning_rate": 2.00780008666763e-05, + "loss": 3.0721, + "step": 6329 + }, + { + "epoch": 0.9, + "learning_rate": 2.004911165679619e-05, + "loss": 3.2047, + "step": 6330 + }, + { + "epoch": 0.9, + "learning_rate": 2.002022244691608e-05, + "loss": 3.0389, + "step": 6331 + }, + { + "epoch": 0.9, + "learning_rate": 1.9991333237035966e-05, + "loss": 3.1679, + "step": 6332 + }, + { + "epoch": 0.9, + "learning_rate": 1.9962444027155857e-05, + "loss": 3.0885, + "step": 6333 + }, + { + "epoch": 0.9, + "learning_rate": 1.9933554817275747e-05, + "loss": 3.0729, + "step": 6334 + }, + { + "epoch": 0.9, + "learning_rate": 1.990466560739564e-05, + "loss": 2.9343, + "step": 6335 + }, + { + "epoch": 0.9, + "learning_rate": 1.9875776397515528e-05, + "loss": 3.0759, + "step": 6336 + }, + { + "epoch": 0.9, + "eval_loss": 3.3094747066497803, + "eval_runtime": 472.097, + "eval_samples_per_second": 43.396, + "eval_steps_per_second": 14.465, + "step": 6336 + }, + { + "epoch": 0.9, + "learning_rate": 1.984688718763542e-05, + "loss": 3.1271, + "step": 6337 + }, + { + "epoch": 0.9, + "learning_rate": 1.981799797775531e-05, + "loss": 2.8808, + "step": 6338 + }, + { + "epoch": 0.9, + "learning_rate": 1.97891087678752e-05, + "loss": 2.9369, + "step": 6339 + }, + { + "epoch": 0.9, + "learning_rate": 1.976021955799509e-05, + "loss": 3.0485, + "step": 6340 + }, + { + "epoch": 0.9, + "learning_rate": 1.973133034811498e-05, + "loss": 3.0099, + "step": 6341 + }, + { + "epoch": 0.9, + "learning_rate": 1.970244113823487e-05, + "loss": 3.0134, + "step": 6342 + }, + { + "epoch": 0.9, + "learning_rate": 1.967355192835476e-05, + "loss": 3.1073, + "step": 6343 + }, + { + "epoch": 0.9, + "learning_rate": 1.964466271847465e-05, + "loss": 3.1248, + "step": 6344 + }, + { + "epoch": 0.9, + "learning_rate": 1.961577350859454e-05, + "loss": 2.9222, + "step": 6345 + }, + { + "epoch": 0.9, + "learning_rate": 1.958688429871443e-05, + "loss": 3.0365, + "step": 6346 + }, + { + "epoch": 0.9, + "learning_rate": 1.9557995088834323e-05, + "loss": 3.0688, + "step": 6347 + }, + { + "epoch": 0.9, + "learning_rate": 1.9529105878954213e-05, + "loss": 3.073, + "step": 6348 + }, + { + "epoch": 0.9, + "learning_rate": 1.95002166690741e-05, + "loss": 2.8182, + "step": 6349 + }, + { + "epoch": 0.9, + "learning_rate": 1.947132745919399e-05, + "loss": 3.1043, + "step": 6350 + }, + { + "epoch": 0.9, + "learning_rate": 1.944243824931388e-05, + "loss": 3.1836, + "step": 6351 + }, + { + "epoch": 0.9, + "learning_rate": 1.9413549039433772e-05, + "loss": 3.1243, + "step": 6352 + }, + { + "epoch": 0.9, + "learning_rate": 1.9384659829553662e-05, + "loss": 3.1342, + "step": 6353 + }, + { + "epoch": 0.9, + "learning_rate": 1.9355770619673553e-05, + "loss": 3.1523, + "step": 6354 + }, + { + "epoch": 0.9, + "learning_rate": 1.9326881409793443e-05, + "loss": 3.1038, + "step": 6355 + }, + { + "epoch": 0.9, + "learning_rate": 1.9297992199913334e-05, + "loss": 2.985, + "step": 6356 + }, + { + "epoch": 0.91, + "learning_rate": 1.9269102990033224e-05, + "loss": 2.9722, + "step": 6357 + }, + { + "epoch": 0.91, + "learning_rate": 1.9240213780153114e-05, + "loss": 3.0341, + "step": 6358 + }, + { + "epoch": 0.91, + "learning_rate": 1.9211324570273005e-05, + "loss": 3.0669, + "step": 6359 + }, + { + "epoch": 0.91, + "learning_rate": 1.9182435360392895e-05, + "loss": 2.9711, + "step": 6360 + }, + { + "epoch": 0.91, + "learning_rate": 1.9153546150512786e-05, + "loss": 3.0699, + "step": 6361 + }, + { + "epoch": 0.91, + "learning_rate": 1.9124656940632673e-05, + "loss": 2.9977, + "step": 6362 + }, + { + "epoch": 0.91, + "learning_rate": 1.9095767730752563e-05, + "loss": 3.0904, + "step": 6363 + }, + { + "epoch": 0.91, + "learning_rate": 1.9066878520872457e-05, + "loss": 3.2007, + "step": 6364 + }, + { + "epoch": 0.91, + "learning_rate": 1.9037989310992347e-05, + "loss": 3.0675, + "step": 6365 + }, + { + "epoch": 0.91, + "learning_rate": 1.9009100101112234e-05, + "loss": 2.9933, + "step": 6366 + }, + { + "epoch": 0.91, + "learning_rate": 1.8980210891232125e-05, + "loss": 3.0551, + "step": 6367 + }, + { + "epoch": 0.91, + "learning_rate": 1.8951321681352015e-05, + "loss": 3.1746, + "step": 6368 + }, + { + "epoch": 0.91, + "learning_rate": 1.8922432471471906e-05, + "loss": 2.9598, + "step": 6369 + }, + { + "epoch": 0.91, + "learning_rate": 1.8893543261591796e-05, + "loss": 2.9074, + "step": 6370 + }, + { + "epoch": 0.91, + "learning_rate": 1.8864654051711687e-05, + "loss": 3.0811, + "step": 6371 + }, + { + "epoch": 0.91, + "learning_rate": 1.8835764841831577e-05, + "loss": 2.9267, + "step": 6372 + }, + { + "epoch": 0.91, + "learning_rate": 1.8806875631951468e-05, + "loss": 2.8797, + "step": 6373 + }, + { + "epoch": 0.91, + "learning_rate": 1.8777986422071358e-05, + "loss": 2.917, + "step": 6374 + }, + { + "epoch": 0.91, + "learning_rate": 1.8749097212191245e-05, + "loss": 3.0835, + "step": 6375 + }, + { + "epoch": 0.91, + "learning_rate": 1.872020800231114e-05, + "loss": 2.9676, + "step": 6376 + }, + { + "epoch": 0.91, + "learning_rate": 1.869131879243103e-05, + "loss": 3.1162, + "step": 6377 + }, + { + "epoch": 0.91, + "learning_rate": 1.866242958255092e-05, + "loss": 3.0682, + "step": 6378 + }, + { + "epoch": 0.91, + "learning_rate": 1.8633540372670807e-05, + "loss": 3.166, + "step": 6379 + }, + { + "epoch": 0.91, + "learning_rate": 1.8604651162790697e-05, + "loss": 3.1365, + "step": 6380 + }, + { + "epoch": 0.91, + "learning_rate": 1.8575761952910588e-05, + "loss": 3.1739, + "step": 6381 + }, + { + "epoch": 0.91, + "learning_rate": 1.854687274303048e-05, + "loss": 3.0823, + "step": 6382 + }, + { + "epoch": 0.91, + "learning_rate": 1.851798353315037e-05, + "loss": 3.1357, + "step": 6383 + }, + { + "epoch": 0.91, + "learning_rate": 1.848909432327026e-05, + "loss": 3.0598, + "step": 6384 + }, + { + "epoch": 0.91, + "learning_rate": 1.846020511339015e-05, + "loss": 2.9985, + "step": 6385 + }, + { + "epoch": 0.91, + "learning_rate": 1.843131590351004e-05, + "loss": 3.013, + "step": 6386 + }, + { + "epoch": 0.91, + "learning_rate": 1.840242669362993e-05, + "loss": 3.0844, + "step": 6387 + }, + { + "epoch": 0.91, + "learning_rate": 1.837353748374982e-05, + "loss": 2.892, + "step": 6388 + }, + { + "epoch": 0.91, + "learning_rate": 1.834464827386971e-05, + "loss": 2.9809, + "step": 6389 + }, + { + "epoch": 0.91, + "learning_rate": 1.83157590639896e-05, + "loss": 3.1594, + "step": 6390 + }, + { + "epoch": 0.91, + "learning_rate": 1.828686985410949e-05, + "loss": 3.0109, + "step": 6391 + }, + { + "epoch": 0.91, + "learning_rate": 1.825798064422938e-05, + "loss": 3.0524, + "step": 6392 + }, + { + "epoch": 0.91, + "learning_rate": 1.8229091434349273e-05, + "loss": 3.11, + "step": 6393 + }, + { + "epoch": 0.91, + "learning_rate": 1.8200202224469163e-05, + "loss": 3.0801, + "step": 6394 + }, + { + "epoch": 0.91, + "learning_rate": 1.8171313014589054e-05, + "loss": 3.0253, + "step": 6395 + }, + { + "epoch": 0.91, + "learning_rate": 1.814242380470894e-05, + "loss": 3.1307, + "step": 6396 + }, + { + "epoch": 0.91, + "learning_rate": 1.811353459482883e-05, + "loss": 3.0491, + "step": 6397 + }, + { + "epoch": 0.91, + "learning_rate": 1.8084645384948722e-05, + "loss": 2.8271, + "step": 6398 + }, + { + "epoch": 0.91, + "learning_rate": 1.8055756175068612e-05, + "loss": 3.1495, + "step": 6399 + }, + { + "epoch": 0.91, + "learning_rate": 1.8026866965188503e-05, + "loss": 3.1997, + "step": 6400 + }, + { + "epoch": 0.91, + "learning_rate": 1.7997977755308393e-05, + "loss": 3.0777, + "step": 6401 + }, + { + "epoch": 0.91, + "learning_rate": 1.7969088545428283e-05, + "loss": 3.0433, + "step": 6402 + }, + { + "epoch": 0.91, + "learning_rate": 1.7940199335548174e-05, + "loss": 2.9814, + "step": 6403 + }, + { + "epoch": 0.91, + "learning_rate": 1.7911310125668064e-05, + "loss": 3.0668, + "step": 6404 + }, + { + "epoch": 0.91, + "learning_rate": 1.7882420915787955e-05, + "loss": 3.0224, + "step": 6405 + }, + { + "epoch": 0.91, + "learning_rate": 1.7853531705907845e-05, + "loss": 2.9634, + "step": 6406 + }, + { + "epoch": 0.91, + "learning_rate": 1.7824642496027736e-05, + "loss": 3.064, + "step": 6407 + }, + { + "epoch": 0.91, + "learning_rate": 1.7795753286147623e-05, + "loss": 3.1138, + "step": 6408 + }, + { + "epoch": 0.91, + "learning_rate": 1.7766864076267513e-05, + "loss": 2.8335, + "step": 6409 + }, + { + "epoch": 0.91, + "learning_rate": 1.7737974866387404e-05, + "loss": 3.0095, + "step": 6410 + }, + { + "epoch": 0.91, + "learning_rate": 1.7709085656507297e-05, + "loss": 3.1095, + "step": 6411 + }, + { + "epoch": 0.91, + "learning_rate": 1.7680196446627188e-05, + "loss": 2.9441, + "step": 6412 + }, + { + "epoch": 0.91, + "learning_rate": 1.7651307236747075e-05, + "loss": 3.0799, + "step": 6413 + }, + { + "epoch": 0.91, + "learning_rate": 1.7622418026866965e-05, + "loss": 3.0277, + "step": 6414 + }, + { + "epoch": 0.91, + "learning_rate": 1.7593528816986856e-05, + "loss": 2.9749, + "step": 6415 + }, + { + "epoch": 0.91, + "learning_rate": 1.7564639607106746e-05, + "loss": 3.1017, + "step": 6416 + }, + { + "epoch": 0.91, + "learning_rate": 1.7535750397226637e-05, + "loss": 3.1977, + "step": 6417 + }, + { + "epoch": 0.91, + "learning_rate": 1.7506861187346527e-05, + "loss": 3.0746, + "step": 6418 + }, + { + "epoch": 0.91, + "learning_rate": 1.7477971977466418e-05, + "loss": 3.0608, + "step": 6419 + }, + { + "epoch": 0.91, + "learning_rate": 1.7449082767586308e-05, + "loss": 3.1119, + "step": 6420 + }, + { + "epoch": 0.91, + "learning_rate": 1.7420193557706195e-05, + "loss": 2.9672, + "step": 6421 + }, + { + "epoch": 0.91, + "learning_rate": 1.739130434782609e-05, + "loss": 3.1024, + "step": 6422 + }, + { + "epoch": 0.91, + "learning_rate": 1.736241513794598e-05, + "loss": 3.1323, + "step": 6423 + }, + { + "epoch": 0.91, + "learning_rate": 1.733352592806587e-05, + "loss": 3.093, + "step": 6424 + }, + { + "epoch": 0.91, + "learning_rate": 1.7304636718185757e-05, + "loss": 3.0176, + "step": 6425 + }, + { + "epoch": 0.91, + "learning_rate": 1.7275747508305647e-05, + "loss": 3.1097, + "step": 6426 + }, + { + "epoch": 0.92, + "learning_rate": 1.7246858298425538e-05, + "loss": 2.881, + "step": 6427 + }, + { + "epoch": 0.92, + "learning_rate": 1.721796908854543e-05, + "loss": 3.1614, + "step": 6428 + }, + { + "epoch": 0.92, + "learning_rate": 1.7189079878665322e-05, + "loss": 3.0169, + "step": 6429 + }, + { + "epoch": 0.92, + "learning_rate": 1.716019066878521e-05, + "loss": 3.0446, + "step": 6430 + }, + { + "epoch": 0.92, + "learning_rate": 1.71313014589051e-05, + "loss": 2.9543, + "step": 6431 + }, + { + "epoch": 0.92, + "learning_rate": 1.710241224902499e-05, + "loss": 3.0359, + "step": 6432 + }, + { + "epoch": 0.92, + "learning_rate": 1.707352303914488e-05, + "loss": 3.0817, + "step": 6433 + }, + { + "epoch": 0.92, + "learning_rate": 1.704463382926477e-05, + "loss": 3.0606, + "step": 6434 + }, + { + "epoch": 0.92, + "learning_rate": 1.701574461938466e-05, + "loss": 3.0164, + "step": 6435 + }, + { + "epoch": 0.92, + "learning_rate": 1.698685540950455e-05, + "loss": 3.1532, + "step": 6436 + }, + { + "epoch": 0.92, + "learning_rate": 1.6957966199624442e-05, + "loss": 2.9962, + "step": 6437 + }, + { + "epoch": 0.92, + "learning_rate": 1.692907698974433e-05, + "loss": 3.1381, + "step": 6438 + }, + { + "epoch": 0.92, + "learning_rate": 1.690018777986422e-05, + "loss": 3.0888, + "step": 6439 + }, + { + "epoch": 0.92, + "learning_rate": 1.6871298569984113e-05, + "loss": 3.0824, + "step": 6440 + }, + { + "epoch": 0.92, + "learning_rate": 1.6842409360104004e-05, + "loss": 2.7761, + "step": 6441 + }, + { + "epoch": 0.92, + "learning_rate": 1.681352015022389e-05, + "loss": 3.1271, + "step": 6442 + }, + { + "epoch": 0.92, + "learning_rate": 1.678463094034378e-05, + "loss": 3.1044, + "step": 6443 + }, + { + "epoch": 0.92, + "learning_rate": 1.675574173046367e-05, + "loss": 3.0952, + "step": 6444 + }, + { + "epoch": 0.92, + "learning_rate": 1.6726852520583562e-05, + "loss": 2.8992, + "step": 6445 + }, + { + "epoch": 0.92, + "learning_rate": 1.6697963310703456e-05, + "loss": 3.1165, + "step": 6446 + }, + { + "epoch": 0.92, + "learning_rate": 1.6669074100823343e-05, + "loss": 3.2024, + "step": 6447 + }, + { + "epoch": 0.92, + "learning_rate": 1.6640184890943233e-05, + "loss": 2.7159, + "step": 6448 + }, + { + "epoch": 0.92, + "learning_rate": 1.6611295681063124e-05, + "loss": 3.1082, + "step": 6449 + }, + { + "epoch": 0.92, + "learning_rate": 1.6582406471183014e-05, + "loss": 3.1386, + "step": 6450 + }, + { + "epoch": 0.92, + "learning_rate": 1.6553517261302905e-05, + "loss": 3.0407, + "step": 6451 + }, + { + "epoch": 0.92, + "learning_rate": 1.6524628051422795e-05, + "loss": 3.0281, + "step": 6452 + }, + { + "epoch": 0.92, + "learning_rate": 1.6495738841542686e-05, + "loss": 3.0393, + "step": 6453 + }, + { + "epoch": 0.92, + "learning_rate": 1.6466849631662576e-05, + "loss": 3.0324, + "step": 6454 + }, + { + "epoch": 0.92, + "learning_rate": 1.6437960421782463e-05, + "loss": 2.9705, + "step": 6455 + }, + { + "epoch": 0.92, + "learning_rate": 1.6409071211902354e-05, + "loss": 2.9463, + "step": 6456 + }, + { + "epoch": 0.92, + "learning_rate": 1.6380182002022247e-05, + "loss": 3.0211, + "step": 6457 + }, + { + "epoch": 0.92, + "learning_rate": 1.6351292792142138e-05, + "loss": 2.9502, + "step": 6458 + }, + { + "epoch": 0.92, + "learning_rate": 1.6322403582262025e-05, + "loss": 2.9689, + "step": 6459 + }, + { + "epoch": 0.92, + "learning_rate": 1.6293514372381915e-05, + "loss": 3.1397, + "step": 6460 + }, + { + "epoch": 0.92, + "learning_rate": 1.6264625162501806e-05, + "loss": 3.0759, + "step": 6461 + }, + { + "epoch": 0.92, + "learning_rate": 1.6235735952621696e-05, + "loss": 2.9888, + "step": 6462 + }, + { + "epoch": 0.92, + "learning_rate": 1.6206846742741587e-05, + "loss": 3.0348, + "step": 6463 + }, + { + "epoch": 0.92, + "learning_rate": 1.6177957532861477e-05, + "loss": 3.0546, + "step": 6464 + }, + { + "epoch": 0.92, + "learning_rate": 1.6149068322981367e-05, + "loss": 3.0652, + "step": 6465 + }, + { + "epoch": 0.92, + "learning_rate": 1.6120179113101258e-05, + "loss": 2.9981, + "step": 6466 + }, + { + "epoch": 0.92, + "learning_rate": 1.609128990322115e-05, + "loss": 3.0046, + "step": 6467 + }, + { + "epoch": 0.92, + "learning_rate": 1.6062400693341035e-05, + "loss": 2.9306, + "step": 6468 + }, + { + "epoch": 0.92, + "learning_rate": 1.603351148346093e-05, + "loss": 3.0131, + "step": 6469 + }, + { + "epoch": 0.92, + "learning_rate": 1.600462227358082e-05, + "loss": 3.0597, + "step": 6470 + }, + { + "epoch": 0.92, + "learning_rate": 1.597573306370071e-05, + "loss": 2.9273, + "step": 6471 + }, + { + "epoch": 0.92, + "learning_rate": 1.5946843853820597e-05, + "loss": 3.0676, + "step": 6472 + }, + { + "epoch": 0.92, + "learning_rate": 1.5917954643940488e-05, + "loss": 3.0043, + "step": 6473 + }, + { + "epoch": 0.92, + "learning_rate": 1.5889065434060378e-05, + "loss": 3.0176, + "step": 6474 + }, + { + "epoch": 0.92, + "learning_rate": 1.5860176224180272e-05, + "loss": 3.1659, + "step": 6475 + }, + { + "epoch": 0.92, + "learning_rate": 1.583128701430016e-05, + "loss": 3.2128, + "step": 6476 + }, + { + "epoch": 0.92, + "learning_rate": 1.580239780442005e-05, + "loss": 3.0655, + "step": 6477 + }, + { + "epoch": 0.92, + "learning_rate": 1.577350859453994e-05, + "loss": 2.962, + "step": 6478 + }, + { + "epoch": 0.92, + "learning_rate": 1.574461938465983e-05, + "loss": 3.2248, + "step": 6479 + }, + { + "epoch": 0.92, + "learning_rate": 1.571573017477972e-05, + "loss": 3.1467, + "step": 6480 + }, + { + "epoch": 0.92, + "learning_rate": 1.568684096489961e-05, + "loss": 2.7129, + "step": 6481 + }, + { + "epoch": 0.92, + "learning_rate": 1.56579517550195e-05, + "loss": 3.1727, + "step": 6482 + }, + { + "epoch": 0.92, + "learning_rate": 1.5629062545139392e-05, + "loss": 2.9672, + "step": 6483 + }, + { + "epoch": 0.92, + "learning_rate": 1.5600173335259282e-05, + "loss": 3.0074, + "step": 6484 + }, + { + "epoch": 0.92, + "learning_rate": 1.557128412537917e-05, + "loss": 3.0807, + "step": 6485 + }, + { + "epoch": 0.92, + "learning_rate": 1.5542394915499063e-05, + "loss": 3.1327, + "step": 6486 + }, + { + "epoch": 0.92, + "learning_rate": 1.5513505705618954e-05, + "loss": 2.9074, + "step": 6487 + }, + { + "epoch": 0.92, + "learning_rate": 1.5484616495738844e-05, + "loss": 3.0862, + "step": 6488 + }, + { + "epoch": 0.92, + "learning_rate": 1.545572728585873e-05, + "loss": 3.0671, + "step": 6489 + }, + { + "epoch": 0.92, + "learning_rate": 1.542683807597862e-05, + "loss": 3.0044, + "step": 6490 + }, + { + "epoch": 0.92, + "learning_rate": 1.5397948866098512e-05, + "loss": 3.1161, + "step": 6491 + }, + { + "epoch": 0.92, + "learning_rate": 1.5369059656218402e-05, + "loss": 2.9369, + "step": 6492 + }, + { + "epoch": 0.92, + "learning_rate": 1.5340170446338293e-05, + "loss": 3.1136, + "step": 6493 + }, + { + "epoch": 0.92, + "learning_rate": 1.5311281236458183e-05, + "loss": 3.0749, + "step": 6494 + }, + { + "epoch": 0.92, + "learning_rate": 1.5282392026578074e-05, + "loss": 3.1212, + "step": 6495 + }, + { + "epoch": 0.92, + "learning_rate": 1.5253502816697964e-05, + "loss": 3.0919, + "step": 6496 + }, + { + "epoch": 0.93, + "learning_rate": 1.5224613606817853e-05, + "loss": 3.0911, + "step": 6497 + }, + { + "epoch": 0.93, + "learning_rate": 1.5195724396937743e-05, + "loss": 3.2119, + "step": 6498 + }, + { + "epoch": 0.93, + "learning_rate": 1.5166835187057636e-05, + "loss": 3.1722, + "step": 6499 + }, + { + "epoch": 0.93, + "learning_rate": 1.5137945977177526e-05, + "loss": 3.1116, + "step": 6500 + }, + { + "epoch": 0.93, + "learning_rate": 1.5109056767297416e-05, + "loss": 2.9683, + "step": 6501 + }, + { + "epoch": 0.93, + "learning_rate": 1.5080167557417305e-05, + "loss": 3.0567, + "step": 6502 + }, + { + "epoch": 0.93, + "learning_rate": 1.5051278347537196e-05, + "loss": 3.0653, + "step": 6503 + }, + { + "epoch": 0.93, + "learning_rate": 1.5022389137657086e-05, + "loss": 2.9544, + "step": 6504 + }, + { + "epoch": 0.93, + "learning_rate": 1.4993499927776976e-05, + "loss": 3.0834, + "step": 6505 + }, + { + "epoch": 0.93, + "learning_rate": 1.4964610717896865e-05, + "loss": 3.1934, + "step": 6506 + }, + { + "epoch": 0.93, + "learning_rate": 1.4935721508016756e-05, + "loss": 3.0452, + "step": 6507 + }, + { + "epoch": 0.93, + "learning_rate": 1.4906832298136648e-05, + "loss": 3.1946, + "step": 6508 + }, + { + "epoch": 0.93, + "learning_rate": 1.4877943088256538e-05, + "loss": 2.9808, + "step": 6509 + }, + { + "epoch": 0.93, + "learning_rate": 1.4849053878376425e-05, + "loss": 3.0162, + "step": 6510 + }, + { + "epoch": 0.93, + "learning_rate": 1.4820164668496317e-05, + "loss": 3.0212, + "step": 6511 + }, + { + "epoch": 0.93, + "learning_rate": 1.4791275458616208e-05, + "loss": 2.8753, + "step": 6512 + }, + { + "epoch": 0.93, + "learning_rate": 1.4762386248736098e-05, + "loss": 2.995, + "step": 6513 + }, + { + "epoch": 0.93, + "learning_rate": 1.4733497038855987e-05, + "loss": 2.9755, + "step": 6514 + }, + { + "epoch": 0.93, + "learning_rate": 1.4704607828975877e-05, + "loss": 2.9062, + "step": 6515 + }, + { + "epoch": 0.93, + "learning_rate": 1.4675718619095768e-05, + "loss": 3.1349, + "step": 6516 + }, + { + "epoch": 0.93, + "learning_rate": 1.464682940921566e-05, + "loss": 2.9564, + "step": 6517 + }, + { + "epoch": 0.93, + "learning_rate": 1.461794019933555e-05, + "loss": 3.0887, + "step": 6518 + }, + { + "epoch": 0.93, + "learning_rate": 1.4589050989455438e-05, + "loss": 3.1461, + "step": 6519 + }, + { + "epoch": 0.93, + "learning_rate": 1.456016177957533e-05, + "loss": 2.9868, + "step": 6520 + }, + { + "epoch": 0.93, + "learning_rate": 1.453127256969522e-05, + "loss": 2.8988, + "step": 6521 + }, + { + "epoch": 0.93, + "learning_rate": 1.450238335981511e-05, + "loss": 2.965, + "step": 6522 + }, + { + "epoch": 0.93, + "learning_rate": 1.4473494149935e-05, + "loss": 2.9583, + "step": 6523 + }, + { + "epoch": 0.93, + "learning_rate": 1.444460494005489e-05, + "loss": 3.0323, + "step": 6524 + }, + { + "epoch": 0.93, + "learning_rate": 1.441571573017478e-05, + "loss": 3.0841, + "step": 6525 + }, + { + "epoch": 0.93, + "learning_rate": 1.4386826520294672e-05, + "loss": 3.0853, + "step": 6526 + }, + { + "epoch": 0.93, + "learning_rate": 1.435793731041456e-05, + "loss": 3.1281, + "step": 6527 + }, + { + "epoch": 0.93, + "learning_rate": 1.4329048100534451e-05, + "loss": 2.9758, + "step": 6528 + }, + { + "epoch": 0.93, + "learning_rate": 1.4300158890654342e-05, + "loss": 3.1012, + "step": 6529 + }, + { + "epoch": 0.93, + "learning_rate": 1.4271269680774232e-05, + "loss": 3.1736, + "step": 6530 + }, + { + "epoch": 0.93, + "learning_rate": 1.4242380470894121e-05, + "loss": 2.9168, + "step": 6531 + }, + { + "epoch": 0.93, + "learning_rate": 1.4213491261014012e-05, + "loss": 2.9395, + "step": 6532 + }, + { + "epoch": 0.93, + "learning_rate": 1.4184602051133902e-05, + "loss": 3.179, + "step": 6533 + }, + { + "epoch": 0.93, + "learning_rate": 1.4155712841253792e-05, + "loss": 3.0333, + "step": 6534 + }, + { + "epoch": 0.93, + "learning_rate": 1.4126823631373681e-05, + "loss": 3.0926, + "step": 6535 + }, + { + "epoch": 0.93, + "learning_rate": 1.4097934421493572e-05, + "loss": 3.0646, + "step": 6536 + }, + { + "epoch": 0.93, + "learning_rate": 1.4069045211613464e-05, + "loss": 3.0286, + "step": 6537 + }, + { + "epoch": 0.93, + "learning_rate": 1.4040156001733354e-05, + "loss": 3.095, + "step": 6538 + }, + { + "epoch": 0.93, + "learning_rate": 1.4011266791853245e-05, + "loss": 3.0268, + "step": 6539 + }, + { + "epoch": 0.93, + "learning_rate": 1.3982377581973133e-05, + "loss": 3.0744, + "step": 6540 + }, + { + "epoch": 0.93, + "learning_rate": 1.3953488372093024e-05, + "loss": 3.1404, + "step": 6541 + }, + { + "epoch": 0.93, + "learning_rate": 1.3924599162212914e-05, + "loss": 3.1033, + "step": 6542 + }, + { + "epoch": 0.93, + "learning_rate": 1.3895709952332805e-05, + "loss": 3.0797, + "step": 6543 + }, + { + "epoch": 0.93, + "learning_rate": 1.3866820742452693e-05, + "loss": 3.1705, + "step": 6544 + }, + { + "epoch": 0.93, + "learning_rate": 1.3837931532572584e-05, + "loss": 3.0224, + "step": 6545 + }, + { + "epoch": 0.93, + "learning_rate": 1.3809042322692476e-05, + "loss": 2.8371, + "step": 6546 + }, + { + "epoch": 0.93, + "learning_rate": 1.3780153112812366e-05, + "loss": 2.915, + "step": 6547 + }, + { + "epoch": 0.93, + "learning_rate": 1.3751263902932255e-05, + "loss": 2.9611, + "step": 6548 + }, + { + "epoch": 0.93, + "learning_rate": 1.3722374693052146e-05, + "loss": 3.1396, + "step": 6549 + }, + { + "epoch": 0.93, + "learning_rate": 1.3693485483172036e-05, + "loss": 2.9345, + "step": 6550 + }, + { + "epoch": 0.93, + "learning_rate": 1.3664596273291926e-05, + "loss": 3.0005, + "step": 6551 + }, + { + "epoch": 0.93, + "learning_rate": 1.3635707063411815e-05, + "loss": 2.9932, + "step": 6552 + }, + { + "epoch": 0.93, + "learning_rate": 1.3606817853531706e-05, + "loss": 3.0044, + "step": 6553 + }, + { + "epoch": 0.93, + "learning_rate": 1.3577928643651596e-05, + "loss": 2.9093, + "step": 6554 + }, + { + "epoch": 0.93, + "learning_rate": 1.3549039433771488e-05, + "loss": 3.0998, + "step": 6555 + }, + { + "epoch": 0.93, + "learning_rate": 1.3520150223891379e-05, + "loss": 3.0678, + "step": 6556 + }, + { + "epoch": 0.93, + "learning_rate": 1.3491261014011267e-05, + "loss": 3.1757, + "step": 6557 + }, + { + "epoch": 0.93, + "learning_rate": 1.3462371804131158e-05, + "loss": 2.8707, + "step": 6558 + }, + { + "epoch": 0.93, + "learning_rate": 1.3433482594251048e-05, + "loss": 3.0569, + "step": 6559 + }, + { + "epoch": 0.93, + "learning_rate": 1.3404593384370939e-05, + "loss": 3.0525, + "step": 6560 + }, + { + "epoch": 0.93, + "learning_rate": 1.3375704174490827e-05, + "loss": 3.019, + "step": 6561 + }, + { + "epoch": 0.93, + "learning_rate": 1.3346814964610718e-05, + "loss": 3.0802, + "step": 6562 + }, + { + "epoch": 0.93, + "learning_rate": 1.3317925754730608e-05, + "loss": 3.1213, + "step": 6563 + }, + { + "epoch": 0.93, + "learning_rate": 1.32890365448505e-05, + "loss": 3.1184, + "step": 6564 + }, + { + "epoch": 0.93, + "learning_rate": 1.3260147334970387e-05, + "loss": 3.0327, + "step": 6565 + }, + { + "epoch": 0.93, + "learning_rate": 1.323125812509028e-05, + "loss": 3.0315, + "step": 6566 + }, + { + "epoch": 0.94, + "learning_rate": 1.320236891521017e-05, + "loss": 3.0842, + "step": 6567 + }, + { + "epoch": 0.94, + "learning_rate": 1.317347970533006e-05, + "loss": 3.0889, + "step": 6568 + }, + { + "epoch": 0.94, + "learning_rate": 1.314459049544995e-05, + "loss": 3.0679, + "step": 6569 + }, + { + "epoch": 0.94, + "learning_rate": 1.311570128556984e-05, + "loss": 3.0597, + "step": 6570 + }, + { + "epoch": 0.94, + "learning_rate": 1.308681207568973e-05, + "loss": 2.9619, + "step": 6571 + }, + { + "epoch": 0.94, + "learning_rate": 1.3057922865809622e-05, + "loss": 3.0097, + "step": 6572 + }, + { + "epoch": 0.94, + "learning_rate": 1.3029033655929513e-05, + "loss": 3.1552, + "step": 6573 + }, + { + "epoch": 0.94, + "learning_rate": 1.30001444460494e-05, + "loss": 3.0397, + "step": 6574 + }, + { + "epoch": 0.94, + "learning_rate": 1.2971255236169292e-05, + "loss": 3.1342, + "step": 6575 + }, + { + "epoch": 0.94, + "learning_rate": 1.2942366026289182e-05, + "loss": 3.0991, + "step": 6576 + }, + { + "epoch": 0.94, + "learning_rate": 1.2913476816409073e-05, + "loss": 3.0415, + "step": 6577 + }, + { + "epoch": 0.94, + "learning_rate": 1.2884587606528961e-05, + "loss": 3.0041, + "step": 6578 + }, + { + "epoch": 0.94, + "learning_rate": 1.2855698396648852e-05, + "loss": 2.9867, + "step": 6579 + }, + { + "epoch": 0.94, + "learning_rate": 1.2826809186768742e-05, + "loss": 2.9954, + "step": 6580 + }, + { + "epoch": 0.94, + "learning_rate": 1.2797919976888634e-05, + "loss": 3.0415, + "step": 6581 + }, + { + "epoch": 0.94, + "learning_rate": 1.2769030767008522e-05, + "loss": 3.0023, + "step": 6582 + }, + { + "epoch": 0.94, + "learning_rate": 1.2740141557128412e-05, + "loss": 2.8956, + "step": 6583 + }, + { + "epoch": 0.94, + "learning_rate": 1.2711252347248304e-05, + "loss": 2.8938, + "step": 6584 + }, + { + "epoch": 0.94, + "learning_rate": 1.2682363137368195e-05, + "loss": 3.0022, + "step": 6585 + }, + { + "epoch": 0.94, + "learning_rate": 1.2653473927488083e-05, + "loss": 3.0178, + "step": 6586 + }, + { + "epoch": 0.94, + "learning_rate": 1.2624584717607974e-05, + "loss": 3.0648, + "step": 6587 + }, + { + "epoch": 0.94, + "learning_rate": 1.2595695507727864e-05, + "loss": 3.0671, + "step": 6588 + }, + { + "epoch": 0.94, + "learning_rate": 1.2566806297847755e-05, + "loss": 3.0802, + "step": 6589 + }, + { + "epoch": 0.94, + "learning_rate": 1.2537917087967643e-05, + "loss": 3.0391, + "step": 6590 + }, + { + "epoch": 0.94, + "learning_rate": 1.2509027878087534e-05, + "loss": 3.1359, + "step": 6591 + }, + { + "epoch": 0.94, + "learning_rate": 1.2480138668207424e-05, + "loss": 2.9749, + "step": 6592 + }, + { + "epoch": 0.94, + "learning_rate": 1.2451249458327316e-05, + "loss": 3.0734, + "step": 6593 + }, + { + "epoch": 0.94, + "learning_rate": 1.2422360248447205e-05, + "loss": 3.0312, + "step": 6594 + }, + { + "epoch": 0.94, + "learning_rate": 1.2393471038567096e-05, + "loss": 3.1918, + "step": 6595 + }, + { + "epoch": 0.94, + "learning_rate": 1.2364581828686986e-05, + "loss": 2.9507, + "step": 6596 + }, + { + "epoch": 0.94, + "learning_rate": 1.2335692618806876e-05, + "loss": 3.1098, + "step": 6597 + }, + { + "epoch": 0.94, + "learning_rate": 1.2306803408926767e-05, + "loss": 2.9422, + "step": 6598 + }, + { + "epoch": 0.94, + "learning_rate": 1.2277914199046657e-05, + "loss": 3.2444, + "step": 6599 + }, + { + "epoch": 0.94, + "learning_rate": 1.2249024989166546e-05, + "loss": 3.0561, + "step": 6600 + }, + { + "epoch": 0.94, + "learning_rate": 1.2220135779286438e-05, + "loss": 3.0919, + "step": 6601 + }, + { + "epoch": 0.94, + "learning_rate": 1.2191246569406327e-05, + "loss": 2.9444, + "step": 6602 + }, + { + "epoch": 0.94, + "learning_rate": 1.2162357359526217e-05, + "loss": 3.1651, + "step": 6603 + }, + { + "epoch": 0.94, + "learning_rate": 1.2133468149646108e-05, + "loss": 3.0717, + "step": 6604 + }, + { + "epoch": 0.94, + "learning_rate": 1.2104578939765998e-05, + "loss": 2.861, + "step": 6605 + }, + { + "epoch": 0.94, + "learning_rate": 1.2075689729885887e-05, + "loss": 3.1206, + "step": 6606 + }, + { + "epoch": 0.94, + "learning_rate": 1.2046800520005779e-05, + "loss": 3.0615, + "step": 6607 + }, + { + "epoch": 0.94, + "learning_rate": 1.2017911310125668e-05, + "loss": 3.1829, + "step": 6608 + }, + { + "epoch": 0.94, + "learning_rate": 1.1989022100245558e-05, + "loss": 3.0526, + "step": 6609 + }, + { + "epoch": 0.94, + "learning_rate": 1.196013289036545e-05, + "loss": 3.0154, + "step": 6610 + }, + { + "epoch": 0.94, + "learning_rate": 1.1931243680485339e-05, + "loss": 3.0023, + "step": 6611 + }, + { + "epoch": 0.94, + "learning_rate": 1.190235447060523e-05, + "loss": 2.9746, + "step": 6612 + }, + { + "epoch": 0.94, + "learning_rate": 1.187346526072512e-05, + "loss": 3.0954, + "step": 6613 + }, + { + "epoch": 0.94, + "learning_rate": 1.184457605084501e-05, + "loss": 3.0068, + "step": 6614 + }, + { + "epoch": 0.94, + "learning_rate": 1.18156868409649e-05, + "loss": 3.1252, + "step": 6615 + }, + { + "epoch": 0.94, + "learning_rate": 1.1786797631084791e-05, + "loss": 3.038, + "step": 6616 + }, + { + "epoch": 0.94, + "learning_rate": 1.175790842120468e-05, + "loss": 3.0386, + "step": 6617 + }, + { + "epoch": 0.94, + "learning_rate": 1.172901921132457e-05, + "loss": 3.0446, + "step": 6618 + }, + { + "epoch": 0.94, + "learning_rate": 1.1700130001444461e-05, + "loss": 3.0548, + "step": 6619 + }, + { + "epoch": 0.94, + "learning_rate": 1.1671240791564351e-05, + "loss": 3.117, + "step": 6620 + }, + { + "epoch": 0.94, + "learning_rate": 1.1642351581684242e-05, + "loss": 2.9915, + "step": 6621 + }, + { + "epoch": 0.94, + "learning_rate": 1.1613462371804132e-05, + "loss": 3.1946, + "step": 6622 + }, + { + "epoch": 0.94, + "learning_rate": 1.1584573161924021e-05, + "loss": 2.9643, + "step": 6623 + }, + { + "epoch": 0.94, + "learning_rate": 1.1555683952043911e-05, + "loss": 3.0892, + "step": 6624 + }, + { + "epoch": 0.94, + "learning_rate": 1.1526794742163802e-05, + "loss": 2.959, + "step": 6625 + }, + { + "epoch": 0.94, + "learning_rate": 1.1497905532283692e-05, + "loss": 2.9867, + "step": 6626 + }, + { + "epoch": 0.94, + "learning_rate": 1.1469016322403583e-05, + "loss": 3.0531, + "step": 6627 + }, + { + "epoch": 0.94, + "learning_rate": 1.1440127112523473e-05, + "loss": 3.1347, + "step": 6628 + }, + { + "epoch": 0.94, + "learning_rate": 1.1411237902643364e-05, + "loss": 3.1322, + "step": 6629 + }, + { + "epoch": 0.94, + "learning_rate": 1.1382348692763254e-05, + "loss": 2.8694, + "step": 6630 + }, + { + "epoch": 0.94, + "learning_rate": 1.1353459482883144e-05, + "loss": 3.0304, + "step": 6631 + }, + { + "epoch": 0.94, + "learning_rate": 1.1324570273003033e-05, + "loss": 3.0735, + "step": 6632 + }, + { + "epoch": 0.94, + "learning_rate": 1.1295681063122925e-05, + "loss": 3.0781, + "step": 6633 + }, + { + "epoch": 0.94, + "learning_rate": 1.1266791853242814e-05, + "loss": 3.2113, + "step": 6634 + }, + { + "epoch": 0.94, + "learning_rate": 1.1237902643362705e-05, + "loss": 3.1034, + "step": 6635 + }, + { + "epoch": 0.94, + "learning_rate": 1.1209013433482595e-05, + "loss": 3.0906, + "step": 6636 + }, + { + "epoch": 0.94, + "learning_rate": 1.1180124223602485e-05, + "loss": 3.0979, + "step": 6637 + }, + { + "epoch": 0.95, + "learning_rate": 1.1151235013722374e-05, + "loss": 3.0023, + "step": 6638 + }, + { + "epoch": 0.95, + "learning_rate": 1.1122345803842266e-05, + "loss": 3.1516, + "step": 6639 + }, + { + "epoch": 0.95, + "learning_rate": 1.1093456593962155e-05, + "loss": 3.141, + "step": 6640 + }, + { + "epoch": 0.95, + "learning_rate": 1.1064567384082045e-05, + "loss": 3.0904, + "step": 6641 + }, + { + "epoch": 0.95, + "learning_rate": 1.1035678174201936e-05, + "loss": 3.1283, + "step": 6642 + }, + { + "epoch": 0.95, + "learning_rate": 1.1006788964321826e-05, + "loss": 2.9994, + "step": 6643 + }, + { + "epoch": 0.95, + "learning_rate": 1.0977899754441715e-05, + "loss": 3.1035, + "step": 6644 + }, + { + "epoch": 0.95, + "learning_rate": 1.0949010544561607e-05, + "loss": 3.1685, + "step": 6645 + }, + { + "epoch": 0.95, + "learning_rate": 1.0920121334681498e-05, + "loss": 2.9159, + "step": 6646 + }, + { + "epoch": 0.95, + "learning_rate": 1.0891232124801386e-05, + "loss": 3.0004, + "step": 6647 + }, + { + "epoch": 0.95, + "learning_rate": 1.0862342914921279e-05, + "loss": 3.0733, + "step": 6648 + }, + { + "epoch": 0.95, + "learning_rate": 1.0833453705041167e-05, + "loss": 2.9666, + "step": 6649 + }, + { + "epoch": 0.95, + "learning_rate": 1.0804564495161058e-05, + "loss": 3.0647, + "step": 6650 + }, + { + "epoch": 0.95, + "learning_rate": 1.0775675285280948e-05, + "loss": 3.0639, + "step": 6651 + }, + { + "epoch": 0.95, + "learning_rate": 1.0746786075400839e-05, + "loss": 3.1523, + "step": 6652 + }, + { + "epoch": 0.95, + "learning_rate": 1.0717896865520727e-05, + "loss": 3.0446, + "step": 6653 + }, + { + "epoch": 0.95, + "learning_rate": 1.068900765564062e-05, + "loss": 2.9252, + "step": 6654 + }, + { + "epoch": 0.95, + "learning_rate": 1.0660118445760508e-05, + "loss": 3.0308, + "step": 6655 + }, + { + "epoch": 0.95, + "learning_rate": 1.0631229235880399e-05, + "loss": 3.2152, + "step": 6656 + }, + { + "epoch": 0.95, + "learning_rate": 1.0602340026000289e-05, + "loss": 3.0464, + "step": 6657 + }, + { + "epoch": 0.95, + "learning_rate": 1.057345081612018e-05, + "loss": 3.028, + "step": 6658 + }, + { + "epoch": 0.95, + "learning_rate": 1.054456160624007e-05, + "loss": 3.1068, + "step": 6659 + }, + { + "epoch": 0.95, + "learning_rate": 1.051567239635996e-05, + "loss": 2.9705, + "step": 6660 + }, + { + "epoch": 0.95, + "learning_rate": 1.0486783186479849e-05, + "loss": 2.9226, + "step": 6661 + }, + { + "epoch": 0.95, + "learning_rate": 1.0457893976599741e-05, + "loss": 2.8683, + "step": 6662 + }, + { + "epoch": 0.95, + "learning_rate": 1.0429004766719632e-05, + "loss": 3.086, + "step": 6663 + }, + { + "epoch": 0.95, + "learning_rate": 1.040011555683952e-05, + "loss": 2.85, + "step": 6664 + }, + { + "epoch": 0.95, + "learning_rate": 1.0371226346959413e-05, + "loss": 3.0683, + "step": 6665 + }, + { + "epoch": 0.95, + "learning_rate": 1.0342337137079301e-05, + "loss": 3.0026, + "step": 6666 + }, + { + "epoch": 0.95, + "learning_rate": 1.0313447927199192e-05, + "loss": 3.0649, + "step": 6667 + }, + { + "epoch": 0.95, + "learning_rate": 1.0284558717319082e-05, + "loss": 3.0641, + "step": 6668 + }, + { + "epoch": 0.95, + "learning_rate": 1.0255669507438973e-05, + "loss": 3.0668, + "step": 6669 + }, + { + "epoch": 0.95, + "learning_rate": 1.0226780297558861e-05, + "loss": 3.0345, + "step": 6670 + }, + { + "epoch": 0.95, + "learning_rate": 1.0197891087678754e-05, + "loss": 3.0709, + "step": 6671 + }, + { + "epoch": 0.95, + "learning_rate": 1.0169001877798642e-05, + "loss": 2.8739, + "step": 6672 + }, + { + "epoch": 0.95, + "learning_rate": 1.0140112667918533e-05, + "loss": 3.0179, + "step": 6673 + }, + { + "epoch": 0.95, + "learning_rate": 1.0111223458038423e-05, + "loss": 3.0285, + "step": 6674 + }, + { + "epoch": 0.95, + "learning_rate": 1.0082334248158314e-05, + "loss": 3.0476, + "step": 6675 + }, + { + "epoch": 0.95, + "learning_rate": 1.0053445038278202e-05, + "loss": 3.0647, + "step": 6676 + }, + { + "epoch": 0.95, + "learning_rate": 1.0024555828398094e-05, + "loss": 3.0383, + "step": 6677 + }, + { + "epoch": 0.95, + "learning_rate": 9.995666618517983e-06, + "loss": 3.0514, + "step": 6678 + }, + { + "epoch": 0.95, + "learning_rate": 9.966777408637874e-06, + "loss": 3.198, + "step": 6679 + }, + { + "epoch": 0.95, + "learning_rate": 9.937888198757764e-06, + "loss": 3.1583, + "step": 6680 + }, + { + "epoch": 0.95, + "learning_rate": 9.908998988877655e-06, + "loss": 2.9471, + "step": 6681 + }, + { + "epoch": 0.95, + "learning_rate": 9.880109778997545e-06, + "loss": 3.0897, + "step": 6682 + }, + { + "epoch": 0.95, + "learning_rate": 9.851220569117435e-06, + "loss": 3.1209, + "step": 6683 + }, + { + "epoch": 0.95, + "learning_rate": 9.822331359237326e-06, + "loss": 2.9804, + "step": 6684 + }, + { + "epoch": 0.95, + "learning_rate": 9.793442149357215e-06, + "loss": 2.9855, + "step": 6685 + }, + { + "epoch": 0.95, + "learning_rate": 9.764552939477107e-06, + "loss": 3.0023, + "step": 6686 + }, + { + "epoch": 0.95, + "learning_rate": 9.735663729596995e-06, + "loss": 3.104, + "step": 6687 + }, + { + "epoch": 0.95, + "learning_rate": 9.706774519716886e-06, + "loss": 3.1989, + "step": 6688 + }, + { + "epoch": 0.95, + "eval_loss": 3.2940800189971924, + "eval_runtime": 471.7893, + "eval_samples_per_second": 43.424, + "eval_steps_per_second": 14.475, + "step": 6688 + }, + { + "epoch": 0.95, + "learning_rate": 9.677885309836776e-06, + "loss": 3.0624, + "step": 6689 + }, + { + "epoch": 0.95, + "learning_rate": 9.648996099956667e-06, + "loss": 3.0859, + "step": 6690 + }, + { + "epoch": 0.95, + "learning_rate": 9.620106890076557e-06, + "loss": 3.1474, + "step": 6691 + }, + { + "epoch": 0.95, + "learning_rate": 9.591217680196448e-06, + "loss": 2.9431, + "step": 6692 + }, + { + "epoch": 0.95, + "learning_rate": 9.562328470316336e-06, + "loss": 3.0516, + "step": 6693 + }, + { + "epoch": 0.95, + "learning_rate": 9.533439260436229e-06, + "loss": 3.0875, + "step": 6694 + }, + { + "epoch": 0.95, + "learning_rate": 9.504550050556117e-06, + "loss": 3.0427, + "step": 6695 + }, + { + "epoch": 0.95, + "learning_rate": 9.475660840676008e-06, + "loss": 2.9797, + "step": 6696 + }, + { + "epoch": 0.95, + "learning_rate": 9.446771630795898e-06, + "loss": 3.033, + "step": 6697 + }, + { + "epoch": 0.95, + "learning_rate": 9.417882420915789e-06, + "loss": 2.9754, + "step": 6698 + }, + { + "epoch": 0.95, + "learning_rate": 9.388993211035679e-06, + "loss": 2.9346, + "step": 6699 + }, + { + "epoch": 0.95, + "learning_rate": 9.36010400115557e-06, + "loss": 3.1591, + "step": 6700 + }, + { + "epoch": 0.95, + "learning_rate": 9.33121479127546e-06, + "loss": 3.022, + "step": 6701 + }, + { + "epoch": 0.95, + "learning_rate": 9.302325581395349e-06, + "loss": 3.0103, + "step": 6702 + }, + { + "epoch": 0.95, + "learning_rate": 9.27343637151524e-06, + "loss": 3.0677, + "step": 6703 + }, + { + "epoch": 0.95, + "learning_rate": 9.24454716163513e-06, + "loss": 3.1395, + "step": 6704 + }, + { + "epoch": 0.95, + "learning_rate": 9.21565795175502e-06, + "loss": 2.9199, + "step": 6705 + }, + { + "epoch": 0.95, + "learning_rate": 9.18676874187491e-06, + "loss": 3.1351, + "step": 6706 + }, + { + "epoch": 0.95, + "learning_rate": 9.1578795319948e-06, + "loss": 3.0665, + "step": 6707 + }, + { + "epoch": 0.96, + "learning_rate": 9.12899032211469e-06, + "loss": 3.0258, + "step": 6708 + }, + { + "epoch": 0.96, + "learning_rate": 9.100101112234582e-06, + "loss": 3.0946, + "step": 6709 + }, + { + "epoch": 0.96, + "learning_rate": 9.07121190235447e-06, + "loss": 3.0898, + "step": 6710 + }, + { + "epoch": 0.96, + "learning_rate": 9.042322692474361e-06, + "loss": 2.9603, + "step": 6711 + }, + { + "epoch": 0.96, + "learning_rate": 9.013433482594251e-06, + "loss": 3.1076, + "step": 6712 + }, + { + "epoch": 0.96, + "learning_rate": 8.984544272714142e-06, + "loss": 3.1297, + "step": 6713 + }, + { + "epoch": 0.96, + "learning_rate": 8.955655062834032e-06, + "loss": 3.1439, + "step": 6714 + }, + { + "epoch": 0.96, + "learning_rate": 8.926765852953923e-06, + "loss": 2.9976, + "step": 6715 + }, + { + "epoch": 0.96, + "learning_rate": 8.897876643073811e-06, + "loss": 3.1051, + "step": 6716 + }, + { + "epoch": 0.96, + "learning_rate": 8.868987433193702e-06, + "loss": 2.979, + "step": 6717 + }, + { + "epoch": 0.96, + "learning_rate": 8.840098223313594e-06, + "loss": 3.129, + "step": 6718 + }, + { + "epoch": 0.96, + "learning_rate": 8.811209013433483e-06, + "loss": 2.9428, + "step": 6719 + }, + { + "epoch": 0.96, + "learning_rate": 8.782319803553373e-06, + "loss": 3.0957, + "step": 6720 + }, + { + "epoch": 0.96, + "learning_rate": 8.753430593673264e-06, + "loss": 3.0369, + "step": 6721 + }, + { + "epoch": 0.96, + "learning_rate": 8.724541383793154e-06, + "loss": 3.0829, + "step": 6722 + }, + { + "epoch": 0.96, + "learning_rate": 8.695652173913044e-06, + "loss": 3.1139, + "step": 6723 + }, + { + "epoch": 0.96, + "learning_rate": 8.666762964032935e-06, + "loss": 2.9817, + "step": 6724 + }, + { + "epoch": 0.96, + "learning_rate": 8.637873754152824e-06, + "loss": 3.0779, + "step": 6725 + }, + { + "epoch": 0.96, + "learning_rate": 8.608984544272716e-06, + "loss": 3.1538, + "step": 6726 + }, + { + "epoch": 0.96, + "learning_rate": 8.580095334392604e-06, + "loss": 3.0275, + "step": 6727 + }, + { + "epoch": 0.96, + "learning_rate": 8.551206124512495e-06, + "loss": 2.9255, + "step": 6728 + }, + { + "epoch": 0.96, + "learning_rate": 8.522316914632385e-06, + "loss": 2.8025, + "step": 6729 + }, + { + "epoch": 0.96, + "learning_rate": 8.493427704752276e-06, + "loss": 3.1091, + "step": 6730 + }, + { + "epoch": 0.96, + "learning_rate": 8.464538494872165e-06, + "loss": 3.1142, + "step": 6731 + }, + { + "epoch": 0.96, + "learning_rate": 8.435649284992057e-06, + "loss": 3.0966, + "step": 6732 + }, + { + "epoch": 0.96, + "learning_rate": 8.406760075111945e-06, + "loss": 3.0464, + "step": 6733 + }, + { + "epoch": 0.96, + "learning_rate": 8.377870865231836e-06, + "loss": 3.0345, + "step": 6734 + }, + { + "epoch": 0.96, + "learning_rate": 8.348981655351728e-06, + "loss": 3.0332, + "step": 6735 + }, + { + "epoch": 0.96, + "learning_rate": 8.320092445471617e-06, + "loss": 2.8829, + "step": 6736 + }, + { + "epoch": 0.96, + "learning_rate": 8.291203235591507e-06, + "loss": 3.0757, + "step": 6737 + }, + { + "epoch": 0.96, + "learning_rate": 8.262314025711398e-06, + "loss": 3.1809, + "step": 6738 + }, + { + "epoch": 0.96, + "learning_rate": 8.233424815831288e-06, + "loss": 3.0978, + "step": 6739 + }, + { + "epoch": 0.96, + "learning_rate": 8.204535605951177e-06, + "loss": 3.0285, + "step": 6740 + }, + { + "epoch": 0.96, + "learning_rate": 8.175646396071069e-06, + "loss": 3.2434, + "step": 6741 + }, + { + "epoch": 0.96, + "learning_rate": 8.146757186190958e-06, + "loss": 2.9622, + "step": 6742 + }, + { + "epoch": 0.96, + "learning_rate": 8.117867976310848e-06, + "loss": 2.9389, + "step": 6743 + }, + { + "epoch": 0.96, + "learning_rate": 8.088978766430739e-06, + "loss": 3.2191, + "step": 6744 + }, + { + "epoch": 0.96, + "learning_rate": 8.060089556550629e-06, + "loss": 3.0754, + "step": 6745 + }, + { + "epoch": 0.96, + "learning_rate": 8.031200346670518e-06, + "loss": 3.0331, + "step": 6746 + }, + { + "epoch": 0.96, + "learning_rate": 8.00231113679041e-06, + "loss": 3.0158, + "step": 6747 + }, + { + "epoch": 0.96, + "learning_rate": 7.973421926910299e-06, + "loss": 3.0167, + "step": 6748 + }, + { + "epoch": 0.96, + "learning_rate": 7.944532717030189e-06, + "loss": 3.1016, + "step": 6749 + }, + { + "epoch": 0.96, + "learning_rate": 7.91564350715008e-06, + "loss": 2.97, + "step": 6750 + }, + { + "epoch": 0.96, + "learning_rate": 7.88675429726997e-06, + "loss": 3.0349, + "step": 6751 + }, + { + "epoch": 0.96, + "learning_rate": 7.85786508738986e-06, + "loss": 3.0124, + "step": 6752 + }, + { + "epoch": 0.96, + "learning_rate": 7.82897587750975e-06, + "loss": 3.0379, + "step": 6753 + }, + { + "epoch": 0.96, + "learning_rate": 7.800086667629641e-06, + "loss": 3.1907, + "step": 6754 + }, + { + "epoch": 0.96, + "learning_rate": 7.771197457749532e-06, + "loss": 3.0362, + "step": 6755 + }, + { + "epoch": 0.96, + "learning_rate": 7.742308247869422e-06, + "loss": 3.059, + "step": 6756 + }, + { + "epoch": 0.96, + "learning_rate": 7.71341903798931e-06, + "loss": 3.0152, + "step": 6757 + }, + { + "epoch": 0.96, + "learning_rate": 7.684529828109201e-06, + "loss": 3.0157, + "step": 6758 + }, + { + "epoch": 0.96, + "learning_rate": 7.655640618229092e-06, + "loss": 3.0653, + "step": 6759 + }, + { + "epoch": 0.96, + "learning_rate": 7.626751408348982e-06, + "loss": 3.1621, + "step": 6760 + }, + { + "epoch": 0.96, + "learning_rate": 7.597862198468872e-06, + "loss": 3.0576, + "step": 6761 + }, + { + "epoch": 0.96, + "learning_rate": 7.568972988588763e-06, + "loss": 3.1017, + "step": 6762 + }, + { + "epoch": 0.96, + "learning_rate": 7.540083778708653e-06, + "loss": 3.0208, + "step": 6763 + }, + { + "epoch": 0.96, + "learning_rate": 7.511194568828543e-06, + "loss": 3.0832, + "step": 6764 + }, + { + "epoch": 0.96, + "learning_rate": 7.482305358948433e-06, + "loss": 3.0535, + "step": 6765 + }, + { + "epoch": 0.96, + "learning_rate": 7.453416149068324e-06, + "loss": 2.9272, + "step": 6766 + }, + { + "epoch": 0.96, + "learning_rate": 7.424526939188213e-06, + "loss": 3.0445, + "step": 6767 + }, + { + "epoch": 0.96, + "learning_rate": 7.395637729308104e-06, + "loss": 3.1716, + "step": 6768 + }, + { + "epoch": 0.96, + "learning_rate": 7.3667485194279935e-06, + "loss": 2.9934, + "step": 6769 + }, + { + "epoch": 0.96, + "learning_rate": 7.337859309547884e-06, + "loss": 3.1079, + "step": 6770 + }, + { + "epoch": 0.96, + "learning_rate": 7.308970099667775e-06, + "loss": 2.8938, + "step": 6771 + }, + { + "epoch": 0.96, + "learning_rate": 7.280080889787665e-06, + "loss": 2.8871, + "step": 6772 + }, + { + "epoch": 0.96, + "learning_rate": 7.251191679907555e-06, + "loss": 2.9292, + "step": 6773 + }, + { + "epoch": 0.96, + "learning_rate": 7.222302470027445e-06, + "loss": 3.0288, + "step": 6774 + }, + { + "epoch": 0.96, + "learning_rate": 7.193413260147336e-06, + "loss": 3.0358, + "step": 6775 + }, + { + "epoch": 0.96, + "learning_rate": 7.164524050267226e-06, + "loss": 3.0653, + "step": 6776 + }, + { + "epoch": 0.96, + "learning_rate": 7.135634840387116e-06, + "loss": 3.0944, + "step": 6777 + }, + { + "epoch": 0.97, + "learning_rate": 7.106745630507006e-06, + "loss": 3.1736, + "step": 6778 + }, + { + "epoch": 0.97, + "learning_rate": 7.077856420626896e-06, + "loss": 2.9908, + "step": 6779 + }, + { + "epoch": 0.97, + "learning_rate": 7.048967210746786e-06, + "loss": 2.9514, + "step": 6780 + }, + { + "epoch": 0.97, + "learning_rate": 7.020078000866677e-06, + "loss": 2.9189, + "step": 6781 + }, + { + "epoch": 0.97, + "learning_rate": 6.991188790986567e-06, + "loss": 3.1035, + "step": 6782 + }, + { + "epoch": 0.97, + "learning_rate": 6.962299581106457e-06, + "loss": 3.0636, + "step": 6783 + }, + { + "epoch": 0.97, + "learning_rate": 6.933410371226347e-06, + "loss": 3.0794, + "step": 6784 + }, + { + "epoch": 0.97, + "learning_rate": 6.904521161346238e-06, + "loss": 3.0011, + "step": 6785 + }, + { + "epoch": 0.97, + "learning_rate": 6.8756319514661276e-06, + "loss": 3.1414, + "step": 6786 + }, + { + "epoch": 0.97, + "learning_rate": 6.846742741586018e-06, + "loss": 3.1243, + "step": 6787 + }, + { + "epoch": 0.97, + "learning_rate": 6.817853531705908e-06, + "loss": 2.9568, + "step": 6788 + }, + { + "epoch": 0.97, + "learning_rate": 6.788964321825798e-06, + "loss": 3.0535, + "step": 6789 + }, + { + "epoch": 0.97, + "learning_rate": 6.760075111945689e-06, + "loss": 3.0086, + "step": 6790 + }, + { + "epoch": 0.97, + "learning_rate": 6.731185902065579e-06, + "loss": 3.0518, + "step": 6791 + }, + { + "epoch": 0.97, + "learning_rate": 6.702296692185469e-06, + "loss": 2.9835, + "step": 6792 + }, + { + "epoch": 0.97, + "learning_rate": 6.673407482305359e-06, + "loss": 2.9584, + "step": 6793 + }, + { + "epoch": 0.97, + "learning_rate": 6.64451827242525e-06, + "loss": 3.036, + "step": 6794 + }, + { + "epoch": 0.97, + "learning_rate": 6.61562906254514e-06, + "loss": 3.0023, + "step": 6795 + }, + { + "epoch": 0.97, + "learning_rate": 6.58673985266503e-06, + "loss": 2.9843, + "step": 6796 + }, + { + "epoch": 0.97, + "learning_rate": 6.55785064278492e-06, + "loss": 3.0756, + "step": 6797 + }, + { + "epoch": 0.97, + "learning_rate": 6.528961432904811e-06, + "loss": 3.0288, + "step": 6798 + }, + { + "epoch": 0.97, + "learning_rate": 6.5000722230247e-06, + "loss": 2.9967, + "step": 6799 + }, + { + "epoch": 0.97, + "learning_rate": 6.471183013144591e-06, + "loss": 3.0806, + "step": 6800 + }, + { + "epoch": 0.97, + "learning_rate": 6.442293803264481e-06, + "loss": 3.0906, + "step": 6801 + }, + { + "epoch": 0.97, + "learning_rate": 6.413404593384371e-06, + "loss": 2.9722, + "step": 6802 + }, + { + "epoch": 0.97, + "learning_rate": 6.384515383504261e-06, + "loss": 3.0042, + "step": 6803 + }, + { + "epoch": 0.97, + "learning_rate": 6.355626173624152e-06, + "loss": 3.1041, + "step": 6804 + }, + { + "epoch": 0.97, + "learning_rate": 6.326736963744042e-06, + "loss": 3.0505, + "step": 6805 + }, + { + "epoch": 0.97, + "learning_rate": 6.297847753863932e-06, + "loss": 3.1167, + "step": 6806 + }, + { + "epoch": 0.97, + "learning_rate": 6.268958543983822e-06, + "loss": 3.0737, + "step": 6807 + }, + { + "epoch": 0.97, + "learning_rate": 6.240069334103712e-06, + "loss": 2.9312, + "step": 6808 + }, + { + "epoch": 0.97, + "learning_rate": 6.2111801242236025e-06, + "loss": 3.0523, + "step": 6809 + }, + { + "epoch": 0.97, + "learning_rate": 6.182290914343493e-06, + "loss": 3.1139, + "step": 6810 + }, + { + "epoch": 0.97, + "learning_rate": 6.153401704463383e-06, + "loss": 3.1251, + "step": 6811 + }, + { + "epoch": 0.97, + "learning_rate": 6.124512494583273e-06, + "loss": 3.1028, + "step": 6812 + }, + { + "epoch": 0.97, + "learning_rate": 6.0956232847031634e-06, + "loss": 3.0303, + "step": 6813 + }, + { + "epoch": 0.97, + "learning_rate": 6.066734074823054e-06, + "loss": 2.9433, + "step": 6814 + }, + { + "epoch": 0.97, + "learning_rate": 6.0378448649429435e-06, + "loss": 3.0504, + "step": 6815 + }, + { + "epoch": 0.97, + "learning_rate": 6.008955655062834e-06, + "loss": 3.0348, + "step": 6816 + }, + { + "epoch": 0.97, + "learning_rate": 5.980066445182725e-06, + "loss": 3.0943, + "step": 6817 + }, + { + "epoch": 0.97, + "learning_rate": 5.951177235302615e-06, + "loss": 3.1233, + "step": 6818 + }, + { + "epoch": 0.97, + "learning_rate": 5.922288025422505e-06, + "loss": 3.0396, + "step": 6819 + }, + { + "epoch": 0.97, + "learning_rate": 5.893398815542396e-06, + "loss": 3.0299, + "step": 6820 + }, + { + "epoch": 0.97, + "learning_rate": 5.864509605662285e-06, + "loss": 3.071, + "step": 6821 + }, + { + "epoch": 0.97, + "learning_rate": 5.835620395782176e-06, + "loss": 3.1082, + "step": 6822 + }, + { + "epoch": 0.97, + "learning_rate": 5.806731185902066e-06, + "loss": 3.0669, + "step": 6823 + }, + { + "epoch": 0.97, + "learning_rate": 5.777841976021956e-06, + "loss": 2.9796, + "step": 6824 + }, + { + "epoch": 0.97, + "learning_rate": 5.748952766141846e-06, + "loss": 2.9577, + "step": 6825 + }, + { + "epoch": 0.97, + "learning_rate": 5.720063556261737e-06, + "loss": 3.0809, + "step": 6826 + }, + { + "epoch": 0.97, + "learning_rate": 5.691174346381627e-06, + "loss": 3.0723, + "step": 6827 + }, + { + "epoch": 0.97, + "learning_rate": 5.662285136501517e-06, + "loss": 2.9341, + "step": 6828 + }, + { + "epoch": 0.97, + "learning_rate": 5.633395926621407e-06, + "loss": 2.9875, + "step": 6829 + }, + { + "epoch": 0.97, + "learning_rate": 5.6045067167412975e-06, + "loss": 2.9204, + "step": 6830 + }, + { + "epoch": 0.97, + "learning_rate": 5.575617506861187e-06, + "loss": 3.0588, + "step": 6831 + }, + { + "epoch": 0.97, + "learning_rate": 5.5467282969810775e-06, + "loss": 2.9367, + "step": 6832 + }, + { + "epoch": 0.97, + "learning_rate": 5.517839087100968e-06, + "loss": 3.0618, + "step": 6833 + }, + { + "epoch": 0.97, + "learning_rate": 5.4889498772208576e-06, + "loss": 3.0858, + "step": 6834 + }, + { + "epoch": 0.97, + "learning_rate": 5.460060667340749e-06, + "loss": 3.086, + "step": 6835 + }, + { + "epoch": 0.97, + "learning_rate": 5.431171457460639e-06, + "loss": 2.8882, + "step": 6836 + }, + { + "epoch": 0.97, + "learning_rate": 5.402282247580529e-06, + "loss": 3.1037, + "step": 6837 + }, + { + "epoch": 0.97, + "learning_rate": 5.373393037700419e-06, + "loss": 2.9876, + "step": 6838 + }, + { + "epoch": 0.97, + "learning_rate": 5.34450382782031e-06, + "loss": 2.9321, + "step": 6839 + }, + { + "epoch": 0.97, + "learning_rate": 5.315614617940199e-06, + "loss": 3.0526, + "step": 6840 + }, + { + "epoch": 0.97, + "learning_rate": 5.28672540806009e-06, + "loss": 2.7436, + "step": 6841 + }, + { + "epoch": 0.97, + "learning_rate": 5.25783619817998e-06, + "loss": 2.9718, + "step": 6842 + }, + { + "epoch": 0.97, + "learning_rate": 5.228946988299871e-06, + "loss": 3.1092, + "step": 6843 + }, + { + "epoch": 0.97, + "learning_rate": 5.20005777841976e-06, + "loss": 2.9495, + "step": 6844 + }, + { + "epoch": 0.97, + "learning_rate": 5.171168568539651e-06, + "loss": 2.9479, + "step": 6845 + }, + { + "epoch": 0.97, + "learning_rate": 5.142279358659541e-06, + "loss": 3.1482, + "step": 6846 + }, + { + "epoch": 0.97, + "learning_rate": 5.113390148779431e-06, + "loss": 2.9749, + "step": 6847 + }, + { + "epoch": 0.98, + "learning_rate": 5.084500938899321e-06, + "loss": 3.0205, + "step": 6848 + }, + { + "epoch": 0.98, + "learning_rate": 5.0556117290192116e-06, + "loss": 2.9451, + "step": 6849 + }, + { + "epoch": 0.98, + "learning_rate": 5.026722519139101e-06, + "loss": 3.0533, + "step": 6850 + }, + { + "epoch": 0.98, + "learning_rate": 4.997833309258992e-06, + "loss": 3.0174, + "step": 6851 + }, + { + "epoch": 0.98, + "learning_rate": 4.968944099378882e-06, + "loss": 2.9644, + "step": 6852 + }, + { + "epoch": 0.98, + "learning_rate": 4.9400548894987725e-06, + "loss": 3.1064, + "step": 6853 + }, + { + "epoch": 0.98, + "learning_rate": 4.911165679618663e-06, + "loss": 3.1061, + "step": 6854 + }, + { + "epoch": 0.98, + "learning_rate": 4.882276469738553e-06, + "loss": 3.0388, + "step": 6855 + }, + { + "epoch": 0.98, + "learning_rate": 4.853387259858443e-06, + "loss": 3.088, + "step": 6856 + }, + { + "epoch": 0.98, + "learning_rate": 4.824498049978333e-06, + "loss": 2.6724, + "step": 6857 + }, + { + "epoch": 0.98, + "learning_rate": 4.795608840098224e-06, + "loss": 3.093, + "step": 6858 + }, + { + "epoch": 0.98, + "learning_rate": 4.766719630218114e-06, + "loss": 3.15, + "step": 6859 + }, + { + "epoch": 0.98, + "learning_rate": 4.737830420338004e-06, + "loss": 3.016, + "step": 6860 + }, + { + "epoch": 0.98, + "learning_rate": 4.708941210457894e-06, + "loss": 2.842, + "step": 6861 + }, + { + "epoch": 0.98, + "learning_rate": 4.680052000577785e-06, + "loss": 3.1013, + "step": 6862 + }, + { + "epoch": 0.98, + "learning_rate": 4.651162790697674e-06, + "loss": 3.0444, + "step": 6863 + }, + { + "epoch": 0.98, + "learning_rate": 4.622273580817565e-06, + "loss": 3.098, + "step": 6864 + }, + { + "epoch": 0.98, + "learning_rate": 4.593384370937455e-06, + "loss": 3.0278, + "step": 6865 + }, + { + "epoch": 0.98, + "learning_rate": 4.564495161057345e-06, + "loss": 3.0869, + "step": 6866 + }, + { + "epoch": 0.98, + "learning_rate": 4.535605951177235e-06, + "loss": 2.7475, + "step": 6867 + }, + { + "epoch": 0.98, + "learning_rate": 4.506716741297126e-06, + "loss": 3.1009, + "step": 6868 + }, + { + "epoch": 0.98, + "learning_rate": 4.477827531417016e-06, + "loss": 3.0051, + "step": 6869 + }, + { + "epoch": 0.98, + "learning_rate": 4.448938321536906e-06, + "loss": 2.9816, + "step": 6870 + }, + { + "epoch": 0.98, + "learning_rate": 4.420049111656797e-06, + "loss": 3.052, + "step": 6871 + }, + { + "epoch": 0.98, + "learning_rate": 4.3911599017766865e-06, + "loss": 3.0582, + "step": 6872 + }, + { + "epoch": 0.98, + "learning_rate": 4.362270691896577e-06, + "loss": 2.9568, + "step": 6873 + }, + { + "epoch": 0.98, + "learning_rate": 4.3333814820164674e-06, + "loss": 2.86, + "step": 6874 + }, + { + "epoch": 0.98, + "learning_rate": 4.304492272136358e-06, + "loss": 3.1061, + "step": 6875 + }, + { + "epoch": 0.98, + "learning_rate": 4.2756030622562475e-06, + "loss": 3.0858, + "step": 6876 + }, + { + "epoch": 0.98, + "learning_rate": 4.246713852376138e-06, + "loss": 2.994, + "step": 6877 + }, + { + "epoch": 0.98, + "learning_rate": 4.217824642496028e-06, + "loss": 3.067, + "step": 6878 + }, + { + "epoch": 0.98, + "learning_rate": 4.188935432615918e-06, + "loss": 2.9959, + "step": 6879 + }, + { + "epoch": 0.98, + "learning_rate": 4.160046222735808e-06, + "loss": 2.8929, + "step": 6880 + }, + { + "epoch": 0.98, + "learning_rate": 4.131157012855699e-06, + "loss": 3.0604, + "step": 6881 + }, + { + "epoch": 0.98, + "learning_rate": 4.102267802975588e-06, + "loss": 3.0014, + "step": 6882 + }, + { + "epoch": 0.98, + "learning_rate": 4.073378593095479e-06, + "loss": 2.997, + "step": 6883 + }, + { + "epoch": 0.98, + "learning_rate": 4.044489383215369e-06, + "loss": 2.9582, + "step": 6884 + }, + { + "epoch": 0.98, + "learning_rate": 4.015600173335259e-06, + "loss": 3.0997, + "step": 6885 + }, + { + "epoch": 0.98, + "learning_rate": 3.986710963455149e-06, + "loss": 3.0248, + "step": 6886 + }, + { + "epoch": 0.98, + "learning_rate": 3.95782175357504e-06, + "loss": 3.1458, + "step": 6887 + }, + { + "epoch": 0.98, + "learning_rate": 3.92893254369493e-06, + "loss": 2.9781, + "step": 6888 + }, + { + "epoch": 0.98, + "learning_rate": 3.900043333814821e-06, + "loss": 3.1087, + "step": 6889 + }, + { + "epoch": 0.98, + "learning_rate": 3.871154123934711e-06, + "loss": 2.7062, + "step": 6890 + }, + { + "epoch": 0.98, + "learning_rate": 3.842264914054601e-06, + "loss": 3.1132, + "step": 6891 + }, + { + "epoch": 0.98, + "learning_rate": 3.813375704174491e-06, + "loss": 3.1139, + "step": 6892 + }, + { + "epoch": 0.98, + "learning_rate": 3.7844864942943815e-06, + "loss": 2.9154, + "step": 6893 + }, + { + "epoch": 0.98, + "learning_rate": 3.7555972844142715e-06, + "loss": 2.9927, + "step": 6894 + }, + { + "epoch": 0.98, + "learning_rate": 3.726708074534162e-06, + "loss": 3.1764, + "step": 6895 + }, + { + "epoch": 0.98, + "learning_rate": 3.697818864654052e-06, + "loss": 3.0163, + "step": 6896 + }, + { + "epoch": 0.98, + "learning_rate": 3.668929654773942e-06, + "loss": 2.9691, + "step": 6897 + }, + { + "epoch": 0.98, + "learning_rate": 3.6400404448938324e-06, + "loss": 2.9428, + "step": 6898 + }, + { + "epoch": 0.98, + "learning_rate": 3.6111512350137224e-06, + "loss": 3.0541, + "step": 6899 + }, + { + "epoch": 0.98, + "learning_rate": 3.582262025133613e-06, + "loss": 3.0231, + "step": 6900 + }, + { + "epoch": 0.98, + "learning_rate": 3.553372815253503e-06, + "loss": 3.0052, + "step": 6901 + }, + { + "epoch": 0.98, + "learning_rate": 3.524483605373393e-06, + "loss": 2.9979, + "step": 6902 + }, + { + "epoch": 0.98, + "learning_rate": 3.4955943954932833e-06, + "loss": 2.8792, + "step": 6903 + }, + { + "epoch": 0.98, + "learning_rate": 3.4667051856131733e-06, + "loss": 3.1465, + "step": 6904 + }, + { + "epoch": 0.98, + "learning_rate": 3.4378159757330638e-06, + "loss": 3.148, + "step": 6905 + }, + { + "epoch": 0.98, + "learning_rate": 3.408926765852954e-06, + "loss": 3.0153, + "step": 6906 + }, + { + "epoch": 0.98, + "learning_rate": 3.3800375559728447e-06, + "loss": 3.0236, + "step": 6907 + }, + { + "epoch": 0.98, + "learning_rate": 3.3511483460927347e-06, + "loss": 3.0493, + "step": 6908 + }, + { + "epoch": 0.98, + "learning_rate": 3.322259136212625e-06, + "loss": 3.0399, + "step": 6909 + }, + { + "epoch": 0.98, + "learning_rate": 3.293369926332515e-06, + "loss": 3.0536, + "step": 6910 + }, + { + "epoch": 0.98, + "learning_rate": 3.2644807164524056e-06, + "loss": 3.0736, + "step": 6911 + }, + { + "epoch": 0.98, + "learning_rate": 3.2355915065722956e-06, + "loss": 3.1185, + "step": 6912 + }, + { + "epoch": 0.98, + "learning_rate": 3.2067022966921856e-06, + "loss": 3.0759, + "step": 6913 + }, + { + "epoch": 0.98, + "learning_rate": 3.177813086812076e-06, + "loss": 2.9949, + "step": 6914 + }, + { + "epoch": 0.98, + "learning_rate": 3.148923876931966e-06, + "loss": 2.9747, + "step": 6915 + }, + { + "epoch": 0.98, + "learning_rate": 3.120034667051856e-06, + "loss": 3.0707, + "step": 6916 + }, + { + "epoch": 0.98, + "learning_rate": 3.0911454571717465e-06, + "loss": 3.0587, + "step": 6917 + }, + { + "epoch": 0.98, + "learning_rate": 3.0622562472916365e-06, + "loss": 3.1504, + "step": 6918 + }, + { + "epoch": 0.99, + "learning_rate": 3.033367037411527e-06, + "loss": 3.1535, + "step": 6919 + }, + { + "epoch": 0.99, + "learning_rate": 3.004477827531417e-06, + "loss": 2.9331, + "step": 6920 + }, + { + "epoch": 0.99, + "learning_rate": 2.9755886176513074e-06, + "loss": 2.8455, + "step": 6921 + }, + { + "epoch": 0.99, + "learning_rate": 2.946699407771198e-06, + "loss": 2.8775, + "step": 6922 + }, + { + "epoch": 0.99, + "learning_rate": 2.917810197891088e-06, + "loss": 2.9542, + "step": 6923 + }, + { + "epoch": 0.99, + "learning_rate": 2.888920988010978e-06, + "loss": 3.0825, + "step": 6924 + }, + { + "epoch": 0.99, + "learning_rate": 2.8600317781308683e-06, + "loss": 2.9671, + "step": 6925 + }, + { + "epoch": 0.99, + "learning_rate": 2.8311425682507583e-06, + "loss": 3.0341, + "step": 6926 + }, + { + "epoch": 0.99, + "learning_rate": 2.8022533583706487e-06, + "loss": 3.0661, + "step": 6927 + }, + { + "epoch": 0.99, + "learning_rate": 2.7733641484905388e-06, + "loss": 3.0443, + "step": 6928 + }, + { + "epoch": 0.99, + "learning_rate": 2.7444749386104288e-06, + "loss": 2.8629, + "step": 6929 + }, + { + "epoch": 0.99, + "learning_rate": 2.7155857287303196e-06, + "loss": 3.1635, + "step": 6930 + }, + { + "epoch": 0.99, + "learning_rate": 2.6866965188502097e-06, + "loss": 3.1478, + "step": 6931 + }, + { + "epoch": 0.99, + "learning_rate": 2.6578073089700997e-06, + "loss": 2.8978, + "step": 6932 + }, + { + "epoch": 0.99, + "learning_rate": 2.62891809908999e-06, + "loss": 2.9421, + "step": 6933 + }, + { + "epoch": 0.99, + "learning_rate": 2.60002888920988e-06, + "loss": 3.0774, + "step": 6934 + }, + { + "epoch": 0.99, + "learning_rate": 2.5711396793297706e-06, + "loss": 3.1546, + "step": 6935 + }, + { + "epoch": 0.99, + "learning_rate": 2.5422504694496606e-06, + "loss": 3.0412, + "step": 6936 + }, + { + "epoch": 0.99, + "learning_rate": 2.5133612595695506e-06, + "loss": 3.2098, + "step": 6937 + }, + { + "epoch": 0.99, + "learning_rate": 2.484472049689441e-06, + "loss": 3.1855, + "step": 6938 + }, + { + "epoch": 0.99, + "learning_rate": 2.4555828398093315e-06, + "loss": 2.9524, + "step": 6939 + }, + { + "epoch": 0.99, + "learning_rate": 2.4266936299292215e-06, + "loss": 3.0207, + "step": 6940 + }, + { + "epoch": 0.99, + "learning_rate": 2.397804420049112e-06, + "loss": 3.0075, + "step": 6941 + }, + { + "epoch": 0.99, + "learning_rate": 2.368915210169002e-06, + "loss": 2.9822, + "step": 6942 + }, + { + "epoch": 0.99, + "learning_rate": 2.3400260002888924e-06, + "loss": 3.1671, + "step": 6943 + }, + { + "epoch": 0.99, + "learning_rate": 2.3111367904087824e-06, + "loss": 3.0587, + "step": 6944 + }, + { + "epoch": 0.99, + "learning_rate": 2.2822475805286724e-06, + "loss": 2.9804, + "step": 6945 + }, + { + "epoch": 0.99, + "learning_rate": 2.253358370648563e-06, + "loss": 3.0, + "step": 6946 + }, + { + "epoch": 0.99, + "learning_rate": 2.224469160768453e-06, + "loss": 3.1084, + "step": 6947 + }, + { + "epoch": 0.99, + "learning_rate": 2.1955799508883433e-06, + "loss": 2.9242, + "step": 6948 + }, + { + "epoch": 0.99, + "learning_rate": 2.1666907410082337e-06, + "loss": 3.1705, + "step": 6949 + }, + { + "epoch": 0.99, + "learning_rate": 2.1378015311281237e-06, + "loss": 2.9616, + "step": 6950 + }, + { + "epoch": 0.99, + "learning_rate": 2.108912321248014e-06, + "loss": 3.0162, + "step": 6951 + }, + { + "epoch": 0.99, + "learning_rate": 2.080023111367904e-06, + "loss": 3.0889, + "step": 6952 + }, + { + "epoch": 0.99, + "learning_rate": 2.051133901487794e-06, + "loss": 3.18, + "step": 6953 + }, + { + "epoch": 0.99, + "learning_rate": 2.0222446916076846e-06, + "loss": 2.9249, + "step": 6954 + }, + { + "epoch": 0.99, + "learning_rate": 1.9933554817275746e-06, + "loss": 3.0066, + "step": 6955 + }, + { + "epoch": 0.99, + "learning_rate": 1.964466271847465e-06, + "loss": 3.0889, + "step": 6956 + }, + { + "epoch": 0.99, + "learning_rate": 1.9355770619673555e-06, + "loss": 3.0842, + "step": 6957 + }, + { + "epoch": 0.99, + "learning_rate": 1.9066878520872455e-06, + "loss": 2.9891, + "step": 6958 + }, + { + "epoch": 0.99, + "learning_rate": 1.8777986422071358e-06, + "loss": 2.9088, + "step": 6959 + }, + { + "epoch": 0.99, + "learning_rate": 1.848909432327026e-06, + "loss": 3.0368, + "step": 6960 + }, + { + "epoch": 0.99, + "learning_rate": 1.8200202224469162e-06, + "loss": 3.0724, + "step": 6961 + }, + { + "epoch": 0.99, + "learning_rate": 1.7911310125668064e-06, + "loss": 3.0317, + "step": 6962 + }, + { + "epoch": 0.99, + "learning_rate": 1.7622418026866964e-06, + "loss": 3.0485, + "step": 6963 + }, + { + "epoch": 0.99, + "learning_rate": 1.7333525928065867e-06, + "loss": 3.0453, + "step": 6964 + }, + { + "epoch": 0.99, + "learning_rate": 1.704463382926477e-06, + "loss": 3.0199, + "step": 6965 + }, + { + "epoch": 0.99, + "learning_rate": 1.6755741730463673e-06, + "loss": 3.0125, + "step": 6966 + }, + { + "epoch": 0.99, + "learning_rate": 1.6466849631662576e-06, + "loss": 2.9638, + "step": 6967 + }, + { + "epoch": 0.99, + "learning_rate": 1.6177957532861478e-06, + "loss": 3.0054, + "step": 6968 + }, + { + "epoch": 0.99, + "learning_rate": 1.588906543406038e-06, + "loss": 2.8664, + "step": 6969 + }, + { + "epoch": 0.99, + "learning_rate": 1.560017333525928e-06, + "loss": 2.8927, + "step": 6970 + }, + { + "epoch": 0.99, + "learning_rate": 1.5311281236458183e-06, + "loss": 2.8634, + "step": 6971 + }, + { + "epoch": 0.99, + "learning_rate": 1.5022389137657085e-06, + "loss": 2.9638, + "step": 6972 + }, + { + "epoch": 0.99, + "learning_rate": 1.473349703885599e-06, + "loss": 3.0843, + "step": 6973 + }, + { + "epoch": 0.99, + "learning_rate": 1.444460494005489e-06, + "loss": 3.1211, + "step": 6974 + }, + { + "epoch": 0.99, + "learning_rate": 1.4155712841253792e-06, + "loss": 3.0891, + "step": 6975 + }, + { + "epoch": 0.99, + "learning_rate": 1.3866820742452694e-06, + "loss": 3.0453, + "step": 6976 + }, + { + "epoch": 0.99, + "learning_rate": 1.3577928643651598e-06, + "loss": 2.896, + "step": 6977 + }, + { + "epoch": 0.99, + "learning_rate": 1.3289036544850498e-06, + "loss": 3.1392, + "step": 6978 + }, + { + "epoch": 0.99, + "learning_rate": 1.30001444460494e-06, + "loss": 2.8895, + "step": 6979 + }, + { + "epoch": 0.99, + "learning_rate": 1.2711252347248303e-06, + "loss": 3.0352, + "step": 6980 + }, + { + "epoch": 0.99, + "learning_rate": 1.2422360248447205e-06, + "loss": 2.9906, + "step": 6981 + }, + { + "epoch": 0.99, + "learning_rate": 1.2133468149646107e-06, + "loss": 2.8133, + "step": 6982 + }, + { + "epoch": 0.99, + "learning_rate": 1.184457605084501e-06, + "loss": 3.0085, + "step": 6983 + }, + { + "epoch": 0.99, + "learning_rate": 1.1555683952043912e-06, + "loss": 2.9782, + "step": 6984 + }, + { + "epoch": 0.99, + "learning_rate": 1.1266791853242814e-06, + "loss": 3.1336, + "step": 6985 + }, + { + "epoch": 0.99, + "learning_rate": 1.0977899754441716e-06, + "loss": 2.9151, + "step": 6986 + }, + { + "epoch": 0.99, + "learning_rate": 1.0689007655640619e-06, + "loss": 3.0004, + "step": 6987 + }, + { + "epoch": 0.99, + "learning_rate": 1.040011555683952e-06, + "loss": 2.9352, + "step": 6988 + }, + { + "epoch": 1.0, + "learning_rate": 1.0111223458038423e-06, + "loss": 3.1152, + "step": 6989 + }, + { + "epoch": 1.0, + "learning_rate": 9.822331359237325e-07, + "loss": 2.9697, + "step": 6990 + }, + { + "epoch": 1.0, + "learning_rate": 9.533439260436228e-07, + "loss": 3.1496, + "step": 6991 + }, + { + "epoch": 1.0, + "learning_rate": 9.24454716163513e-07, + "loss": 2.913, + "step": 6992 + }, + { + "epoch": 1.0, + "learning_rate": 8.955655062834032e-07, + "loss": 3.0368, + "step": 6993 + }, + { + "epoch": 1.0, + "learning_rate": 8.666762964032933e-07, + "loss": 2.9764, + "step": 6994 + }, + { + "epoch": 1.0, + "learning_rate": 8.377870865231837e-07, + "loss": 3.0714, + "step": 6995 + }, + { + "epoch": 1.0, + "learning_rate": 8.088978766430739e-07, + "loss": 3.0611, + "step": 6996 + }, + { + "epoch": 1.0, + "learning_rate": 7.80008666762964e-07, + "loss": 3.0141, + "step": 6997 + }, + { + "epoch": 1.0, + "learning_rate": 7.511194568828542e-07, + "loss": 2.9486, + "step": 6998 + }, + { + "epoch": 1.0, + "learning_rate": 7.222302470027445e-07, + "loss": 3.0568, + "step": 6999 + }, + { + "epoch": 1.0, + "learning_rate": 6.933410371226347e-07, + "loss": 2.9752, + "step": 7000 + }, + { + "epoch": 1.0, + "learning_rate": 6.644518272425249e-07, + "loss": 3.0737, + "step": 7001 + }, + { + "epoch": 1.0, + "learning_rate": 6.355626173624151e-07, + "loss": 3.0703, + "step": 7002 + }, + { + "epoch": 1.0, + "learning_rate": 6.066734074823054e-07, + "loss": 2.9669, + "step": 7003 + }, + { + "epoch": 1.0, + "learning_rate": 5.777841976021956e-07, + "loss": 3.0173, + "step": 7004 + }, + { + "epoch": 1.0, + "learning_rate": 5.488949877220858e-07, + "loss": 3.0607, + "step": 7005 + }, + { + "epoch": 1.0, + "learning_rate": 5.20005777841976e-07, + "loss": 3.0821, + "step": 7006 + }, + { + "epoch": 1.0, + "learning_rate": 4.911165679618663e-07, + "loss": 3.0936, + "step": 7007 + }, + { + "epoch": 1.0, + "learning_rate": 4.622273580817565e-07, + "loss": 3.0225, + "step": 7008 + }, + { + "epoch": 1.0, + "learning_rate": 4.3333814820164667e-07, + "loss": 3.1118, + "step": 7009 + }, + { + "epoch": 1.0, + "learning_rate": 4.0444893832153695e-07, + "loss": 2.9692, + "step": 7010 + }, + { + "epoch": 1.0, + "learning_rate": 3.755597284414271e-07, + "loss": 2.9797, + "step": 7011 + }, + { + "epoch": 1.0, + "learning_rate": 3.4667051856131735e-07, + "loss": 3.0496, + "step": 7012 + }, + { + "epoch": 1.0, + "learning_rate": 3.1778130868120757e-07, + "loss": 3.0703, + "step": 7013 + }, + { + "epoch": 1.0, + "learning_rate": 2.888920988010978e-07, + "loss": 3.0217, + "step": 7014 + }, + { + "epoch": 1.0, + "learning_rate": 2.60002888920988e-07, + "loss": 3.0717, + "step": 7015 + }, + { + "epoch": 1.0, + "learning_rate": 2.3111367904087825e-07, + "loss": 2.8553, + "step": 7016 + }, + { + "epoch": 1.0, + "learning_rate": 2.0222446916076847e-07, + "loss": 3.0506, + "step": 7017 + }, + { + "epoch": 1.0, + "learning_rate": 1.7333525928065867e-07, + "loss": 3.0767, + "step": 7018 + }, + { + "epoch": 1.0, + "learning_rate": 1.444460494005489e-07, + "loss": 2.9865, + "step": 7019 + }, + { + "epoch": 1.0, + "learning_rate": 1.1555683952043912e-07, + "loss": 3.0635, + "step": 7020 + }, + { + "epoch": 1.0, + "learning_rate": 8.666762964032934e-08, + "loss": 3.1211, + "step": 7021 + }, + { + "epoch": 1.0, + "learning_rate": 5.777841976021956e-08, + "loss": 3.0291, + "step": 7022 + }, + { + "epoch": 1.0, + "learning_rate": 2.888920988010978e-08, + "loss": 2.9262, + "step": 7023 + } + ], + "logging_steps": 1, + "max_steps": 7023, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 6.982535499177001e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7023/training_args.bin b/checkpoint-7023/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c77f4971d5b253e03551d5ec924cbcc8d74d913b --- /dev/null +++ b/checkpoint-7023/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28dac321ffa4fc6b1816289fdd947bf7a05151e15fa1a3250a46751d33968167 +size 5947 diff --git a/checkpoint-7023/zero_to_fp32.py b/checkpoint-7023/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..c98caae31534368be22b67fc4ae906836c992a8d --- /dev/null +++ b/checkpoint-7023/zero_to_fp32.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/pytorch_model.bin b/pytorch_model.bin index 9f3e652372206a8ce18df6640e01359aa18f072d..ddf899646ea36e92b400ddf8707cb8787ce98198 100644 --- a/pytorch_model.bin +++ b/pytorch_model.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dacdc9b55ee848db7afc26a62a0f8b1e7e0292efb43052826553ced61e8db2c +oid sha256:86777aa7f151785d50e9b81bbbf8ad2484d6bee18618a77dc33d533354c0eac0 size 14017936677 diff --git a/training_args.bin b/training_args.bin index 27df21b1ba1f123f64bd4ae309a426a24a3e0152..3b2da1ef4b4876032c9ff6e6ebb2b9ecfcb90b45 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eab8eb7ea02a01dbed44a6358bf0acb0288a7622be895f8329fd7053ca6cdfb6 +oid sha256:86756e665c69be190c3de2e236f3cb1a2a4bdb410bba7277fd198efa1b9a6a27 size 6011