maxymoo2 commited on
Commit
23291af
1 Parent(s): 196cd6f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ GoNotoCurrent.ttf filter=lfs diff=lfs merge=lfs -text
GoNotoCurrent.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ab5c39e2b1c34a955136275ce0db068cb20d9643ead033d6b8124a73ab4f64
3
+ size 15645492
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PIXELForPreTraining"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "cache_dir": null,
7
+ "decoder_hidden_size": 512,
8
+ "decoder_intermediate_size": 2048,
9
+ "decoder_num_attention_heads": 16,
10
+ "decoder_num_hidden_layers": 8,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "image_size": [
15
+ 16,
16
+ 8464
17
+ ],
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "layer_norm_eps": 1e-12,
21
+ "mask_ratio": 0.25,
22
+ "model_type": "pixel",
23
+ "norm_pix_loss": true,
24
+ "num_attention_heads": 12,
25
+ "num_channels": 3,
26
+ "num_hidden_layers": 12,
27
+ "patch_size": 16,
28
+ "qkv_bias": true,
29
+ "revision": "main",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.17.0",
32
+ "use_auth_token": "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"
33
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d593546cef05352ccecad390d9c0ddab8c146c78a6d1a3241da6886295101225
3
+ size 893440890
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbb8d91acce6a6993455e4982d6f10c9368165bb837e125fe96d2e63d6eda4e
3
+ size 449474626
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2addad55b174f852181f57b499544b406cb0c30e1eadb1c2c06ef02f08b3f5
3
+ size 15006
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:380a90181621c43326bc092827ea6d5fe8ee877030e58694534faf09f2ad66be
3
+ size 1064
text_renderer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "background_color": "white",
3
+ "dpi": 120,
4
+ "font_color": "black",
5
+ "font_file": "GoNotoCurrent.ttf",
6
+ "font_size": 8,
7
+ "max_seq_length": 529,
8
+ "pad_size": 3,
9
+ "pixels_per_patch": 16,
10
+ "text_renderer_type": "PyGameTextRenderer"
11
+ }
trainer_state.json ADDED
@@ -0,0 +1,3366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.190901512444921,
5
+ "global_step": 50000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 2.9999999999999997e-06,
13
+ "loss": 1.0412,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 5.999999999999999e-06,
19
+ "loss": 0.835,
20
+ "step": 200
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 8.999999999999999e-06,
25
+ "loss": 0.7822,
26
+ "step": 300
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 1.1999999999999999e-05,
31
+ "loss": 0.7718,
32
+ "step": 400
33
+ },
34
+ {
35
+ "epoch": 0.01,
36
+ "learning_rate": 1.4999999999999999e-05,
37
+ "loss": 0.7707,
38
+ "step": 500
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "learning_rate": 1.7999999999999997e-05,
43
+ "loss": 0.7697,
44
+ "step": 600
45
+ },
46
+ {
47
+ "epoch": 0.02,
48
+ "learning_rate": 2.1e-05,
49
+ "loss": 0.769,
50
+ "step": 700
51
+ },
52
+ {
53
+ "epoch": 0.02,
54
+ "learning_rate": 2.3999999999999997e-05,
55
+ "loss": 0.7682,
56
+ "step": 800
57
+ },
58
+ {
59
+ "epoch": 0.02,
60
+ "learning_rate": 2.6999999999999996e-05,
61
+ "loss": 0.7674,
62
+ "step": 900
63
+ },
64
+ {
65
+ "epoch": 0.02,
66
+ "learning_rate": 2.9999999999999997e-05,
67
+ "loss": 0.767,
68
+ "step": 1000
69
+ },
70
+ {
71
+ "epoch": 0.02,
72
+ "eval_runtime": 45.7675,
73
+ "eval_samples_per_second": 235.975,
74
+ "eval_steps_per_second": 7.385,
75
+ "step": 1000
76
+ },
77
+ {
78
+ "epoch": 0.03,
79
+ "learning_rate": 3.2999999999999996e-05,
80
+ "loss": 0.7665,
81
+ "step": 1100
82
+ },
83
+ {
84
+ "epoch": 0.03,
85
+ "learning_rate": 3.5999999999999994e-05,
86
+ "loss": 0.7662,
87
+ "step": 1200
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "learning_rate": 3.9e-05,
92
+ "loss": 0.7661,
93
+ "step": 1300
94
+ },
95
+ {
96
+ "epoch": 0.03,
97
+ "learning_rate": 4.2e-05,
98
+ "loss": 0.766,
99
+ "step": 1400
100
+ },
101
+ {
102
+ "epoch": 0.04,
103
+ "learning_rate": 4.4999999999999996e-05,
104
+ "loss": 0.7659,
105
+ "step": 1500
106
+ },
107
+ {
108
+ "epoch": 0.04,
109
+ "learning_rate": 4.7999999999999994e-05,
110
+ "loss": 0.7656,
111
+ "step": 1600
112
+ },
113
+ {
114
+ "epoch": 0.04,
115
+ "learning_rate": 5.1e-05,
116
+ "loss": 0.7655,
117
+ "step": 1700
118
+ },
119
+ {
120
+ "epoch": 0.04,
121
+ "learning_rate": 5.399999999999999e-05,
122
+ "loss": 0.7655,
123
+ "step": 1800
124
+ },
125
+ {
126
+ "epoch": 0.05,
127
+ "learning_rate": 5.6999999999999996e-05,
128
+ "loss": 0.7653,
129
+ "step": 1900
130
+ },
131
+ {
132
+ "epoch": 0.05,
133
+ "learning_rate": 5.9999999999999995e-05,
134
+ "loss": 0.7655,
135
+ "step": 2000
136
+ },
137
+ {
138
+ "epoch": 0.05,
139
+ "eval_runtime": 45.5917,
140
+ "eval_samples_per_second": 236.885,
141
+ "eval_steps_per_second": 7.414,
142
+ "step": 2000
143
+ },
144
+ {
145
+ "epoch": 0.05,
146
+ "learning_rate": 6.299999999999999e-05,
147
+ "loss": 0.7651,
148
+ "step": 2100
149
+ },
150
+ {
151
+ "epoch": 0.05,
152
+ "learning_rate": 6.599999999999999e-05,
153
+ "loss": 0.7653,
154
+ "step": 2200
155
+ },
156
+ {
157
+ "epoch": 0.05,
158
+ "learning_rate": 6.9e-05,
159
+ "loss": 0.7654,
160
+ "step": 2300
161
+ },
162
+ {
163
+ "epoch": 0.06,
164
+ "learning_rate": 7.199999999999999e-05,
165
+ "loss": 0.765,
166
+ "step": 2400
167
+ },
168
+ {
169
+ "epoch": 0.06,
170
+ "learning_rate": 7.5e-05,
171
+ "loss": 0.7649,
172
+ "step": 2500
173
+ },
174
+ {
175
+ "epoch": 0.06,
176
+ "learning_rate": 7.8e-05,
177
+ "loss": 0.7648,
178
+ "step": 2600
179
+ },
180
+ {
181
+ "epoch": 0.06,
182
+ "learning_rate": 8.1e-05,
183
+ "loss": 0.7647,
184
+ "step": 2700
185
+ },
186
+ {
187
+ "epoch": 0.07,
188
+ "learning_rate": 8.4e-05,
189
+ "loss": 0.7645,
190
+ "step": 2800
191
+ },
192
+ {
193
+ "epoch": 0.07,
194
+ "learning_rate": 8.699999999999999e-05,
195
+ "loss": 0.7645,
196
+ "step": 2900
197
+ },
198
+ {
199
+ "epoch": 0.07,
200
+ "learning_rate": 8.999999999999999e-05,
201
+ "loss": 0.7644,
202
+ "step": 3000
203
+ },
204
+ {
205
+ "epoch": 0.07,
206
+ "eval_runtime": 45.7281,
207
+ "eval_samples_per_second": 236.179,
208
+ "eval_steps_per_second": 7.392,
209
+ "step": 3000
210
+ },
211
+ {
212
+ "epoch": 0.07,
213
+ "learning_rate": 9.3e-05,
214
+ "loss": 0.7641,
215
+ "step": 3100
216
+ },
217
+ {
218
+ "epoch": 0.08,
219
+ "learning_rate": 9.599999999999999e-05,
220
+ "loss": 0.764,
221
+ "step": 3200
222
+ },
223
+ {
224
+ "epoch": 0.08,
225
+ "learning_rate": 9.9e-05,
226
+ "loss": 0.7638,
227
+ "step": 3300
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "learning_rate": 0.000102,
232
+ "loss": 0.763,
233
+ "step": 3400
234
+ },
235
+ {
236
+ "epoch": 0.08,
237
+ "learning_rate": 0.00010499999999999999,
238
+ "loss": 0.7665,
239
+ "step": 3500
240
+ },
241
+ {
242
+ "epoch": 0.09,
243
+ "learning_rate": 0.00010799999999999998,
244
+ "loss": 0.7669,
245
+ "step": 3600
246
+ },
247
+ {
248
+ "epoch": 0.09,
249
+ "learning_rate": 0.00011099999999999999,
250
+ "loss": 0.7653,
251
+ "step": 3700
252
+ },
253
+ {
254
+ "epoch": 0.09,
255
+ "learning_rate": 0.00011399999999999999,
256
+ "loss": 0.7535,
257
+ "step": 3800
258
+ },
259
+ {
260
+ "epoch": 0.09,
261
+ "learning_rate": 0.000117,
262
+ "loss": 0.7218,
263
+ "step": 3900
264
+ },
265
+ {
266
+ "epoch": 0.1,
267
+ "learning_rate": 0.00011999999999999999,
268
+ "loss": 0.6956,
269
+ "step": 4000
270
+ },
271
+ {
272
+ "epoch": 0.1,
273
+ "eval_runtime": 45.9121,
274
+ "eval_samples_per_second": 235.232,
275
+ "eval_steps_per_second": 7.362,
276
+ "step": 4000
277
+ },
278
+ {
279
+ "epoch": 0.1,
280
+ "learning_rate": 0.00012299999999999998,
281
+ "loss": 0.6758,
282
+ "step": 4100
283
+ },
284
+ {
285
+ "epoch": 0.1,
286
+ "learning_rate": 0.00012599999999999997,
287
+ "loss": 0.6557,
288
+ "step": 4200
289
+ },
290
+ {
291
+ "epoch": 0.1,
292
+ "learning_rate": 0.000129,
293
+ "loss": 0.6402,
294
+ "step": 4300
295
+ },
296
+ {
297
+ "epoch": 0.1,
298
+ "learning_rate": 0.00013199999999999998,
299
+ "loss": 0.6302,
300
+ "step": 4400
301
+ },
302
+ {
303
+ "epoch": 0.11,
304
+ "learning_rate": 0.000135,
305
+ "loss": 0.623,
306
+ "step": 4500
307
+ },
308
+ {
309
+ "epoch": 0.11,
310
+ "learning_rate": 0.000138,
311
+ "loss": 0.6169,
312
+ "step": 4600
313
+ },
314
+ {
315
+ "epoch": 0.11,
316
+ "learning_rate": 0.00014099999999999998,
317
+ "loss": 0.6121,
318
+ "step": 4700
319
+ },
320
+ {
321
+ "epoch": 0.11,
322
+ "learning_rate": 0.00014399999999999998,
323
+ "loss": 0.607,
324
+ "step": 4800
325
+ },
326
+ {
327
+ "epoch": 0.12,
328
+ "learning_rate": 0.000147,
329
+ "loss": 0.6039,
330
+ "step": 4900
331
+ },
332
+ {
333
+ "epoch": 0.12,
334
+ "learning_rate": 0.00015,
335
+ "loss": 0.6012,
336
+ "step": 5000
337
+ },
338
+ {
339
+ "epoch": 0.12,
340
+ "eval_runtime": 46.0979,
341
+ "eval_samples_per_second": 234.284,
342
+ "eval_steps_per_second": 7.332,
343
+ "step": 5000
344
+ },
345
+ {
346
+ "epoch": 0.12,
347
+ "learning_rate": 0.0001499996172456075,
348
+ "loss": 0.5981,
349
+ "step": 5100
350
+ },
351
+ {
352
+ "epoch": 0.12,
353
+ "learning_rate": 0.00014999846898661572,
354
+ "loss": 0.5954,
355
+ "step": 5200
356
+ },
357
+ {
358
+ "epoch": 0.13,
359
+ "learning_rate": 0.00014999655523558183,
360
+ "loss": 0.5935,
361
+ "step": 5300
362
+ },
363
+ {
364
+ "epoch": 0.13,
365
+ "learning_rate": 0.00014999387601343436,
366
+ "loss": 0.5911,
367
+ "step": 5400
368
+ },
369
+ {
370
+ "epoch": 0.13,
371
+ "learning_rate": 0.00014999043134947282,
372
+ "loss": 0.5895,
373
+ "step": 5500
374
+ },
375
+ {
376
+ "epoch": 0.13,
377
+ "learning_rate": 0.00014998622128136748,
378
+ "loss": 0.5877,
379
+ "step": 5600
380
+ },
381
+ {
382
+ "epoch": 0.14,
383
+ "learning_rate": 0.000149981245855159,
384
+ "loss": 0.5866,
385
+ "step": 5700
386
+ },
387
+ {
388
+ "epoch": 0.14,
389
+ "learning_rate": 0.00014997550512525784,
390
+ "loss": 0.5845,
391
+ "step": 5800
392
+ },
393
+ {
394
+ "epoch": 0.14,
395
+ "learning_rate": 0.0001499689991544437,
396
+ "loss": 0.5784,
397
+ "step": 5900
398
+ },
399
+ {
400
+ "epoch": 0.14,
401
+ "learning_rate": 0.00014996172801386482,
402
+ "loss": 0.5684,
403
+ "step": 6000
404
+ },
405
+ {
406
+ "epoch": 0.14,
407
+ "eval_runtime": 46.0154,
408
+ "eval_samples_per_second": 234.704,
409
+ "eval_steps_per_second": 7.345,
410
+ "step": 6000
411
+ },
412
+ {
413
+ "epoch": 0.15,
414
+ "learning_rate": 0.00014995369178303722,
415
+ "loss": 0.5642,
416
+ "step": 6100
417
+ },
418
+ {
419
+ "epoch": 0.15,
420
+ "learning_rate": 0.0001499448905498439,
421
+ "loss": 0.5625,
422
+ "step": 6200
423
+ },
424
+ {
425
+ "epoch": 0.15,
426
+ "learning_rate": 0.00014993532441053364,
427
+ "loss": 0.5601,
428
+ "step": 6300
429
+ },
430
+ {
431
+ "epoch": 0.15,
432
+ "learning_rate": 0.0001499249934697203,
433
+ "loss": 0.5581,
434
+ "step": 6400
435
+ },
436
+ {
437
+ "epoch": 0.15,
438
+ "learning_rate": 0.0001499138978403813,
439
+ "loss": 0.554,
440
+ "step": 6500
441
+ },
442
+ {
443
+ "epoch": 0.16,
444
+ "learning_rate": 0.00014990203764385677,
445
+ "loss": 0.5462,
446
+ "step": 6600
447
+ },
448
+ {
449
+ "epoch": 0.16,
450
+ "learning_rate": 0.00014988941300984784,
451
+ "loss": 0.5284,
452
+ "step": 6700
453
+ },
454
+ {
455
+ "epoch": 0.16,
456
+ "learning_rate": 0.0001498760240764155,
457
+ "loss": 0.5032,
458
+ "step": 6800
459
+ },
460
+ {
461
+ "epoch": 0.16,
462
+ "learning_rate": 0.000149861870989979,
463
+ "loss": 0.4751,
464
+ "step": 6900
465
+ },
466
+ {
467
+ "epoch": 0.17,
468
+ "learning_rate": 0.0001498469539053142,
469
+ "loss": 0.4574,
470
+ "step": 7000
471
+ },
472
+ {
473
+ "epoch": 0.17,
474
+ "eval_runtime": 45.9402,
475
+ "eval_samples_per_second": 235.088,
476
+ "eval_steps_per_second": 7.357,
477
+ "step": 7000
478
+ },
479
+ {
480
+ "epoch": 0.17,
481
+ "learning_rate": 0.00014983127298555198,
482
+ "loss": 0.4453,
483
+ "step": 7100
484
+ },
485
+ {
486
+ "epoch": 0.17,
487
+ "learning_rate": 0.00014981482840217632,
488
+ "loss": 0.437,
489
+ "step": 7200
490
+ },
491
+ {
492
+ "epoch": 0.17,
493
+ "learning_rate": 0.00014979762033502262,
494
+ "loss": 0.4306,
495
+ "step": 7300
496
+ },
497
+ {
498
+ "epoch": 0.18,
499
+ "learning_rate": 0.00014977964897227547,
500
+ "loss": 0.4254,
501
+ "step": 7400
502
+ },
503
+ {
504
+ "epoch": 0.18,
505
+ "learning_rate": 0.00014976091451046687,
506
+ "loss": 0.4204,
507
+ "step": 7500
508
+ },
509
+ {
510
+ "epoch": 0.18,
511
+ "learning_rate": 0.00014974141715447386,
512
+ "loss": 0.4178,
513
+ "step": 7600
514
+ },
515
+ {
516
+ "epoch": 0.18,
517
+ "learning_rate": 0.00014972115711751644,
518
+ "loss": 0.4135,
519
+ "step": 7700
520
+ },
521
+ {
522
+ "epoch": 0.19,
523
+ "learning_rate": 0.00014970013462115505,
524
+ "loss": 0.4099,
525
+ "step": 7800
526
+ },
527
+ {
528
+ "epoch": 0.19,
529
+ "learning_rate": 0.00014967834989528843,
530
+ "loss": 0.4077,
531
+ "step": 7900
532
+ },
533
+ {
534
+ "epoch": 0.19,
535
+ "learning_rate": 0.00014965580317815078,
536
+ "loss": 0.405,
537
+ "step": 8000
538
+ },
539
+ {
540
+ "epoch": 0.19,
541
+ "eval_runtime": 45.7648,
542
+ "eval_samples_per_second": 235.989,
543
+ "eval_steps_per_second": 7.386,
544
+ "step": 8000
545
+ },
546
+ {
547
+ "epoch": 0.19,
548
+ "learning_rate": 0.00014963249471630944,
549
+ "loss": 0.4017,
550
+ "step": 8100
551
+ },
552
+ {
553
+ "epoch": 0.2,
554
+ "learning_rate": 0.000149608424764662,
555
+ "loss": 0.4006,
556
+ "step": 8200
557
+ },
558
+ {
559
+ "epoch": 0.2,
560
+ "learning_rate": 0.0001495835935864336,
561
+ "loss": 0.3977,
562
+ "step": 8300
563
+ },
564
+ {
565
+ "epoch": 0.2,
566
+ "learning_rate": 0.00014955800145317397,
567
+ "loss": 0.3964,
568
+ "step": 8400
569
+ },
570
+ {
571
+ "epoch": 0.2,
572
+ "learning_rate": 0.00014953164864475466,
573
+ "loss": 0.3949,
574
+ "step": 8500
575
+ },
576
+ {
577
+ "epoch": 0.2,
578
+ "learning_rate": 0.0001495045354493657,
579
+ "loss": 0.3961,
580
+ "step": 8600
581
+ },
582
+ {
583
+ "epoch": 0.21,
584
+ "learning_rate": 0.00014947666216351272,
585
+ "loss": 0.398,
586
+ "step": 8700
587
+ },
588
+ {
589
+ "epoch": 0.21,
590
+ "learning_rate": 0.00014944802909201344,
591
+ "loss": 0.3924,
592
+ "step": 8800
593
+ },
594
+ {
595
+ "epoch": 0.21,
596
+ "learning_rate": 0.00014941863654799456,
597
+ "loss": 0.3938,
598
+ "step": 8900
599
+ },
600
+ {
601
+ "epoch": 0.21,
602
+ "learning_rate": 0.00014938848485288825,
603
+ "loss": 0.3885,
604
+ "step": 9000
605
+ },
606
+ {
607
+ "epoch": 0.21,
608
+ "eval_runtime": 45.9868,
609
+ "eval_samples_per_second": 234.85,
610
+ "eval_steps_per_second": 7.35,
611
+ "step": 9000
612
+ },
613
+ {
614
+ "epoch": 0.22,
615
+ "learning_rate": 0.0001493575743364286,
616
+ "loss": 0.391,
617
+ "step": 9100
618
+ },
619
+ {
620
+ "epoch": 0.22,
621
+ "learning_rate": 0.00014932590533664808,
622
+ "loss": 0.3884,
623
+ "step": 9200
624
+ },
625
+ {
626
+ "epoch": 0.22,
627
+ "learning_rate": 0.0001492934781998738,
628
+ "loss": 0.3856,
629
+ "step": 9300
630
+ },
631
+ {
632
+ "epoch": 0.22,
633
+ "learning_rate": 0.0001492602932807237,
634
+ "loss": 0.3843,
635
+ "step": 9400
636
+ },
637
+ {
638
+ "epoch": 0.23,
639
+ "learning_rate": 0.00014922635094210277,
640
+ "loss": 0.3848,
641
+ "step": 9500
642
+ },
643
+ {
644
+ "epoch": 0.23,
645
+ "learning_rate": 0.000149191651555199,
646
+ "loss": 0.3795,
647
+ "step": 9600
648
+ },
649
+ {
650
+ "epoch": 0.23,
651
+ "learning_rate": 0.0001491561954994793,
652
+ "loss": 0.3735,
653
+ "step": 9700
654
+ },
655
+ {
656
+ "epoch": 0.23,
657
+ "learning_rate": 0.00014911998316268537,
658
+ "loss": 0.3658,
659
+ "step": 9800
660
+ },
661
+ {
662
+ "epoch": 0.24,
663
+ "learning_rate": 0.00014908301494082963,
664
+ "loss": 0.362,
665
+ "step": 9900
666
+ },
667
+ {
668
+ "epoch": 0.24,
669
+ "learning_rate": 0.00014904529123819054,
670
+ "loss": 0.3595,
671
+ "step": 10000
672
+ },
673
+ {
674
+ "epoch": 0.24,
675
+ "eval_runtime": 46.3224,
676
+ "eval_samples_per_second": 233.148,
677
+ "eval_steps_per_second": 7.297,
678
+ "step": 10000
679
+ },
680
+ {
681
+ "epoch": 0.24,
682
+ "learning_rate": 0.00014900681246730852,
683
+ "loss": 0.3585,
684
+ "step": 10100
685
+ },
686
+ {
687
+ "epoch": 0.24,
688
+ "learning_rate": 0.00014896757904898125,
689
+ "loss": 0.3578,
690
+ "step": 10200
691
+ },
692
+ {
693
+ "epoch": 0.25,
694
+ "learning_rate": 0.00014892759141225904,
695
+ "loss": 0.3568,
696
+ "step": 10300
697
+ },
698
+ {
699
+ "epoch": 0.25,
700
+ "learning_rate": 0.00014888684999444035,
701
+ "loss": 0.355,
702
+ "step": 10400
703
+ },
704
+ {
705
+ "epoch": 0.25,
706
+ "learning_rate": 0.00014884535524106675,
707
+ "loss": 0.3537,
708
+ "step": 10500
709
+ },
710
+ {
711
+ "epoch": 0.25,
712
+ "learning_rate": 0.00014880310760591824,
713
+ "loss": 0.3523,
714
+ "step": 10600
715
+ },
716
+ {
717
+ "epoch": 0.25,
718
+ "learning_rate": 0.0001487601075510082,
719
+ "loss": 0.3524,
720
+ "step": 10700
721
+ },
722
+ {
723
+ "epoch": 0.26,
724
+ "learning_rate": 0.0001487163555465783,
725
+ "loss": 0.3515,
726
+ "step": 10800
727
+ },
728
+ {
729
+ "epoch": 0.26,
730
+ "learning_rate": 0.0001486718520710935,
731
+ "loss": 0.3508,
732
+ "step": 10900
733
+ },
734
+ {
735
+ "epoch": 0.26,
736
+ "learning_rate": 0.00014862659761123663,
737
+ "loss": 0.3493,
738
+ "step": 11000
739
+ },
740
+ {
741
+ "epoch": 0.26,
742
+ "eval_runtime": 46.1625,
743
+ "eval_samples_per_second": 233.956,
744
+ "eval_steps_per_second": 7.322,
745
+ "step": 11000
746
+ },
747
+ {
748
+ "epoch": 0.26,
749
+ "learning_rate": 0.00014858059266190327,
750
+ "loss": 0.3472,
751
+ "step": 11100
752
+ },
753
+ {
754
+ "epoch": 0.27,
755
+ "learning_rate": 0.00014853383772619612,
756
+ "loss": 0.3463,
757
+ "step": 11200
758
+ },
759
+ {
760
+ "epoch": 0.27,
761
+ "learning_rate": 0.00014848633331541967,
762
+ "loss": 0.3363,
763
+ "step": 11300
764
+ },
765
+ {
766
+ "epoch": 0.27,
767
+ "learning_rate": 0.0001484380799490746,
768
+ "loss": 0.3265,
769
+ "step": 11400
770
+ },
771
+ {
772
+ "epoch": 0.27,
773
+ "learning_rate": 0.00014838907815485194,
774
+ "loss": 0.3235,
775
+ "step": 11500
776
+ },
777
+ {
778
+ "epoch": 0.28,
779
+ "learning_rate": 0.00014833932846862748,
780
+ "loss": 0.3218,
781
+ "step": 11600
782
+ },
783
+ {
784
+ "epoch": 0.28,
785
+ "learning_rate": 0.00014828883143445582,
786
+ "loss": 0.3203,
787
+ "step": 11700
788
+ },
789
+ {
790
+ "epoch": 0.28,
791
+ "learning_rate": 0.0001482375876045644,
792
+ "loss": 0.3204,
793
+ "step": 11800
794
+ },
795
+ {
796
+ "epoch": 0.28,
797
+ "learning_rate": 0.0001481855975393476,
798
+ "loss": 0.3184,
799
+ "step": 11900
800
+ },
801
+ {
802
+ "epoch": 0.29,
803
+ "learning_rate": 0.0001481328618073604,
804
+ "loss": 0.318,
805
+ "step": 12000
806
+ },
807
+ {
808
+ "epoch": 0.29,
809
+ "eval_runtime": 46.1354,
810
+ "eval_samples_per_second": 234.094,
811
+ "eval_steps_per_second": 7.326,
812
+ "step": 12000
813
+ },
814
+ {
815
+ "epoch": 0.29,
816
+ "learning_rate": 0.0001480793809853123,
817
+ "loss": 0.3163,
818
+ "step": 12100
819
+ },
820
+ {
821
+ "epoch": 0.29,
822
+ "learning_rate": 0.00014802515565806107,
823
+ "loss": 0.3155,
824
+ "step": 12200
825
+ },
826
+ {
827
+ "epoch": 0.29,
828
+ "learning_rate": 0.00014797018641860612,
829
+ "loss": 0.314,
830
+ "step": 12300
831
+ },
832
+ {
833
+ "epoch": 0.3,
834
+ "learning_rate": 0.0001479144738680823,
835
+ "loss": 0.3136,
836
+ "step": 12400
837
+ },
838
+ {
839
+ "epoch": 0.3,
840
+ "learning_rate": 0.00014785801861575312,
841
+ "loss": 0.3117,
842
+ "step": 12500
843
+ },
844
+ {
845
+ "epoch": 0.3,
846
+ "learning_rate": 0.00014780082127900416,
847
+ "loss": 0.3086,
848
+ "step": 12600
849
+ },
850
+ {
851
+ "epoch": 0.3,
852
+ "learning_rate": 0.00014774288248333635,
853
+ "loss": 0.3074,
854
+ "step": 12700
855
+ },
856
+ {
857
+ "epoch": 0.3,
858
+ "learning_rate": 0.00014768420286235908,
859
+ "loss": 0.3074,
860
+ "step": 12800
861
+ },
862
+ {
863
+ "epoch": 0.31,
864
+ "learning_rate": 0.00014762478305778328,
865
+ "loss": 0.3064,
866
+ "step": 12900
867
+ },
868
+ {
869
+ "epoch": 0.31,
870
+ "learning_rate": 0.0001475646237194144,
871
+ "loss": 0.3057,
872
+ "step": 13000
873
+ },
874
+ {
875
+ "epoch": 0.31,
876
+ "eval_runtime": 46.1242,
877
+ "eval_samples_per_second": 234.15,
878
+ "eval_steps_per_second": 7.328,
879
+ "step": 13000
880
+ },
881
+ {
882
+ "epoch": 0.31,
883
+ "learning_rate": 0.00014750372550514533,
884
+ "loss": 0.3048,
885
+ "step": 13100
886
+ },
887
+ {
888
+ "epoch": 0.31,
889
+ "learning_rate": 0.0001474420890809492,
890
+ "loss": 0.3037,
891
+ "step": 13200
892
+ },
893
+ {
894
+ "epoch": 0.32,
895
+ "learning_rate": 0.00014737971512087202,
896
+ "loss": 0.3029,
897
+ "step": 13300
898
+ },
899
+ {
900
+ "epoch": 0.32,
901
+ "learning_rate": 0.00014731660430702552,
902
+ "loss": 0.3024,
903
+ "step": 13400
904
+ },
905
+ {
906
+ "epoch": 0.32,
907
+ "learning_rate": 0.00014725275732957937,
908
+ "loss": 0.3011,
909
+ "step": 13500
910
+ },
911
+ {
912
+ "epoch": 0.32,
913
+ "learning_rate": 0.00014718817488675387,
914
+ "loss": 0.3006,
915
+ "step": 13600
916
+ },
917
+ {
918
+ "epoch": 0.33,
919
+ "learning_rate": 0.00014712285768481235,
920
+ "loss": 0.3009,
921
+ "step": 13700
922
+ },
923
+ {
924
+ "epoch": 0.33,
925
+ "learning_rate": 0.00014705680643805323,
926
+ "loss": 0.2991,
927
+ "step": 13800
928
+ },
929
+ {
930
+ "epoch": 0.33,
931
+ "learning_rate": 0.00014699002186880232,
932
+ "loss": 0.2991,
933
+ "step": 13900
934
+ },
935
+ {
936
+ "epoch": 0.33,
937
+ "learning_rate": 0.00014692250470740503,
938
+ "loss": 0.2979,
939
+ "step": 14000
940
+ },
941
+ {
942
+ "epoch": 0.33,
943
+ "eval_runtime": 46.2531,
944
+ "eval_samples_per_second": 233.498,
945
+ "eval_steps_per_second": 7.308,
946
+ "step": 14000
947
+ },
948
+ {
949
+ "epoch": 0.34,
950
+ "learning_rate": 0.00014685425569221819,
951
+ "loss": 0.2975,
952
+ "step": 14100
953
+ },
954
+ {
955
+ "epoch": 0.34,
956
+ "learning_rate": 0.00014678527556960207,
957
+ "loss": 0.2955,
958
+ "step": 14200
959
+ },
960
+ {
961
+ "epoch": 0.34,
962
+ "learning_rate": 0.0001467155650939123,
963
+ "loss": 0.295,
964
+ "step": 14300
965
+ },
966
+ {
967
+ "epoch": 0.34,
968
+ "learning_rate": 0.00014664512502749141,
969
+ "loss": 0.2941,
970
+ "step": 14400
971
+ },
972
+ {
973
+ "epoch": 0.35,
974
+ "learning_rate": 0.00014657395614066075,
975
+ "loss": 0.2931,
976
+ "step": 14500
977
+ },
978
+ {
979
+ "epoch": 0.35,
980
+ "learning_rate": 0.0001465020592117118,
981
+ "loss": 0.2921,
982
+ "step": 14600
983
+ },
984
+ {
985
+ "epoch": 0.35,
986
+ "learning_rate": 0.0001464294350268979,
987
+ "loss": 0.2918,
988
+ "step": 14700
989
+ },
990
+ {
991
+ "epoch": 0.35,
992
+ "learning_rate": 0.00014635608438042546,
993
+ "loss": 0.2907,
994
+ "step": 14800
995
+ },
996
+ {
997
+ "epoch": 0.35,
998
+ "learning_rate": 0.00014628200807444543,
999
+ "loss": 0.2899,
1000
+ "step": 14900
1001
+ },
1002
+ {
1003
+ "epoch": 0.36,
1004
+ "learning_rate": 0.0001462072069190444,
1005
+ "loss": 0.2898,
1006
+ "step": 15000
1007
+ },
1008
+ {
1009
+ "epoch": 0.36,
1010
+ "eval_runtime": 46.2774,
1011
+ "eval_samples_per_second": 233.375,
1012
+ "eval_steps_per_second": 7.304,
1013
+ "step": 15000
1014
+ },
1015
+ {
1016
+ "epoch": 0.36,
1017
+ "learning_rate": 0.00014613168173223585,
1018
+ "loss": 0.2885,
1019
+ "step": 15100
1020
+ },
1021
+ {
1022
+ "epoch": 0.36,
1023
+ "learning_rate": 0.00014605543333995113,
1024
+ "loss": 0.288,
1025
+ "step": 15200
1026
+ },
1027
+ {
1028
+ "epoch": 0.36,
1029
+ "learning_rate": 0.00014597846257603038,
1030
+ "loss": 0.2875,
1031
+ "step": 15300
1032
+ },
1033
+ {
1034
+ "epoch": 0.37,
1035
+ "learning_rate": 0.0001459007702822136,
1036
+ "loss": 0.2876,
1037
+ "step": 15400
1038
+ },
1039
+ {
1040
+ "epoch": 0.37,
1041
+ "learning_rate": 0.00014582235730813128,
1042
+ "loss": 0.2862,
1043
+ "step": 15500
1044
+ },
1045
+ {
1046
+ "epoch": 0.37,
1047
+ "learning_rate": 0.00014574322451129507,
1048
+ "loss": 0.2849,
1049
+ "step": 15600
1050
+ },
1051
+ {
1052
+ "epoch": 0.37,
1053
+ "learning_rate": 0.00014566337275708863,
1054
+ "loss": 0.2852,
1055
+ "step": 15700
1056
+ },
1057
+ {
1058
+ "epoch": 0.38,
1059
+ "learning_rate": 0.0001455828029187579,
1060
+ "loss": 0.2833,
1061
+ "step": 15800
1062
+ },
1063
+ {
1064
+ "epoch": 0.38,
1065
+ "learning_rate": 0.00014550151587740178,
1066
+ "loss": 0.2836,
1067
+ "step": 15900
1068
+ },
1069
+ {
1070
+ "epoch": 0.38,
1071
+ "learning_rate": 0.00014541951252196225,
1072
+ "loss": 0.2817,
1073
+ "step": 16000
1074
+ },
1075
+ {
1076
+ "epoch": 0.38,
1077
+ "eval_runtime": 46.1169,
1078
+ "eval_samples_per_second": 234.187,
1079
+ "eval_steps_per_second": 7.329,
1080
+ "step": 16000
1081
+ },
1082
+ {
1083
+ "epoch": 0.38,
1084
+ "learning_rate": 0.00014533679374921493,
1085
+ "loss": 0.2824,
1086
+ "step": 16100
1087
+ },
1088
+ {
1089
+ "epoch": 0.39,
1090
+ "learning_rate": 0.00014525336046375905,
1091
+ "loss": 0.2817,
1092
+ "step": 16200
1093
+ },
1094
+ {
1095
+ "epoch": 0.39,
1096
+ "learning_rate": 0.00014516921357800766,
1097
+ "loss": 0.2812,
1098
+ "step": 16300
1099
+ },
1100
+ {
1101
+ "epoch": 0.39,
1102
+ "learning_rate": 0.00014508435401217759,
1103
+ "loss": 0.2812,
1104
+ "step": 16400
1105
+ },
1106
+ {
1107
+ "epoch": 0.39,
1108
+ "learning_rate": 0.00014499878269427948,
1109
+ "loss": 0.2795,
1110
+ "step": 16500
1111
+ },
1112
+ {
1113
+ "epoch": 0.4,
1114
+ "learning_rate": 0.00014491250056010758,
1115
+ "loss": 0.2788,
1116
+ "step": 16600
1117
+ },
1118
+ {
1119
+ "epoch": 0.4,
1120
+ "learning_rate": 0.00014482550855322943,
1121
+ "loss": 0.2775,
1122
+ "step": 16700
1123
+ },
1124
+ {
1125
+ "epoch": 0.4,
1126
+ "learning_rate": 0.0001447378076249757,
1127
+ "loss": 0.2773,
1128
+ "step": 16800
1129
+ },
1130
+ {
1131
+ "epoch": 0.4,
1132
+ "learning_rate": 0.00014464939873442973,
1133
+ "loss": 0.2769,
1134
+ "step": 16900
1135
+ },
1136
+ {
1137
+ "epoch": 0.4,
1138
+ "learning_rate": 0.00014456028284841693,
1139
+ "loss": 0.2765,
1140
+ "step": 17000
1141
+ },
1142
+ {
1143
+ "epoch": 0.4,
1144
+ "eval_runtime": 46.3516,
1145
+ "eval_samples_per_second": 233.002,
1146
+ "eval_steps_per_second": 7.292,
1147
+ "step": 17000
1148
+ },
1149
+ {
1150
+ "epoch": 0.41,
1151
+ "learning_rate": 0.00014447046094149437,
1152
+ "loss": 0.2752,
1153
+ "step": 17100
1154
+ },
1155
+ {
1156
+ "epoch": 0.41,
1157
+ "learning_rate": 0.00014437993399594003,
1158
+ "loss": 0.2765,
1159
+ "step": 17200
1160
+ },
1161
+ {
1162
+ "epoch": 0.41,
1163
+ "learning_rate": 0.0001442887030017421,
1164
+ "loss": 0.2752,
1165
+ "step": 17300
1166
+ },
1167
+ {
1168
+ "epoch": 0.41,
1169
+ "learning_rate": 0.00014419676895658807,
1170
+ "loss": 0.2748,
1171
+ "step": 17400
1172
+ },
1173
+ {
1174
+ "epoch": 0.42,
1175
+ "learning_rate": 0.000144104132865854,
1176
+ "loss": 0.2739,
1177
+ "step": 17500
1178
+ },
1179
+ {
1180
+ "epoch": 0.42,
1181
+ "learning_rate": 0.0001440107957425933,
1182
+ "loss": 0.2729,
1183
+ "step": 17600
1184
+ },
1185
+ {
1186
+ "epoch": 0.42,
1187
+ "learning_rate": 0.0001439167586075258,
1188
+ "loss": 0.2722,
1189
+ "step": 17700
1190
+ },
1191
+ {
1192
+ "epoch": 0.42,
1193
+ "learning_rate": 0.0001438220224890265,
1194
+ "loss": 0.2725,
1195
+ "step": 17800
1196
+ },
1197
+ {
1198
+ "epoch": 0.43,
1199
+ "learning_rate": 0.00014372658842311449,
1200
+ "loss": 0.2726,
1201
+ "step": 17900
1202
+ },
1203
+ {
1204
+ "epoch": 0.43,
1205
+ "learning_rate": 0.00014363045745344137,
1206
+ "loss": 0.2715,
1207
+ "step": 18000
1208
+ },
1209
+ {
1210
+ "epoch": 0.43,
1211
+ "eval_runtime": 46.2247,
1212
+ "eval_samples_per_second": 233.641,
1213
+ "eval_steps_per_second": 7.312,
1214
+ "step": 18000
1215
+ },
1216
+ {
1217
+ "epoch": 0.43,
1218
+ "learning_rate": 0.00014353363063128005,
1219
+ "loss": 0.2705,
1220
+ "step": 18100
1221
+ },
1222
+ {
1223
+ "epoch": 0.43,
1224
+ "learning_rate": 0.0001434361090155131,
1225
+ "loss": 0.2706,
1226
+ "step": 18200
1227
+ },
1228
+ {
1229
+ "epoch": 0.44,
1230
+ "learning_rate": 0.00014333789367262136,
1231
+ "loss": 0.2701,
1232
+ "step": 18300
1233
+ },
1234
+ {
1235
+ "epoch": 0.44,
1236
+ "learning_rate": 0.00014323898567667202,
1237
+ "loss": 0.2693,
1238
+ "step": 18400
1239
+ },
1240
+ {
1241
+ "epoch": 0.44,
1242
+ "learning_rate": 0.00014313938610930712,
1243
+ "loss": 0.2693,
1244
+ "step": 18500
1245
+ },
1246
+ {
1247
+ "epoch": 0.44,
1248
+ "learning_rate": 0.00014303909605973154,
1249
+ "loss": 0.2691,
1250
+ "step": 18600
1251
+ },
1252
+ {
1253
+ "epoch": 0.45,
1254
+ "learning_rate": 0.0001429381166247012,
1255
+ "loss": 0.2681,
1256
+ "step": 18700
1257
+ },
1258
+ {
1259
+ "epoch": 0.45,
1260
+ "learning_rate": 0.00014283644890851103,
1261
+ "loss": 0.2672,
1262
+ "step": 18800
1263
+ },
1264
+ {
1265
+ "epoch": 0.45,
1266
+ "learning_rate": 0.00014273409402298291,
1267
+ "loss": 0.2671,
1268
+ "step": 18900
1269
+ },
1270
+ {
1271
+ "epoch": 0.45,
1272
+ "learning_rate": 0.00014263105308745343,
1273
+ "loss": 0.2676,
1274
+ "step": 19000
1275
+ },
1276
+ {
1277
+ "epoch": 0.45,
1278
+ "eval_runtime": 46.3331,
1279
+ "eval_samples_per_second": 233.095,
1280
+ "eval_steps_per_second": 7.295,
1281
+ "step": 19000
1282
+ },
1283
+ {
1284
+ "epoch": 0.45,
1285
+ "learning_rate": 0.00014252732722876176,
1286
+ "loss": 0.2654,
1287
+ "step": 19100
1288
+ },
1289
+ {
1290
+ "epoch": 0.46,
1291
+ "learning_rate": 0.0001424229175812373,
1292
+ "loss": 0.2649,
1293
+ "step": 19200
1294
+ },
1295
+ {
1296
+ "epoch": 0.46,
1297
+ "learning_rate": 0.00014231782528668717,
1298
+ "loss": 0.2647,
1299
+ "step": 19300
1300
+ },
1301
+ {
1302
+ "epoch": 0.46,
1303
+ "learning_rate": 0.00014221205149438394,
1304
+ "loss": 0.2649,
1305
+ "step": 19400
1306
+ },
1307
+ {
1308
+ "epoch": 0.46,
1309
+ "learning_rate": 0.0001421055973610528,
1310
+ "loss": 0.264,
1311
+ "step": 19500
1312
+ },
1313
+ {
1314
+ "epoch": 0.47,
1315
+ "learning_rate": 0.00014199846405085913,
1316
+ "loss": 0.2647,
1317
+ "step": 19600
1318
+ },
1319
+ {
1320
+ "epoch": 0.47,
1321
+ "learning_rate": 0.00014189065273539564,
1322
+ "loss": 0.2635,
1323
+ "step": 19700
1324
+ },
1325
+ {
1326
+ "epoch": 0.47,
1327
+ "learning_rate": 0.00014178216459366958,
1328
+ "loss": 0.2623,
1329
+ "step": 19800
1330
+ },
1331
+ {
1332
+ "epoch": 0.47,
1333
+ "learning_rate": 0.00014167300081208988,
1334
+ "loss": 0.2627,
1335
+ "step": 19900
1336
+ },
1337
+ {
1338
+ "epoch": 0.48,
1339
+ "learning_rate": 0.00014156316258445421,
1340
+ "loss": 0.2932,
1341
+ "step": 20000
1342
+ },
1343
+ {
1344
+ "epoch": 0.48,
1345
+ "eval_runtime": 46.169,
1346
+ "eval_samples_per_second": 233.923,
1347
+ "eval_steps_per_second": 7.321,
1348
+ "step": 20000
1349
+ },
1350
+ {
1351
+ "epoch": 0.48,
1352
+ "learning_rate": 0.00014145265111193583,
1353
+ "loss": 0.2645,
1354
+ "step": 20100
1355
+ },
1356
+ {
1357
+ "epoch": 0.48,
1358
+ "learning_rate": 0.00014134146760307043,
1359
+ "loss": 0.2625,
1360
+ "step": 20200
1361
+ },
1362
+ {
1363
+ "epoch": 0.48,
1364
+ "learning_rate": 0.00014122961327374313,
1365
+ "loss": 0.2615,
1366
+ "step": 20300
1367
+ },
1368
+ {
1369
+ "epoch": 0.49,
1370
+ "learning_rate": 0.0001411170893471749,
1371
+ "loss": 0.2605,
1372
+ "step": 20400
1373
+ },
1374
+ {
1375
+ "epoch": 0.49,
1376
+ "learning_rate": 0.00014100389705390938,
1377
+ "loss": 0.26,
1378
+ "step": 20500
1379
+ },
1380
+ {
1381
+ "epoch": 0.49,
1382
+ "learning_rate": 0.0001408900376317994,
1383
+ "loss": 0.2583,
1384
+ "step": 20600
1385
+ },
1386
+ {
1387
+ "epoch": 0.49,
1388
+ "learning_rate": 0.0001407755123259933,
1389
+ "loss": 0.258,
1390
+ "step": 20700
1391
+ },
1392
+ {
1393
+ "epoch": 0.5,
1394
+ "learning_rate": 0.00014066032238892152,
1395
+ "loss": 0.2569,
1396
+ "step": 20800
1397
+ },
1398
+ {
1399
+ "epoch": 0.5,
1400
+ "learning_rate": 0.00014054446908028272,
1401
+ "loss": 0.2568,
1402
+ "step": 20900
1403
+ },
1404
+ {
1405
+ "epoch": 0.5,
1406
+ "learning_rate": 0.00014042795366703018,
1407
+ "loss": 0.2563,
1408
+ "step": 21000
1409
+ },
1410
+ {
1411
+ "epoch": 0.5,
1412
+ "eval_runtime": 46.2726,
1413
+ "eval_samples_per_second": 233.4,
1414
+ "eval_steps_per_second": 7.305,
1415
+ "step": 21000
1416
+ },
1417
+ {
1418
+ "epoch": 0.5,
1419
+ "learning_rate": 0.0001403107774233577,
1420
+ "loss": 0.256,
1421
+ "step": 21100
1422
+ },
1423
+ {
1424
+ "epoch": 0.5,
1425
+ "learning_rate": 0.00014019294163068597,
1426
+ "loss": 0.2548,
1427
+ "step": 21200
1428
+ },
1429
+ {
1430
+ "epoch": 0.51,
1431
+ "learning_rate": 0.00014007444757764835,
1432
+ "loss": 0.2543,
1433
+ "step": 21300
1434
+ },
1435
+ {
1436
+ "epoch": 0.51,
1437
+ "learning_rate": 0.0001399552965600768,
1438
+ "loss": 0.2537,
1439
+ "step": 21400
1440
+ },
1441
+ {
1442
+ "epoch": 0.51,
1443
+ "learning_rate": 0.0001398354898809877,
1444
+ "loss": 0.2531,
1445
+ "step": 21500
1446
+ },
1447
+ {
1448
+ "epoch": 0.51,
1449
+ "learning_rate": 0.0001397150288505678,
1450
+ "loss": 0.2531,
1451
+ "step": 21600
1452
+ },
1453
+ {
1454
+ "epoch": 0.52,
1455
+ "learning_rate": 0.00013959391478615959,
1456
+ "loss": 0.2526,
1457
+ "step": 21700
1458
+ },
1459
+ {
1460
+ "epoch": 0.52,
1461
+ "learning_rate": 0.00013947214901224706,
1462
+ "loss": 0.2522,
1463
+ "step": 21800
1464
+ },
1465
+ {
1466
+ "epoch": 0.52,
1467
+ "learning_rate": 0.0001393497328604412,
1468
+ "loss": 0.2515,
1469
+ "step": 21900
1470
+ },
1471
+ {
1472
+ "epoch": 0.52,
1473
+ "learning_rate": 0.00013922666766946545,
1474
+ "loss": 0.2513,
1475
+ "step": 22000
1476
+ },
1477
+ {
1478
+ "epoch": 0.52,
1479
+ "eval_runtime": 46.224,
1480
+ "eval_samples_per_second": 233.645,
1481
+ "eval_steps_per_second": 7.312,
1482
+ "step": 22000
1483
+ },
1484
+ {
1485
+ "epoch": 0.53,
1486
+ "learning_rate": 0.00013910295478514106,
1487
+ "loss": 0.2504,
1488
+ "step": 22100
1489
+ },
1490
+ {
1491
+ "epoch": 0.53,
1492
+ "learning_rate": 0.0001389785955603722,
1493
+ "loss": 0.2503,
1494
+ "step": 22200
1495
+ },
1496
+ {
1497
+ "epoch": 0.53,
1498
+ "learning_rate": 0.00013885359135513154,
1499
+ "loss": 0.2501,
1500
+ "step": 22300
1501
+ },
1502
+ {
1503
+ "epoch": 0.53,
1504
+ "learning_rate": 0.000138727943536445,
1505
+ "loss": 0.2488,
1506
+ "step": 22400
1507
+ },
1508
+ {
1509
+ "epoch": 0.54,
1510
+ "learning_rate": 0.00013860165347837698,
1511
+ "loss": 0.2492,
1512
+ "step": 22500
1513
+ },
1514
+ {
1515
+ "epoch": 0.54,
1516
+ "learning_rate": 0.00013847472256201535,
1517
+ "loss": 0.2483,
1518
+ "step": 22600
1519
+ },
1520
+ {
1521
+ "epoch": 0.54,
1522
+ "learning_rate": 0.00013834715217545625,
1523
+ "loss": 0.248,
1524
+ "step": 22700
1525
+ },
1526
+ {
1527
+ "epoch": 0.54,
1528
+ "learning_rate": 0.000138218943713789,
1529
+ "loss": 0.2479,
1530
+ "step": 22800
1531
+ },
1532
+ {
1533
+ "epoch": 0.55,
1534
+ "learning_rate": 0.0001380900985790808,
1535
+ "loss": 0.2485,
1536
+ "step": 22900
1537
+ },
1538
+ {
1539
+ "epoch": 0.55,
1540
+ "learning_rate": 0.00013796061818036138,
1541
+ "loss": 0.2467,
1542
+ "step": 23000
1543
+ },
1544
+ {
1545
+ "epoch": 0.55,
1546
+ "eval_runtime": 46.1546,
1547
+ "eval_samples_per_second": 233.996,
1548
+ "eval_steps_per_second": 7.323,
1549
+ "step": 23000
1550
+ },
1551
+ {
1552
+ "epoch": 0.55,
1553
+ "learning_rate": 0.00013783050393360768,
1554
+ "loss": 0.2468,
1555
+ "step": 23100
1556
+ },
1557
+ {
1558
+ "epoch": 0.55,
1559
+ "learning_rate": 0.0001376997572617282,
1560
+ "loss": 0.2463,
1561
+ "step": 23200
1562
+ },
1563
+ {
1564
+ "epoch": 0.55,
1565
+ "learning_rate": 0.00013756837959454766,
1566
+ "loss": 0.2456,
1567
+ "step": 23300
1568
+ },
1569
+ {
1570
+ "epoch": 0.56,
1571
+ "learning_rate": 0.0001374363723687911,
1572
+ "loss": 0.2459,
1573
+ "step": 23400
1574
+ },
1575
+ {
1576
+ "epoch": 0.56,
1577
+ "learning_rate": 0.00013730373702806846,
1578
+ "loss": 0.2447,
1579
+ "step": 23500
1580
+ },
1581
+ {
1582
+ "epoch": 0.56,
1583
+ "learning_rate": 0.00013717047502285855,
1584
+ "loss": 0.245,
1585
+ "step": 23600
1586
+ },
1587
+ {
1588
+ "epoch": 0.56,
1589
+ "learning_rate": 0.0001370365878104933,
1590
+ "loss": 0.2446,
1591
+ "step": 23700
1592
+ },
1593
+ {
1594
+ "epoch": 0.57,
1595
+ "learning_rate": 0.00013690207685514185,
1596
+ "loss": 0.2442,
1597
+ "step": 23800
1598
+ },
1599
+ {
1600
+ "epoch": 0.57,
1601
+ "learning_rate": 0.0001367669436277944,
1602
+ "loss": 0.2439,
1603
+ "step": 23900
1604
+ },
1605
+ {
1606
+ "epoch": 0.57,
1607
+ "learning_rate": 0.0001366311896062463,
1608
+ "loss": 0.2438,
1609
+ "step": 24000
1610
+ },
1611
+ {
1612
+ "epoch": 0.57,
1613
+ "eval_runtime": 46.5558,
1614
+ "eval_samples_per_second": 231.98,
1615
+ "eval_steps_per_second": 7.26,
1616
+ "step": 24000
1617
+ },
1618
+ {
1619
+ "epoch": 0.57,
1620
+ "learning_rate": 0.00013649481627508181,
1621
+ "loss": 0.2436,
1622
+ "step": 24100
1623
+ },
1624
+ {
1625
+ "epoch": 0.58,
1626
+ "learning_rate": 0.0001363578251256578,
1627
+ "loss": 0.2429,
1628
+ "step": 24200
1629
+ },
1630
+ {
1631
+ "epoch": 0.58,
1632
+ "learning_rate": 0.00013622021765608754,
1633
+ "loss": 0.2424,
1634
+ "step": 24300
1635
+ },
1636
+ {
1637
+ "epoch": 0.58,
1638
+ "learning_rate": 0.00013608199537122425,
1639
+ "loss": 0.242,
1640
+ "step": 24400
1641
+ },
1642
+ {
1643
+ "epoch": 0.58,
1644
+ "learning_rate": 0.0001359431597826447,
1645
+ "loss": 0.2422,
1646
+ "step": 24500
1647
+ },
1648
+ {
1649
+ "epoch": 0.59,
1650
+ "learning_rate": 0.0001358037124086327,
1651
+ "loss": 0.2418,
1652
+ "step": 24600
1653
+ },
1654
+ {
1655
+ "epoch": 0.59,
1656
+ "learning_rate": 0.00013566365477416233,
1657
+ "loss": 0.2407,
1658
+ "step": 24700
1659
+ },
1660
+ {
1661
+ "epoch": 0.59,
1662
+ "learning_rate": 0.00013552298841088144,
1663
+ "loss": 0.2416,
1664
+ "step": 24800
1665
+ },
1666
+ {
1667
+ "epoch": 0.59,
1668
+ "learning_rate": 0.00013538171485709486,
1669
+ "loss": 0.2411,
1670
+ "step": 24900
1671
+ },
1672
+ {
1673
+ "epoch": 0.6,
1674
+ "learning_rate": 0.00013523983565774753,
1675
+ "loss": 0.2401,
1676
+ "step": 25000
1677
+ },
1678
+ {
1679
+ "epoch": 0.6,
1680
+ "eval_runtime": 46.0773,
1681
+ "eval_samples_per_second": 234.389,
1682
+ "eval_steps_per_second": 7.336,
1683
+ "step": 25000
1684
+ },
1685
+ {
1686
+ "epoch": 0.6,
1687
+ "learning_rate": 0.00013509735236440766,
1688
+ "loss": 0.2401,
1689
+ "step": 25100
1690
+ },
1691
+ {
1692
+ "epoch": 0.6,
1693
+ "learning_rate": 0.00013495426653524972,
1694
+ "loss": 0.2402,
1695
+ "step": 25200
1696
+ },
1697
+ {
1698
+ "epoch": 0.6,
1699
+ "learning_rate": 0.00013481057973503742,
1700
+ "loss": 0.24,
1701
+ "step": 25300
1702
+ },
1703
+ {
1704
+ "epoch": 0.6,
1705
+ "learning_rate": 0.00013466629353510651,
1706
+ "loss": 0.239,
1707
+ "step": 25400
1708
+ },
1709
+ {
1710
+ "epoch": 0.61,
1711
+ "learning_rate": 0.00013452140951334787,
1712
+ "loss": 0.239,
1713
+ "step": 25500
1714
+ },
1715
+ {
1716
+ "epoch": 0.61,
1717
+ "learning_rate": 0.00013437592925418985,
1718
+ "loss": 0.2388,
1719
+ "step": 25600
1720
+ },
1721
+ {
1722
+ "epoch": 0.61,
1723
+ "learning_rate": 0.00013422985434858133,
1724
+ "loss": 0.238,
1725
+ "step": 25700
1726
+ },
1727
+ {
1728
+ "epoch": 0.61,
1729
+ "learning_rate": 0.00013408318639397405,
1730
+ "loss": 0.2387,
1731
+ "step": 25800
1732
+ },
1733
+ {
1734
+ "epoch": 0.62,
1735
+ "learning_rate": 0.00013393592699430525,
1736
+ "loss": 0.2372,
1737
+ "step": 25900
1738
+ },
1739
+ {
1740
+ "epoch": 0.62,
1741
+ "learning_rate": 0.00013378807775998012,
1742
+ "loss": 0.2377,
1743
+ "step": 26000
1744
+ },
1745
+ {
1746
+ "epoch": 0.62,
1747
+ "eval_runtime": 46.2501,
1748
+ "eval_samples_per_second": 233.513,
1749
+ "eval_steps_per_second": 7.308,
1750
+ "step": 26000
1751
+ },
1752
+ {
1753
+ "epoch": 0.62,
1754
+ "learning_rate": 0.00013363964030785422,
1755
+ "loss": 0.2373,
1756
+ "step": 26100
1757
+ },
1758
+ {
1759
+ "epoch": 0.62,
1760
+ "learning_rate": 0.00013349061626121578,
1761
+ "loss": 0.238,
1762
+ "step": 26200
1763
+ },
1764
+ {
1765
+ "epoch": 0.63,
1766
+ "learning_rate": 0.00013334100724976783,
1767
+ "loss": 0.2367,
1768
+ "step": 26300
1769
+ },
1770
+ {
1771
+ "epoch": 0.63,
1772
+ "learning_rate": 0.0001331908149096106,
1773
+ "loss": 0.2367,
1774
+ "step": 26400
1775
+ },
1776
+ {
1777
+ "epoch": 0.63,
1778
+ "learning_rate": 0.00013304004088322342,
1779
+ "loss": 0.2356,
1780
+ "step": 26500
1781
+ },
1782
+ {
1783
+ "epoch": 0.63,
1784
+ "learning_rate": 0.00013288868681944692,
1785
+ "loss": 0.2365,
1786
+ "step": 26600
1787
+ },
1788
+ {
1789
+ "epoch": 0.64,
1790
+ "learning_rate": 0.00013273675437346487,
1791
+ "loss": 0.236,
1792
+ "step": 26700
1793
+ },
1794
+ {
1795
+ "epoch": 0.64,
1796
+ "learning_rate": 0.00013258424520678618,
1797
+ "loss": 0.2356,
1798
+ "step": 26800
1799
+ },
1800
+ {
1801
+ "epoch": 0.64,
1802
+ "learning_rate": 0.00013243116098722663,
1803
+ "loss": 0.2363,
1804
+ "step": 26900
1805
+ },
1806
+ {
1807
+ "epoch": 0.64,
1808
+ "learning_rate": 0.00013227750338889077,
1809
+ "loss": 0.2345,
1810
+ "step": 27000
1811
+ },
1812
+ {
1813
+ "epoch": 0.64,
1814
+ "eval_runtime": 46.2738,
1815
+ "eval_samples_per_second": 233.394,
1816
+ "eval_steps_per_second": 7.304,
1817
+ "step": 27000
1818
+ },
1819
+ {
1820
+ "epoch": 0.65,
1821
+ "learning_rate": 0.00013212327409215343,
1822
+ "loss": 0.2351,
1823
+ "step": 27100
1824
+ },
1825
+ {
1826
+ "epoch": 0.65,
1827
+ "learning_rate": 0.0001319684747836415,
1828
+ "loss": 0.2351,
1829
+ "step": 27200
1830
+ },
1831
+ {
1832
+ "epoch": 0.65,
1833
+ "learning_rate": 0.0001318131071562154,
1834
+ "loss": 0.2342,
1835
+ "step": 27300
1836
+ },
1837
+ {
1838
+ "epoch": 0.65,
1839
+ "learning_rate": 0.00013165717290895067,
1840
+ "loss": 0.2338,
1841
+ "step": 27400
1842
+ },
1843
+ {
1844
+ "epoch": 0.65,
1845
+ "learning_rate": 0.0001315006737471192,
1846
+ "loss": 0.234,
1847
+ "step": 27500
1848
+ },
1849
+ {
1850
+ "epoch": 0.66,
1851
+ "learning_rate": 0.0001313436113821708,
1852
+ "loss": 0.233,
1853
+ "step": 27600
1854
+ },
1855
+ {
1856
+ "epoch": 0.66,
1857
+ "learning_rate": 0.00013118598753171425,
1858
+ "loss": 0.2331,
1859
+ "step": 27700
1860
+ },
1861
+ {
1862
+ "epoch": 0.66,
1863
+ "learning_rate": 0.0001310278039194988,
1864
+ "loss": 0.2329,
1865
+ "step": 27800
1866
+ },
1867
+ {
1868
+ "epoch": 0.66,
1869
+ "learning_rate": 0.00013086906227539506,
1870
+ "loss": 0.2332,
1871
+ "step": 27900
1872
+ },
1873
+ {
1874
+ "epoch": 0.67,
1875
+ "learning_rate": 0.00013070976433537623,
1876
+ "loss": 0.2338,
1877
+ "step": 28000
1878
+ },
1879
+ {
1880
+ "epoch": 0.67,
1881
+ "eval_runtime": 46.2625,
1882
+ "eval_samples_per_second": 233.45,
1883
+ "eval_steps_per_second": 7.306,
1884
+ "step": 28000
1885
+ },
1886
+ {
1887
+ "epoch": 0.67,
1888
+ "learning_rate": 0.00013054991184149905,
1889
+ "loss": 0.2325,
1890
+ "step": 28100
1891
+ },
1892
+ {
1893
+ "epoch": 0.67,
1894
+ "learning_rate": 0.00013038950654188476,
1895
+ "loss": 0.2312,
1896
+ "step": 28200
1897
+ },
1898
+ {
1899
+ "epoch": 0.67,
1900
+ "learning_rate": 0.00013022855019070005,
1901
+ "loss": 0.2323,
1902
+ "step": 28300
1903
+ },
1904
+ {
1905
+ "epoch": 0.68,
1906
+ "learning_rate": 0.0001300670445481378,
1907
+ "loss": 0.2319,
1908
+ "step": 28400
1909
+ },
1910
+ {
1911
+ "epoch": 0.68,
1912
+ "learning_rate": 0.0001299049913803978,
1913
+ "loss": 0.2324,
1914
+ "step": 28500
1915
+ },
1916
+ {
1917
+ "epoch": 0.68,
1918
+ "learning_rate": 0.00012974239245966754,
1919
+ "loss": 0.2313,
1920
+ "step": 28600
1921
+ },
1922
+ {
1923
+ "epoch": 0.68,
1924
+ "learning_rate": 0.0001295792495641028,
1925
+ "loss": 0.2318,
1926
+ "step": 28700
1927
+ },
1928
+ {
1929
+ "epoch": 0.69,
1930
+ "learning_rate": 0.00012941556447780813,
1931
+ "loss": 0.2309,
1932
+ "step": 28800
1933
+ },
1934
+ {
1935
+ "epoch": 0.69,
1936
+ "learning_rate": 0.0001292513389908174,
1937
+ "loss": 0.231,
1938
+ "step": 28900
1939
+ },
1940
+ {
1941
+ "epoch": 0.69,
1942
+ "learning_rate": 0.0001290865748990742,
1943
+ "loss": 0.2298,
1944
+ "step": 29000
1945
+ },
1946
+ {
1947
+ "epoch": 0.69,
1948
+ "eval_runtime": 46.1555,
1949
+ "eval_samples_per_second": 233.992,
1950
+ "eval_steps_per_second": 7.323,
1951
+ "step": 29000
1952
+ },
1953
+ {
1954
+ "epoch": 0.69,
1955
+ "learning_rate": 0.00012892127400441228,
1956
+ "loss": 0.2302,
1957
+ "step": 29100
1958
+ },
1959
+ {
1960
+ "epoch": 0.7,
1961
+ "learning_rate": 0.00012875543811453576,
1962
+ "loss": 0.2305,
1963
+ "step": 29200
1964
+ },
1965
+ {
1966
+ "epoch": 0.7,
1967
+ "learning_rate": 0.0001285890690429993,
1968
+ "loss": 0.2293,
1969
+ "step": 29300
1970
+ },
1971
+ {
1972
+ "epoch": 0.7,
1973
+ "learning_rate": 0.00012842216860918846,
1974
+ "loss": 0.2298,
1975
+ "step": 29400
1976
+ },
1977
+ {
1978
+ "epoch": 0.7,
1979
+ "learning_rate": 0.0001282547386382996,
1980
+ "loss": 0.2296,
1981
+ "step": 29500
1982
+ },
1983
+ {
1984
+ "epoch": 0.71,
1985
+ "learning_rate": 0.0001280867809613201,
1986
+ "loss": 0.2291,
1987
+ "step": 29600
1988
+ },
1989
+ {
1990
+ "epoch": 0.71,
1991
+ "learning_rate": 0.0001279182974150082,
1992
+ "loss": 0.2279,
1993
+ "step": 29700
1994
+ },
1995
+ {
1996
+ "epoch": 0.71,
1997
+ "learning_rate": 0.00012774928984187297,
1998
+ "loss": 0.2278,
1999
+ "step": 29800
2000
+ },
2001
+ {
2002
+ "epoch": 0.71,
2003
+ "learning_rate": 0.00012757976009015413,
2004
+ "loss": 0.228,
2005
+ "step": 29900
2006
+ },
2007
+ {
2008
+ "epoch": 0.71,
2009
+ "learning_rate": 0.0001274097100138019,
2010
+ "loss": 0.2282,
2011
+ "step": 30000
2012
+ },
2013
+ {
2014
+ "epoch": 0.71,
2015
+ "eval_runtime": 46.6895,
2016
+ "eval_samples_per_second": 231.315,
2017
+ "eval_steps_per_second": 7.239,
2018
+ "step": 30000
2019
+ },
2020
+ {
2021
+ "epoch": 0.72,
2022
+ "learning_rate": 0.00012723914147245663,
2023
+ "loss": 0.2276,
2024
+ "step": 30100
2025
+ },
2026
+ {
2027
+ "epoch": 0.72,
2028
+ "learning_rate": 0.00012706805633142863,
2029
+ "loss": 0.2276,
2030
+ "step": 30200
2031
+ },
2032
+ {
2033
+ "epoch": 0.72,
2034
+ "learning_rate": 0.00012689645646167755,
2035
+ "loss": 0.2281,
2036
+ "step": 30300
2037
+ },
2038
+ {
2039
+ "epoch": 0.72,
2040
+ "learning_rate": 0.00012672434373979207,
2041
+ "loss": 0.2265,
2042
+ "step": 30400
2043
+ },
2044
+ {
2045
+ "epoch": 0.73,
2046
+ "learning_rate": 0.00012655172004796936,
2047
+ "loss": 0.2286,
2048
+ "step": 30500
2049
+ },
2050
+ {
2051
+ "epoch": 0.73,
2052
+ "learning_rate": 0.00012637858727399448,
2053
+ "loss": 0.227,
2054
+ "step": 30600
2055
+ },
2056
+ {
2057
+ "epoch": 0.73,
2058
+ "learning_rate": 0.00012620494731121966,
2059
+ "loss": 0.2267,
2060
+ "step": 30700
2061
+ },
2062
+ {
2063
+ "epoch": 0.73,
2064
+ "learning_rate": 0.00012603080205854372,
2065
+ "loss": 0.2266,
2066
+ "step": 30800
2067
+ },
2068
+ {
2069
+ "epoch": 0.74,
2070
+ "learning_rate": 0.00012585615342039126,
2071
+ "loss": 0.2258,
2072
+ "step": 30900
2073
+ },
2074
+ {
2075
+ "epoch": 0.74,
2076
+ "learning_rate": 0.0001256810033066918,
2077
+ "loss": 0.226,
2078
+ "step": 31000
2079
+ },
2080
+ {
2081
+ "epoch": 0.74,
2082
+ "eval_runtime": 47.0689,
2083
+ "eval_samples_per_second": 229.451,
2084
+ "eval_steps_per_second": 7.181,
2085
+ "step": 31000
2086
+ },
2087
+ {
2088
+ "epoch": 0.74,
2089
+ "learning_rate": 0.0001255053536328589,
2090
+ "loss": 0.2257,
2091
+ "step": 31100
2092
+ },
2093
+ {
2094
+ "epoch": 0.74,
2095
+ "learning_rate": 0.0001253292063197693,
2096
+ "loss": 0.2256,
2097
+ "step": 31200
2098
+ },
2099
+ {
2100
+ "epoch": 0.75,
2101
+ "learning_rate": 0.0001251525632937418,
2102
+ "loss": 0.2257,
2103
+ "step": 31300
2104
+ },
2105
+ {
2106
+ "epoch": 0.75,
2107
+ "learning_rate": 0.00012497542648651615,
2108
+ "loss": 0.2248,
2109
+ "step": 31400
2110
+ },
2111
+ {
2112
+ "epoch": 0.75,
2113
+ "learning_rate": 0.00012479779783523216,
2114
+ "loss": 0.225,
2115
+ "step": 31500
2116
+ },
2117
+ {
2118
+ "epoch": 0.75,
2119
+ "learning_rate": 0.00012461967928240828,
2120
+ "loss": 0.2246,
2121
+ "step": 31600
2122
+ },
2123
+ {
2124
+ "epoch": 0.76,
2125
+ "learning_rate": 0.00012444107277592047,
2126
+ "loss": 0.2247,
2127
+ "step": 31700
2128
+ },
2129
+ {
2130
+ "epoch": 0.76,
2131
+ "learning_rate": 0.0001242619802689809,
2132
+ "loss": 0.2246,
2133
+ "step": 31800
2134
+ },
2135
+ {
2136
+ "epoch": 0.76,
2137
+ "learning_rate": 0.00012408240372011647,
2138
+ "loss": 0.2238,
2139
+ "step": 31900
2140
+ },
2141
+ {
2142
+ "epoch": 0.76,
2143
+ "learning_rate": 0.0001239023450931476,
2144
+ "loss": 0.2243,
2145
+ "step": 32000
2146
+ },
2147
+ {
2148
+ "epoch": 0.76,
2149
+ "eval_runtime": 47.1954,
2150
+ "eval_samples_per_second": 228.836,
2151
+ "eval_steps_per_second": 7.162,
2152
+ "step": 32000
2153
+ },
2154
+ {
2155
+ "epoch": 0.76,
2156
+ "learning_rate": 0.00012372180635716656,
2157
+ "loss": 0.2235,
2158
+ "step": 32100
2159
+ },
2160
+ {
2161
+ "epoch": 0.77,
2162
+ "learning_rate": 0.00012354078948651604,
2163
+ "loss": 0.2239,
2164
+ "step": 32200
2165
+ },
2166
+ {
2167
+ "epoch": 0.77,
2168
+ "learning_rate": 0.00012335929646076758,
2169
+ "loss": 0.2231,
2170
+ "step": 32300
2171
+ },
2172
+ {
2173
+ "epoch": 0.77,
2174
+ "learning_rate": 0.00012317732926469976,
2175
+ "loss": 0.2225,
2176
+ "step": 32400
2177
+ },
2178
+ {
2179
+ "epoch": 0.77,
2180
+ "learning_rate": 0.00012299488988827675,
2181
+ "loss": 0.2233,
2182
+ "step": 32500
2183
+ },
2184
+ {
2185
+ "epoch": 0.78,
2186
+ "learning_rate": 0.0001228119803266263,
2187
+ "loss": 0.223,
2188
+ "step": 32600
2189
+ },
2190
+ {
2191
+ "epoch": 0.78,
2192
+ "learning_rate": 0.0001226286025800181,
2193
+ "loss": 0.2229,
2194
+ "step": 32700
2195
+ },
2196
+ {
2197
+ "epoch": 0.78,
2198
+ "learning_rate": 0.00012244475865384177,
2199
+ "loss": 0.222,
2200
+ "step": 32800
2201
+ },
2202
+ {
2203
+ "epoch": 0.78,
2204
+ "learning_rate": 0.00012226045055858505,
2205
+ "loss": 0.2217,
2206
+ "step": 32900
2207
+ },
2208
+ {
2209
+ "epoch": 0.79,
2210
+ "learning_rate": 0.00012207568030981174,
2211
+ "loss": 0.2222,
2212
+ "step": 33000
2213
+ },
2214
+ {
2215
+ "epoch": 0.79,
2216
+ "eval_runtime": 47.0101,
2217
+ "eval_samples_per_second": 229.738,
2218
+ "eval_steps_per_second": 7.19,
2219
+ "step": 33000
2220
+ },
2221
+ {
2222
+ "epoch": 0.79,
2223
+ "learning_rate": 0.00012189044992813972,
2224
+ "loss": 0.2213,
2225
+ "step": 33100
2226
+ },
2227
+ {
2228
+ "epoch": 0.79,
2229
+ "learning_rate": 0.0001217047614392187,
2230
+ "loss": 0.2206,
2231
+ "step": 33200
2232
+ },
2233
+ {
2234
+ "epoch": 0.79,
2235
+ "learning_rate": 0.00012151861687370828,
2236
+ "loss": 0.2221,
2237
+ "step": 33300
2238
+ },
2239
+ {
2240
+ "epoch": 0.8,
2241
+ "learning_rate": 0.00012133201826725558,
2242
+ "loss": 0.2209,
2243
+ "step": 33400
2244
+ },
2245
+ {
2246
+ "epoch": 0.8,
2247
+ "learning_rate": 0.0001211449676604731,
2248
+ "loss": 0.2211,
2249
+ "step": 33500
2250
+ },
2251
+ {
2252
+ "epoch": 0.8,
2253
+ "learning_rate": 0.00012095746709891632,
2254
+ "loss": 0.2205,
2255
+ "step": 33600
2256
+ },
2257
+ {
2258
+ "epoch": 0.8,
2259
+ "learning_rate": 0.00012076951863306127,
2260
+ "loss": 0.2203,
2261
+ "step": 33700
2262
+ },
2263
+ {
2264
+ "epoch": 0.81,
2265
+ "learning_rate": 0.0001205811243182823,
2266
+ "loss": 0.22,
2267
+ "step": 33800
2268
+ },
2269
+ {
2270
+ "epoch": 0.81,
2271
+ "learning_rate": 0.00012039228621482949,
2272
+ "loss": 0.2192,
2273
+ "step": 33900
2274
+ },
2275
+ {
2276
+ "epoch": 0.81,
2277
+ "learning_rate": 0.00012020300638780604,
2278
+ "loss": 0.219,
2279
+ "step": 34000
2280
+ },
2281
+ {
2282
+ "epoch": 0.81,
2283
+ "eval_runtime": 47.0946,
2284
+ "eval_samples_per_second": 229.325,
2285
+ "eval_steps_per_second": 7.177,
2286
+ "step": 34000
2287
+ },
2288
+ {
2289
+ "epoch": 0.81,
2290
+ "learning_rate": 0.00012001328690714582,
2291
+ "loss": 0.2194,
2292
+ "step": 34100
2293
+ },
2294
+ {
2295
+ "epoch": 0.81,
2296
+ "learning_rate": 0.00011982312984759068,
2297
+ "loss": 0.2194,
2298
+ "step": 34200
2299
+ },
2300
+ {
2301
+ "epoch": 0.82,
2302
+ "learning_rate": 0.00011963253728866778,
2303
+ "loss": 0.2189,
2304
+ "step": 34300
2305
+ },
2306
+ {
2307
+ "epoch": 0.82,
2308
+ "learning_rate": 0.00011944151131466675,
2309
+ "loss": 0.219,
2310
+ "step": 34400
2311
+ },
2312
+ {
2313
+ "epoch": 0.82,
2314
+ "learning_rate": 0.00011925005401461709,
2315
+ "loss": 0.2184,
2316
+ "step": 34500
2317
+ },
2318
+ {
2319
+ "epoch": 0.82,
2320
+ "learning_rate": 0.00011905816748226513,
2321
+ "loss": 0.2182,
2322
+ "step": 34600
2323
+ },
2324
+ {
2325
+ "epoch": 0.83,
2326
+ "learning_rate": 0.00011886585381605125,
2327
+ "loss": 0.2188,
2328
+ "step": 34700
2329
+ },
2330
+ {
2331
+ "epoch": 0.83,
2332
+ "learning_rate": 0.00011867311511908693,
2333
+ "loss": 0.2179,
2334
+ "step": 34800
2335
+ },
2336
+ {
2337
+ "epoch": 0.83,
2338
+ "learning_rate": 0.00011847995349913162,
2339
+ "loss": 0.218,
2340
+ "step": 34900
2341
+ },
2342
+ {
2343
+ "epoch": 0.83,
2344
+ "learning_rate": 0.00011828637106856989,
2345
+ "loss": 0.2173,
2346
+ "step": 35000
2347
+ },
2348
+ {
2349
+ "epoch": 0.83,
2350
+ "eval_runtime": 46.7598,
2351
+ "eval_samples_per_second": 230.968,
2352
+ "eval_steps_per_second": 7.228,
2353
+ "step": 35000
2354
+ },
2355
+ {
2356
+ "epoch": 0.84,
2357
+ "learning_rate": 0.00011809236994438816,
2358
+ "loss": 0.2171,
2359
+ "step": 35100
2360
+ },
2361
+ {
2362
+ "epoch": 0.84,
2363
+ "learning_rate": 0.00011789795224815164,
2364
+ "loss": 0.2175,
2365
+ "step": 35200
2366
+ },
2367
+ {
2368
+ "epoch": 0.84,
2369
+ "learning_rate": 0.00011770312010598116,
2370
+ "loss": 0.2167,
2371
+ "step": 35300
2372
+ },
2373
+ {
2374
+ "epoch": 0.84,
2375
+ "learning_rate": 0.00011750787564852973,
2376
+ "loss": 0.2167,
2377
+ "step": 35400
2378
+ },
2379
+ {
2380
+ "epoch": 0.85,
2381
+ "learning_rate": 0.00011731222101095955,
2382
+ "loss": 0.2171,
2383
+ "step": 35500
2384
+ },
2385
+ {
2386
+ "epoch": 0.85,
2387
+ "learning_rate": 0.00011711615833291833,
2388
+ "loss": 0.2161,
2389
+ "step": 35600
2390
+ },
2391
+ {
2392
+ "epoch": 0.85,
2393
+ "learning_rate": 0.0001169196897585161,
2394
+ "loss": 0.2168,
2395
+ "step": 35700
2396
+ },
2397
+ {
2398
+ "epoch": 0.85,
2399
+ "learning_rate": 0.00011672281743630175,
2400
+ "loss": 0.2162,
2401
+ "step": 35800
2402
+ },
2403
+ {
2404
+ "epoch": 0.86,
2405
+ "learning_rate": 0.0001165255435192394,
2406
+ "loss": 0.2152,
2407
+ "step": 35900
2408
+ },
2409
+ {
2410
+ "epoch": 0.86,
2411
+ "learning_rate": 0.00011632787016468506,
2412
+ "loss": 0.216,
2413
+ "step": 36000
2414
+ },
2415
+ {
2416
+ "epoch": 0.86,
2417
+ "eval_runtime": 47.0992,
2418
+ "eval_samples_per_second": 229.303,
2419
+ "eval_steps_per_second": 7.176,
2420
+ "step": 36000
2421
+ },
2422
+ {
2423
+ "epoch": 0.86,
2424
+ "learning_rate": 0.0001161297995343628,
2425
+ "loss": 0.2157,
2426
+ "step": 36100
2427
+ },
2428
+ {
2429
+ "epoch": 0.86,
2430
+ "learning_rate": 0.00011593133379434138,
2431
+ "loss": 0.215,
2432
+ "step": 36200
2433
+ },
2434
+ {
2435
+ "epoch": 0.86,
2436
+ "learning_rate": 0.00011573247511501028,
2437
+ "loss": 0.2154,
2438
+ "step": 36300
2439
+ },
2440
+ {
2441
+ "epoch": 0.87,
2442
+ "learning_rate": 0.00011553322567105619,
2443
+ "loss": 0.2155,
2444
+ "step": 36400
2445
+ },
2446
+ {
2447
+ "epoch": 0.87,
2448
+ "learning_rate": 0.00011533358764143905,
2449
+ "loss": 0.2149,
2450
+ "step": 36500
2451
+ },
2452
+ {
2453
+ "epoch": 0.87,
2454
+ "learning_rate": 0.00011513356320936841,
2455
+ "loss": 0.2144,
2456
+ "step": 36600
2457
+ },
2458
+ {
2459
+ "epoch": 0.87,
2460
+ "learning_rate": 0.00011493315456227943,
2461
+ "loss": 0.2147,
2462
+ "step": 36700
2463
+ },
2464
+ {
2465
+ "epoch": 0.88,
2466
+ "learning_rate": 0.00011473236389180894,
2467
+ "loss": 0.2145,
2468
+ "step": 36800
2469
+ },
2470
+ {
2471
+ "epoch": 0.88,
2472
+ "learning_rate": 0.00011453119339377154,
2473
+ "loss": 0.2146,
2474
+ "step": 36900
2475
+ },
2476
+ {
2477
+ "epoch": 0.88,
2478
+ "learning_rate": 0.00011432964526813558,
2479
+ "loss": 0.2145,
2480
+ "step": 37000
2481
+ },
2482
+ {
2483
+ "epoch": 0.88,
2484
+ "eval_runtime": 46.8321,
2485
+ "eval_samples_per_second": 230.611,
2486
+ "eval_steps_per_second": 7.217,
2487
+ "step": 37000
2488
+ },
2489
+ {
2490
+ "epoch": 0.88,
2491
+ "learning_rate": 0.00011412772171899904,
2492
+ "loss": 0.2132,
2493
+ "step": 37100
2494
+ },
2495
+ {
2496
+ "epoch": 0.89,
2497
+ "learning_rate": 0.00011392542495456556,
2498
+ "loss": 0.2133,
2499
+ "step": 37200
2500
+ },
2501
+ {
2502
+ "epoch": 0.89,
2503
+ "learning_rate": 0.00011372275718712006,
2504
+ "loss": 0.2125,
2505
+ "step": 37300
2506
+ },
2507
+ {
2508
+ "epoch": 0.89,
2509
+ "learning_rate": 0.00011351972063300484,
2510
+ "loss": 0.2135,
2511
+ "step": 37400
2512
+ },
2513
+ {
2514
+ "epoch": 0.89,
2515
+ "learning_rate": 0.00011331631751259515,
2516
+ "loss": 0.213,
2517
+ "step": 37500
2518
+ },
2519
+ {
2520
+ "epoch": 0.9,
2521
+ "learning_rate": 0.00011311255005027487,
2522
+ "loss": 0.2132,
2523
+ "step": 37600
2524
+ },
2525
+ {
2526
+ "epoch": 0.9,
2527
+ "learning_rate": 0.00011290842047441232,
2528
+ "loss": 0.2125,
2529
+ "step": 37700
2530
+ },
2531
+ {
2532
+ "epoch": 0.9,
2533
+ "learning_rate": 0.00011270393101733585,
2534
+ "loss": 0.2122,
2535
+ "step": 37800
2536
+ },
2537
+ {
2538
+ "epoch": 0.9,
2539
+ "learning_rate": 0.00011249908391530946,
2540
+ "loss": 0.2113,
2541
+ "step": 37900
2542
+ },
2543
+ {
2544
+ "epoch": 0.91,
2545
+ "learning_rate": 0.00011229388140850814,
2546
+ "loss": 0.2119,
2547
+ "step": 38000
2548
+ },
2549
+ {
2550
+ "epoch": 0.91,
2551
+ "eval_runtime": 46.8036,
2552
+ "eval_samples_per_second": 230.751,
2553
+ "eval_steps_per_second": 7.222,
2554
+ "step": 38000
2555
+ },
2556
+ {
2557
+ "epoch": 0.91,
2558
+ "learning_rate": 0.00011208832574099368,
2559
+ "loss": 0.2113,
2560
+ "step": 38100
2561
+ },
2562
+ {
2563
+ "epoch": 0.91,
2564
+ "learning_rate": 0.00011188241916068993,
2565
+ "loss": 0.2111,
2566
+ "step": 38200
2567
+ },
2568
+ {
2569
+ "epoch": 0.91,
2570
+ "learning_rate": 0.00011167616391935826,
2571
+ "loss": 0.2111,
2572
+ "step": 38300
2573
+ },
2574
+ {
2575
+ "epoch": 0.91,
2576
+ "learning_rate": 0.00011146956227257293,
2577
+ "loss": 0.2119,
2578
+ "step": 38400
2579
+ },
2580
+ {
2581
+ "epoch": 0.92,
2582
+ "learning_rate": 0.00011126261647969645,
2583
+ "loss": 0.2115,
2584
+ "step": 38500
2585
+ },
2586
+ {
2587
+ "epoch": 0.92,
2588
+ "learning_rate": 0.00011105532880385487,
2589
+ "loss": 0.2104,
2590
+ "step": 38600
2591
+ },
2592
+ {
2593
+ "epoch": 0.92,
2594
+ "learning_rate": 0.00011084770151191299,
2595
+ "loss": 0.2107,
2596
+ "step": 38700
2597
+ },
2598
+ {
2599
+ "epoch": 0.92,
2600
+ "learning_rate": 0.00011063973687444962,
2601
+ "loss": 0.2097,
2602
+ "step": 38800
2603
+ },
2604
+ {
2605
+ "epoch": 0.93,
2606
+ "learning_rate": 0.00011043143716573272,
2607
+ "loss": 0.2107,
2608
+ "step": 38900
2609
+ },
2610
+ {
2611
+ "epoch": 0.93,
2612
+ "learning_rate": 0.00011022280466369448,
2613
+ "loss": 0.2113,
2614
+ "step": 39000
2615
+ },
2616
+ {
2617
+ "epoch": 0.93,
2618
+ "eval_runtime": 47.0898,
2619
+ "eval_samples_per_second": 229.349,
2620
+ "eval_steps_per_second": 7.178,
2621
+ "step": 39000
2622
+ },
2623
+ {
2624
+ "epoch": 0.93,
2625
+ "learning_rate": 0.00011001384164990662,
2626
+ "loss": 0.2099,
2627
+ "step": 39100
2628
+ },
2629
+ {
2630
+ "epoch": 0.93,
2631
+ "learning_rate": 0.00010980455040955506,
2632
+ "loss": 0.21,
2633
+ "step": 39200
2634
+ },
2635
+ {
2636
+ "epoch": 0.94,
2637
+ "learning_rate": 0.00010959493323141538,
2638
+ "loss": 0.2091,
2639
+ "step": 39300
2640
+ },
2641
+ {
2642
+ "epoch": 0.94,
2643
+ "learning_rate": 0.00010938499240782739,
2644
+ "loss": 0.2098,
2645
+ "step": 39400
2646
+ },
2647
+ {
2648
+ "epoch": 0.94,
2649
+ "learning_rate": 0.00010917473023467032,
2650
+ "loss": 0.2096,
2651
+ "step": 39500
2652
+ },
2653
+ {
2654
+ "epoch": 0.94,
2655
+ "learning_rate": 0.00010896414901133761,
2656
+ "loss": 0.2085,
2657
+ "step": 39600
2658
+ },
2659
+ {
2660
+ "epoch": 0.95,
2661
+ "learning_rate": 0.00010875325104071177,
2662
+ "loss": 0.2093,
2663
+ "step": 39700
2664
+ },
2665
+ {
2666
+ "epoch": 0.95,
2667
+ "learning_rate": 0.00010854203862913927,
2668
+ "loss": 0.2084,
2669
+ "step": 39800
2670
+ },
2671
+ {
2672
+ "epoch": 0.95,
2673
+ "learning_rate": 0.00010833051408640509,
2674
+ "loss": 0.2083,
2675
+ "step": 39900
2676
+ },
2677
+ {
2678
+ "epoch": 0.95,
2679
+ "learning_rate": 0.00010811867972570786,
2680
+ "loss": 0.2084,
2681
+ "step": 40000
2682
+ },
2683
+ {
2684
+ "epoch": 0.95,
2685
+ "eval_runtime": 46.8854,
2686
+ "eval_samples_per_second": 230.349,
2687
+ "eval_steps_per_second": 7.209,
2688
+ "step": 40000
2689
+ },
2690
+ {
2691
+ "epoch": 0.96,
2692
+ "learning_rate": 0.00010790653786363416,
2693
+ "loss": 0.2082,
2694
+ "step": 40100
2695
+ },
2696
+ {
2697
+ "epoch": 0.96,
2698
+ "learning_rate": 0.00010769409082013337,
2699
+ "loss": 0.2081,
2700
+ "step": 40200
2701
+ },
2702
+ {
2703
+ "epoch": 0.96,
2704
+ "learning_rate": 0.00010748134091849238,
2705
+ "loss": 0.2077,
2706
+ "step": 40300
2707
+ },
2708
+ {
2709
+ "epoch": 0.96,
2710
+ "learning_rate": 0.00010726829048531,
2711
+ "loss": 0.2078,
2712
+ "step": 40400
2713
+ },
2714
+ {
2715
+ "epoch": 0.96,
2716
+ "learning_rate": 0.00010705494185047165,
2717
+ "loss": 0.2077,
2718
+ "step": 40500
2719
+ },
2720
+ {
2721
+ "epoch": 0.97,
2722
+ "learning_rate": 0.0001068412973471238,
2723
+ "loss": 0.2073,
2724
+ "step": 40600
2725
+ },
2726
+ {
2727
+ "epoch": 0.97,
2728
+ "learning_rate": 0.00010662735931164853,
2729
+ "loss": 0.2076,
2730
+ "step": 40700
2731
+ },
2732
+ {
2733
+ "epoch": 0.97,
2734
+ "learning_rate": 0.0001064131300836379,
2735
+ "loss": 0.2069,
2736
+ "step": 40800
2737
+ },
2738
+ {
2739
+ "epoch": 0.97,
2740
+ "learning_rate": 0.0001061986120058684,
2741
+ "loss": 0.2067,
2742
+ "step": 40900
2743
+ },
2744
+ {
2745
+ "epoch": 0.98,
2746
+ "learning_rate": 0.00010598380742427543,
2747
+ "loss": 0.206,
2748
+ "step": 41000
2749
+ },
2750
+ {
2751
+ "epoch": 0.98,
2752
+ "eval_runtime": 46.6481,
2753
+ "eval_samples_per_second": 231.521,
2754
+ "eval_steps_per_second": 7.246,
2755
+ "step": 41000
2756
+ },
2757
+ {
2758
+ "epoch": 0.98,
2759
+ "learning_rate": 0.00010576871868792746,
2760
+ "loss": 0.206,
2761
+ "step": 41100
2762
+ },
2763
+ {
2764
+ "epoch": 0.98,
2765
+ "learning_rate": 0.0001055533481490004,
2766
+ "loss": 0.2058,
2767
+ "step": 41200
2768
+ },
2769
+ {
2770
+ "epoch": 0.98,
2771
+ "learning_rate": 0.000105337698162752,
2772
+ "loss": 0.206,
2773
+ "step": 41300
2774
+ },
2775
+ {
2776
+ "epoch": 0.99,
2777
+ "learning_rate": 0.00010512177108749594,
2778
+ "loss": 0.2057,
2779
+ "step": 41400
2780
+ },
2781
+ {
2782
+ "epoch": 0.99,
2783
+ "learning_rate": 0.00010490556928457616,
2784
+ "loss": 0.2039,
2785
+ "step": 41500
2786
+ },
2787
+ {
2788
+ "epoch": 0.99,
2789
+ "learning_rate": 0.00010468909511834088,
2790
+ "loss": 0.205,
2791
+ "step": 41600
2792
+ },
2793
+ {
2794
+ "epoch": 0.99,
2795
+ "learning_rate": 0.00010447235095611692,
2796
+ "loss": 0.2045,
2797
+ "step": 41700
2798
+ },
2799
+ {
2800
+ "epoch": 1.0,
2801
+ "learning_rate": 0.00010425533916818376,
2802
+ "loss": 0.2047,
2803
+ "step": 41800
2804
+ },
2805
+ {
2806
+ "epoch": 1.0,
2807
+ "learning_rate": 0.00010403806212774747,
2808
+ "loss": 0.205,
2809
+ "step": 41900
2810
+ },
2811
+ {
2812
+ "epoch": 1.0,
2813
+ "learning_rate": 0.000103820522210915,
2814
+ "loss": 0.2042,
2815
+ "step": 42000
2816
+ },
2817
+ {
2818
+ "epoch": 1.0,
2819
+ "eval_runtime": 46.7967,
2820
+ "eval_samples_per_second": 230.786,
2821
+ "eval_steps_per_second": 7.223,
2822
+ "step": 42000
2823
+ },
2824
+ {
2825
+ "epoch": 1.0,
2826
+ "learning_rate": 0.00010360272179666802,
2827
+ "loss": 0.204,
2828
+ "step": 42100
2829
+ },
2830
+ {
2831
+ "epoch": 1.01,
2832
+ "learning_rate": 0.00010338466326683697,
2833
+ "loss": 0.2037,
2834
+ "step": 42200
2835
+ },
2836
+ {
2837
+ "epoch": 1.01,
2838
+ "learning_rate": 0.00010316634900607497,
2839
+ "loss": 0.2033,
2840
+ "step": 42300
2841
+ },
2842
+ {
2843
+ "epoch": 1.01,
2844
+ "learning_rate": 0.00010294778140183182,
2845
+ "loss": 0.2035,
2846
+ "step": 42400
2847
+ },
2848
+ {
2849
+ "epoch": 1.01,
2850
+ "learning_rate": 0.00010272896284432785,
2851
+ "loss": 0.2037,
2852
+ "step": 42500
2853
+ },
2854
+ {
2855
+ "epoch": 1.01,
2856
+ "learning_rate": 0.00010250989572652766,
2857
+ "loss": 0.2028,
2858
+ "step": 42600
2859
+ },
2860
+ {
2861
+ "epoch": 1.02,
2862
+ "learning_rate": 0.00010229058244411427,
2863
+ "loss": 0.2019,
2864
+ "step": 42700
2865
+ },
2866
+ {
2867
+ "epoch": 1.02,
2868
+ "learning_rate": 0.00010207102539546251,
2869
+ "loss": 0.2032,
2870
+ "step": 42800
2871
+ },
2872
+ {
2873
+ "epoch": 1.02,
2874
+ "learning_rate": 0.00010185122698161311,
2875
+ "loss": 0.2026,
2876
+ "step": 42900
2877
+ },
2878
+ {
2879
+ "epoch": 1.02,
2880
+ "learning_rate": 0.00010163118960624632,
2881
+ "loss": 0.2024,
2882
+ "step": 43000
2883
+ },
2884
+ {
2885
+ "epoch": 1.02,
2886
+ "eval_runtime": 46.9319,
2887
+ "eval_samples_per_second": 230.121,
2888
+ "eval_steps_per_second": 7.202,
2889
+ "step": 43000
2890
+ },
2891
+ {
2892
+ "epoch": 1.03,
2893
+ "learning_rate": 0.00010141091567565561,
2894
+ "loss": 0.2028,
2895
+ "step": 43100
2896
+ },
2897
+ {
2898
+ "epoch": 1.03,
2899
+ "learning_rate": 0.00010119040759872142,
2900
+ "loss": 0.2018,
2901
+ "step": 43200
2902
+ },
2903
+ {
2904
+ "epoch": 1.03,
2905
+ "learning_rate": 0.00010096966778688472,
2906
+ "loss": 0.2016,
2907
+ "step": 43300
2908
+ },
2909
+ {
2910
+ "epoch": 1.03,
2911
+ "learning_rate": 0.00010074869865412074,
2912
+ "loss": 0.2024,
2913
+ "step": 43400
2914
+ },
2915
+ {
2916
+ "epoch": 1.04,
2917
+ "learning_rate": 0.00010052750261691254,
2918
+ "loss": 0.2017,
2919
+ "step": 43500
2920
+ },
2921
+ {
2922
+ "epoch": 1.04,
2923
+ "learning_rate": 0.0001003060820942245,
2924
+ "loss": 0.2015,
2925
+ "step": 43600
2926
+ },
2927
+ {
2928
+ "epoch": 1.04,
2929
+ "learning_rate": 0.00010008443950747599,
2930
+ "loss": 0.2014,
2931
+ "step": 43700
2932
+ },
2933
+ {
2934
+ "epoch": 1.04,
2935
+ "learning_rate": 9.986257728051483e-05,
2936
+ "loss": 0.2014,
2937
+ "step": 43800
2938
+ },
2939
+ {
2940
+ "epoch": 1.05,
2941
+ "learning_rate": 9.964049783959082e-05,
2942
+ "loss": 0.2012,
2943
+ "step": 43900
2944
+ },
2945
+ {
2946
+ "epoch": 1.05,
2947
+ "learning_rate": 9.94182036133291e-05,
2948
+ "loss": 0.201,
2949
+ "step": 44000
2950
+ },
2951
+ {
2952
+ "epoch": 1.05,
2953
+ "eval_runtime": 47.2136,
2954
+ "eval_samples_per_second": 228.748,
2955
+ "eval_steps_per_second": 7.159,
2956
+ "step": 44000
2957
+ },
2958
+ {
2959
+ "epoch": 1.05,
2960
+ "learning_rate": 9.919569703270376e-05,
2961
+ "loss": 0.1998,
2962
+ "step": 44100
2963
+ },
2964
+ {
2965
+ "epoch": 1.05,
2966
+ "learning_rate": 9.89729805310111e-05,
2967
+ "loss": 0.2004,
2968
+ "step": 44200
2969
+ },
2970
+ {
2971
+ "epoch": 1.06,
2972
+ "learning_rate": 9.875005654384307e-05,
2973
+ "loss": 0.2009,
2974
+ "step": 44300
2975
+ },
2976
+ {
2977
+ "epoch": 1.06,
2978
+ "learning_rate": 9.852692750906071e-05,
2979
+ "loss": 0.1999,
2980
+ "step": 44400
2981
+ },
2982
+ {
2983
+ "epoch": 1.06,
2984
+ "learning_rate": 9.830359586676737e-05,
2985
+ "loss": 0.1997,
2986
+ "step": 44500
2987
+ },
2988
+ {
2989
+ "epoch": 1.06,
2990
+ "learning_rate": 9.808006405928215e-05,
2991
+ "loss": 0.2006,
2992
+ "step": 44600
2993
+ },
2994
+ {
2995
+ "epoch": 1.06,
2996
+ "learning_rate": 9.785633453111306e-05,
2997
+ "loss": 0.1999,
2998
+ "step": 44700
2999
+ },
3000
+ {
3001
+ "epoch": 1.07,
3002
+ "learning_rate": 9.763240972893037e-05,
3003
+ "loss": 0.1992,
3004
+ "step": 44800
3005
+ },
3006
+ {
3007
+ "epoch": 1.07,
3008
+ "learning_rate": 9.740829210153984e-05,
3009
+ "loss": 0.1991,
3010
+ "step": 44900
3011
+ },
3012
+ {
3013
+ "epoch": 1.07,
3014
+ "learning_rate": 9.718398409985593e-05,
3015
+ "loss": 0.199,
3016
+ "step": 45000
3017
+ },
3018
+ {
3019
+ "epoch": 1.07,
3020
+ "eval_runtime": 46.9221,
3021
+ "eval_samples_per_second": 230.169,
3022
+ "eval_steps_per_second": 7.203,
3023
+ "step": 45000
3024
+ },
3025
+ {
3026
+ "epoch": 1.07,
3027
+ "learning_rate": 9.695948817687504e-05,
3028
+ "loss": 0.1987,
3029
+ "step": 45100
3030
+ },
3031
+ {
3032
+ "epoch": 1.08,
3033
+ "learning_rate": 9.673480678764858e-05,
3034
+ "loss": 0.1982,
3035
+ "step": 45200
3036
+ },
3037
+ {
3038
+ "epoch": 1.08,
3039
+ "learning_rate": 9.650994238925626e-05,
3040
+ "loss": 0.1989,
3041
+ "step": 45300
3042
+ },
3043
+ {
3044
+ "epoch": 1.08,
3045
+ "learning_rate": 9.628489744077911e-05,
3046
+ "loss": 0.1985,
3047
+ "step": 45400
3048
+ },
3049
+ {
3050
+ "epoch": 1.08,
3051
+ "learning_rate": 9.60596744032726e-05,
3052
+ "loss": 0.1981,
3053
+ "step": 45500
3054
+ },
3055
+ {
3056
+ "epoch": 1.09,
3057
+ "learning_rate": 9.583427573973982e-05,
3058
+ "loss": 0.1976,
3059
+ "step": 45600
3060
+ },
3061
+ {
3062
+ "epoch": 1.09,
3063
+ "learning_rate": 9.560870391510441e-05,
3064
+ "loss": 0.1981,
3065
+ "step": 45700
3066
+ },
3067
+ {
3068
+ "epoch": 1.09,
3069
+ "learning_rate": 9.538296139618371e-05,
3070
+ "loss": 0.1978,
3071
+ "step": 45800
3072
+ },
3073
+ {
3074
+ "epoch": 1.09,
3075
+ "learning_rate": 9.515705065166178e-05,
3076
+ "loss": 0.1977,
3077
+ "step": 45900
3078
+ },
3079
+ {
3080
+ "epoch": 1.1,
3081
+ "learning_rate": 9.493097415206228e-05,
3082
+ "loss": 0.1974,
3083
+ "step": 46000
3084
+ },
3085
+ {
3086
+ "epoch": 1.1,
3087
+ "eval_runtime": 47.1161,
3088
+ "eval_samples_per_second": 229.221,
3089
+ "eval_steps_per_second": 7.174,
3090
+ "step": 46000
3091
+ },
3092
+ {
3093
+ "epoch": 1.1,
3094
+ "learning_rate": 9.47047343697216e-05,
3095
+ "loss": 0.1978,
3096
+ "step": 46100
3097
+ },
3098
+ {
3099
+ "epoch": 1.1,
3100
+ "learning_rate": 9.447833377876176e-05,
3101
+ "loss": 0.1974,
3102
+ "step": 46200
3103
+ },
3104
+ {
3105
+ "epoch": 1.1,
3106
+ "learning_rate": 9.425177485506336e-05,
3107
+ "loss": 0.1971,
3108
+ "step": 46300
3109
+ },
3110
+ {
3111
+ "epoch": 1.11,
3112
+ "learning_rate": 9.402506007623848e-05,
3113
+ "loss": 0.1968,
3114
+ "step": 46400
3115
+ },
3116
+ {
3117
+ "epoch": 1.11,
3118
+ "learning_rate": 9.379819192160362e-05,
3119
+ "loss": 0.1969,
3120
+ "step": 46500
3121
+ },
3122
+ {
3123
+ "epoch": 1.11,
3124
+ "learning_rate": 9.357117287215258e-05,
3125
+ "loss": 0.1966,
3126
+ "step": 46600
3127
+ },
3128
+ {
3129
+ "epoch": 1.11,
3130
+ "learning_rate": 9.334400541052928e-05,
3131
+ "loss": 0.1971,
3132
+ "step": 46700
3133
+ },
3134
+ {
3135
+ "epoch": 1.11,
3136
+ "learning_rate": 9.311669202100073e-05,
3137
+ "loss": 0.1962,
3138
+ "step": 46800
3139
+ },
3140
+ {
3141
+ "epoch": 1.12,
3142
+ "learning_rate": 9.288923518942968e-05,
3143
+ "loss": 0.1959,
3144
+ "step": 46900
3145
+ },
3146
+ {
3147
+ "epoch": 1.12,
3148
+ "learning_rate": 9.26616374032477e-05,
3149
+ "loss": 0.1964,
3150
+ "step": 47000
3151
+ },
3152
+ {
3153
+ "epoch": 1.12,
3154
+ "eval_runtime": 46.7963,
3155
+ "eval_samples_per_second": 230.788,
3156
+ "eval_steps_per_second": 7.223,
3157
+ "step": 47000
3158
+ },
3159
+ {
3160
+ "epoch": 1.12,
3161
+ "learning_rate": 9.243390115142761e-05,
3162
+ "loss": 0.196,
3163
+ "step": 47100
3164
+ },
3165
+ {
3166
+ "epoch": 1.12,
3167
+ "learning_rate": 9.220602892445661e-05,
3168
+ "loss": 0.1955,
3169
+ "step": 47200
3170
+ },
3171
+ {
3172
+ "epoch": 1.13,
3173
+ "learning_rate": 9.197802321430889e-05,
3174
+ "loss": 0.1958,
3175
+ "step": 47300
3176
+ },
3177
+ {
3178
+ "epoch": 1.13,
3179
+ "learning_rate": 9.174988651441833e-05,
3180
+ "loss": 0.1951,
3181
+ "step": 47400
3182
+ },
3183
+ {
3184
+ "epoch": 1.13,
3185
+ "learning_rate": 9.152162131965137e-05,
3186
+ "loss": 0.1954,
3187
+ "step": 47500
3188
+ },
3189
+ {
3190
+ "epoch": 1.13,
3191
+ "learning_rate": 9.129323012627956e-05,
3192
+ "loss": 0.1948,
3193
+ "step": 47600
3194
+ },
3195
+ {
3196
+ "epoch": 1.14,
3197
+ "learning_rate": 9.106471543195244e-05,
3198
+ "loss": 0.1954,
3199
+ "step": 47700
3200
+ },
3201
+ {
3202
+ "epoch": 1.14,
3203
+ "learning_rate": 9.08360797356701e-05,
3204
+ "loss": 0.1953,
3205
+ "step": 47800
3206
+ },
3207
+ {
3208
+ "epoch": 1.14,
3209
+ "learning_rate": 9.060732553775582e-05,
3210
+ "loss": 0.1949,
3211
+ "step": 47900
3212
+ },
3213
+ {
3214
+ "epoch": 1.14,
3215
+ "learning_rate": 9.037845533982892e-05,
3216
+ "loss": 0.1947,
3217
+ "step": 48000
3218
+ },
3219
+ {
3220
+ "epoch": 1.14,
3221
+ "eval_runtime": 46.9646,
3222
+ "eval_samples_per_second": 229.96,
3223
+ "eval_steps_per_second": 7.197,
3224
+ "step": 48000
3225
+ },
3226
+ {
3227
+ "epoch": 1.15,
3228
+ "learning_rate": 9.014947164477721e-05,
3229
+ "loss": 0.1946,
3230
+ "step": 48100
3231
+ },
3232
+ {
3233
+ "epoch": 1.15,
3234
+ "learning_rate": 8.992037695672967e-05,
3235
+ "loss": 0.1938,
3236
+ "step": 48200
3237
+ },
3238
+ {
3239
+ "epoch": 1.15,
3240
+ "learning_rate": 8.969117378102912e-05,
3241
+ "loss": 0.1946,
3242
+ "step": 48300
3243
+ },
3244
+ {
3245
+ "epoch": 1.15,
3246
+ "learning_rate": 8.946186462420478e-05,
3247
+ "loss": 0.1942,
3248
+ "step": 48400
3249
+ },
3250
+ {
3251
+ "epoch": 1.16,
3252
+ "learning_rate": 8.923245199394482e-05,
3253
+ "loss": 0.1934,
3254
+ "step": 48500
3255
+ },
3256
+ {
3257
+ "epoch": 1.16,
3258
+ "learning_rate": 8.900293839906903e-05,
3259
+ "loss": 0.194,
3260
+ "step": 48600
3261
+ },
3262
+ {
3263
+ "epoch": 1.16,
3264
+ "learning_rate": 8.87733263495013e-05,
3265
+ "loss": 0.1936,
3266
+ "step": 48700
3267
+ },
3268
+ {
3269
+ "epoch": 1.16,
3270
+ "learning_rate": 8.85436183562422e-05,
3271
+ "loss": 0.1933,
3272
+ "step": 48800
3273
+ },
3274
+ {
3275
+ "epoch": 1.16,
3276
+ "learning_rate": 8.83138169313416e-05,
3277
+ "loss": 0.1933,
3278
+ "step": 48900
3279
+ },
3280
+ {
3281
+ "epoch": 1.17,
3282
+ "learning_rate": 8.808392458787103e-05,
3283
+ "loss": 0.1931,
3284
+ "step": 49000
3285
+ },
3286
+ {
3287
+ "epoch": 1.17,
3288
+ "eval_runtime": 46.9712,
3289
+ "eval_samples_per_second": 229.928,
3290
+ "eval_steps_per_second": 7.196,
3291
+ "step": 49000
3292
+ },
3293
+ {
3294
+ "epoch": 1.17,
3295
+ "learning_rate": 8.78539438398963e-05,
3296
+ "loss": 0.1922,
3297
+ "step": 49100
3298
+ },
3299
+ {
3300
+ "epoch": 1.17,
3301
+ "learning_rate": 8.762387720245008e-05,
3302
+ "loss": 0.1922,
3303
+ "step": 49200
3304
+ },
3305
+ {
3306
+ "epoch": 1.17,
3307
+ "learning_rate": 8.73937271915042e-05,
3308
+ "loss": 0.1926,
3309
+ "step": 49300
3310
+ },
3311
+ {
3312
+ "epoch": 1.18,
3313
+ "learning_rate": 8.716349632394235e-05,
3314
+ "loss": 0.1924,
3315
+ "step": 49400
3316
+ },
3317
+ {
3318
+ "epoch": 1.18,
3319
+ "learning_rate": 8.69331871175324e-05,
3320
+ "loss": 0.1927,
3321
+ "step": 49500
3322
+ },
3323
+ {
3324
+ "epoch": 1.18,
3325
+ "learning_rate": 8.67028020908989e-05,
3326
+ "loss": 0.1924,
3327
+ "step": 49600
3328
+ },
3329
+ {
3330
+ "epoch": 1.18,
3331
+ "learning_rate": 8.647234376349565e-05,
3332
+ "loss": 0.1921,
3333
+ "step": 49700
3334
+ },
3335
+ {
3336
+ "epoch": 1.19,
3337
+ "learning_rate": 8.624181465557794e-05,
3338
+ "loss": 0.1914,
3339
+ "step": 49800
3340
+ },
3341
+ {
3342
+ "epoch": 1.19,
3343
+ "learning_rate": 8.601121728817519e-05,
3344
+ "loss": 0.1917,
3345
+ "step": 49900
3346
+ },
3347
+ {
3348
+ "epoch": 1.19,
3349
+ "learning_rate": 8.578055418306327e-05,
3350
+ "loss": 0.1918,
3351
+ "step": 50000
3352
+ },
3353
+ {
3354
+ "epoch": 1.19,
3355
+ "eval_runtime": 47.0452,
3356
+ "eval_samples_per_second": 229.566,
3357
+ "eval_steps_per_second": 7.185,
3358
+ "step": 50000
3359
+ }
3360
+ ],
3361
+ "max_steps": 100000,
3362
+ "num_train_epochs": 3,
3363
+ "total_flos": 3.504974211922538e+21,
3364
+ "trial_name": null,
3365
+ "trial_params": null
3366
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8912f4d3b0db3368d2797156e551b6addea5deca25403aff52088102f2b395cb
3
+ size 3640