theblackcat102 commited on
Commit
97c3f4f
1 Parent(s): 4290c60

Upload 7 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-1b-deduped",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8192,
12
+ "layer_norm_eps": 1e-05,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "gpt_neox",
15
+ "num_attention_heads": 8,
16
+ "num_hidden_layers": 16,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 0.25,
19
+ "tie_word_embeddings": false,
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.25.1",
22
+ "use_cache": true,
23
+ "use_parallel_residual": true,
24
+ "vocab_size": 50281
25
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c6085d42d75ecaac9ea9c9ddaf539e08d1c1b7c54e756eba552afee742e4054
3
+ size 2090527556
special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<human>",
4
+ "<bot>",
5
+ "</prefix>",
6
+ "<prefix>"
7
+ ],
8
+ "bos_token": "<|endoftext|>",
9
+ "eos_token": "<|endoftext|>",
10
+ "pad_token": "<|padding|>",
11
+ "sep_token": "<|endoftext|>",
12
+ "unk_token": "<|endoftext|>"
13
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "EleutherAI/pythia-1b-deduped",
7
+ "special_tokens_map_file": "/fsx/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/3523781c8df75f7741687a4284f6f70e1afa12f4/special_tokens_map.json",
8
+ "tokenizer_class": "GPTNeoXTokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,2404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.3287175905000616,
5
+ "global_step": 2000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.4084967333570947e-06,
13
+ "loss": 2.5231,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 2.0507482022971233e-06,
19
+ "loss": 2.3436,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.0,
24
+ "learning_rate": 2.385606273598312e-06,
25
+ "loss": 2.22,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 2.6136695401116585e-06,
31
+ "loss": 2.1055,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.01,
36
+ "learning_rate": 2.7868297632261957e-06,
37
+ "loss": 2.1275,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.01,
42
+ "learning_rate": 2.926458092787486e-06,
43
+ "loss": 2.0425,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.01,
48
+ "learning_rate": 3.0434580045013773e-06,
49
+ "loss": 2.0407,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.01,
54
+ "learning_rate": 3.1441512086208035e-06,
55
+ "loss": 2.0558,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.01,
60
+ "learning_rate": 3.232532087697698e-06,
61
+ "loss": 1.9887,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.02,
66
+ "learning_rate": 3.3112862237770753e-06,
67
+ "loss": 1.9845,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.02,
72
+ "learning_rate": 3.3823062961420163e-06,
73
+ "loss": 1.9856,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.02,
78
+ "learning_rate": 3.446976436243603e-06,
79
+ "loss": 1.9968,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.02,
84
+ "learning_rate": 3.506339534926595e-06,
85
+ "loss": 1.9247,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.02,
90
+ "learning_rate": 3.5612009452606784e-06,
91
+ "loss": 1.9817,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.02,
96
+ "learning_rate": 3.612195557913627e-06,
97
+ "loss": 1.9644,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.03,
102
+ "learning_rate": 3.65983275401539e-06,
103
+ "loss": 1.9639,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.03,
108
+ "learning_rate": 3.7045274519126395e-06,
109
+ "loss": 1.9587,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.03,
114
+ "learning_rate": 3.7466221106030114e-06,
115
+ "loss": 1.9849,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.03,
120
+ "learning_rate": 3.786402677560832e-06,
121
+ "loss": 1.9745,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.03,
126
+ "learning_rate": 3.824110376935989e-06,
127
+ "loss": 1.9429,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.03,
132
+ "learning_rate": 3.8599505757615295e-06,
133
+ "loss": 1.9484,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 0.04,
138
+ "learning_rate": 3.894099556414216e-06,
139
+ "loss": 1.9214,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 0.04,
144
+ "learning_rate": 3.9267097619885385e-06,
145
+ "loss": 1.9274,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 0.04,
150
+ "learning_rate": 3.95791391001684e-06,
151
+ "loss": 1.9185,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 0.04,
156
+ "learning_rate": 3.987828255432777e-06,
157
+ "loss": 1.9578,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 0.04,
162
+ "learning_rate": 4.016555205552159e-06,
163
+ "loss": 1.907,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 0.04,
168
+ "learning_rate": 4.044185435607626e-06,
169
+ "loss": 1.9448,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 0.05,
174
+ "learning_rate": 4.070799615107415e-06,
175
+ "loss": 1.8884,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 0.05,
180
+ "learning_rate": 4.096469827889988e-06,
181
+ "loss": 1.9402,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 0.05,
186
+ "learning_rate": 4.121260748862021e-06,
187
+ "loss": 1.9153,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 0.05,
192
+ "learning_rate": 4.145230625795312e-06,
193
+ "loss": 1.9106,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 0.05,
198
+ "learning_rate": 4.1684321036962525e-06,
199
+ "loss": 1.958,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 0.05,
204
+ "learning_rate": 4.190912921100477e-06,
205
+ "loss": 1.9117,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 0.06,
210
+ "learning_rate": 4.212716501452232e-06,
211
+ "loss": 1.9097,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 0.06,
216
+ "learning_rate": 4.233882457984791e-06,
217
+ "loss": 1.9279,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 0.06,
222
+ "learning_rate": 4.2544470268536555e-06,
223
+ "loss": 1.9164,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 0.06,
228
+ "learning_rate": 4.27444344042015e-06,
229
+ "loss": 1.9323,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 0.06,
234
+ "learning_rate": 4.293902250342989e-06,
235
+ "loss": 1.9134,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 0.06,
240
+ "learning_rate": 4.312851608364853e-06,
241
+ "loss": 1.8835,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 0.07,
246
+ "learning_rate": 4.3313175112718595e-06,
247
+ "loss": 1.8969,
248
+ "step": 400
249
+ },
250
+ {
251
+ "epoch": 0.07,
252
+ "learning_rate": 4.3493240153753665e-06,
253
+ "loss": 1.9238,
254
+ "step": 410
255
+ },
256
+ {
257
+ "epoch": 0.07,
258
+ "learning_rate": 4.366893424956263e-06,
259
+ "loss": 1.8946,
260
+ "step": 420
261
+ },
262
+ {
263
+ "epoch": 0.07,
264
+ "learning_rate": 4.38404645837504e-06,
265
+ "loss": 1.8781,
266
+ "step": 430
267
+ },
268
+ {
269
+ "epoch": 0.07,
270
+ "learning_rate": 4.400802394950703e-06,
271
+ "loss": 1.8955,
272
+ "step": 440
273
+ },
274
+ {
275
+ "epoch": 0.07,
276
+ "learning_rate": 4.4171792052198945e-06,
277
+ "loss": 1.8515,
278
+ "step": 450
279
+ },
280
+ {
281
+ "epoch": 0.08,
282
+ "learning_rate": 4.433193666783084e-06,
283
+ "loss": 1.8978,
284
+ "step": 460
285
+ },
286
+ {
287
+ "epoch": 0.08,
288
+ "learning_rate": 4.448861467610187e-06,
289
+ "loss": 1.889,
290
+ "step": 470
291
+ },
292
+ {
293
+ "epoch": 0.08,
294
+ "learning_rate": 4.4641972984001906e-06,
295
+ "loss": 1.8667,
296
+ "step": 480
297
+ },
298
+ {
299
+ "epoch": 0.08,
300
+ "learning_rate": 4.479214935357724e-06,
301
+ "loss": 1.9304,
302
+ "step": 490
303
+ },
304
+ {
305
+ "epoch": 0.08,
306
+ "learning_rate": 4.493927314555554e-06,
307
+ "loss": 1.9042,
308
+ "step": 500
309
+ },
310
+ {
311
+ "epoch": 0.08,
312
+ "eval_gsm8k_hard_accuracy": 0.8928993119172672,
313
+ "eval_gsm8k_hard_loss": 0.4951171875,
314
+ "eval_gsm8k_hard_runtime": 2.1138,
315
+ "eval_gsm8k_hard_samples_per_second": 124.893,
316
+ "eval_gsm8k_hard_steps_per_second": 8.042,
317
+ "step": 500
318
+ },
319
+ {
320
+ "epoch": 0.08,
321
+ "eval_webgpt_accuracy": 0.4654303097674511,
322
+ "eval_webgpt_loss": 2.478515625,
323
+ "eval_webgpt_runtime": 13.6298,
324
+ "eval_webgpt_samples_per_second": 287.312,
325
+ "eval_webgpt_steps_per_second": 17.975,
326
+ "step": 500
327
+ },
328
+ {
329
+ "epoch": 0.08,
330
+ "eval_squad_v2_accuracy": 0.8681394546082154,
331
+ "eval_squad_v2_loss": 0.51806640625,
332
+ "eval_squad_v2_runtime": 80.2931,
333
+ "eval_squad_v2_samples_per_second": 324.611,
334
+ "eval_squad_v2_steps_per_second": 20.288,
335
+ "step": 500
336
+ },
337
+ {
338
+ "epoch": 0.08,
339
+ "eval_adversarial_qa_accuracy": 0.7833800465144016,
340
+ "eval_adversarial_qa_loss": 1.310546875,
341
+ "eval_adversarial_qa_runtime": 19.1554,
342
+ "eval_adversarial_qa_samples_per_second": 313.228,
343
+ "eval_adversarial_qa_steps_per_second": 19.577,
344
+ "step": 500
345
+ },
346
+ {
347
+ "epoch": 0.08,
348
+ "eval_private_tuning_accuracy": 0.6404945703123248,
349
+ "eval_private_tuning_loss": 1.3779296875,
350
+ "eval_private_tuning_runtime": 68.286,
351
+ "eval_private_tuning_samples_per_second": 310.137,
352
+ "eval_private_tuning_steps_per_second": 19.389,
353
+ "step": 500
354
+ },
355
+ {
356
+ "epoch": 0.08,
357
+ "eval_oa_translated_accuracy": 0.6488286598439107,
358
+ "eval_oa_translated_loss": 1.5576171875,
359
+ "eval_oa_translated_runtime": 524.5762,
360
+ "eval_oa_translated_samples_per_second": 254.918,
361
+ "eval_oa_translated_steps_per_second": 15.933,
362
+ "step": 500
363
+ },
364
+ {
365
+ "epoch": 0.08,
366
+ "eval_prosocial_dialogue_accuracy": 0.52144098641849,
367
+ "eval_prosocial_dialogue_loss": 1.90625,
368
+ "eval_prosocial_dialogue_runtime": 90.5414,
369
+ "eval_prosocial_dialogue_samples_per_second": 298.018,
370
+ "eval_prosocial_dialogue_steps_per_second": 18.632,
371
+ "step": 500
372
+ },
373
+ {
374
+ "epoch": 0.08,
375
+ "eval_math_qa_accuracy": 0.5165153098127461,
376
+ "eval_math_qa_loss": 2.20703125,
377
+ "eval_math_qa_runtime": 17.7049,
378
+ "eval_math_qa_samples_per_second": 337.082,
379
+ "eval_math_qa_steps_per_second": 21.068,
380
+ "step": 500
381
+ },
382
+ {
383
+ "epoch": 0.08,
384
+ "eval_wikihow_accuracy": 0.5831415499792042,
385
+ "eval_wikihow_loss": 2.140625,
386
+ "eval_wikihow_runtime": 9.4415,
387
+ "eval_wikihow_samples_per_second": 242.863,
388
+ "eval_wikihow_steps_per_second": 15.252,
389
+ "step": 500
390
+ },
391
+ {
392
+ "epoch": 0.08,
393
+ "eval_joke_accuracy": 0.45545868081880214,
394
+ "eval_joke_loss": 2.529296875,
395
+ "eval_joke_runtime": 1.3918,
396
+ "eval_joke_samples_per_second": 54.606,
397
+ "eval_joke_steps_per_second": 3.593,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.08,
402
+ "eval_gsm8k_accuracy": 0.7111711283077483,
403
+ "eval_gsm8k_loss": 1.1416015625,
404
+ "eval_gsm8k_runtime": 6.0874,
405
+ "eval_gsm8k_samples_per_second": 245.588,
406
+ "eval_gsm8k_steps_per_second": 15.442,
407
+ "step": 500
408
+ },
409
+ {
410
+ "epoch": 0.08,
411
+ "eval_ted_trans_en-hi_accuracy": 0.5343350158469304,
412
+ "eval_ted_trans_en-hi_loss": 2.244140625,
413
+ "eval_ted_trans_en-hi_runtime": 1.2611,
414
+ "eval_ted_trans_en-hi_samples_per_second": 81.672,
415
+ "eval_ted_trans_en-hi_steps_per_second": 5.551,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 0.08,
420
+ "eval_ted_trans_de-ja_accuracy": 0.5195722742027878,
421
+ "eval_ted_trans_de-ja_loss": 2.314453125,
422
+ "eval_ted_trans_de-ja_runtime": 3.7052,
423
+ "eval_ted_trans_de-ja_samples_per_second": 193.783,
424
+ "eval_ted_trans_de-ja_steps_per_second": 12.145,
425
+ "step": 500
426
+ },
427
+ {
428
+ "epoch": 0.08,
429
+ "eval_ted_trans_nl-en_accuracy": 0.6433630400125447,
430
+ "eval_ted_trans_nl-en_loss": 1.7353515625,
431
+ "eval_ted_trans_nl-en_runtime": 3.5777,
432
+ "eval_ted_trans_nl-en_samples_per_second": 215.5,
433
+ "eval_ted_trans_nl-en_steps_per_second": 13.696,
434
+ "step": 500
435
+ },
436
+ {
437
+ "epoch": 0.08,
438
+ "eval_ted_trans_en-ja_accuracy": 0.5440905817396176,
439
+ "eval_ted_trans_en-ja_loss": 2.13671875,
440
+ "eval_ted_trans_en-ja_runtime": 3.7065,
441
+ "eval_ted_trans_en-ja_samples_per_second": 216.109,
442
+ "eval_ted_trans_en-ja_steps_per_second": 13.76,
443
+ "step": 500
444
+ },
445
+ {
446
+ "epoch": 0.08,
447
+ "eval_ted_trans_en-es_accuracy": 0.7055326931870142,
448
+ "eval_ted_trans_en-es_loss": 1.3369140625,
449
+ "eval_ted_trans_en-es_runtime": 3.3519,
450
+ "eval_ted_trans_en-es_samples_per_second": 246.427,
451
+ "eval_ted_trans_en-es_steps_per_second": 15.514,
452
+ "step": 500
453
+ },
454
+ {
455
+ "epoch": 0.08,
456
+ "eval_ted_trans_en-ms_accuracy": 0.5517070757050965,
457
+ "eval_ted_trans_en-ms_loss": 2.32421875,
458
+ "eval_ted_trans_en-ms_runtime": 0.8373,
459
+ "eval_ted_trans_en-ms_samples_per_second": 50.159,
460
+ "eval_ted_trans_en-ms_steps_per_second": 3.583,
461
+ "step": 500
462
+ },
463
+ {
464
+ "epoch": 0.08,
465
+ "eval_xsum_accuracy": 0.5663549372439451,
466
+ "eval_xsum_loss": NaN,
467
+ "eval_xsum_runtime": 140.8411,
468
+ "eval_xsum_samples_per_second": 289.752,
469
+ "eval_xsum_steps_per_second": 18.113,
470
+ "step": 500
471
+ },
472
+ {
473
+ "epoch": 0.08,
474
+ "eval_cnn_dailymail_accuracy": 0.6501961820900207,
475
+ "eval_cnn_dailymail_loss": NaN,
476
+ "eval_cnn_dailymail_runtime": 207.5377,
477
+ "eval_cnn_dailymail_samples_per_second": 276.687,
478
+ "eval_cnn_dailymail_steps_per_second": 17.293,
479
+ "step": 500
480
+ },
481
+ {
482
+ "epoch": 0.08,
483
+ "eval_multi_news_accuracy": 0.5168385769568282,
484
+ "eval_multi_news_loss": NaN,
485
+ "eval_multi_news_runtime": 36.1577,
486
+ "eval_multi_news_samples_per_second": 248.771,
487
+ "eval_multi_news_steps_per_second": 15.571,
488
+ "step": 500
489
+ },
490
+ {
491
+ "epoch": 0.08,
492
+ "eval_tldr_news_accuracy": 0.5048904354368475,
493
+ "eval_tldr_news_loss": 2.384765625,
494
+ "eval_tldr_news_runtime": 5.9032,
495
+ "eval_tldr_news_samples_per_second": 241.901,
496
+ "eval_tldr_news_steps_per_second": 15.246,
497
+ "step": 500
498
+ },
499
+ {
500
+ "epoch": 0.08,
501
+ "eval_scitldr_accuracy": 0.5,
502
+ "eval_scitldr_loss": NaN,
503
+ "eval_scitldr_runtime": 2.3974,
504
+ "eval_scitldr_samples_per_second": 166.428,
505
+ "eval_scitldr_steps_per_second": 10.428,
506
+ "step": 500
507
+ },
508
+ {
509
+ "epoch": 0.08,
510
+ "eval_samsum_accuracy": 0.5789020336200051,
511
+ "eval_samsum_loss": 1.619140625,
512
+ "eval_samsum_runtime": 10.1073,
513
+ "eval_samsum_samples_per_second": 291.572,
514
+ "eval_samsum_steps_per_second": 18.304,
515
+ "step": 500
516
+ },
517
+ {
518
+ "epoch": 0.08,
519
+ "eval_debate_sum_accuracy": 0.9321960723793357,
520
+ "eval_debate_sum_loss": NaN,
521
+ "eval_debate_sum_runtime": 188.2954,
522
+ "eval_debate_sum_samples_per_second": 255.524,
523
+ "eval_debate_sum_steps_per_second": 15.975,
524
+ "step": 500
525
+ },
526
+ {
527
+ "epoch": 0.08,
528
+ "eval_billsum_accuracy": 0.6453599014888616,
529
+ "eval_billsum_loss": NaN,
530
+ "eval_billsum_runtime": 22.3683,
531
+ "eval_billsum_samples_per_second": 169.436,
532
+ "eval_billsum_steps_per_second": 10.595,
533
+ "step": 500
534
+ },
535
+ {
536
+ "epoch": 0.08,
537
+ "eval_wmt2019_zh-en_accuracy": 0.5524590644131345,
538
+ "eval_wmt2019_zh-en_loss": 2.146484375,
539
+ "eval_wmt2019_zh-en_runtime": 13.8635,
540
+ "eval_wmt2019_zh-en_samples_per_second": 287.158,
541
+ "eval_wmt2019_zh-en_steps_per_second": 17.961,
542
+ "step": 500
543
+ },
544
+ {
545
+ "epoch": 0.08,
546
+ "eval_wmt2019_ru-en_accuracy": 0.6308636370293347,
547
+ "eval_wmt2019_ru-en_loss": 1.580078125,
548
+ "eval_wmt2019_ru-en_runtime": 11.2038,
549
+ "eval_wmt2019_ru-en_samples_per_second": 267.766,
550
+ "eval_wmt2019_ru-en_steps_per_second": 16.78,
551
+ "step": 500
552
+ },
553
+ {
554
+ "epoch": 0.08,
555
+ "eval_wmt2019_de-en_accuracy": 0.657534107930853,
556
+ "eval_wmt2019_de-en_loss": 1.501953125,
557
+ "eval_wmt2019_de-en_runtime": 9.454,
558
+ "eval_wmt2019_de-en_samples_per_second": 317.115,
559
+ "eval_wmt2019_de-en_steps_per_second": 19.886,
560
+ "step": 500
561
+ },
562
+ {
563
+ "epoch": 0.08,
564
+ "eval_wmt2019_fr-de_accuracy": 0.6479142094481346,
565
+ "eval_wmt2019_fr-de_loss": 1.5439453125,
566
+ "eval_wmt2019_fr-de_runtime": 5.2625,
567
+ "eval_wmt2019_fr-de_samples_per_second": 287.315,
568
+ "eval_wmt2019_fr-de_steps_per_second": 18.052,
569
+ "step": 500
570
+ },
571
+ {
572
+ "epoch": 0.08,
573
+ "eval_essay_instruction_accuracy": 0.5775154438520775,
574
+ "eval_essay_instruction_loss": 2.09765625,
575
+ "eval_essay_instruction_runtime": 3.0793,
576
+ "eval_essay_instruction_samples_per_second": 134.122,
577
+ "eval_essay_instruction_steps_per_second": 8.443,
578
+ "step": 500
579
+ },
580
+ {
581
+ "epoch": 0.08,
582
+ "eval_reddit_eli5_accuracy": 0.42260973997710893,
583
+ "eval_reddit_eli5_loss": 2.76171875,
584
+ "eval_reddit_eli5_runtime": 203.2121,
585
+ "eval_reddit_eli5_samples_per_second": 268.326,
586
+ "eval_reddit_eli5_steps_per_second": 16.771,
587
+ "step": 500
588
+ },
589
+ {
590
+ "epoch": 0.08,
591
+ "eval_reddit_askh_accuracy": 0.42568767737796204,
592
+ "eval_reddit_askh_loss": 2.84375,
593
+ "eval_reddit_askh_runtime": 111.1784,
594
+ "eval_reddit_askh_samples_per_second": 177.238,
595
+ "eval_reddit_askh_steps_per_second": 11.081,
596
+ "step": 500
597
+ },
598
+ {
599
+ "epoch": 0.08,
600
+ "eval_reddit_asks_accuracy": 0.43555163138333913,
601
+ "eval_reddit_asks_loss": 2.689453125,
602
+ "eval_reddit_asks_runtime": 119.9403,
603
+ "eval_reddit_asks_samples_per_second": 219.743,
604
+ "eval_reddit_asks_steps_per_second": 13.74,
605
+ "step": 500
606
+ },
607
+ {
608
+ "epoch": 0.08,
609
+ "learning_rate": 4.5083465988888945e-06,
610
+ "loss": 1.8966,
611
+ "step": 510
612
+ },
613
+ {
614
+ "epoch": 0.09,
615
+ "learning_rate": 4.5224842384899045e-06,
616
+ "loss": 1.9039,
617
+ "step": 520
618
+ },
619
+ {
620
+ "epoch": 0.09,
621
+ "learning_rate": 4.5363510253542444e-06,
622
+ "loss": 1.9029,
623
+ "step": 530
624
+ },
625
+ {
626
+ "epoch": 0.09,
627
+ "learning_rate": 4.549957142832593e-06,
628
+ "loss": 1.8759,
629
+ "step": 540
630
+ },
631
+ {
632
+ "epoch": 0.09,
633
+ "learning_rate": 4.563312210555719e-06,
634
+ "loss": 1.9042,
635
+ "step": 550
636
+ },
637
+ {
638
+ "epoch": 0.09,
639
+ "learning_rate": 4.576425325289549e-06,
640
+ "loss": 1.9205,
641
+ "step": 560
642
+ },
643
+ {
644
+ "epoch": 0.09,
645
+ "learning_rate": 4.589305098154845e-06,
646
+ "loss": 1.9324,
647
+ "step": 570
648
+ },
649
+ {
650
+ "epoch": 0.1,
651
+ "learning_rate": 4.601959688592886e-06,
652
+ "loss": 1.8757,
653
+ "step": 580
654
+ },
655
+ {
656
+ "epoch": 0.1,
657
+ "learning_rate": 4.614396835412691e-06,
658
+ "loss": 1.895,
659
+ "step": 590
660
+ },
661
+ {
662
+ "epoch": 0.1,
663
+ "learning_rate": 4.626623885215616e-06,
664
+ "loss": 1.9004,
665
+ "step": 600
666
+ },
667
+ {
668
+ "epoch": 0.1,
669
+ "learning_rate": 4.638647818458763e-06,
670
+ "loss": 1.8705,
671
+ "step": 610
672
+ },
673
+ {
674
+ "epoch": 0.1,
675
+ "learning_rate": 4.650475273388737e-06,
676
+ "loss": 1.8929,
677
+ "step": 620
678
+ },
679
+ {
680
+ "epoch": 0.1,
681
+ "learning_rate": 4.662112568051194e-06,
682
+ "loss": 1.8745,
683
+ "step": 630
684
+ },
685
+ {
686
+ "epoch": 0.11,
687
+ "learning_rate": 4.673565720558918e-06,
688
+ "loss": 1.8768,
689
+ "step": 640
690
+ },
691
+ {
692
+ "epoch": 0.11,
693
+ "learning_rate": 4.6848404677811685e-06,
694
+ "loss": 1.885,
695
+ "step": 650
696
+ },
697
+ {
698
+ "epoch": 0.11,
699
+ "learning_rate": 4.695942282599635e-06,
700
+ "loss": 1.8496,
701
+ "step": 660
702
+ },
703
+ {
704
+ "epoch": 0.11,
705
+ "learning_rate": 4.706876389860915e-06,
706
+ "loss": 1.9061,
707
+ "step": 670
708
+ },
709
+ {
710
+ "epoch": 0.11,
711
+ "learning_rate": 4.717647781141908e-06,
712
+ "loss": 1.8839,
713
+ "step": 680
714
+ },
715
+ {
716
+ "epoch": 0.11,
717
+ "learning_rate": 4.7282612284325845e-06,
718
+ "loss": 1.921,
719
+ "step": 690
720
+ },
721
+ {
722
+ "epoch": 0.12,
723
+ "learning_rate": 4.738721296830016e-06,
724
+ "loss": 1.8519,
725
+ "step": 700
726
+ },
727
+ {
728
+ "epoch": 0.12,
729
+ "learning_rate": 4.749032356328167e-06,
730
+ "loss": 1.8901,
731
+ "step": 710
732
+ },
733
+ {
734
+ "epoch": 0.12,
735
+ "learning_rate": 4.759198592779668e-06,
736
+ "loss": 1.8678,
737
+ "step": 720
738
+ },
739
+ {
740
+ "epoch": 0.12,
741
+ "learning_rate": 4.769224018098397e-06,
742
+ "loss": 1.859,
743
+ "step": 730
744
+ },
745
+ {
746
+ "epoch": 0.12,
747
+ "learning_rate": 4.7791124797650865e-06,
748
+ "loss": 1.8315,
749
+ "step": 740
750
+ },
751
+ {
752
+ "epoch": 0.12,
753
+ "learning_rate": 4.788867669692332e-06,
754
+ "loss": 1.8915,
755
+ "step": 750
756
+ },
757
+ {
758
+ "epoch": 0.12,
759
+ "learning_rate": 4.798493132500121e-06,
760
+ "loss": 1.8936,
761
+ "step": 760
762
+ },
763
+ {
764
+ "epoch": 0.13,
765
+ "learning_rate": 4.8079922732483016e-06,
766
+ "loss": 1.8869,
767
+ "step": 770
768
+ },
769
+ {
770
+ "epoch": 0.13,
771
+ "learning_rate": 4.817368364668191e-06,
772
+ "loss": 1.8556,
773
+ "step": 780
774
+ },
775
+ {
776
+ "epoch": 0.13,
777
+ "learning_rate": 4.8266245539317745e-06,
778
+ "loss": 1.8594,
779
+ "step": 790
780
+ },
781
+ {
782
+ "epoch": 0.13,
783
+ "learning_rate": 4.835763868993521e-06,
784
+ "loss": 1.8646,
785
+ "step": 800
786
+ },
787
+ {
788
+ "epoch": 0.13,
789
+ "learning_rate": 4.844789224536785e-06,
790
+ "loss": 1.8758,
791
+ "step": 810
792
+ },
793
+ {
794
+ "epoch": 0.13,
795
+ "learning_rate": 4.853703427554027e-06,
796
+ "loss": 1.8349,
797
+ "step": 820
798
+ },
799
+ {
800
+ "epoch": 0.14,
801
+ "learning_rate": 4.862509182587578e-06,
802
+ "loss": 1.8517,
803
+ "step": 830
804
+ },
805
+ {
806
+ "epoch": 0.14,
807
+ "learning_rate": 4.871209096655434e-06,
808
+ "loss": 1.8563,
809
+ "step": 840
810
+ },
811
+ {
812
+ "epoch": 0.14,
813
+ "learning_rate": 4.879805683884512e-06,
814
+ "loss": 1.8749,
815
+ "step": 850
816
+ },
817
+ {
818
+ "epoch": 0.14,
819
+ "learning_rate": 4.888301369871998e-06,
820
+ "loss": 1.8276,
821
+ "step": 860
822
+ },
823
+ {
824
+ "epoch": 0.14,
825
+ "learning_rate": 4.8966984957936845e-06,
826
+ "loss": 1.7967,
827
+ "step": 870
828
+ },
829
+ {
830
+ "epoch": 0.14,
831
+ "learning_rate": 4.904999322276735e-06,
832
+ "loss": 1.9041,
833
+ "step": 880
834
+ },
835
+ {
836
+ "epoch": 0.15,
837
+ "learning_rate": 4.913206033052878e-06,
838
+ "loss": 1.8417,
839
+ "step": 890
840
+ },
841
+ {
842
+ "epoch": 0.15,
843
+ "learning_rate": 4.921320738406821e-06,
844
+ "loss": 1.8611,
845
+ "step": 900
846
+ },
847
+ {
848
+ "epoch": 0.15,
849
+ "learning_rate": 4.929345478433492e-06,
850
+ "loss": 1.8924,
851
+ "step": 910
852
+ },
853
+ {
854
+ "epoch": 0.15,
855
+ "learning_rate": 4.937282226116702e-06,
856
+ "loss": 1.8684,
857
+ "step": 920
858
+ },
859
+ {
860
+ "epoch": 0.15,
861
+ "learning_rate": 4.945132890240829e-06,
862
+ "loss": 1.8292,
863
+ "step": 930
864
+ },
865
+ {
866
+ "epoch": 0.15,
867
+ "learning_rate": 4.952899318146298e-06,
868
+ "loss": 1.8498,
869
+ "step": 940
870
+ },
871
+ {
872
+ "epoch": 0.16,
873
+ "learning_rate": 4.96058329833879e-06,
874
+ "loss": 1.8944,
875
+ "step": 950
876
+ },
877
+ {
878
+ "epoch": 0.16,
879
+ "learning_rate": 4.968186562961406e-06,
880
+ "loss": 1.885,
881
+ "step": 960
882
+ },
883
+ {
884
+ "epoch": 0.16,
885
+ "learning_rate": 4.975710790138337e-06,
886
+ "loss": 1.8469,
887
+ "step": 970
888
+ },
889
+ {
890
+ "epoch": 0.16,
891
+ "learning_rate": 4.9831576061979556e-06,
892
+ "loss": 1.8536,
893
+ "step": 980
894
+ },
895
+ {
896
+ "epoch": 0.16,
897
+ "learning_rate": 4.990528587782728e-06,
898
+ "loss": 1.8514,
899
+ "step": 990
900
+ },
901
+ {
902
+ "epoch": 0.16,
903
+ "learning_rate": 4.99782526385276e-06,
904
+ "loss": 1.8168,
905
+ "step": 1000
906
+ },
907
+ {
908
+ "epoch": 0.16,
909
+ "eval_gsm8k_hard_accuracy": 0.9009201579740238,
910
+ "eval_gsm8k_hard_loss": 0.44189453125,
911
+ "eval_gsm8k_hard_runtime": 1.5125,
912
+ "eval_gsm8k_hard_samples_per_second": 174.543,
913
+ "eval_gsm8k_hard_steps_per_second": 11.24,
914
+ "step": 1000
915
+ },
916
+ {
917
+ "epoch": 0.16,
918
+ "eval_webgpt_accuracy": 0.4676998865211623,
919
+ "eval_webgpt_loss": 2.44921875,
920
+ "eval_webgpt_runtime": 15.1394,
921
+ "eval_webgpt_samples_per_second": 258.664,
922
+ "eval_webgpt_steps_per_second": 16.183,
923
+ "step": 1000
924
+ },
925
+ {
926
+ "epoch": 0.16,
927
+ "eval_squad_v2_accuracy": 0.8740146386480032,
928
+ "eval_squad_v2_loss": 0.456298828125,
929
+ "eval_squad_v2_runtime": 77.6449,
930
+ "eval_squad_v2_samples_per_second": 335.682,
931
+ "eval_squad_v2_steps_per_second": 20.98,
932
+ "step": 1000
933
+ },
934
+ {
935
+ "epoch": 0.16,
936
+ "eval_adversarial_qa_accuracy": 0.7841552865406405,
937
+ "eval_adversarial_qa_loss": 1.16796875,
938
+ "eval_adversarial_qa_runtime": 20.3196,
939
+ "eval_adversarial_qa_samples_per_second": 295.282,
940
+ "eval_adversarial_qa_steps_per_second": 18.455,
941
+ "step": 1000
942
+ },
943
+ {
944
+ "epoch": 0.16,
945
+ "eval_private_tuning_accuracy": 0.6468452789452516,
946
+ "eval_private_tuning_loss": 1.33203125,
947
+ "eval_private_tuning_runtime": 62.217,
948
+ "eval_private_tuning_samples_per_second": 340.389,
949
+ "eval_private_tuning_steps_per_second": 21.28,
950
+ "step": 1000
951
+ },
952
+ {
953
+ "epoch": 0.16,
954
+ "eval_oa_translated_accuracy": 0.6605713712776647,
955
+ "eval_oa_translated_loss": 1.4912109375,
956
+ "eval_oa_translated_runtime": 498.0305,
957
+ "eval_oa_translated_samples_per_second": 268.506,
958
+ "eval_oa_translated_steps_per_second": 16.782,
959
+ "step": 1000
960
+ },
961
+ {
962
+ "epoch": 0.16,
963
+ "eval_prosocial_dialogue_accuracy": 0.5267998067934173,
964
+ "eval_prosocial_dialogue_loss": 1.9033203125,
965
+ "eval_prosocial_dialogue_runtime": 126.2272,
966
+ "eval_prosocial_dialogue_samples_per_second": 213.765,
967
+ "eval_prosocial_dialogue_steps_per_second": 13.365,
968
+ "step": 1000
969
+ },
970
+ {
971
+ "epoch": 0.16,
972
+ "eval_math_qa_accuracy": 0.5343074095293895,
973
+ "eval_math_qa_loss": 2.080078125,
974
+ "eval_math_qa_runtime": 19.3631,
975
+ "eval_math_qa_samples_per_second": 308.215,
976
+ "eval_math_qa_steps_per_second": 19.263,
977
+ "step": 1000
978
+ },
979
+ {
980
+ "epoch": 0.16,
981
+ "eval_wikihow_accuracy": 0.5909261056425897,
982
+ "eval_wikihow_loss": 2.078125,
983
+ "eval_wikihow_runtime": 7.7313,
984
+ "eval_wikihow_samples_per_second": 296.588,
985
+ "eval_wikihow_steps_per_second": 18.626,
986
+ "step": 1000
987
+ },
988
+ {
989
+ "epoch": 0.16,
990
+ "eval_joke_accuracy": 0.45830174374526156,
991
+ "eval_joke_loss": 2.498046875,
992
+ "eval_joke_runtime": 2.2389,
993
+ "eval_joke_samples_per_second": 33.945,
994
+ "eval_joke_steps_per_second": 2.233,
995
+ "step": 1000
996
+ },
997
+ {
998
+ "epoch": 0.16,
999
+ "eval_gsm8k_accuracy": 0.7258224765956256,
1000
+ "eval_gsm8k_loss": 1.0576171875,
1001
+ "eval_gsm8k_runtime": 6.1435,
1002
+ "eval_gsm8k_samples_per_second": 243.346,
1003
+ "eval_gsm8k_steps_per_second": 15.301,
1004
+ "step": 1000
1005
+ },
1006
+ {
1007
+ "epoch": 0.16,
1008
+ "eval_ted_trans_en-hi_accuracy": 0.5460148777895856,
1009
+ "eval_ted_trans_en-hi_loss": 2.138671875,
1010
+ "eval_ted_trans_en-hi_runtime": 0.5653,
1011
+ "eval_ted_trans_en-hi_samples_per_second": 182.218,
1012
+ "eval_ted_trans_en-hi_steps_per_second": 12.384,
1013
+ "step": 1000
1014
+ },
1015
+ {
1016
+ "epoch": 0.16,
1017
+ "eval_ted_trans_de-ja_accuracy": 0.5330968145857443,
1018
+ "eval_ted_trans_de-ja_loss": 2.193359375,
1019
+ "eval_ted_trans_de-ja_runtime": 3.858,
1020
+ "eval_ted_trans_de-ja_samples_per_second": 186.106,
1021
+ "eval_ted_trans_de-ja_steps_per_second": 11.664,
1022
+ "step": 1000
1023
+ },
1024
+ {
1025
+ "epoch": 0.16,
1026
+ "eval_ted_trans_nl-en_accuracy": 0.6414471117584333,
1027
+ "eval_ted_trans_nl-en_loss": 1.689453125,
1028
+ "eval_ted_trans_nl-en_runtime": 3.1044,
1029
+ "eval_ted_trans_nl-en_samples_per_second": 248.36,
1030
+ "eval_ted_trans_nl-en_steps_per_second": 15.784,
1031
+ "step": 1000
1032
+ },
1033
+ {
1034
+ "epoch": 0.16,
1035
+ "eval_ted_trans_en-ja_accuracy": 0.5530009680542111,
1036
+ "eval_ted_trans_en-ja_loss": 2.05859375,
1037
+ "eval_ted_trans_en-ja_runtime": 3.8822,
1038
+ "eval_ted_trans_en-ja_samples_per_second": 206.324,
1039
+ "eval_ted_trans_en-ja_steps_per_second": 13.137,
1040
+ "step": 1000
1041
+ },
1042
+ {
1043
+ "epoch": 0.16,
1044
+ "eval_ted_trans_en-es_accuracy": 0.7106248418922337,
1045
+ "eval_ted_trans_en-es_loss": 1.2880859375,
1046
+ "eval_ted_trans_en-es_runtime": 2.9435,
1047
+ "eval_ted_trans_en-es_samples_per_second": 280.621,
1048
+ "eval_ted_trans_en-es_steps_per_second": 17.666,
1049
+ "step": 1000
1050
+ },
1051
+ {
1052
+ "epoch": 0.16,
1053
+ "eval_ted_trans_en-ms_accuracy": 0.5987135081642752,
1054
+ "eval_ted_trans_en-ms_loss": 1.984375,
1055
+ "eval_ted_trans_en-ms_runtime": 1.4559,
1056
+ "eval_ted_trans_en-ms_samples_per_second": 28.848,
1057
+ "eval_ted_trans_en-ms_steps_per_second": 2.061,
1058
+ "step": 1000
1059
+ },
1060
+ {
1061
+ "epoch": 0.16,
1062
+ "eval_xsum_accuracy": 0.5722584941442159,
1063
+ "eval_xsum_loss": NaN,
1064
+ "eval_xsum_runtime": 141.7203,
1065
+ "eval_xsum_samples_per_second": 287.955,
1066
+ "eval_xsum_steps_per_second": 18.0,
1067
+ "step": 1000
1068
+ },
1069
+ {
1070
+ "epoch": 0.16,
1071
+ "eval_cnn_dailymail_accuracy": 0.6576155822271417,
1072
+ "eval_cnn_dailymail_loss": NaN,
1073
+ "eval_cnn_dailymail_runtime": 209.5351,
1074
+ "eval_cnn_dailymail_samples_per_second": 274.05,
1075
+ "eval_cnn_dailymail_steps_per_second": 17.128,
1076
+ "step": 1000
1077
+ },
1078
+ {
1079
+ "epoch": 0.16,
1080
+ "eval_multi_news_accuracy": 0.5226291863275405,
1081
+ "eval_multi_news_loss": NaN,
1082
+ "eval_multi_news_runtime": 36.8798,
1083
+ "eval_multi_news_samples_per_second": 243.9,
1084
+ "eval_multi_news_steps_per_second": 15.266,
1085
+ "step": 1000
1086
+ },
1087
+ {
1088
+ "epoch": 0.16,
1089
+ "eval_tldr_news_accuracy": 0.5359729145114267,
1090
+ "eval_tldr_news_loss": 2.20703125,
1091
+ "eval_tldr_news_runtime": 4.9335,
1092
+ "eval_tldr_news_samples_per_second": 289.451,
1093
+ "eval_tldr_news_steps_per_second": 18.243,
1094
+ "step": 1000
1095
+ },
1096
+ {
1097
+ "epoch": 0.16,
1098
+ "eval_scitldr_accuracy": 0.49700598802395207,
1099
+ "eval_scitldr_loss": NaN,
1100
+ "eval_scitldr_runtime": 1.5917,
1101
+ "eval_scitldr_samples_per_second": 250.67,
1102
+ "eval_scitldr_steps_per_second": 15.706,
1103
+ "step": 1000
1104
+ },
1105
+ {
1106
+ "epoch": 0.16,
1107
+ "eval_samsum_accuracy": 0.590799585469913,
1108
+ "eval_samsum_loss": 1.5537109375,
1109
+ "eval_samsum_runtime": 10.6642,
1110
+ "eval_samsum_samples_per_second": 276.345,
1111
+ "eval_samsum_steps_per_second": 17.348,
1112
+ "step": 1000
1113
+ },
1114
+ {
1115
+ "epoch": 0.16,
1116
+ "eval_debate_sum_accuracy": 0.9329163674973446,
1117
+ "eval_debate_sum_loss": NaN,
1118
+ "eval_debate_sum_runtime": 196.1179,
1119
+ "eval_debate_sum_samples_per_second": 245.332,
1120
+ "eval_debate_sum_steps_per_second": 15.338,
1121
+ "step": 1000
1122
+ },
1123
+ {
1124
+ "epoch": 0.16,
1125
+ "eval_billsum_accuracy": 0.6510711811280909,
1126
+ "eval_billsum_loss": NaN,
1127
+ "eval_billsum_runtime": 16.6536,
1128
+ "eval_billsum_samples_per_second": 227.579,
1129
+ "eval_billsum_steps_per_second": 14.231,
1130
+ "step": 1000
1131
+ },
1132
+ {
1133
+ "epoch": 0.16,
1134
+ "eval_wmt2019_zh-en_accuracy": 0.5587294145226513,
1135
+ "eval_wmt2019_zh-en_loss": 2.111328125,
1136
+ "eval_wmt2019_zh-en_runtime": 12.8767,
1137
+ "eval_wmt2019_zh-en_samples_per_second": 309.164,
1138
+ "eval_wmt2019_zh-en_steps_per_second": 19.337,
1139
+ "step": 1000
1140
+ },
1141
+ {
1142
+ "epoch": 0.16,
1143
+ "eval_wmt2019_ru-en_accuracy": 0.6366095054310483,
1144
+ "eval_wmt2019_ru-en_loss": 1.552734375,
1145
+ "eval_wmt2019_ru-en_runtime": 10.1123,
1146
+ "eval_wmt2019_ru-en_samples_per_second": 296.667,
1147
+ "eval_wmt2019_ru-en_steps_per_second": 18.591,
1148
+ "step": 1000
1149
+ },
1150
+ {
1151
+ "epoch": 0.16,
1152
+ "eval_wmt2019_de-en_accuracy": 0.6681106028096029,
1153
+ "eval_wmt2019_de-en_loss": 1.4462890625,
1154
+ "eval_wmt2019_de-en_runtime": 9.913,
1155
+ "eval_wmt2019_de-en_samples_per_second": 302.432,
1156
+ "eval_wmt2019_de-en_steps_per_second": 18.965,
1157
+ "step": 1000
1158
+ },
1159
+ {
1160
+ "epoch": 0.16,
1161
+ "eval_wmt2019_fr-de_accuracy": 0.6540822971254923,
1162
+ "eval_wmt2019_fr-de_loss": 1.51171875,
1163
+ "eval_wmt2019_fr-de_runtime": 5.7364,
1164
+ "eval_wmt2019_fr-de_samples_per_second": 263.579,
1165
+ "eval_wmt2019_fr-de_steps_per_second": 16.561,
1166
+ "step": 1000
1167
+ },
1168
+ {
1169
+ "epoch": 0.16,
1170
+ "eval_essay_instruction_accuracy": 0.5807311500380807,
1171
+ "eval_essay_instruction_loss": 2.072265625,
1172
+ "eval_essay_instruction_runtime": 4.2906,
1173
+ "eval_essay_instruction_samples_per_second": 96.257,
1174
+ "eval_essay_instruction_steps_per_second": 6.06,
1175
+ "step": 1000
1176
+ },
1177
+ {
1178
+ "epoch": 0.16,
1179
+ "eval_reddit_eli5_accuracy": 0.42394149731958486,
1180
+ "eval_reddit_eli5_loss": 2.748046875,
1181
+ "eval_reddit_eli5_runtime": 220.7861,
1182
+ "eval_reddit_eli5_samples_per_second": 246.968,
1183
+ "eval_reddit_eli5_steps_per_second": 15.436,
1184
+ "step": 1000
1185
+ },
1186
+ {
1187
+ "epoch": 0.16,
1188
+ "eval_reddit_askh_accuracy": 0.42705794870139985,
1189
+ "eval_reddit_askh_loss": 2.826171875,
1190
+ "eval_reddit_askh_runtime": 106.0159,
1191
+ "eval_reddit_askh_samples_per_second": 185.868,
1192
+ "eval_reddit_askh_steps_per_second": 11.621,
1193
+ "step": 1000
1194
+ },
1195
+ {
1196
+ "epoch": 0.16,
1197
+ "eval_reddit_asks_accuracy": 0.43686793419350517,
1198
+ "eval_reddit_asks_loss": 2.67578125,
1199
+ "eval_reddit_asks_runtime": 110.0537,
1200
+ "eval_reddit_asks_samples_per_second": 239.483,
1201
+ "eval_reddit_asks_steps_per_second": 14.975,
1202
+ "step": 1000
1203
+ },
1204
+ {
1205
+ "epoch": 0.17,
1206
+ "learning_rate": 4.997313753581662e-06,
1207
+ "loss": 1.8558,
1208
+ "step": 1010
1209
+ },
1210
+ {
1211
+ "epoch": 0.17,
1212
+ "learning_rate": 4.992836676217766e-06,
1213
+ "loss": 1.862,
1214
+ "step": 1020
1215
+ },
1216
+ {
1217
+ "epoch": 0.17,
1218
+ "learning_rate": 4.988359598853868e-06,
1219
+ "loss": 1.8667,
1220
+ "step": 1030
1221
+ },
1222
+ {
1223
+ "epoch": 0.17,
1224
+ "learning_rate": 4.9838825214899716e-06,
1225
+ "loss": 1.8597,
1226
+ "step": 1040
1227
+ },
1228
+ {
1229
+ "epoch": 0.17,
1230
+ "learning_rate": 4.979405444126075e-06,
1231
+ "loss": 1.8472,
1232
+ "step": 1050
1233
+ },
1234
+ {
1235
+ "epoch": 0.17,
1236
+ "learning_rate": 4.974928366762178e-06,
1237
+ "loss": 1.8552,
1238
+ "step": 1060
1239
+ },
1240
+ {
1241
+ "epoch": 0.18,
1242
+ "learning_rate": 4.9704512893982816e-06,
1243
+ "loss": 1.8532,
1244
+ "step": 1070
1245
+ },
1246
+ {
1247
+ "epoch": 0.18,
1248
+ "learning_rate": 4.965974212034385e-06,
1249
+ "loss": 1.8235,
1250
+ "step": 1080
1251
+ },
1252
+ {
1253
+ "epoch": 0.18,
1254
+ "learning_rate": 4.961497134670487e-06,
1255
+ "loss": 1.861,
1256
+ "step": 1090
1257
+ },
1258
+ {
1259
+ "epoch": 0.18,
1260
+ "learning_rate": 4.957020057306591e-06,
1261
+ "loss": 1.7964,
1262
+ "step": 1100
1263
+ },
1264
+ {
1265
+ "epoch": 0.18,
1266
+ "learning_rate": 4.952542979942694e-06,
1267
+ "loss": 1.8203,
1268
+ "step": 1110
1269
+ },
1270
+ {
1271
+ "epoch": 0.18,
1272
+ "learning_rate": 4.9480659025787965e-06,
1273
+ "loss": 1.8759,
1274
+ "step": 1120
1275
+ },
1276
+ {
1277
+ "epoch": 0.19,
1278
+ "learning_rate": 4.9435888252149e-06,
1279
+ "loss": 1.8592,
1280
+ "step": 1130
1281
+ },
1282
+ {
1283
+ "epoch": 0.19,
1284
+ "learning_rate": 4.939111747851003e-06,
1285
+ "loss": 1.8487,
1286
+ "step": 1140
1287
+ },
1288
+ {
1289
+ "epoch": 0.19,
1290
+ "learning_rate": 4.9346346704871065e-06,
1291
+ "loss": 1.8112,
1292
+ "step": 1150
1293
+ },
1294
+ {
1295
+ "epoch": 0.19,
1296
+ "learning_rate": 4.930157593123209e-06,
1297
+ "loss": 1.8028,
1298
+ "step": 1160
1299
+ },
1300
+ {
1301
+ "epoch": 0.19,
1302
+ "learning_rate": 4.925680515759313e-06,
1303
+ "loss": 1.8361,
1304
+ "step": 1170
1305
+ },
1306
+ {
1307
+ "epoch": 0.19,
1308
+ "learning_rate": 4.921203438395416e-06,
1309
+ "loss": 1.8625,
1310
+ "step": 1180
1311
+ },
1312
+ {
1313
+ "epoch": 0.2,
1314
+ "learning_rate": 4.916726361031519e-06,
1315
+ "loss": 1.8345,
1316
+ "step": 1190
1317
+ },
1318
+ {
1319
+ "epoch": 0.2,
1320
+ "learning_rate": 4.912249283667622e-06,
1321
+ "loss": 1.8506,
1322
+ "step": 1200
1323
+ },
1324
+ {
1325
+ "epoch": 0.2,
1326
+ "learning_rate": 4.907772206303726e-06,
1327
+ "loss": 1.8326,
1328
+ "step": 1210
1329
+ },
1330
+ {
1331
+ "epoch": 0.2,
1332
+ "learning_rate": 4.903295128939828e-06,
1333
+ "loss": 1.8399,
1334
+ "step": 1220
1335
+ },
1336
+ {
1337
+ "epoch": 0.2,
1338
+ "learning_rate": 4.8988180515759315e-06,
1339
+ "loss": 1.8706,
1340
+ "step": 1230
1341
+ },
1342
+ {
1343
+ "epoch": 0.2,
1344
+ "learning_rate": 4.894340974212035e-06,
1345
+ "loss": 1.8227,
1346
+ "step": 1240
1347
+ },
1348
+ {
1349
+ "epoch": 0.21,
1350
+ "learning_rate": 4.889863896848137e-06,
1351
+ "loss": 1.8461,
1352
+ "step": 1250
1353
+ },
1354
+ {
1355
+ "epoch": 0.21,
1356
+ "learning_rate": 4.8853868194842415e-06,
1357
+ "loss": 1.874,
1358
+ "step": 1260
1359
+ },
1360
+ {
1361
+ "epoch": 0.21,
1362
+ "learning_rate": 4.880909742120344e-06,
1363
+ "loss": 1.8178,
1364
+ "step": 1270
1365
+ },
1366
+ {
1367
+ "epoch": 0.21,
1368
+ "learning_rate": 4.876432664756447e-06,
1369
+ "loss": 1.8141,
1370
+ "step": 1280
1371
+ },
1372
+ {
1373
+ "epoch": 0.21,
1374
+ "learning_rate": 4.871955587392551e-06,
1375
+ "loss": 1.8258,
1376
+ "step": 1290
1377
+ },
1378
+ {
1379
+ "epoch": 0.21,
1380
+ "learning_rate": 4.867478510028654e-06,
1381
+ "loss": 1.8595,
1382
+ "step": 1300
1383
+ },
1384
+ {
1385
+ "epoch": 0.22,
1386
+ "learning_rate": 4.8630014326647565e-06,
1387
+ "loss": 1.8495,
1388
+ "step": 1310
1389
+ },
1390
+ {
1391
+ "epoch": 0.22,
1392
+ "learning_rate": 4.85852435530086e-06,
1393
+ "loss": 1.8492,
1394
+ "step": 1320
1395
+ },
1396
+ {
1397
+ "epoch": 0.22,
1398
+ "learning_rate": 4.854047277936963e-06,
1399
+ "loss": 1.8339,
1400
+ "step": 1330
1401
+ },
1402
+ {
1403
+ "epoch": 0.22,
1404
+ "learning_rate": 4.849570200573066e-06,
1405
+ "loss": 1.8218,
1406
+ "step": 1340
1407
+ },
1408
+ {
1409
+ "epoch": 0.22,
1410
+ "learning_rate": 4.84509312320917e-06,
1411
+ "loss": 1.8411,
1412
+ "step": 1350
1413
+ },
1414
+ {
1415
+ "epoch": 0.22,
1416
+ "learning_rate": 4.840616045845273e-06,
1417
+ "loss": 1.8275,
1418
+ "step": 1360
1419
+ },
1420
+ {
1421
+ "epoch": 0.23,
1422
+ "learning_rate": 4.836138968481376e-06,
1423
+ "loss": 1.8358,
1424
+ "step": 1370
1425
+ },
1426
+ {
1427
+ "epoch": 0.23,
1428
+ "learning_rate": 4.831661891117479e-06,
1429
+ "loss": 1.8395,
1430
+ "step": 1380
1431
+ },
1432
+ {
1433
+ "epoch": 0.23,
1434
+ "learning_rate": 4.827184813753582e-06,
1435
+ "loss": 1.8114,
1436
+ "step": 1390
1437
+ },
1438
+ {
1439
+ "epoch": 0.23,
1440
+ "learning_rate": 4.822707736389685e-06,
1441
+ "loss": 1.8167,
1442
+ "step": 1400
1443
+ },
1444
+ {
1445
+ "epoch": 0.23,
1446
+ "learning_rate": 4.818230659025788e-06,
1447
+ "loss": 1.8502,
1448
+ "step": 1410
1449
+ },
1450
+ {
1451
+ "epoch": 0.23,
1452
+ "learning_rate": 4.8137535816618915e-06,
1453
+ "loss": 1.8494,
1454
+ "step": 1420
1455
+ },
1456
+ {
1457
+ "epoch": 0.24,
1458
+ "learning_rate": 4.809276504297995e-06,
1459
+ "loss": 1.8546,
1460
+ "step": 1430
1461
+ },
1462
+ {
1463
+ "epoch": 0.24,
1464
+ "learning_rate": 4.804799426934098e-06,
1465
+ "loss": 1.8219,
1466
+ "step": 1440
1467
+ },
1468
+ {
1469
+ "epoch": 0.24,
1470
+ "learning_rate": 4.8003223495702015e-06,
1471
+ "loss": 1.856,
1472
+ "step": 1450
1473
+ },
1474
+ {
1475
+ "epoch": 0.24,
1476
+ "learning_rate": 4.795845272206304e-06,
1477
+ "loss": 1.8235,
1478
+ "step": 1460
1479
+ },
1480
+ {
1481
+ "epoch": 0.24,
1482
+ "learning_rate": 4.791368194842407e-06,
1483
+ "loss": 1.8084,
1484
+ "step": 1470
1485
+ },
1486
+ {
1487
+ "epoch": 0.24,
1488
+ "learning_rate": 4.786891117478511e-06,
1489
+ "loss": 1.8036,
1490
+ "step": 1480
1491
+ },
1492
+ {
1493
+ "epoch": 0.24,
1494
+ "learning_rate": 4.782414040114613e-06,
1495
+ "loss": 1.807,
1496
+ "step": 1490
1497
+ },
1498
+ {
1499
+ "epoch": 0.25,
1500
+ "learning_rate": 4.7779369627507165e-06,
1501
+ "loss": 1.8223,
1502
+ "step": 1500
1503
+ },
1504
+ {
1505
+ "epoch": 0.25,
1506
+ "eval_gsm8k_hard_accuracy": 0.9040755669557429,
1507
+ "eval_gsm8k_hard_loss": 0.418212890625,
1508
+ "eval_gsm8k_hard_runtime": 1.5226,
1509
+ "eval_gsm8k_hard_samples_per_second": 173.393,
1510
+ "eval_gsm8k_hard_steps_per_second": 11.165,
1511
+ "step": 1500
1512
+ },
1513
+ {
1514
+ "epoch": 0.25,
1515
+ "eval_webgpt_accuracy": 0.4685187206997972,
1516
+ "eval_webgpt_loss": 2.43359375,
1517
+ "eval_webgpt_runtime": 16.9148,
1518
+ "eval_webgpt_samples_per_second": 231.513,
1519
+ "eval_webgpt_steps_per_second": 14.484,
1520
+ "step": 1500
1521
+ },
1522
+ {
1523
+ "epoch": 0.25,
1524
+ "eval_squad_v2_accuracy": 0.8753178869062296,
1525
+ "eval_squad_v2_loss": 0.424560546875,
1526
+ "eval_squad_v2_runtime": 78.195,
1527
+ "eval_squad_v2_samples_per_second": 333.32,
1528
+ "eval_squad_v2_steps_per_second": 20.833,
1529
+ "step": 1500
1530
+ },
1531
+ {
1532
+ "epoch": 0.25,
1533
+ "eval_adversarial_qa_accuracy": 0.7873755143419405,
1534
+ "eval_adversarial_qa_loss": 1.076171875,
1535
+ "eval_adversarial_qa_runtime": 18.1293,
1536
+ "eval_adversarial_qa_samples_per_second": 330.956,
1537
+ "eval_adversarial_qa_steps_per_second": 20.685,
1538
+ "step": 1500
1539
+ },
1540
+ {
1541
+ "epoch": 0.25,
1542
+ "eval_private_tuning_accuracy": 0.6490207424086791,
1543
+ "eval_private_tuning_loss": 1.30859375,
1544
+ "eval_private_tuning_runtime": 65.2643,
1545
+ "eval_private_tuning_samples_per_second": 324.496,
1546
+ "eval_private_tuning_steps_per_second": 20.287,
1547
+ "step": 1500
1548
+ },
1549
+ {
1550
+ "epoch": 0.25,
1551
+ "eval_oa_translated_accuracy": 0.6676994033045951,
1552
+ "eval_oa_translated_loss": 1.447265625,
1553
+ "eval_oa_translated_runtime": 495.9674,
1554
+ "eval_oa_translated_samples_per_second": 269.623,
1555
+ "eval_oa_translated_steps_per_second": 16.852,
1556
+ "step": 1500
1557
+ },
1558
+ {
1559
+ "epoch": 0.25,
1560
+ "eval_prosocial_dialogue_accuracy": 0.52370774081413,
1561
+ "eval_prosocial_dialogue_loss": 1.802734375,
1562
+ "eval_prosocial_dialogue_runtime": 117.5764,
1563
+ "eval_prosocial_dialogue_samples_per_second": 229.493,
1564
+ "eval_prosocial_dialogue_steps_per_second": 14.348,
1565
+ "step": 1500
1566
+ },
1567
+ {
1568
+ "epoch": 0.25,
1569
+ "eval_math_qa_accuracy": 0.5447965302424216,
1570
+ "eval_math_qa_loss": 2.015625,
1571
+ "eval_math_qa_runtime": 19.5318,
1572
+ "eval_math_qa_samples_per_second": 305.553,
1573
+ "eval_math_qa_steps_per_second": 19.097,
1574
+ "step": 1500
1575
+ },
1576
+ {
1577
+ "epoch": 0.25,
1578
+ "eval_wikihow_accuracy": 0.5924303341189519,
1579
+ "eval_wikihow_loss": 2.048828125,
1580
+ "eval_wikihow_runtime": 8.7505,
1581
+ "eval_wikihow_samples_per_second": 262.042,
1582
+ "eval_wikihow_steps_per_second": 16.456,
1583
+ "step": 1500
1584
+ },
1585
+ {
1586
+ "epoch": 0.25,
1587
+ "eval_joke_accuracy": 0.46341925701288855,
1588
+ "eval_joke_loss": 2.439453125,
1589
+ "eval_joke_runtime": 0.9638,
1590
+ "eval_joke_samples_per_second": 78.857,
1591
+ "eval_joke_steps_per_second": 5.188,
1592
+ "step": 1500
1593
+ },
1594
+ {
1595
+ "epoch": 0.25,
1596
+ "eval_gsm8k_accuracy": 0.7337848616728005,
1597
+ "eval_gsm8k_loss": 1.01171875,
1598
+ "eval_gsm8k_runtime": 6.3369,
1599
+ "eval_gsm8k_samples_per_second": 235.921,
1600
+ "eval_gsm8k_steps_per_second": 14.834,
1601
+ "step": 1500
1602
+ },
1603
+ {
1604
+ "epoch": 0.25,
1605
+ "eval_ted_trans_en-hi_accuracy": 0.5467649647887324,
1606
+ "eval_ted_trans_en-hi_loss": 2.080078125,
1607
+ "eval_ted_trans_en-hi_runtime": 0.6116,
1608
+ "eval_ted_trans_en-hi_samples_per_second": 168.402,
1609
+ "eval_ted_trans_en-hi_steps_per_second": 11.445,
1610
+ "step": 1500
1611
+ },
1612
+ {
1613
+ "epoch": 0.25,
1614
+ "eval_ted_trans_de-ja_accuracy": 0.5412865271482303,
1615
+ "eval_ted_trans_de-ja_loss": 2.130859375,
1616
+ "eval_ted_trans_de-ja_runtime": 3.781,
1617
+ "eval_ted_trans_de-ja_samples_per_second": 189.896,
1618
+ "eval_ted_trans_de-ja_steps_per_second": 11.902,
1619
+ "step": 1500
1620
+ },
1621
+ {
1622
+ "epoch": 0.25,
1623
+ "eval_ted_trans_nl-en_accuracy": 0.6488075112486671,
1624
+ "eval_ted_trans_nl-en_loss": 1.65625,
1625
+ "eval_ted_trans_nl-en_runtime": 3.2481,
1626
+ "eval_ted_trans_nl-en_samples_per_second": 237.369,
1627
+ "eval_ted_trans_nl-en_steps_per_second": 15.086,
1628
+ "step": 1500
1629
+ },
1630
+ {
1631
+ "epoch": 0.25,
1632
+ "eval_ted_trans_en-ja_accuracy": 0.5640618403329245,
1633
+ "eval_ted_trans_en-ja_loss": 1.990234375,
1634
+ "eval_ted_trans_en-ja_runtime": 3.5175,
1635
+ "eval_ted_trans_en-ja_samples_per_second": 227.717,
1636
+ "eval_ted_trans_en-ja_steps_per_second": 14.499,
1637
+ "step": 1500
1638
+ },
1639
+ {
1640
+ "epoch": 0.25,
1641
+ "eval_ted_trans_en-es_accuracy": 0.7159835441109249,
1642
+ "eval_ted_trans_en-es_loss": 1.25,
1643
+ "eval_ted_trans_en-es_runtime": 3.1192,
1644
+ "eval_ted_trans_en-es_samples_per_second": 264.814,
1645
+ "eval_ted_trans_en-es_steps_per_second": 16.671,
1646
+ "step": 1500
1647
+ },
1648
+ {
1649
+ "epoch": 0.25,
1650
+ "eval_ted_trans_en-ms_accuracy": 0.5680356259277586,
1651
+ "eval_ted_trans_en-ms_loss": 2.125,
1652
+ "eval_ted_trans_en-ms_runtime": 1.4378,
1653
+ "eval_ted_trans_en-ms_samples_per_second": 29.212,
1654
+ "eval_ted_trans_en-ms_steps_per_second": 2.087,
1655
+ "step": 1500
1656
+ },
1657
+ {
1658
+ "epoch": 0.25,
1659
+ "eval_xsum_accuracy": 0.575791398307109,
1660
+ "eval_xsum_loss": NaN,
1661
+ "eval_xsum_runtime": 142.8893,
1662
+ "eval_xsum_samples_per_second": 285.599,
1663
+ "eval_xsum_steps_per_second": 17.853,
1664
+ "step": 1500
1665
+ },
1666
+ {
1667
+ "epoch": 0.25,
1668
+ "eval_cnn_dailymail_accuracy": 0.6578154814514997,
1669
+ "eval_cnn_dailymail_loss": NaN,
1670
+ "eval_cnn_dailymail_runtime": 210.3199,
1671
+ "eval_cnn_dailymail_samples_per_second": 273.027,
1672
+ "eval_cnn_dailymail_steps_per_second": 17.064,
1673
+ "step": 1500
1674
+ },
1675
+ {
1676
+ "epoch": 0.25,
1677
+ "eval_multi_news_accuracy": 0.5236211410651092,
1678
+ "eval_multi_news_loss": NaN,
1679
+ "eval_multi_news_runtime": 35.5484,
1680
+ "eval_multi_news_samples_per_second": 253.035,
1681
+ "eval_multi_news_steps_per_second": 15.838,
1682
+ "step": 1500
1683
+ },
1684
+ {
1685
+ "epoch": 0.25,
1686
+ "eval_tldr_news_accuracy": 0.5471644879149816,
1687
+ "eval_tldr_news_loss": 2.119140625,
1688
+ "eval_tldr_news_runtime": 4.1143,
1689
+ "eval_tldr_news_samples_per_second": 347.078,
1690
+ "eval_tldr_news_steps_per_second": 21.875,
1691
+ "step": 1500
1692
+ },
1693
+ {
1694
+ "epoch": 0.25,
1695
+ "eval_scitldr_accuracy": 0.49550898203592814,
1696
+ "eval_scitldr_loss": NaN,
1697
+ "eval_scitldr_runtime": 2.4172,
1698
+ "eval_scitldr_samples_per_second": 165.069,
1699
+ "eval_scitldr_steps_per_second": 10.343,
1700
+ "step": 1500
1701
+ },
1702
+ {
1703
+ "epoch": 0.25,
1704
+ "eval_samsum_accuracy": 0.5926165192931454,
1705
+ "eval_samsum_loss": 1.5224609375,
1706
+ "eval_samsum_runtime": 10.6814,
1707
+ "eval_samsum_samples_per_second": 275.901,
1708
+ "eval_samsum_steps_per_second": 17.32,
1709
+ "step": 1500
1710
+ },
1711
+ {
1712
+ "epoch": 0.25,
1713
+ "eval_debate_sum_accuracy": 0.9358983757089394,
1714
+ "eval_debate_sum_loss": NaN,
1715
+ "eval_debate_sum_runtime": 196.0638,
1716
+ "eval_debate_sum_samples_per_second": 245.4,
1717
+ "eval_debate_sum_steps_per_second": 15.342,
1718
+ "step": 1500
1719
+ },
1720
+ {
1721
+ "epoch": 0.25,
1722
+ "eval_billsum_accuracy": 0.653463309552768,
1723
+ "eval_billsum_loss": NaN,
1724
+ "eval_billsum_runtime": 16.6514,
1725
+ "eval_billsum_samples_per_second": 227.609,
1726
+ "eval_billsum_steps_per_second": 14.233,
1727
+ "step": 1500
1728
+ },
1729
+ {
1730
+ "epoch": 0.25,
1731
+ "eval_wmt2019_zh-en_accuracy": 0.5612179149240267,
1732
+ "eval_wmt2019_zh-en_loss": 2.091796875,
1733
+ "eval_wmt2019_zh-en_runtime": 13.0415,
1734
+ "eval_wmt2019_zh-en_samples_per_second": 305.255,
1735
+ "eval_wmt2019_zh-en_steps_per_second": 19.093,
1736
+ "step": 1500
1737
+ },
1738
+ {
1739
+ "epoch": 0.25,
1740
+ "eval_wmt2019_ru-en_accuracy": 0.6424937502741108,
1741
+ "eval_wmt2019_ru-en_loss": 1.5146484375,
1742
+ "eval_wmt2019_ru-en_runtime": 9.2157,
1743
+ "eval_wmt2019_ru-en_samples_per_second": 325.531,
1744
+ "eval_wmt2019_ru-en_steps_per_second": 20.4,
1745
+ "step": 1500
1746
+ },
1747
+ {
1748
+ "epoch": 0.25,
1749
+ "eval_wmt2019_de-en_accuracy": 0.6710659487470143,
1750
+ "eval_wmt2019_de-en_loss": 1.4248046875,
1751
+ "eval_wmt2019_de-en_runtime": 10.6387,
1752
+ "eval_wmt2019_de-en_samples_per_second": 281.801,
1753
+ "eval_wmt2019_de-en_steps_per_second": 17.671,
1754
+ "step": 1500
1755
+ },
1756
+ {
1757
+ "epoch": 0.25,
1758
+ "eval_wmt2019_fr-de_accuracy": 0.6660826692300537,
1759
+ "eval_wmt2019_fr-de_loss": 1.4521484375,
1760
+ "eval_wmt2019_fr-de_runtime": 5.6356,
1761
+ "eval_wmt2019_fr-de_samples_per_second": 268.295,
1762
+ "eval_wmt2019_fr-de_steps_per_second": 16.857,
1763
+ "step": 1500
1764
+ },
1765
+ {
1766
+ "epoch": 0.25,
1767
+ "eval_essay_instruction_accuracy": 0.5827727003469578,
1768
+ "eval_essay_instruction_loss": 2.05859375,
1769
+ "eval_essay_instruction_runtime": 4.1866,
1770
+ "eval_essay_instruction_samples_per_second": 98.649,
1771
+ "eval_essay_instruction_steps_per_second": 6.21,
1772
+ "step": 1500
1773
+ },
1774
+ {
1775
+ "epoch": 0.25,
1776
+ "eval_reddit_eli5_accuracy": 0.42421905510230506,
1777
+ "eval_reddit_eli5_loss": 2.73828125,
1778
+ "eval_reddit_eli5_runtime": 199.4145,
1779
+ "eval_reddit_eli5_samples_per_second": 273.436,
1780
+ "eval_reddit_eli5_steps_per_second": 17.09,
1781
+ "step": 1500
1782
+ },
1783
+ {
1784
+ "epoch": 0.25,
1785
+ "eval_reddit_askh_accuracy": 0.4276221576585814,
1786
+ "eval_reddit_askh_loss": 2.81640625,
1787
+ "eval_reddit_askh_runtime": 110.2213,
1788
+ "eval_reddit_askh_samples_per_second": 178.777,
1789
+ "eval_reddit_askh_steps_per_second": 11.178,
1790
+ "step": 1500
1791
+ },
1792
+ {
1793
+ "epoch": 0.25,
1794
+ "eval_reddit_asks_accuracy": 0.4372012246587393,
1795
+ "eval_reddit_asks_loss": 2.66796875,
1796
+ "eval_reddit_asks_runtime": 131.6507,
1797
+ "eval_reddit_asks_samples_per_second": 200.196,
1798
+ "eval_reddit_asks_steps_per_second": 12.518,
1799
+ "step": 1500
1800
+ },
1801
+ {
1802
+ "epoch": 0.25,
1803
+ "learning_rate": 4.77345988538682e-06,
1804
+ "loss": 1.8033,
1805
+ "step": 1510
1806
+ },
1807
+ {
1808
+ "epoch": 0.25,
1809
+ "learning_rate": 4.768982808022923e-06,
1810
+ "loss": 1.8477,
1811
+ "step": 1520
1812
+ },
1813
+ {
1814
+ "epoch": 0.25,
1815
+ "learning_rate": 4.7645057306590265e-06,
1816
+ "loss": 1.8417,
1817
+ "step": 1530
1818
+ },
1819
+ {
1820
+ "epoch": 0.25,
1821
+ "learning_rate": 4.76002865329513e-06,
1822
+ "loss": 1.7781,
1823
+ "step": 1540
1824
+ },
1825
+ {
1826
+ "epoch": 0.25,
1827
+ "learning_rate": 4.755551575931232e-06,
1828
+ "loss": 1.808,
1829
+ "step": 1550
1830
+ },
1831
+ {
1832
+ "epoch": 0.26,
1833
+ "learning_rate": 4.751074498567336e-06,
1834
+ "loss": 1.8719,
1835
+ "step": 1560
1836
+ },
1837
+ {
1838
+ "epoch": 0.26,
1839
+ "learning_rate": 4.746597421203439e-06,
1840
+ "loss": 1.8382,
1841
+ "step": 1570
1842
+ },
1843
+ {
1844
+ "epoch": 0.26,
1845
+ "learning_rate": 4.742120343839542e-06,
1846
+ "loss": 1.7991,
1847
+ "step": 1580
1848
+ },
1849
+ {
1850
+ "epoch": 0.26,
1851
+ "learning_rate": 4.737643266475645e-06,
1852
+ "loss": 1.809,
1853
+ "step": 1590
1854
+ },
1855
+ {
1856
+ "epoch": 0.26,
1857
+ "learning_rate": 4.733166189111748e-06,
1858
+ "loss": 1.8206,
1859
+ "step": 1600
1860
+ },
1861
+ {
1862
+ "epoch": 0.26,
1863
+ "learning_rate": 4.7286891117478515e-06,
1864
+ "loss": 1.8475,
1865
+ "step": 1610
1866
+ },
1867
+ {
1868
+ "epoch": 0.27,
1869
+ "learning_rate": 4.724212034383955e-06,
1870
+ "loss": 1.8342,
1871
+ "step": 1620
1872
+ },
1873
+ {
1874
+ "epoch": 0.27,
1875
+ "learning_rate": 4.719734957020058e-06,
1876
+ "loss": 1.8436,
1877
+ "step": 1630
1878
+ },
1879
+ {
1880
+ "epoch": 0.27,
1881
+ "learning_rate": 4.715257879656161e-06,
1882
+ "loss": 1.8198,
1883
+ "step": 1640
1884
+ },
1885
+ {
1886
+ "epoch": 0.27,
1887
+ "learning_rate": 4.710780802292264e-06,
1888
+ "loss": 1.8271,
1889
+ "step": 1650
1890
+ },
1891
+ {
1892
+ "epoch": 0.27,
1893
+ "learning_rate": 4.706303724928367e-06,
1894
+ "loss": 1.8584,
1895
+ "step": 1660
1896
+ },
1897
+ {
1898
+ "epoch": 0.27,
1899
+ "learning_rate": 4.701826647564471e-06,
1900
+ "loss": 1.8485,
1901
+ "step": 1670
1902
+ },
1903
+ {
1904
+ "epoch": 0.28,
1905
+ "learning_rate": 4.697349570200573e-06,
1906
+ "loss": 1.7872,
1907
+ "step": 1680
1908
+ },
1909
+ {
1910
+ "epoch": 0.28,
1911
+ "learning_rate": 4.6928724928366764e-06,
1912
+ "loss": 1.8026,
1913
+ "step": 1690
1914
+ },
1915
+ {
1916
+ "epoch": 0.28,
1917
+ "learning_rate": 4.68839541547278e-06,
1918
+ "loss": 1.8138,
1919
+ "step": 1700
1920
+ },
1921
+ {
1922
+ "epoch": 0.28,
1923
+ "learning_rate": 4.683918338108882e-06,
1924
+ "loss": 1.7871,
1925
+ "step": 1710
1926
+ },
1927
+ {
1928
+ "epoch": 0.28,
1929
+ "learning_rate": 4.6794412607449864e-06,
1930
+ "loss": 1.8459,
1931
+ "step": 1720
1932
+ },
1933
+ {
1934
+ "epoch": 0.28,
1935
+ "learning_rate": 4.67496418338109e-06,
1936
+ "loss": 1.8081,
1937
+ "step": 1730
1938
+ },
1939
+ {
1940
+ "epoch": 0.29,
1941
+ "learning_rate": 4.670487106017192e-06,
1942
+ "loss": 1.7956,
1943
+ "step": 1740
1944
+ },
1945
+ {
1946
+ "epoch": 0.29,
1947
+ "learning_rate": 4.666010028653296e-06,
1948
+ "loss": 1.779,
1949
+ "step": 1750
1950
+ },
1951
+ {
1952
+ "epoch": 0.29,
1953
+ "learning_rate": 4.661532951289399e-06,
1954
+ "loss": 1.8106,
1955
+ "step": 1760
1956
+ },
1957
+ {
1958
+ "epoch": 0.29,
1959
+ "learning_rate": 4.657055873925501e-06,
1960
+ "loss": 1.8225,
1961
+ "step": 1770
1962
+ },
1963
+ {
1964
+ "epoch": 0.29,
1965
+ "learning_rate": 4.652578796561605e-06,
1966
+ "loss": 1.8263,
1967
+ "step": 1780
1968
+ },
1969
+ {
1970
+ "epoch": 0.29,
1971
+ "learning_rate": 4.648101719197708e-06,
1972
+ "loss": 1.8232,
1973
+ "step": 1790
1974
+ },
1975
+ {
1976
+ "epoch": 0.3,
1977
+ "learning_rate": 4.643624641833811e-06,
1978
+ "loss": 1.751,
1979
+ "step": 1800
1980
+ },
1981
+ {
1982
+ "epoch": 0.3,
1983
+ "learning_rate": 4.639147564469915e-06,
1984
+ "loss": 1.8275,
1985
+ "step": 1810
1986
+ },
1987
+ {
1988
+ "epoch": 0.3,
1989
+ "learning_rate": 4.634670487106018e-06,
1990
+ "loss": 1.7738,
1991
+ "step": 1820
1992
+ },
1993
+ {
1994
+ "epoch": 0.3,
1995
+ "learning_rate": 4.6301934097421206e-06,
1996
+ "loss": 1.8064,
1997
+ "step": 1830
1998
+ },
1999
+ {
2000
+ "epoch": 0.3,
2001
+ "learning_rate": 4.625716332378224e-06,
2002
+ "loss": 1.7775,
2003
+ "step": 1840
2004
+ },
2005
+ {
2006
+ "epoch": 0.3,
2007
+ "learning_rate": 4.621239255014327e-06,
2008
+ "loss": 1.799,
2009
+ "step": 1850
2010
+ },
2011
+ {
2012
+ "epoch": 0.31,
2013
+ "learning_rate": 4.61676217765043e-06,
2014
+ "loss": 1.8349,
2015
+ "step": 1860
2016
+ },
2017
+ {
2018
+ "epoch": 0.31,
2019
+ "learning_rate": 4.612285100286533e-06,
2020
+ "loss": 1.7826,
2021
+ "step": 1870
2022
+ },
2023
+ {
2024
+ "epoch": 0.31,
2025
+ "learning_rate": 4.607808022922636e-06,
2026
+ "loss": 1.8322,
2027
+ "step": 1880
2028
+ },
2029
+ {
2030
+ "epoch": 0.31,
2031
+ "learning_rate": 4.60333094555874e-06,
2032
+ "loss": 1.8115,
2033
+ "step": 1890
2034
+ },
2035
+ {
2036
+ "epoch": 0.31,
2037
+ "learning_rate": 4.598853868194843e-06,
2038
+ "loss": 1.7811,
2039
+ "step": 1900
2040
+ },
2041
+ {
2042
+ "epoch": 0.31,
2043
+ "learning_rate": 4.594376790830946e-06,
2044
+ "loss": 1.8113,
2045
+ "step": 1910
2046
+ },
2047
+ {
2048
+ "epoch": 0.32,
2049
+ "learning_rate": 4.589899713467049e-06,
2050
+ "loss": 1.82,
2051
+ "step": 1920
2052
+ },
2053
+ {
2054
+ "epoch": 0.32,
2055
+ "learning_rate": 4.585422636103152e-06,
2056
+ "loss": 1.8562,
2057
+ "step": 1930
2058
+ },
2059
+ {
2060
+ "epoch": 0.32,
2061
+ "learning_rate": 4.5809455587392556e-06,
2062
+ "loss": 1.8197,
2063
+ "step": 1940
2064
+ },
2065
+ {
2066
+ "epoch": 0.32,
2067
+ "learning_rate": 4.576468481375359e-06,
2068
+ "loss": 1.8255,
2069
+ "step": 1950
2070
+ },
2071
+ {
2072
+ "epoch": 0.32,
2073
+ "learning_rate": 4.571991404011461e-06,
2074
+ "loss": 1.7827,
2075
+ "step": 1960
2076
+ },
2077
+ {
2078
+ "epoch": 0.32,
2079
+ "learning_rate": 4.567514326647565e-06,
2080
+ "loss": 1.7943,
2081
+ "step": 1970
2082
+ },
2083
+ {
2084
+ "epoch": 0.33,
2085
+ "learning_rate": 4.563037249283668e-06,
2086
+ "loss": 1.7536,
2087
+ "step": 1980
2088
+ },
2089
+ {
2090
+ "epoch": 0.33,
2091
+ "learning_rate": 4.558560171919771e-06,
2092
+ "loss": 1.783,
2093
+ "step": 1990
2094
+ },
2095
+ {
2096
+ "epoch": 0.33,
2097
+ "learning_rate": 4.554083094555875e-06,
2098
+ "loss": 1.7924,
2099
+ "step": 2000
2100
+ },
2101
+ {
2102
+ "epoch": 0.33,
2103
+ "eval_gsm8k_hard_accuracy": 0.9076584829607915,
2104
+ "eval_gsm8k_hard_loss": 0.39697265625,
2105
+ "eval_gsm8k_hard_runtime": 2.2124,
2106
+ "eval_gsm8k_hard_samples_per_second": 119.327,
2107
+ "eval_gsm8k_hard_steps_per_second": 7.684,
2108
+ "step": 2000
2109
+ },
2110
+ {
2111
+ "epoch": 0.33,
2112
+ "eval_webgpt_accuracy": 0.4696865932858906,
2113
+ "eval_webgpt_loss": 2.423828125,
2114
+ "eval_webgpt_runtime": 14.9073,
2115
+ "eval_webgpt_samples_per_second": 262.691,
2116
+ "eval_webgpt_steps_per_second": 16.435,
2117
+ "step": 2000
2118
+ },
2119
+ {
2120
+ "epoch": 0.33,
2121
+ "eval_squad_v2_accuracy": 0.8851732615724923,
2122
+ "eval_squad_v2_loss": 0.40966796875,
2123
+ "eval_squad_v2_runtime": 80.4777,
2124
+ "eval_squad_v2_samples_per_second": 323.866,
2125
+ "eval_squad_v2_steps_per_second": 20.242,
2126
+ "step": 2000
2127
+ },
2128
+ {
2129
+ "epoch": 0.33,
2130
+ "eval_adversarial_qa_accuracy": 0.7928618283737849,
2131
+ "eval_adversarial_qa_loss": 1.1015625,
2132
+ "eval_adversarial_qa_runtime": 18.1784,
2133
+ "eval_adversarial_qa_samples_per_second": 330.062,
2134
+ "eval_adversarial_qa_steps_per_second": 20.629,
2135
+ "step": 2000
2136
+ },
2137
+ {
2138
+ "epoch": 0.33,
2139
+ "eval_private_tuning_accuracy": 0.6519236031057545,
2140
+ "eval_private_tuning_loss": 1.2939453125,
2141
+ "eval_private_tuning_runtime": 67.7066,
2142
+ "eval_private_tuning_samples_per_second": 312.791,
2143
+ "eval_private_tuning_steps_per_second": 19.555,
2144
+ "step": 2000
2145
+ },
2146
+ {
2147
+ "epoch": 0.33,
2148
+ "eval_oa_translated_accuracy": 0.6739938039772579,
2149
+ "eval_oa_translated_loss": 1.4169921875,
2150
+ "eval_oa_translated_runtime": 489.9008,
2151
+ "eval_oa_translated_samples_per_second": 272.961,
2152
+ "eval_oa_translated_steps_per_second": 17.061,
2153
+ "step": 2000
2154
+ },
2155
+ {
2156
+ "epoch": 0.33,
2157
+ "eval_prosocial_dialogue_accuracy": 0.5339765946198812,
2158
+ "eval_prosocial_dialogue_loss": 1.8515625,
2159
+ "eval_prosocial_dialogue_runtime": 112.602,
2160
+ "eval_prosocial_dialogue_samples_per_second": 239.632,
2161
+ "eval_prosocial_dialogue_steps_per_second": 14.982,
2162
+ "step": 2000
2163
+ },
2164
+ {
2165
+ "epoch": 0.33,
2166
+ "eval_math_qa_accuracy": 0.5540153422185813,
2167
+ "eval_math_qa_loss": 1.9658203125,
2168
+ "eval_math_qa_runtime": 19.4551,
2169
+ "eval_math_qa_samples_per_second": 306.758,
2170
+ "eval_math_qa_steps_per_second": 19.172,
2171
+ "step": 2000
2172
+ },
2173
+ {
2174
+ "epoch": 0.33,
2175
+ "eval_wikihow_accuracy": 0.5962636905587134,
2176
+ "eval_wikihow_loss": 2.03515625,
2177
+ "eval_wikihow_runtime": 8.6232,
2178
+ "eval_wikihow_samples_per_second": 265.91,
2179
+ "eval_wikihow_steps_per_second": 16.699,
2180
+ "step": 2000
2181
+ },
2182
+ {
2183
+ "epoch": 0.33,
2184
+ "eval_joke_accuracy": 0.4670204700530705,
2185
+ "eval_joke_loss": 2.40234375,
2186
+ "eval_joke_runtime": 0.908,
2187
+ "eval_joke_samples_per_second": 83.698,
2188
+ "eval_joke_steps_per_second": 5.506,
2189
+ "step": 2000
2190
+ },
2191
+ {
2192
+ "epoch": 0.33,
2193
+ "eval_gsm8k_accuracy": 0.7402429297099117,
2194
+ "eval_gsm8k_loss": 0.97998046875,
2195
+ "eval_gsm8k_runtime": 5.491,
2196
+ "eval_gsm8k_samples_per_second": 272.266,
2197
+ "eval_gsm8k_steps_per_second": 17.119,
2198
+ "step": 2000
2199
+ },
2200
+ {
2201
+ "epoch": 0.33,
2202
+ "eval_ted_trans_en-hi_accuracy": 0.5490196078431373,
2203
+ "eval_ted_trans_en-hi_loss": 2.10546875,
2204
+ "eval_ted_trans_en-hi_runtime": 1.4695,
2205
+ "eval_ted_trans_en-hi_samples_per_second": 70.093,
2206
+ "eval_ted_trans_en-hi_steps_per_second": 4.764,
2207
+ "step": 2000
2208
+ },
2209
+ {
2210
+ "epoch": 0.33,
2211
+ "eval_ted_trans_de-ja_accuracy": 0.5422406826169489,
2212
+ "eval_ted_trans_de-ja_loss": 2.119140625,
2213
+ "eval_ted_trans_de-ja_runtime": 2.8137,
2214
+ "eval_ted_trans_de-ja_samples_per_second": 255.176,
2215
+ "eval_ted_trans_de-ja_steps_per_second": 15.993,
2216
+ "step": 2000
2217
+ },
2218
+ {
2219
+ "epoch": 0.33,
2220
+ "eval_ted_trans_nl-en_accuracy": 0.6510702489011417,
2221
+ "eval_ted_trans_nl-en_loss": 1.6396484375,
2222
+ "eval_ted_trans_nl-en_runtime": 4.3357,
2223
+ "eval_ted_trans_nl-en_samples_per_second": 177.827,
2224
+ "eval_ted_trans_nl-en_steps_per_second": 11.302,
2225
+ "step": 2000
2226
+ },
2227
+ {
2228
+ "epoch": 0.33,
2229
+ "eval_ted_trans_en-ja_accuracy": 0.556157479064968,
2230
+ "eval_ted_trans_en-ja_loss": 2.017578125,
2231
+ "eval_ted_trans_en-ja_runtime": 3.2862,
2232
+ "eval_ted_trans_en-ja_samples_per_second": 243.744,
2233
+ "eval_ted_trans_en-ja_steps_per_second": 15.519,
2234
+ "step": 2000
2235
+ },
2236
+ {
2237
+ "epoch": 0.33,
2238
+ "eval_ted_trans_en-es_accuracy": 0.7188412420341738,
2239
+ "eval_ted_trans_en-es_loss": 1.2294921875,
2240
+ "eval_ted_trans_en-es_runtime": 4.2374,
2241
+ "eval_ted_trans_en-es_samples_per_second": 194.932,
2242
+ "eval_ted_trans_en-es_steps_per_second": 12.272,
2243
+ "step": 2000
2244
+ },
2245
+ {
2246
+ "epoch": 0.33,
2247
+ "eval_ted_trans_en-ms_accuracy": 0.5734784760019792,
2248
+ "eval_ted_trans_en-ms_loss": 2.06640625,
2249
+ "eval_ted_trans_en-ms_runtime": 0.629,
2250
+ "eval_ted_trans_en-ms_samples_per_second": 66.768,
2251
+ "eval_ted_trans_en-ms_steps_per_second": 4.769,
2252
+ "step": 2000
2253
+ },
2254
+ {
2255
+ "epoch": 0.33,
2256
+ "eval_xsum_accuracy": 0.5781070399698887,
2257
+ "eval_xsum_loss": NaN,
2258
+ "eval_xsum_runtime": 144.895,
2259
+ "eval_xsum_samples_per_second": 281.645,
2260
+ "eval_xsum_steps_per_second": 17.606,
2261
+ "step": 2000
2262
+ },
2263
+ {
2264
+ "epoch": 0.33,
2265
+ "eval_cnn_dailymail_accuracy": 0.661322804206144,
2266
+ "eval_cnn_dailymail_loss": NaN,
2267
+ "eval_cnn_dailymail_runtime": 208.0253,
2268
+ "eval_cnn_dailymail_samples_per_second": 276.039,
2269
+ "eval_cnn_dailymail_steps_per_second": 17.253,
2270
+ "step": 2000
2271
+ },
2272
+ {
2273
+ "epoch": 0.33,
2274
+ "eval_multi_news_accuracy": 0.5257933653652501,
2275
+ "eval_multi_news_loss": NaN,
2276
+ "eval_multi_news_runtime": 34.8299,
2277
+ "eval_multi_news_samples_per_second": 258.255,
2278
+ "eval_multi_news_steps_per_second": 16.164,
2279
+ "step": 2000
2280
+ },
2281
+ {
2282
+ "epoch": 0.33,
2283
+ "eval_tldr_news_accuracy": 0.554453117652591,
2284
+ "eval_tldr_news_loss": 2.056640625,
2285
+ "eval_tldr_news_runtime": 4.219,
2286
+ "eval_tldr_news_samples_per_second": 338.467,
2287
+ "eval_tldr_news_steps_per_second": 21.332,
2288
+ "step": 2000
2289
+ },
2290
+ {
2291
+ "epoch": 0.33,
2292
+ "eval_scitldr_accuracy": 0.5014970059880239,
2293
+ "eval_scitldr_loss": NaN,
2294
+ "eval_scitldr_runtime": 2.5157,
2295
+ "eval_scitldr_samples_per_second": 158.605,
2296
+ "eval_scitldr_steps_per_second": 9.938,
2297
+ "step": 2000
2298
+ },
2299
+ {
2300
+ "epoch": 0.33,
2301
+ "eval_samsum_accuracy": 0.5984307075274895,
2302
+ "eval_samsum_loss": 1.5048828125,
2303
+ "eval_samsum_runtime": 9.7229,
2304
+ "eval_samsum_samples_per_second": 303.099,
2305
+ "eval_samsum_steps_per_second": 19.027,
2306
+ "step": 2000
2307
+ },
2308
+ {
2309
+ "epoch": 0.33,
2310
+ "eval_debate_sum_accuracy": 0.9366294751454436,
2311
+ "eval_debate_sum_loss": NaN,
2312
+ "eval_debate_sum_runtime": 191.1458,
2313
+ "eval_debate_sum_samples_per_second": 251.714,
2314
+ "eval_debate_sum_steps_per_second": 15.737,
2315
+ "step": 2000
2316
+ },
2317
+ {
2318
+ "epoch": 0.33,
2319
+ "eval_billsum_accuracy": 0.6557828398195577,
2320
+ "eval_billsum_loss": NaN,
2321
+ "eval_billsum_runtime": 21.6214,
2322
+ "eval_billsum_samples_per_second": 175.289,
2323
+ "eval_billsum_steps_per_second": 10.961,
2324
+ "step": 2000
2325
+ },
2326
+ {
2327
+ "epoch": 0.33,
2328
+ "eval_wmt2019_zh-en_accuracy": 0.5675863966606366,
2329
+ "eval_wmt2019_zh-en_loss": 2.0625,
2330
+ "eval_wmt2019_zh-en_runtime": 12.8505,
2331
+ "eval_wmt2019_zh-en_samples_per_second": 309.793,
2332
+ "eval_wmt2019_zh-en_steps_per_second": 19.377,
2333
+ "step": 2000
2334
+ },
2335
+ {
2336
+ "epoch": 0.33,
2337
+ "eval_wmt2019_ru-en_accuracy": 0.6426618715553412,
2338
+ "eval_wmt2019_ru-en_loss": 1.517578125,
2339
+ "eval_wmt2019_ru-en_runtime": 9.7092,
2340
+ "eval_wmt2019_ru-en_samples_per_second": 308.986,
2341
+ "eval_wmt2019_ru-en_steps_per_second": 19.363,
2342
+ "step": 2000
2343
+ },
2344
+ {
2345
+ "epoch": 0.33,
2346
+ "eval_wmt2019_de-en_accuracy": 0.6722804744747176,
2347
+ "eval_wmt2019_de-en_loss": 1.4169921875,
2348
+ "eval_wmt2019_de-en_runtime": 10.2232,
2349
+ "eval_wmt2019_de-en_samples_per_second": 293.255,
2350
+ "eval_wmt2019_de-en_steps_per_second": 18.39,
2351
+ "step": 2000
2352
+ },
2353
+ {
2354
+ "epoch": 0.33,
2355
+ "eval_wmt2019_fr-de_accuracy": 0.6611832925051939,
2356
+ "eval_wmt2019_fr-de_loss": 1.466796875,
2357
+ "eval_wmt2019_fr-de_runtime": 5.2327,
2358
+ "eval_wmt2019_fr-de_samples_per_second": 288.95,
2359
+ "eval_wmt2019_fr-de_steps_per_second": 18.155,
2360
+ "step": 2000
2361
+ },
2362
+ {
2363
+ "epoch": 0.33,
2364
+ "eval_essay_instruction_accuracy": 0.5831323516967082,
2365
+ "eval_essay_instruction_loss": 2.0546875,
2366
+ "eval_essay_instruction_runtime": 4.364,
2367
+ "eval_essay_instruction_samples_per_second": 94.638,
2368
+ "eval_essay_instruction_steps_per_second": 5.958,
2369
+ "step": 2000
2370
+ },
2371
+ {
2372
+ "epoch": 0.33,
2373
+ "eval_reddit_eli5_accuracy": 0.4256390849199412,
2374
+ "eval_reddit_eli5_loss": 2.732421875,
2375
+ "eval_reddit_eli5_runtime": 199.1709,
2376
+ "eval_reddit_eli5_samples_per_second": 273.77,
2377
+ "eval_reddit_eli5_steps_per_second": 17.111,
2378
+ "step": 2000
2379
+ },
2380
+ {
2381
+ "epoch": 0.33,
2382
+ "eval_reddit_askh_accuracy": 0.428839058527484,
2383
+ "eval_reddit_askh_loss": 2.80859375,
2384
+ "eval_reddit_askh_runtime": 129.3643,
2385
+ "eval_reddit_askh_samples_per_second": 152.322,
2386
+ "eval_reddit_askh_steps_per_second": 9.523,
2387
+ "step": 2000
2388
+ },
2389
+ {
2390
+ "epoch": 0.33,
2391
+ "eval_reddit_asks_accuracy": 0.43882877148313176,
2392
+ "eval_reddit_asks_loss": 2.662109375,
2393
+ "eval_reddit_asks_runtime": 99.4788,
2394
+ "eval_reddit_asks_samples_per_second": 264.941,
2395
+ "eval_reddit_asks_steps_per_second": 16.566,
2396
+ "step": 2000
2397
+ }
2398
+ ],
2399
+ "max_steps": 12168,
2400
+ "num_train_epochs": 2,
2401
+ "total_flos": 1.6560121131357438e+19,
2402
+ "trial_name": null,
2403
+ "trial_params": null
2404
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab66bb7fe1ee40d1b84fee573586d7e0b1f414a1ef36c8ada27b52ae4613e463
3
+ size 4539