Andric Valdez commited on
Commit
f6b1ea6
β€’
1 Parent(s): 902c035
config.json β†’ best/config.json RENAMED
File without changes
special_tokens_map.json β†’ best/special_tokens_map.json RENAMED
File without changes
tokenizer.json β†’ best/tokenizer.json RENAMED
File without changes
tokenizer_config.json β†’ best/tokenizer_config.json RENAMED
File without changes
best/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b1005c88e7a47ac094d8a7bd77607b48d53a9b5dd46659d3ae79ec2d04818e
3
+ size 4728
vocab.txt β†’ best/vocab.txt RENAMED
File without changes
checkpoint-23952/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nghuyong/ernie-2.0-base-en",
3
+ "architectures": [
4
+ "ErnieForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "human",
13
+ "1": "machine"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "label2id": {
18
+ "human": 0,
19
+ "machine": 1
20
+ },
21
+ "layer_norm_eps": 1e-05,
22
+ "max_position_embeddings": 512,
23
+ "model_type": "ernie",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 12,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "task_type_vocab_size": 3,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.37.2",
32
+ "type_vocab_size": 4,
33
+ "use_cache": true,
34
+ "use_task_id": false,
35
+ "vocab_size": 30522
36
+ }
checkpoint-23952/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35c01a4675f28d78dc9e14eeccb44123773490203400bbe9961db68d5343aa95
3
+ size 14244
checkpoint-23952/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a22170cd9f763354940d719206be4ca903b4f9ab6f58ac350ab57faff52de78
3
+ size 1064
checkpoint-23952/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-23952/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-23952/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "full_tokenizer_file": null,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
checkpoint-23952/trainer_state.json ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.30735036730766296,
3
+ "best_model_checkpoint": "nghuyong/ernie-2.0-base-en/subtaskA/0/checkpoint-23952",
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 23952,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "learning_rate": 1.9860832776664444e-05,
14
+ "loss": 0.4761,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.04,
19
+ "learning_rate": 1.9721665553328883e-05,
20
+ "loss": 0.3754,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.06,
25
+ "learning_rate": 1.9582498329993322e-05,
26
+ "loss": 0.3187,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.08,
31
+ "learning_rate": 1.944333110665776e-05,
32
+ "loss": 0.3292,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.1,
37
+ "learning_rate": 1.9304163883322203e-05,
38
+ "loss": 0.269,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.13,
43
+ "learning_rate": 1.9164996659986642e-05,
44
+ "loss": 0.2716,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.15,
49
+ "learning_rate": 1.902582943665108e-05,
50
+ "loss": 0.2496,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.17,
55
+ "learning_rate": 1.888666221331552e-05,
56
+ "loss": 0.2035,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.19,
61
+ "learning_rate": 1.8747494989979962e-05,
62
+ "loss": 0.2654,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.21,
67
+ "learning_rate": 1.86083277666444e-05,
68
+ "loss": 0.2489,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.23,
73
+ "learning_rate": 1.8469160543308843e-05,
74
+ "loss": 0.2162,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.25,
79
+ "learning_rate": 1.832999331997328e-05,
80
+ "loss": 0.2375,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.27,
85
+ "learning_rate": 1.819082609663772e-05,
86
+ "loss": 0.2367,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.29,
91
+ "learning_rate": 1.805165887330216e-05,
92
+ "loss": 0.238,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.31,
97
+ "learning_rate": 1.7912491649966602e-05,
98
+ "loss": 0.2017,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.33,
103
+ "learning_rate": 1.777332442663104e-05,
104
+ "loss": 0.2096,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.35,
109
+ "learning_rate": 1.763415720329548e-05,
110
+ "loss": 0.2028,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.38,
115
+ "learning_rate": 1.7494989979959922e-05,
116
+ "loss": 0.267,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.4,
121
+ "learning_rate": 1.735582275662436e-05,
122
+ "loss": 0.2098,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.42,
127
+ "learning_rate": 1.72166555332888e-05,
128
+ "loss": 0.2167,
129
+ "step": 10000
130
+ },
131
+ {
132
+ "epoch": 0.44,
133
+ "learning_rate": 1.707748830995324e-05,
134
+ "loss": 0.1927,
135
+ "step": 10500
136
+ },
137
+ {
138
+ "epoch": 0.46,
139
+ "learning_rate": 1.693832108661768e-05,
140
+ "loss": 0.1595,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 0.48,
145
+ "learning_rate": 1.679915386328212e-05,
146
+ "loss": 0.1724,
147
+ "step": 11500
148
+ },
149
+ {
150
+ "epoch": 0.5,
151
+ "learning_rate": 1.6659986639946563e-05,
152
+ "loss": 0.1884,
153
+ "step": 12000
154
+ },
155
+ {
156
+ "epoch": 0.52,
157
+ "learning_rate": 1.6520819416611002e-05,
158
+ "loss": 0.1607,
159
+ "step": 12500
160
+ },
161
+ {
162
+ "epoch": 0.54,
163
+ "learning_rate": 1.638165219327544e-05,
164
+ "loss": 0.1893,
165
+ "step": 13000
166
+ },
167
+ {
168
+ "epoch": 0.56,
169
+ "learning_rate": 1.624248496993988e-05,
170
+ "loss": 0.1775,
171
+ "step": 13500
172
+ },
173
+ {
174
+ "epoch": 0.58,
175
+ "learning_rate": 1.6103317746604322e-05,
176
+ "loss": 0.1716,
177
+ "step": 14000
178
+ },
179
+ {
180
+ "epoch": 0.61,
181
+ "learning_rate": 1.596415052326876e-05,
182
+ "loss": 0.1563,
183
+ "step": 14500
184
+ },
185
+ {
186
+ "epoch": 0.63,
187
+ "learning_rate": 1.58249832999332e-05,
188
+ "loss": 0.1856,
189
+ "step": 15000
190
+ },
191
+ {
192
+ "epoch": 0.65,
193
+ "learning_rate": 1.5685816076597642e-05,
194
+ "loss": 0.1454,
195
+ "step": 15500
196
+ },
197
+ {
198
+ "epoch": 0.67,
199
+ "learning_rate": 1.554664885326208e-05,
200
+ "loss": 0.1488,
201
+ "step": 16000
202
+ },
203
+ {
204
+ "epoch": 0.69,
205
+ "learning_rate": 1.5407481629926523e-05,
206
+ "loss": 0.1879,
207
+ "step": 16500
208
+ },
209
+ {
210
+ "epoch": 0.71,
211
+ "learning_rate": 1.5268314406590962e-05,
212
+ "loss": 0.1438,
213
+ "step": 17000
214
+ },
215
+ {
216
+ "epoch": 0.73,
217
+ "learning_rate": 1.5129147183255401e-05,
218
+ "loss": 0.1797,
219
+ "step": 17500
220
+ },
221
+ {
222
+ "epoch": 0.75,
223
+ "learning_rate": 1.498997995991984e-05,
224
+ "loss": 0.1457,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.77,
229
+ "learning_rate": 1.485081273658428e-05,
230
+ "loss": 0.1379,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.79,
235
+ "learning_rate": 1.471164551324872e-05,
236
+ "loss": 0.1472,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.81,
241
+ "learning_rate": 1.4572478289913162e-05,
242
+ "loss": 0.1233,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "learning_rate": 1.44333110665776e-05,
248
+ "loss": 0.1456,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 0.86,
253
+ "learning_rate": 1.4294143843242041e-05,
254
+ "loss": 0.1444,
255
+ "step": 20500
256
+ },
257
+ {
258
+ "epoch": 0.88,
259
+ "learning_rate": 1.415497661990648e-05,
260
+ "loss": 0.1413,
261
+ "step": 21000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "learning_rate": 1.4015809396570921e-05,
266
+ "loss": 0.1428,
267
+ "step": 21500
268
+ },
269
+ {
270
+ "epoch": 0.92,
271
+ "learning_rate": 1.3876642173235361e-05,
272
+ "loss": 0.1305,
273
+ "step": 22000
274
+ },
275
+ {
276
+ "epoch": 0.94,
277
+ "learning_rate": 1.37374749498998e-05,
278
+ "loss": 0.1669,
279
+ "step": 22500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "learning_rate": 1.3598307726564241e-05,
284
+ "loss": 0.1417,
285
+ "step": 23000
286
+ },
287
+ {
288
+ "epoch": 0.98,
289
+ "learning_rate": 1.345914050322868e-05,
290
+ "loss": 0.1494,
291
+ "step": 23500
292
+ },
293
+ {
294
+ "epoch": 1.0,
295
+ "eval_f1": 0.9453072812291249,
296
+ "eval_loss": 0.30735036730766296,
297
+ "eval_runtime": 307.615,
298
+ "eval_samples_per_second": 77.864,
299
+ "eval_steps_per_second": 19.466,
300
+ "step": 23952
301
+ }
302
+ ],
303
+ "logging_steps": 500,
304
+ "max_steps": 71856,
305
+ "num_input_tokens_seen": 0,
306
+ "num_train_epochs": 3,
307
+ "save_steps": 500,
308
+ "total_flos": 2.52073546587648e+16,
309
+ "train_batch_size": 4,
310
+ "trial_name": null,
311
+ "trial_params": null
312
+ }
checkpoint-23952/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b1005c88e7a47ac094d8a7bd77607b48d53a9b5dd46659d3ae79ec2d04818e
3
+ size 4728
checkpoint-23952/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-47904/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nghuyong/ernie-2.0-base-en",
3
+ "architectures": [
4
+ "ErnieForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "human",
13
+ "1": "machine"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "label2id": {
18
+ "human": 0,
19
+ "machine": 1
20
+ },
21
+ "layer_norm_eps": 1e-05,
22
+ "max_position_embeddings": 512,
23
+ "model_type": "ernie",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 12,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "task_type_vocab_size": 3,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.37.2",
32
+ "type_vocab_size": 4,
33
+ "use_cache": true,
34
+ "use_task_id": false,
35
+ "vocab_size": 30522
36
+ }
checkpoint-47904/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b3fcfc54b16d25c52e03fd5bc7e8dfe5b2098fac96e8acf3590872a4dc44304
3
+ size 14244
checkpoint-47904/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24fa720b8cc388929ba1afef71968322b751662d03575e80e0dddc5cb854fd79
3
+ size 1064
checkpoint-47904/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-47904/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-47904/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "full_tokenizer_file": null,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
checkpoint-47904/trainer_state.json ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.24657295644283295,
3
+ "best_model_checkpoint": "nghuyong/ernie-2.0-base-en/subtaskA/0/checkpoint-47904",
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 47904,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "learning_rate": 1.9860832776664444e-05,
14
+ "loss": 0.4761,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.04,
19
+ "learning_rate": 1.9721665553328883e-05,
20
+ "loss": 0.3754,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.06,
25
+ "learning_rate": 1.9582498329993322e-05,
26
+ "loss": 0.3187,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.08,
31
+ "learning_rate": 1.944333110665776e-05,
32
+ "loss": 0.3292,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.1,
37
+ "learning_rate": 1.9304163883322203e-05,
38
+ "loss": 0.269,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.13,
43
+ "learning_rate": 1.9164996659986642e-05,
44
+ "loss": 0.2716,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.15,
49
+ "learning_rate": 1.902582943665108e-05,
50
+ "loss": 0.2496,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.17,
55
+ "learning_rate": 1.888666221331552e-05,
56
+ "loss": 0.2035,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.19,
61
+ "learning_rate": 1.8747494989979962e-05,
62
+ "loss": 0.2654,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.21,
67
+ "learning_rate": 1.86083277666444e-05,
68
+ "loss": 0.2489,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.23,
73
+ "learning_rate": 1.8469160543308843e-05,
74
+ "loss": 0.2162,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.25,
79
+ "learning_rate": 1.832999331997328e-05,
80
+ "loss": 0.2375,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.27,
85
+ "learning_rate": 1.819082609663772e-05,
86
+ "loss": 0.2367,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.29,
91
+ "learning_rate": 1.805165887330216e-05,
92
+ "loss": 0.238,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.31,
97
+ "learning_rate": 1.7912491649966602e-05,
98
+ "loss": 0.2017,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.33,
103
+ "learning_rate": 1.777332442663104e-05,
104
+ "loss": 0.2096,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.35,
109
+ "learning_rate": 1.763415720329548e-05,
110
+ "loss": 0.2028,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.38,
115
+ "learning_rate": 1.7494989979959922e-05,
116
+ "loss": 0.267,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.4,
121
+ "learning_rate": 1.735582275662436e-05,
122
+ "loss": 0.2098,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.42,
127
+ "learning_rate": 1.72166555332888e-05,
128
+ "loss": 0.2167,
129
+ "step": 10000
130
+ },
131
+ {
132
+ "epoch": 0.44,
133
+ "learning_rate": 1.707748830995324e-05,
134
+ "loss": 0.1927,
135
+ "step": 10500
136
+ },
137
+ {
138
+ "epoch": 0.46,
139
+ "learning_rate": 1.693832108661768e-05,
140
+ "loss": 0.1595,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 0.48,
145
+ "learning_rate": 1.679915386328212e-05,
146
+ "loss": 0.1724,
147
+ "step": 11500
148
+ },
149
+ {
150
+ "epoch": 0.5,
151
+ "learning_rate": 1.6659986639946563e-05,
152
+ "loss": 0.1884,
153
+ "step": 12000
154
+ },
155
+ {
156
+ "epoch": 0.52,
157
+ "learning_rate": 1.6520819416611002e-05,
158
+ "loss": 0.1607,
159
+ "step": 12500
160
+ },
161
+ {
162
+ "epoch": 0.54,
163
+ "learning_rate": 1.638165219327544e-05,
164
+ "loss": 0.1893,
165
+ "step": 13000
166
+ },
167
+ {
168
+ "epoch": 0.56,
169
+ "learning_rate": 1.624248496993988e-05,
170
+ "loss": 0.1775,
171
+ "step": 13500
172
+ },
173
+ {
174
+ "epoch": 0.58,
175
+ "learning_rate": 1.6103317746604322e-05,
176
+ "loss": 0.1716,
177
+ "step": 14000
178
+ },
179
+ {
180
+ "epoch": 0.61,
181
+ "learning_rate": 1.596415052326876e-05,
182
+ "loss": 0.1563,
183
+ "step": 14500
184
+ },
185
+ {
186
+ "epoch": 0.63,
187
+ "learning_rate": 1.58249832999332e-05,
188
+ "loss": 0.1856,
189
+ "step": 15000
190
+ },
191
+ {
192
+ "epoch": 0.65,
193
+ "learning_rate": 1.5685816076597642e-05,
194
+ "loss": 0.1454,
195
+ "step": 15500
196
+ },
197
+ {
198
+ "epoch": 0.67,
199
+ "learning_rate": 1.554664885326208e-05,
200
+ "loss": 0.1488,
201
+ "step": 16000
202
+ },
203
+ {
204
+ "epoch": 0.69,
205
+ "learning_rate": 1.5407481629926523e-05,
206
+ "loss": 0.1879,
207
+ "step": 16500
208
+ },
209
+ {
210
+ "epoch": 0.71,
211
+ "learning_rate": 1.5268314406590962e-05,
212
+ "loss": 0.1438,
213
+ "step": 17000
214
+ },
215
+ {
216
+ "epoch": 0.73,
217
+ "learning_rate": 1.5129147183255401e-05,
218
+ "loss": 0.1797,
219
+ "step": 17500
220
+ },
221
+ {
222
+ "epoch": 0.75,
223
+ "learning_rate": 1.498997995991984e-05,
224
+ "loss": 0.1457,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.77,
229
+ "learning_rate": 1.485081273658428e-05,
230
+ "loss": 0.1379,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.79,
235
+ "learning_rate": 1.471164551324872e-05,
236
+ "loss": 0.1472,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.81,
241
+ "learning_rate": 1.4572478289913162e-05,
242
+ "loss": 0.1233,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "learning_rate": 1.44333110665776e-05,
248
+ "loss": 0.1456,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 0.86,
253
+ "learning_rate": 1.4294143843242041e-05,
254
+ "loss": 0.1444,
255
+ "step": 20500
256
+ },
257
+ {
258
+ "epoch": 0.88,
259
+ "learning_rate": 1.415497661990648e-05,
260
+ "loss": 0.1413,
261
+ "step": 21000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "learning_rate": 1.4015809396570921e-05,
266
+ "loss": 0.1428,
267
+ "step": 21500
268
+ },
269
+ {
270
+ "epoch": 0.92,
271
+ "learning_rate": 1.3876642173235361e-05,
272
+ "loss": 0.1305,
273
+ "step": 22000
274
+ },
275
+ {
276
+ "epoch": 0.94,
277
+ "learning_rate": 1.37374749498998e-05,
278
+ "loss": 0.1669,
279
+ "step": 22500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "learning_rate": 1.3598307726564241e-05,
284
+ "loss": 0.1417,
285
+ "step": 23000
286
+ },
287
+ {
288
+ "epoch": 0.98,
289
+ "learning_rate": 1.345914050322868e-05,
290
+ "loss": 0.1494,
291
+ "step": 23500
292
+ },
293
+ {
294
+ "epoch": 1.0,
295
+ "eval_f1": 0.9453072812291249,
296
+ "eval_loss": 0.30735036730766296,
297
+ "eval_runtime": 307.615,
298
+ "eval_samples_per_second": 77.864,
299
+ "eval_steps_per_second": 19.466,
300
+ "step": 23952
301
+ },
302
+ {
303
+ "epoch": 1.0,
304
+ "learning_rate": 1.331997327989312e-05,
305
+ "loss": 0.1028,
306
+ "step": 24000
307
+ },
308
+ {
309
+ "epoch": 1.02,
310
+ "learning_rate": 1.318080605655756e-05,
311
+ "loss": 0.0729,
312
+ "step": 24500
313
+ },
314
+ {
315
+ "epoch": 1.04,
316
+ "learning_rate": 1.3041638833222002e-05,
317
+ "loss": 0.0976,
318
+ "step": 25000
319
+ },
320
+ {
321
+ "epoch": 1.06,
322
+ "learning_rate": 1.290247160988644e-05,
323
+ "loss": 0.0782,
324
+ "step": 25500
325
+ },
326
+ {
327
+ "epoch": 1.09,
328
+ "learning_rate": 1.2763304386550881e-05,
329
+ "loss": 0.0977,
330
+ "step": 26000
331
+ },
332
+ {
333
+ "epoch": 1.11,
334
+ "learning_rate": 1.262413716321532e-05,
335
+ "loss": 0.0837,
336
+ "step": 26500
337
+ },
338
+ {
339
+ "epoch": 1.13,
340
+ "learning_rate": 1.248496993987976e-05,
341
+ "loss": 0.1268,
342
+ "step": 27000
343
+ },
344
+ {
345
+ "epoch": 1.15,
346
+ "learning_rate": 1.23458027165442e-05,
347
+ "loss": 0.0819,
348
+ "step": 27500
349
+ },
350
+ {
351
+ "epoch": 1.17,
352
+ "learning_rate": 1.220663549320864e-05,
353
+ "loss": 0.0932,
354
+ "step": 28000
355
+ },
356
+ {
357
+ "epoch": 1.19,
358
+ "learning_rate": 1.206746826987308e-05,
359
+ "loss": 0.0678,
360
+ "step": 28500
361
+ },
362
+ {
363
+ "epoch": 1.21,
364
+ "learning_rate": 1.192830104653752e-05,
365
+ "loss": 0.0864,
366
+ "step": 29000
367
+ },
368
+ {
369
+ "epoch": 1.23,
370
+ "learning_rate": 1.1789133823201962e-05,
371
+ "loss": 0.0909,
372
+ "step": 29500
373
+ },
374
+ {
375
+ "epoch": 1.25,
376
+ "learning_rate": 1.16499665998664e-05,
377
+ "loss": 0.0942,
378
+ "step": 30000
379
+ },
380
+ {
381
+ "epoch": 1.27,
382
+ "learning_rate": 1.1510799376530842e-05,
383
+ "loss": 0.0872,
384
+ "step": 30500
385
+ },
386
+ {
387
+ "epoch": 1.29,
388
+ "learning_rate": 1.137163215319528e-05,
389
+ "loss": 0.0692,
390
+ "step": 31000
391
+ },
392
+ {
393
+ "epoch": 1.32,
394
+ "learning_rate": 1.1232464929859721e-05,
395
+ "loss": 0.0876,
396
+ "step": 31500
397
+ },
398
+ {
399
+ "epoch": 1.34,
400
+ "learning_rate": 1.109329770652416e-05,
401
+ "loss": 0.079,
402
+ "step": 32000
403
+ },
404
+ {
405
+ "epoch": 1.36,
406
+ "learning_rate": 1.09541304831886e-05,
407
+ "loss": 0.0793,
408
+ "step": 32500
409
+ },
410
+ {
411
+ "epoch": 1.38,
412
+ "learning_rate": 1.081496325985304e-05,
413
+ "loss": 0.0912,
414
+ "step": 33000
415
+ },
416
+ {
417
+ "epoch": 1.4,
418
+ "learning_rate": 1.067579603651748e-05,
419
+ "loss": 0.0853,
420
+ "step": 33500
421
+ },
422
+ {
423
+ "epoch": 1.42,
424
+ "learning_rate": 1.053662881318192e-05,
425
+ "loss": 0.0807,
426
+ "step": 34000
427
+ },
428
+ {
429
+ "epoch": 1.44,
430
+ "learning_rate": 1.039746158984636e-05,
431
+ "loss": 0.0663,
432
+ "step": 34500
433
+ },
434
+ {
435
+ "epoch": 1.46,
436
+ "learning_rate": 1.0258294366510799e-05,
437
+ "loss": 0.1107,
438
+ "step": 35000
439
+ },
440
+ {
441
+ "epoch": 1.48,
442
+ "learning_rate": 1.0119127143175241e-05,
443
+ "loss": 0.0972,
444
+ "step": 35500
445
+ },
446
+ {
447
+ "epoch": 1.5,
448
+ "learning_rate": 9.97995991983968e-06,
449
+ "loss": 0.0696,
450
+ "step": 36000
451
+ },
452
+ {
453
+ "epoch": 1.52,
454
+ "learning_rate": 9.84079269650412e-06,
455
+ "loss": 0.074,
456
+ "step": 36500
457
+ },
458
+ {
459
+ "epoch": 1.54,
460
+ "learning_rate": 9.70162547316856e-06,
461
+ "loss": 0.0865,
462
+ "step": 37000
463
+ },
464
+ {
465
+ "epoch": 1.57,
466
+ "learning_rate": 9.562458249833e-06,
467
+ "loss": 0.1031,
468
+ "step": 37500
469
+ },
470
+ {
471
+ "epoch": 1.59,
472
+ "learning_rate": 9.423291026497439e-06,
473
+ "loss": 0.0582,
474
+ "step": 38000
475
+ },
476
+ {
477
+ "epoch": 1.61,
478
+ "learning_rate": 9.28412380316188e-06,
479
+ "loss": 0.094,
480
+ "step": 38500
481
+ },
482
+ {
483
+ "epoch": 1.63,
484
+ "learning_rate": 9.14495657982632e-06,
485
+ "loss": 0.0607,
486
+ "step": 39000
487
+ },
488
+ {
489
+ "epoch": 1.65,
490
+ "learning_rate": 9.005789356490761e-06,
491
+ "loss": 0.0828,
492
+ "step": 39500
493
+ },
494
+ {
495
+ "epoch": 1.67,
496
+ "learning_rate": 8.8666221331552e-06,
497
+ "loss": 0.083,
498
+ "step": 40000
499
+ },
500
+ {
501
+ "epoch": 1.69,
502
+ "learning_rate": 8.72745490981964e-06,
503
+ "loss": 0.0732,
504
+ "step": 40500
505
+ },
506
+ {
507
+ "epoch": 1.71,
508
+ "learning_rate": 8.588287686484081e-06,
509
+ "loss": 0.064,
510
+ "step": 41000
511
+ },
512
+ {
513
+ "epoch": 1.73,
514
+ "learning_rate": 8.44912046314852e-06,
515
+ "loss": 0.0883,
516
+ "step": 41500
517
+ },
518
+ {
519
+ "epoch": 1.75,
520
+ "learning_rate": 8.30995323981296e-06,
521
+ "loss": 0.0859,
522
+ "step": 42000
523
+ },
524
+ {
525
+ "epoch": 1.77,
526
+ "learning_rate": 8.1707860164774e-06,
527
+ "loss": 0.0704,
528
+ "step": 42500
529
+ },
530
+ {
531
+ "epoch": 1.8,
532
+ "learning_rate": 8.03161879314184e-06,
533
+ "loss": 0.0651,
534
+ "step": 43000
535
+ },
536
+ {
537
+ "epoch": 1.82,
538
+ "learning_rate": 7.892451569806279e-06,
539
+ "loss": 0.0832,
540
+ "step": 43500
541
+ },
542
+ {
543
+ "epoch": 1.84,
544
+ "learning_rate": 7.75328434647072e-06,
545
+ "loss": 0.0834,
546
+ "step": 44000
547
+ },
548
+ {
549
+ "epoch": 1.86,
550
+ "learning_rate": 7.614117123135159e-06,
551
+ "loss": 0.0769,
552
+ "step": 44500
553
+ },
554
+ {
555
+ "epoch": 1.88,
556
+ "learning_rate": 7.474949899799599e-06,
557
+ "loss": 0.0934,
558
+ "step": 45000
559
+ },
560
+ {
561
+ "epoch": 1.9,
562
+ "learning_rate": 7.33578267646404e-06,
563
+ "loss": 0.0711,
564
+ "step": 45500
565
+ },
566
+ {
567
+ "epoch": 1.92,
568
+ "learning_rate": 7.1966154531284795e-06,
569
+ "loss": 0.0766,
570
+ "step": 46000
571
+ },
572
+ {
573
+ "epoch": 1.94,
574
+ "learning_rate": 7.057448229792919e-06,
575
+ "loss": 0.0754,
576
+ "step": 46500
577
+ },
578
+ {
579
+ "epoch": 1.96,
580
+ "learning_rate": 6.91828100645736e-06,
581
+ "loss": 0.0664,
582
+ "step": 47000
583
+ },
584
+ {
585
+ "epoch": 1.98,
586
+ "learning_rate": 6.7791137831218005e-06,
587
+ "loss": 0.0767,
588
+ "step": 47500
589
+ },
590
+ {
591
+ "epoch": 2.0,
592
+ "eval_f1": 0.9613810955243821,
593
+ "eval_loss": 0.24657295644283295,
594
+ "eval_runtime": 308.1161,
595
+ "eval_samples_per_second": 77.737,
596
+ "eval_steps_per_second": 19.434,
597
+ "step": 47904
598
+ }
599
+ ],
600
+ "logging_steps": 500,
601
+ "max_steps": 71856,
602
+ "num_input_tokens_seen": 0,
603
+ "num_train_epochs": 3,
604
+ "save_steps": 500,
605
+ "total_flos": 5.04147093175296e+16,
606
+ "train_batch_size": 4,
607
+ "trial_name": null,
608
+ "trial_params": null
609
+ }
checkpoint-47904/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b1005c88e7a47ac094d8a7bd77607b48d53a9b5dd46659d3ae79ec2d04818e
3
+ size 4728
checkpoint-47904/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-71856/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nghuyong/ernie-2.0-base-en",
3
+ "architectures": [
4
+ "ErnieForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "human",
13
+ "1": "machine"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "label2id": {
18
+ "human": 0,
19
+ "machine": 1
20
+ },
21
+ "layer_norm_eps": 1e-05,
22
+ "max_position_embeddings": 512,
23
+ "model_type": "ernie",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 12,
26
+ "pad_token_id": 0,
27
+ "position_embedding_type": "absolute",
28
+ "problem_type": "single_label_classification",
29
+ "task_type_vocab_size": 3,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.37.2",
32
+ "type_vocab_size": 4,
33
+ "use_cache": true,
34
+ "use_task_id": false,
35
+ "vocab_size": 30522
36
+ }
checkpoint-71856/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0924d19ee7039bc9342ebcfcf3bd81d79a39593e804591fa36a0f906a542e8a4
3
+ size 14244
checkpoint-71856/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f19be7b49be8626be5cd18d6c5467894483a7388c74673e9e7257331a73d4db
3
+ size 1064
checkpoint-71856/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
checkpoint-71856/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-71856/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "full_tokenizer_file": null,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
checkpoint-71856/trainer_state.json ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.19777873158454895,
3
+ "best_model_checkpoint": "nghuyong/ernie-2.0-base-en/subtaskA/0/checkpoint-71856",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 71856,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "learning_rate": 1.9860832776664444e-05,
14
+ "loss": 0.4761,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.04,
19
+ "learning_rate": 1.9721665553328883e-05,
20
+ "loss": 0.3754,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.06,
25
+ "learning_rate": 1.9582498329993322e-05,
26
+ "loss": 0.3187,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.08,
31
+ "learning_rate": 1.944333110665776e-05,
32
+ "loss": 0.3292,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.1,
37
+ "learning_rate": 1.9304163883322203e-05,
38
+ "loss": 0.269,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.13,
43
+ "learning_rate": 1.9164996659986642e-05,
44
+ "loss": 0.2716,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.15,
49
+ "learning_rate": 1.902582943665108e-05,
50
+ "loss": 0.2496,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.17,
55
+ "learning_rate": 1.888666221331552e-05,
56
+ "loss": 0.2035,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.19,
61
+ "learning_rate": 1.8747494989979962e-05,
62
+ "loss": 0.2654,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.21,
67
+ "learning_rate": 1.86083277666444e-05,
68
+ "loss": 0.2489,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.23,
73
+ "learning_rate": 1.8469160543308843e-05,
74
+ "loss": 0.2162,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.25,
79
+ "learning_rate": 1.832999331997328e-05,
80
+ "loss": 0.2375,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.27,
85
+ "learning_rate": 1.819082609663772e-05,
86
+ "loss": 0.2367,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.29,
91
+ "learning_rate": 1.805165887330216e-05,
92
+ "loss": 0.238,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 0.31,
97
+ "learning_rate": 1.7912491649966602e-05,
98
+ "loss": 0.2017,
99
+ "step": 7500
100
+ },
101
+ {
102
+ "epoch": 0.33,
103
+ "learning_rate": 1.777332442663104e-05,
104
+ "loss": 0.2096,
105
+ "step": 8000
106
+ },
107
+ {
108
+ "epoch": 0.35,
109
+ "learning_rate": 1.763415720329548e-05,
110
+ "loss": 0.2028,
111
+ "step": 8500
112
+ },
113
+ {
114
+ "epoch": 0.38,
115
+ "learning_rate": 1.7494989979959922e-05,
116
+ "loss": 0.267,
117
+ "step": 9000
118
+ },
119
+ {
120
+ "epoch": 0.4,
121
+ "learning_rate": 1.735582275662436e-05,
122
+ "loss": 0.2098,
123
+ "step": 9500
124
+ },
125
+ {
126
+ "epoch": 0.42,
127
+ "learning_rate": 1.72166555332888e-05,
128
+ "loss": 0.2167,
129
+ "step": 10000
130
+ },
131
+ {
132
+ "epoch": 0.44,
133
+ "learning_rate": 1.707748830995324e-05,
134
+ "loss": 0.1927,
135
+ "step": 10500
136
+ },
137
+ {
138
+ "epoch": 0.46,
139
+ "learning_rate": 1.693832108661768e-05,
140
+ "loss": 0.1595,
141
+ "step": 11000
142
+ },
143
+ {
144
+ "epoch": 0.48,
145
+ "learning_rate": 1.679915386328212e-05,
146
+ "loss": 0.1724,
147
+ "step": 11500
148
+ },
149
+ {
150
+ "epoch": 0.5,
151
+ "learning_rate": 1.6659986639946563e-05,
152
+ "loss": 0.1884,
153
+ "step": 12000
154
+ },
155
+ {
156
+ "epoch": 0.52,
157
+ "learning_rate": 1.6520819416611002e-05,
158
+ "loss": 0.1607,
159
+ "step": 12500
160
+ },
161
+ {
162
+ "epoch": 0.54,
163
+ "learning_rate": 1.638165219327544e-05,
164
+ "loss": 0.1893,
165
+ "step": 13000
166
+ },
167
+ {
168
+ "epoch": 0.56,
169
+ "learning_rate": 1.624248496993988e-05,
170
+ "loss": 0.1775,
171
+ "step": 13500
172
+ },
173
+ {
174
+ "epoch": 0.58,
175
+ "learning_rate": 1.6103317746604322e-05,
176
+ "loss": 0.1716,
177
+ "step": 14000
178
+ },
179
+ {
180
+ "epoch": 0.61,
181
+ "learning_rate": 1.596415052326876e-05,
182
+ "loss": 0.1563,
183
+ "step": 14500
184
+ },
185
+ {
186
+ "epoch": 0.63,
187
+ "learning_rate": 1.58249832999332e-05,
188
+ "loss": 0.1856,
189
+ "step": 15000
190
+ },
191
+ {
192
+ "epoch": 0.65,
193
+ "learning_rate": 1.5685816076597642e-05,
194
+ "loss": 0.1454,
195
+ "step": 15500
196
+ },
197
+ {
198
+ "epoch": 0.67,
199
+ "learning_rate": 1.554664885326208e-05,
200
+ "loss": 0.1488,
201
+ "step": 16000
202
+ },
203
+ {
204
+ "epoch": 0.69,
205
+ "learning_rate": 1.5407481629926523e-05,
206
+ "loss": 0.1879,
207
+ "step": 16500
208
+ },
209
+ {
210
+ "epoch": 0.71,
211
+ "learning_rate": 1.5268314406590962e-05,
212
+ "loss": 0.1438,
213
+ "step": 17000
214
+ },
215
+ {
216
+ "epoch": 0.73,
217
+ "learning_rate": 1.5129147183255401e-05,
218
+ "loss": 0.1797,
219
+ "step": 17500
220
+ },
221
+ {
222
+ "epoch": 0.75,
223
+ "learning_rate": 1.498997995991984e-05,
224
+ "loss": 0.1457,
225
+ "step": 18000
226
+ },
227
+ {
228
+ "epoch": 0.77,
229
+ "learning_rate": 1.485081273658428e-05,
230
+ "loss": 0.1379,
231
+ "step": 18500
232
+ },
233
+ {
234
+ "epoch": 0.79,
235
+ "learning_rate": 1.471164551324872e-05,
236
+ "loss": 0.1472,
237
+ "step": 19000
238
+ },
239
+ {
240
+ "epoch": 0.81,
241
+ "learning_rate": 1.4572478289913162e-05,
242
+ "loss": 0.1233,
243
+ "step": 19500
244
+ },
245
+ {
246
+ "epoch": 0.84,
247
+ "learning_rate": 1.44333110665776e-05,
248
+ "loss": 0.1456,
249
+ "step": 20000
250
+ },
251
+ {
252
+ "epoch": 0.86,
253
+ "learning_rate": 1.4294143843242041e-05,
254
+ "loss": 0.1444,
255
+ "step": 20500
256
+ },
257
+ {
258
+ "epoch": 0.88,
259
+ "learning_rate": 1.415497661990648e-05,
260
+ "loss": 0.1413,
261
+ "step": 21000
262
+ },
263
+ {
264
+ "epoch": 0.9,
265
+ "learning_rate": 1.4015809396570921e-05,
266
+ "loss": 0.1428,
267
+ "step": 21500
268
+ },
269
+ {
270
+ "epoch": 0.92,
271
+ "learning_rate": 1.3876642173235361e-05,
272
+ "loss": 0.1305,
273
+ "step": 22000
274
+ },
275
+ {
276
+ "epoch": 0.94,
277
+ "learning_rate": 1.37374749498998e-05,
278
+ "loss": 0.1669,
279
+ "step": 22500
280
+ },
281
+ {
282
+ "epoch": 0.96,
283
+ "learning_rate": 1.3598307726564241e-05,
284
+ "loss": 0.1417,
285
+ "step": 23000
286
+ },
287
+ {
288
+ "epoch": 0.98,
289
+ "learning_rate": 1.345914050322868e-05,
290
+ "loss": 0.1494,
291
+ "step": 23500
292
+ },
293
+ {
294
+ "epoch": 1.0,
295
+ "eval_f1": 0.9453072812291249,
296
+ "eval_loss": 0.30735036730766296,
297
+ "eval_runtime": 307.615,
298
+ "eval_samples_per_second": 77.864,
299
+ "eval_steps_per_second": 19.466,
300
+ "step": 23952
301
+ },
302
+ {
303
+ "epoch": 1.0,
304
+ "learning_rate": 1.331997327989312e-05,
305
+ "loss": 0.1028,
306
+ "step": 24000
307
+ },
308
+ {
309
+ "epoch": 1.02,
310
+ "learning_rate": 1.318080605655756e-05,
311
+ "loss": 0.0729,
312
+ "step": 24500
313
+ },
314
+ {
315
+ "epoch": 1.04,
316
+ "learning_rate": 1.3041638833222002e-05,
317
+ "loss": 0.0976,
318
+ "step": 25000
319
+ },
320
+ {
321
+ "epoch": 1.06,
322
+ "learning_rate": 1.290247160988644e-05,
323
+ "loss": 0.0782,
324
+ "step": 25500
325
+ },
326
+ {
327
+ "epoch": 1.09,
328
+ "learning_rate": 1.2763304386550881e-05,
329
+ "loss": 0.0977,
330
+ "step": 26000
331
+ },
332
+ {
333
+ "epoch": 1.11,
334
+ "learning_rate": 1.262413716321532e-05,
335
+ "loss": 0.0837,
336
+ "step": 26500
337
+ },
338
+ {
339
+ "epoch": 1.13,
340
+ "learning_rate": 1.248496993987976e-05,
341
+ "loss": 0.1268,
342
+ "step": 27000
343
+ },
344
+ {
345
+ "epoch": 1.15,
346
+ "learning_rate": 1.23458027165442e-05,
347
+ "loss": 0.0819,
348
+ "step": 27500
349
+ },
350
+ {
351
+ "epoch": 1.17,
352
+ "learning_rate": 1.220663549320864e-05,
353
+ "loss": 0.0932,
354
+ "step": 28000
355
+ },
356
+ {
357
+ "epoch": 1.19,
358
+ "learning_rate": 1.206746826987308e-05,
359
+ "loss": 0.0678,
360
+ "step": 28500
361
+ },
362
+ {
363
+ "epoch": 1.21,
364
+ "learning_rate": 1.192830104653752e-05,
365
+ "loss": 0.0864,
366
+ "step": 29000
367
+ },
368
+ {
369
+ "epoch": 1.23,
370
+ "learning_rate": 1.1789133823201962e-05,
371
+ "loss": 0.0909,
372
+ "step": 29500
373
+ },
374
+ {
375
+ "epoch": 1.25,
376
+ "learning_rate": 1.16499665998664e-05,
377
+ "loss": 0.0942,
378
+ "step": 30000
379
+ },
380
+ {
381
+ "epoch": 1.27,
382
+ "learning_rate": 1.1510799376530842e-05,
383
+ "loss": 0.0872,
384
+ "step": 30500
385
+ },
386
+ {
387
+ "epoch": 1.29,
388
+ "learning_rate": 1.137163215319528e-05,
389
+ "loss": 0.0692,
390
+ "step": 31000
391
+ },
392
+ {
393
+ "epoch": 1.32,
394
+ "learning_rate": 1.1232464929859721e-05,
395
+ "loss": 0.0876,
396
+ "step": 31500
397
+ },
398
+ {
399
+ "epoch": 1.34,
400
+ "learning_rate": 1.109329770652416e-05,
401
+ "loss": 0.079,
402
+ "step": 32000
403
+ },
404
+ {
405
+ "epoch": 1.36,
406
+ "learning_rate": 1.09541304831886e-05,
407
+ "loss": 0.0793,
408
+ "step": 32500
409
+ },
410
+ {
411
+ "epoch": 1.38,
412
+ "learning_rate": 1.081496325985304e-05,
413
+ "loss": 0.0912,
414
+ "step": 33000
415
+ },
416
+ {
417
+ "epoch": 1.4,
418
+ "learning_rate": 1.067579603651748e-05,
419
+ "loss": 0.0853,
420
+ "step": 33500
421
+ },
422
+ {
423
+ "epoch": 1.42,
424
+ "learning_rate": 1.053662881318192e-05,
425
+ "loss": 0.0807,
426
+ "step": 34000
427
+ },
428
+ {
429
+ "epoch": 1.44,
430
+ "learning_rate": 1.039746158984636e-05,
431
+ "loss": 0.0663,
432
+ "step": 34500
433
+ },
434
+ {
435
+ "epoch": 1.46,
436
+ "learning_rate": 1.0258294366510799e-05,
437
+ "loss": 0.1107,
438
+ "step": 35000
439
+ },
440
+ {
441
+ "epoch": 1.48,
442
+ "learning_rate": 1.0119127143175241e-05,
443
+ "loss": 0.0972,
444
+ "step": 35500
445
+ },
446
+ {
447
+ "epoch": 1.5,
448
+ "learning_rate": 9.97995991983968e-06,
449
+ "loss": 0.0696,
450
+ "step": 36000
451
+ },
452
+ {
453
+ "epoch": 1.52,
454
+ "learning_rate": 9.84079269650412e-06,
455
+ "loss": 0.074,
456
+ "step": 36500
457
+ },
458
+ {
459
+ "epoch": 1.54,
460
+ "learning_rate": 9.70162547316856e-06,
461
+ "loss": 0.0865,
462
+ "step": 37000
463
+ },
464
+ {
465
+ "epoch": 1.57,
466
+ "learning_rate": 9.562458249833e-06,
467
+ "loss": 0.1031,
468
+ "step": 37500
469
+ },
470
+ {
471
+ "epoch": 1.59,
472
+ "learning_rate": 9.423291026497439e-06,
473
+ "loss": 0.0582,
474
+ "step": 38000
475
+ },
476
+ {
477
+ "epoch": 1.61,
478
+ "learning_rate": 9.28412380316188e-06,
479
+ "loss": 0.094,
480
+ "step": 38500
481
+ },
482
+ {
483
+ "epoch": 1.63,
484
+ "learning_rate": 9.14495657982632e-06,
485
+ "loss": 0.0607,
486
+ "step": 39000
487
+ },
488
+ {
489
+ "epoch": 1.65,
490
+ "learning_rate": 9.005789356490761e-06,
491
+ "loss": 0.0828,
492
+ "step": 39500
493
+ },
494
+ {
495
+ "epoch": 1.67,
496
+ "learning_rate": 8.8666221331552e-06,
497
+ "loss": 0.083,
498
+ "step": 40000
499
+ },
500
+ {
501
+ "epoch": 1.69,
502
+ "learning_rate": 8.72745490981964e-06,
503
+ "loss": 0.0732,
504
+ "step": 40500
505
+ },
506
+ {
507
+ "epoch": 1.71,
508
+ "learning_rate": 8.588287686484081e-06,
509
+ "loss": 0.064,
510
+ "step": 41000
511
+ },
512
+ {
513
+ "epoch": 1.73,
514
+ "learning_rate": 8.44912046314852e-06,
515
+ "loss": 0.0883,
516
+ "step": 41500
517
+ },
518
+ {
519
+ "epoch": 1.75,
520
+ "learning_rate": 8.30995323981296e-06,
521
+ "loss": 0.0859,
522
+ "step": 42000
523
+ },
524
+ {
525
+ "epoch": 1.77,
526
+ "learning_rate": 8.1707860164774e-06,
527
+ "loss": 0.0704,
528
+ "step": 42500
529
+ },
530
+ {
531
+ "epoch": 1.8,
532
+ "learning_rate": 8.03161879314184e-06,
533
+ "loss": 0.0651,
534
+ "step": 43000
535
+ },
536
+ {
537
+ "epoch": 1.82,
538
+ "learning_rate": 7.892451569806279e-06,
539
+ "loss": 0.0832,
540
+ "step": 43500
541
+ },
542
+ {
543
+ "epoch": 1.84,
544
+ "learning_rate": 7.75328434647072e-06,
545
+ "loss": 0.0834,
546
+ "step": 44000
547
+ },
548
+ {
549
+ "epoch": 1.86,
550
+ "learning_rate": 7.614117123135159e-06,
551
+ "loss": 0.0769,
552
+ "step": 44500
553
+ },
554
+ {
555
+ "epoch": 1.88,
556
+ "learning_rate": 7.474949899799599e-06,
557
+ "loss": 0.0934,
558
+ "step": 45000
559
+ },
560
+ {
561
+ "epoch": 1.9,
562
+ "learning_rate": 7.33578267646404e-06,
563
+ "loss": 0.0711,
564
+ "step": 45500
565
+ },
566
+ {
567
+ "epoch": 1.92,
568
+ "learning_rate": 7.1966154531284795e-06,
569
+ "loss": 0.0766,
570
+ "step": 46000
571
+ },
572
+ {
573
+ "epoch": 1.94,
574
+ "learning_rate": 7.057448229792919e-06,
575
+ "loss": 0.0754,
576
+ "step": 46500
577
+ },
578
+ {
579
+ "epoch": 1.96,
580
+ "learning_rate": 6.91828100645736e-06,
581
+ "loss": 0.0664,
582
+ "step": 47000
583
+ },
584
+ {
585
+ "epoch": 1.98,
586
+ "learning_rate": 6.7791137831218005e-06,
587
+ "loss": 0.0767,
588
+ "step": 47500
589
+ },
590
+ {
591
+ "epoch": 2.0,
592
+ "eval_f1": 0.9613810955243821,
593
+ "eval_loss": 0.24657295644283295,
594
+ "eval_runtime": 308.1161,
595
+ "eval_samples_per_second": 77.737,
596
+ "eval_steps_per_second": 19.434,
597
+ "step": 47904
598
+ },
599
+ {
600
+ "epoch": 2.0,
601
+ "learning_rate": 6.63994655978624e-06,
602
+ "loss": 0.0632,
603
+ "step": 48000
604
+ },
605
+ {
606
+ "epoch": 2.02,
607
+ "learning_rate": 6.50077933645068e-06,
608
+ "loss": 0.0302,
609
+ "step": 48500
610
+ },
611
+ {
612
+ "epoch": 2.05,
613
+ "learning_rate": 6.36161211311512e-06,
614
+ "loss": 0.0557,
615
+ "step": 49000
616
+ },
617
+ {
618
+ "epoch": 2.07,
619
+ "learning_rate": 6.22244488977956e-06,
620
+ "loss": 0.0313,
621
+ "step": 49500
622
+ },
623
+ {
624
+ "epoch": 2.09,
625
+ "learning_rate": 6.083277666443999e-06,
626
+ "loss": 0.0411,
627
+ "step": 50000
628
+ },
629
+ {
630
+ "epoch": 2.11,
631
+ "learning_rate": 5.944110443108439e-06,
632
+ "loss": 0.0478,
633
+ "step": 50500
634
+ },
635
+ {
636
+ "epoch": 2.13,
637
+ "learning_rate": 5.80494321977288e-06,
638
+ "loss": 0.0324,
639
+ "step": 51000
640
+ },
641
+ {
642
+ "epoch": 2.15,
643
+ "learning_rate": 5.6657759964373195e-06,
644
+ "loss": 0.0363,
645
+ "step": 51500
646
+ },
647
+ {
648
+ "epoch": 2.17,
649
+ "learning_rate": 5.526608773101759e-06,
650
+ "loss": 0.034,
651
+ "step": 52000
652
+ },
653
+ {
654
+ "epoch": 2.19,
655
+ "learning_rate": 5.387441549766199e-06,
656
+ "loss": 0.0267,
657
+ "step": 52500
658
+ },
659
+ {
660
+ "epoch": 2.21,
661
+ "learning_rate": 5.248274326430639e-06,
662
+ "loss": 0.0418,
663
+ "step": 53000
664
+ },
665
+ {
666
+ "epoch": 2.23,
667
+ "learning_rate": 5.1091071030950786e-06,
668
+ "loss": 0.0288,
669
+ "step": 53500
670
+ },
671
+ {
672
+ "epoch": 2.25,
673
+ "learning_rate": 4.969939879759519e-06,
674
+ "loss": 0.0354,
675
+ "step": 54000
676
+ },
677
+ {
678
+ "epoch": 2.28,
679
+ "learning_rate": 4.830772656423959e-06,
680
+ "loss": 0.0415,
681
+ "step": 54500
682
+ },
683
+ {
684
+ "epoch": 2.3,
685
+ "learning_rate": 4.6916054330883996e-06,
686
+ "loss": 0.026,
687
+ "step": 55000
688
+ },
689
+ {
690
+ "epoch": 2.32,
691
+ "learning_rate": 4.552438209752839e-06,
692
+ "loss": 0.0354,
693
+ "step": 55500
694
+ },
695
+ {
696
+ "epoch": 2.34,
697
+ "learning_rate": 4.41327098641728e-06,
698
+ "loss": 0.0344,
699
+ "step": 56000
700
+ },
701
+ {
702
+ "epoch": 2.36,
703
+ "learning_rate": 4.27410376308172e-06,
704
+ "loss": 0.0238,
705
+ "step": 56500
706
+ },
707
+ {
708
+ "epoch": 2.38,
709
+ "learning_rate": 4.1349365397461595e-06,
710
+ "loss": 0.0382,
711
+ "step": 57000
712
+ },
713
+ {
714
+ "epoch": 2.4,
715
+ "learning_rate": 3.995769316410599e-06,
716
+ "loss": 0.026,
717
+ "step": 57500
718
+ },
719
+ {
720
+ "epoch": 2.42,
721
+ "learning_rate": 3.856602093075039e-06,
722
+ "loss": 0.028,
723
+ "step": 58000
724
+ },
725
+ {
726
+ "epoch": 2.44,
727
+ "learning_rate": 3.717434869739479e-06,
728
+ "loss": 0.0229,
729
+ "step": 58500
730
+ },
731
+ {
732
+ "epoch": 2.46,
733
+ "learning_rate": 3.578267646403919e-06,
734
+ "loss": 0.0372,
735
+ "step": 59000
736
+ },
737
+ {
738
+ "epoch": 2.48,
739
+ "learning_rate": 3.4391004230683596e-06,
740
+ "loss": 0.0344,
741
+ "step": 59500
742
+ },
743
+ {
744
+ "epoch": 2.51,
745
+ "learning_rate": 3.2999331997327993e-06,
746
+ "loss": 0.0179,
747
+ "step": 60000
748
+ },
749
+ {
750
+ "epoch": 2.53,
751
+ "learning_rate": 3.160765976397239e-06,
752
+ "loss": 0.0245,
753
+ "step": 60500
754
+ },
755
+ {
756
+ "epoch": 2.55,
757
+ "learning_rate": 3.0215987530616793e-06,
758
+ "loss": 0.0192,
759
+ "step": 61000
760
+ },
761
+ {
762
+ "epoch": 2.57,
763
+ "learning_rate": 2.882431529726119e-06,
764
+ "loss": 0.0185,
765
+ "step": 61500
766
+ },
767
+ {
768
+ "epoch": 2.59,
769
+ "learning_rate": 2.743264306390559e-06,
770
+ "loss": 0.0372,
771
+ "step": 62000
772
+ },
773
+ {
774
+ "epoch": 2.61,
775
+ "learning_rate": 2.604097083054999e-06,
776
+ "loss": 0.0417,
777
+ "step": 62500
778
+ },
779
+ {
780
+ "epoch": 2.63,
781
+ "learning_rate": 2.464929859719439e-06,
782
+ "loss": 0.0236,
783
+ "step": 63000
784
+ },
785
+ {
786
+ "epoch": 2.65,
787
+ "learning_rate": 2.325762636383879e-06,
788
+ "loss": 0.032,
789
+ "step": 63500
790
+ },
791
+ {
792
+ "epoch": 2.67,
793
+ "learning_rate": 2.186595413048319e-06,
794
+ "loss": 0.0356,
795
+ "step": 64000
796
+ },
797
+ {
798
+ "epoch": 2.69,
799
+ "learning_rate": 2.047428189712759e-06,
800
+ "loss": 0.0369,
801
+ "step": 64500
802
+ },
803
+ {
804
+ "epoch": 2.71,
805
+ "learning_rate": 1.908260966377199e-06,
806
+ "loss": 0.043,
807
+ "step": 65000
808
+ },
809
+ {
810
+ "epoch": 2.73,
811
+ "learning_rate": 1.769093743041639e-06,
812
+ "loss": 0.0304,
813
+ "step": 65500
814
+ },
815
+ {
816
+ "epoch": 2.76,
817
+ "learning_rate": 1.629926519706079e-06,
818
+ "loss": 0.0354,
819
+ "step": 66000
820
+ },
821
+ {
822
+ "epoch": 2.78,
823
+ "learning_rate": 1.4907592963705188e-06,
824
+ "loss": 0.026,
825
+ "step": 66500
826
+ },
827
+ {
828
+ "epoch": 2.8,
829
+ "learning_rate": 1.3515920730349588e-06,
830
+ "loss": 0.0272,
831
+ "step": 67000
832
+ },
833
+ {
834
+ "epoch": 2.82,
835
+ "learning_rate": 1.2124248496993988e-06,
836
+ "loss": 0.0231,
837
+ "step": 67500
838
+ },
839
+ {
840
+ "epoch": 2.84,
841
+ "learning_rate": 1.073257626363839e-06,
842
+ "loss": 0.0277,
843
+ "step": 68000
844
+ },
845
+ {
846
+ "epoch": 2.86,
847
+ "learning_rate": 9.340904030282788e-07,
848
+ "loss": 0.0216,
849
+ "step": 68500
850
+ },
851
+ {
852
+ "epoch": 2.88,
853
+ "learning_rate": 7.949231796927188e-07,
854
+ "loss": 0.0362,
855
+ "step": 69000
856
+ },
857
+ {
858
+ "epoch": 2.9,
859
+ "learning_rate": 6.557559563571589e-07,
860
+ "loss": 0.0167,
861
+ "step": 69500
862
+ },
863
+ {
864
+ "epoch": 2.92,
865
+ "learning_rate": 5.165887330215988e-07,
866
+ "loss": 0.0174,
867
+ "step": 70000
868
+ },
869
+ {
870
+ "epoch": 2.94,
871
+ "learning_rate": 3.7742150968603874e-07,
872
+ "loss": 0.0194,
873
+ "step": 70500
874
+ },
875
+ {
876
+ "epoch": 2.96,
877
+ "learning_rate": 2.3825428635047877e-07,
878
+ "loss": 0.0243,
879
+ "step": 71000
880
+ },
881
+ {
882
+ "epoch": 2.99,
883
+ "learning_rate": 9.908706301491873e-08,
884
+ "loss": 0.0151,
885
+ "step": 71500
886
+ },
887
+ {
888
+ "epoch": 3.0,
889
+ "eval_f1": 0.9735303941215765,
890
+ "eval_loss": 0.19777873158454895,
891
+ "eval_runtime": 307.9685,
892
+ "eval_samples_per_second": 77.774,
893
+ "eval_steps_per_second": 19.444,
894
+ "step": 71856
895
+ }
896
+ ],
897
+ "logging_steps": 500,
898
+ "max_steps": 71856,
899
+ "num_input_tokens_seen": 0,
900
+ "num_train_epochs": 3,
901
+ "save_steps": 500,
902
+ "total_flos": 7.56220639762944e+16,
903
+ "train_batch_size": 4,
904
+ "trial_name": null,
905
+ "trial_params": null
906
+ }
checkpoint-71856/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b1005c88e7a47ac094d8a7bd77607b48d53a9b5dd46659d3ae79ec2d04818e
3
+ size 4728
checkpoint-71856/vocab.txt ADDED
The diff for this file is too large to render. See raw diff