colerobertson commited on
Commit
bac01b2
1 Parent(s): 02160f9

Training in progress, epoch 1

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. model.safetensors +1 -1
  2. run-4/checkpoint-144/config.json +80 -0
  3. run-4/checkpoint-144/model.safetensors +3 -0
  4. run-4/checkpoint-144/optimizer.pt +3 -0
  5. run-4/checkpoint-144/preprocessor_config.json +9 -0
  6. run-4/checkpoint-144/rng_state.pth +3 -0
  7. run-4/checkpoint-144/scheduler.pt +3 -0
  8. run-4/checkpoint-144/trainer_state.json +247 -0
  9. run-4/checkpoint-144/training_args.bin +3 -0
  10. run-4/checkpoint-192/config.json +80 -0
  11. run-4/checkpoint-192/model.safetensors +3 -0
  12. run-4/checkpoint-192/optimizer.pt +3 -0
  13. run-4/checkpoint-192/preprocessor_config.json +9 -0
  14. run-4/checkpoint-192/rng_state.pth +3 -0
  15. run-4/checkpoint-192/scheduler.pt +3 -0
  16. run-4/checkpoint-192/trainer_state.json +326 -0
  17. run-4/checkpoint-192/training_args.bin +3 -0
  18. run-4/checkpoint-240/config.json +80 -0
  19. run-4/checkpoint-240/model.safetensors +3 -0
  20. run-4/checkpoint-240/optimizer.pt +3 -0
  21. run-4/checkpoint-240/preprocessor_config.json +9 -0
  22. run-4/checkpoint-240/rng_state.pth +3 -0
  23. run-4/checkpoint-240/scheduler.pt +3 -0
  24. run-4/checkpoint-240/trainer_state.json +405 -0
  25. run-4/checkpoint-240/training_args.bin +3 -0
  26. run-4/checkpoint-288/config.json +80 -0
  27. run-4/checkpoint-288/model.safetensors +3 -0
  28. run-4/checkpoint-288/optimizer.pt +3 -0
  29. run-4/checkpoint-288/preprocessor_config.json +9 -0
  30. run-4/checkpoint-288/rng_state.pth +3 -0
  31. run-4/checkpoint-288/scheduler.pt +3 -0
  32. run-4/checkpoint-288/trainer_state.json +477 -0
  33. run-4/checkpoint-288/training_args.bin +3 -0
  34. run-4/checkpoint-336/config.json +80 -0
  35. run-4/checkpoint-336/model.safetensors +3 -0
  36. run-4/checkpoint-336/optimizer.pt +3 -0
  37. run-4/checkpoint-336/preprocessor_config.json +9 -0
  38. run-4/checkpoint-336/rng_state.pth +3 -0
  39. run-4/checkpoint-336/scheduler.pt +3 -0
  40. run-4/checkpoint-336/trainer_state.json +556 -0
  41. run-4/checkpoint-336/training_args.bin +3 -0
  42. run-4/checkpoint-384/config.json +80 -0
  43. run-4/checkpoint-384/model.safetensors +3 -0
  44. run-4/checkpoint-384/optimizer.pt +3 -0
  45. run-4/checkpoint-384/preprocessor_config.json +9 -0
  46. run-4/checkpoint-384/rng_state.pth +3 -0
  47. run-4/checkpoint-384/scheduler.pt +3 -0
  48. run-4/checkpoint-384/trainer_state.json +628 -0
  49. run-4/checkpoint-384/training_args.bin +3 -0
  50. run-4/checkpoint-432/config.json +80 -0
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:018f138040ad7aa88bf94f4b8deb9d4f75616181b7d07008cdde50dd03503a11
3
  size 94763496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1ed6ffdb684cc537f47f54766448db627495bfcb076b58f5bd92f9f75833010
3
  size 94763496
run-4/checkpoint-144/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-144/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84be90551bec8e32bb108dda4349a49a0a709d76c0e31826f05686036125618e
3
+ size 94763496
run-4/checkpoint-144/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8185b0c73404b8fffa8583fd719233443888c59c9660c9da10c6e75c9be8a36e
3
+ size 189552570
run-4/checkpoint-144/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-144/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b8bdd7b4355fd23f0b8256efb0158e4240e11263e992a13d50944c37692500
3
+ size 14244
run-4/checkpoint-144/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5c28f22d5574a7076ae76732817bd92c67e71ad78a2880a292f813c3e7a8d1
3
+ size 1064
run-4/checkpoint-144/trainer_state.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 144,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ }
234
+ ],
235
+ "logging_steps": 5,
236
+ "max_steps": 480,
237
+ "num_input_tokens_seen": 0,
238
+ "num_train_epochs": 10,
239
+ "save_steps": 500,
240
+ "total_flos": 2121874430755872.0,
241
+ "train_batch_size": 4,
242
+ "trial_name": null,
243
+ "trial_params": {
244
+ "learning_rate": 2.1794099242430636e-06,
245
+ "per_device_train_batch_size": 4
246
+ }
247
+ }
run-4/checkpoint-144/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-192/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-192/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04a662be668d43e34b3ede26defff9905cfe78fa0158e7f3bca9b7849a684f21
3
+ size 94763496
run-4/checkpoint-192/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:989d42785f68e982f9cc1de749a83d539a71c120933d2aae451e2785a66e335b
3
+ size 189552570
run-4/checkpoint-192/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-192/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5aef020ca2df517540ac9ff4e195e1c41a7b85939e93195d118078f119bc949
3
+ size 14244
run-4/checkpoint-192/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2463c4896d69602722ffa95084240326443a5f0698cb239a50fde88c55bcd421
3
+ size 1064
run-4/checkpoint-192/trainer_state.json ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 192,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ },
234
+ {
235
+ "epoch": 3.02,
236
+ "grad_norm": 0.7627719640731812,
237
+ "learning_rate": 1.6900516773644127e-06,
238
+ "loss": 0.6557,
239
+ "step": 145
240
+ },
241
+ {
242
+ "epoch": 3.12,
243
+ "grad_norm": 0.9291415214538574,
244
+ "learning_rate": 1.6648270254634511e-06,
245
+ "loss": 0.6219,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "grad_norm": 0.9248765707015991,
251
+ "learning_rate": 1.6396023735624898e-06,
252
+ "loss": 0.6325,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 3.33,
257
+ "grad_norm": 0.9842573404312134,
258
+ "learning_rate": 1.6143777216615284e-06,
259
+ "loss": 0.6521,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 3.44,
264
+ "grad_norm": 0.8689214587211609,
265
+ "learning_rate": 1.589153069760567e-06,
266
+ "loss": 0.5929,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 3.54,
271
+ "grad_norm": 1.0012000799179077,
272
+ "learning_rate": 1.5639284178596057e-06,
273
+ "loss": 0.584,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 3.65,
278
+ "grad_norm": 0.7438368797302246,
279
+ "learning_rate": 1.5387037659586443e-06,
280
+ "loss": 0.6813,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 3.75,
285
+ "grad_norm": 1.8603870868682861,
286
+ "learning_rate": 1.513479114057683e-06,
287
+ "loss": 0.6099,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.85,
292
+ "grad_norm": 0.9918416738510132,
293
+ "learning_rate": 1.4882544621567216e-06,
294
+ "loss": 0.6192,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 3.96,
299
+ "grad_norm": 1.9146322011947632,
300
+ "learning_rate": 1.4630298102557603e-06,
301
+ "loss": 0.6472,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 4.0,
306
+ "eval_f1": 0.7326732673267327,
307
+ "eval_loss": 0.6818161010742188,
308
+ "eval_runtime": 1.3841,
309
+ "eval_samples_per_second": 46.239,
310
+ "eval_steps_per_second": 5.78,
311
+ "step": 192
312
+ }
313
+ ],
314
+ "logging_steps": 5,
315
+ "max_steps": 480,
316
+ "num_input_tokens_seen": 0,
317
+ "num_train_epochs": 10,
318
+ "save_steps": 500,
319
+ "total_flos": 2891755054954176.0,
320
+ "train_batch_size": 4,
321
+ "trial_name": null,
322
+ "trial_params": {
323
+ "learning_rate": 2.1794099242430636e-06,
324
+ "per_device_train_batch_size": 4
325
+ }
326
+ }
run-4/checkpoint-192/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-240/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-240/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:336f5153e14e700e4c4e5e048754a4434d79bdee3267bc3a53cbb003b875ea7e
3
+ size 94763496
run-4/checkpoint-240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22b1916abc98dec4fd4f1436e9d8dd462ba5c4401dddd0941a36031f3bc679ff
3
+ size 189552570
run-4/checkpoint-240/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8eb071c49709b6f4047e7f48105f0dd51daaf73e0a11fd742255aa4c3526f42
3
+ size 14244
run-4/checkpoint-240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2e35d3649cf053f4cc488647dacb8d6e20774271115c5998276bc7752ca7e23
3
+ size 1064
run-4/checkpoint-240/trainer_state.json ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 240,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ },
234
+ {
235
+ "epoch": 3.02,
236
+ "grad_norm": 0.7627719640731812,
237
+ "learning_rate": 1.6900516773644127e-06,
238
+ "loss": 0.6557,
239
+ "step": 145
240
+ },
241
+ {
242
+ "epoch": 3.12,
243
+ "grad_norm": 0.9291415214538574,
244
+ "learning_rate": 1.6648270254634511e-06,
245
+ "loss": 0.6219,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "grad_norm": 0.9248765707015991,
251
+ "learning_rate": 1.6396023735624898e-06,
252
+ "loss": 0.6325,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 3.33,
257
+ "grad_norm": 0.9842573404312134,
258
+ "learning_rate": 1.6143777216615284e-06,
259
+ "loss": 0.6521,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 3.44,
264
+ "grad_norm": 0.8689214587211609,
265
+ "learning_rate": 1.589153069760567e-06,
266
+ "loss": 0.5929,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 3.54,
271
+ "grad_norm": 1.0012000799179077,
272
+ "learning_rate": 1.5639284178596057e-06,
273
+ "loss": 0.584,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 3.65,
278
+ "grad_norm": 0.7438368797302246,
279
+ "learning_rate": 1.5387037659586443e-06,
280
+ "loss": 0.6813,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 3.75,
285
+ "grad_norm": 1.8603870868682861,
286
+ "learning_rate": 1.513479114057683e-06,
287
+ "loss": 0.6099,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.85,
292
+ "grad_norm": 0.9918416738510132,
293
+ "learning_rate": 1.4882544621567216e-06,
294
+ "loss": 0.6192,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 3.96,
299
+ "grad_norm": 1.9146322011947632,
300
+ "learning_rate": 1.4630298102557603e-06,
301
+ "loss": 0.6472,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 4.0,
306
+ "eval_f1": 0.7326732673267327,
307
+ "eval_loss": 0.6818161010742188,
308
+ "eval_runtime": 1.3841,
309
+ "eval_samples_per_second": 46.239,
310
+ "eval_steps_per_second": 5.78,
311
+ "step": 192
312
+ },
313
+ {
314
+ "epoch": 4.06,
315
+ "grad_norm": 0.9502781629562378,
316
+ "learning_rate": 1.437805158354799e-06,
317
+ "loss": 0.6447,
318
+ "step": 195
319
+ },
320
+ {
321
+ "epoch": 4.17,
322
+ "grad_norm": 0.8570067286491394,
323
+ "learning_rate": 1.4125805064538375e-06,
324
+ "loss": 0.5306,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 4.27,
329
+ "grad_norm": 0.8097484111785889,
330
+ "learning_rate": 1.3873558545528762e-06,
331
+ "loss": 0.6202,
332
+ "step": 205
333
+ },
334
+ {
335
+ "epoch": 4.38,
336
+ "grad_norm": 2.0106472969055176,
337
+ "learning_rate": 1.3621312026519146e-06,
338
+ "loss": 0.6705,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 4.48,
343
+ "grad_norm": 1.090775489807129,
344
+ "learning_rate": 1.3369065507509533e-06,
345
+ "loss": 0.6297,
346
+ "step": 215
347
+ },
348
+ {
349
+ "epoch": 4.58,
350
+ "grad_norm": 0.8988145589828491,
351
+ "learning_rate": 1.311681898849992e-06,
352
+ "loss": 0.5896,
353
+ "step": 220
354
+ },
355
+ {
356
+ "epoch": 4.69,
357
+ "grad_norm": 0.9149978756904602,
358
+ "learning_rate": 1.2864572469490305e-06,
359
+ "loss": 0.6156,
360
+ "step": 225
361
+ },
362
+ {
363
+ "epoch": 4.79,
364
+ "grad_norm": 1.9398412704467773,
365
+ "learning_rate": 1.2612325950480692e-06,
366
+ "loss": 0.6305,
367
+ "step": 230
368
+ },
369
+ {
370
+ "epoch": 4.9,
371
+ "grad_norm": 0.9217966794967651,
372
+ "learning_rate": 1.2360079431471078e-06,
373
+ "loss": 0.5943,
374
+ "step": 235
375
+ },
376
+ {
377
+ "epoch": 5.0,
378
+ "grad_norm": 0.9083653688430786,
379
+ "learning_rate": 1.2107832912461465e-06,
380
+ "loss": 0.6386,
381
+ "step": 240
382
+ },
383
+ {
384
+ "epoch": 5.0,
385
+ "eval_f1": 0.7326732673267327,
386
+ "eval_loss": 0.6846389770507812,
387
+ "eval_runtime": 1.4094,
388
+ "eval_samples_per_second": 45.409,
389
+ "eval_steps_per_second": 5.676,
390
+ "step": 240
391
+ }
392
+ ],
393
+ "logging_steps": 5,
394
+ "max_steps": 480,
395
+ "num_input_tokens_seen": 0,
396
+ "num_train_epochs": 10,
397
+ "save_steps": 500,
398
+ "total_flos": 3654362860415712.0,
399
+ "train_batch_size": 4,
400
+ "trial_name": null,
401
+ "trial_params": {
402
+ "learning_rate": 2.1794099242430636e-06,
403
+ "per_device_train_batch_size": 4
404
+ }
405
+ }
run-4/checkpoint-240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-288/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be27592b57346e1ba7fb5ac856d5e685d4d7d316009088a8a4a6b512cb22d9a4
3
+ size 94763496
run-4/checkpoint-288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c627abefd26f649e146eb3f7adc646b23cae944e9576351a11327cb5a485e7d
3
+ size 189552570
run-4/checkpoint-288/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9903236b654011babeaee26ea70e1c6278fa670549b900c6df1d64732428a642
3
+ size 14244
run-4/checkpoint-288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2163f47f6ce948a3a821c2032fa9ddfd17ee0992318acd25664f5287b23dd105
3
+ size 1064
run-4/checkpoint-288/trainer_state.json ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 6.0,
5
+ "eval_steps": 500,
6
+ "global_step": 288,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ },
234
+ {
235
+ "epoch": 3.02,
236
+ "grad_norm": 0.7627719640731812,
237
+ "learning_rate": 1.6900516773644127e-06,
238
+ "loss": 0.6557,
239
+ "step": 145
240
+ },
241
+ {
242
+ "epoch": 3.12,
243
+ "grad_norm": 0.9291415214538574,
244
+ "learning_rate": 1.6648270254634511e-06,
245
+ "loss": 0.6219,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "grad_norm": 0.9248765707015991,
251
+ "learning_rate": 1.6396023735624898e-06,
252
+ "loss": 0.6325,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 3.33,
257
+ "grad_norm": 0.9842573404312134,
258
+ "learning_rate": 1.6143777216615284e-06,
259
+ "loss": 0.6521,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 3.44,
264
+ "grad_norm": 0.8689214587211609,
265
+ "learning_rate": 1.589153069760567e-06,
266
+ "loss": 0.5929,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 3.54,
271
+ "grad_norm": 1.0012000799179077,
272
+ "learning_rate": 1.5639284178596057e-06,
273
+ "loss": 0.584,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 3.65,
278
+ "grad_norm": 0.7438368797302246,
279
+ "learning_rate": 1.5387037659586443e-06,
280
+ "loss": 0.6813,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 3.75,
285
+ "grad_norm": 1.8603870868682861,
286
+ "learning_rate": 1.513479114057683e-06,
287
+ "loss": 0.6099,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.85,
292
+ "grad_norm": 0.9918416738510132,
293
+ "learning_rate": 1.4882544621567216e-06,
294
+ "loss": 0.6192,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 3.96,
299
+ "grad_norm": 1.9146322011947632,
300
+ "learning_rate": 1.4630298102557603e-06,
301
+ "loss": 0.6472,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 4.0,
306
+ "eval_f1": 0.7326732673267327,
307
+ "eval_loss": 0.6818161010742188,
308
+ "eval_runtime": 1.3841,
309
+ "eval_samples_per_second": 46.239,
310
+ "eval_steps_per_second": 5.78,
311
+ "step": 192
312
+ },
313
+ {
314
+ "epoch": 4.06,
315
+ "grad_norm": 0.9502781629562378,
316
+ "learning_rate": 1.437805158354799e-06,
317
+ "loss": 0.6447,
318
+ "step": 195
319
+ },
320
+ {
321
+ "epoch": 4.17,
322
+ "grad_norm": 0.8570067286491394,
323
+ "learning_rate": 1.4125805064538375e-06,
324
+ "loss": 0.5306,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 4.27,
329
+ "grad_norm": 0.8097484111785889,
330
+ "learning_rate": 1.3873558545528762e-06,
331
+ "loss": 0.6202,
332
+ "step": 205
333
+ },
334
+ {
335
+ "epoch": 4.38,
336
+ "grad_norm": 2.0106472969055176,
337
+ "learning_rate": 1.3621312026519146e-06,
338
+ "loss": 0.6705,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 4.48,
343
+ "grad_norm": 1.090775489807129,
344
+ "learning_rate": 1.3369065507509533e-06,
345
+ "loss": 0.6297,
346
+ "step": 215
347
+ },
348
+ {
349
+ "epoch": 4.58,
350
+ "grad_norm": 0.8988145589828491,
351
+ "learning_rate": 1.311681898849992e-06,
352
+ "loss": 0.5896,
353
+ "step": 220
354
+ },
355
+ {
356
+ "epoch": 4.69,
357
+ "grad_norm": 0.9149978756904602,
358
+ "learning_rate": 1.2864572469490305e-06,
359
+ "loss": 0.6156,
360
+ "step": 225
361
+ },
362
+ {
363
+ "epoch": 4.79,
364
+ "grad_norm": 1.9398412704467773,
365
+ "learning_rate": 1.2612325950480692e-06,
366
+ "loss": 0.6305,
367
+ "step": 230
368
+ },
369
+ {
370
+ "epoch": 4.9,
371
+ "grad_norm": 0.9217966794967651,
372
+ "learning_rate": 1.2360079431471078e-06,
373
+ "loss": 0.5943,
374
+ "step": 235
375
+ },
376
+ {
377
+ "epoch": 5.0,
378
+ "grad_norm": 0.9083653688430786,
379
+ "learning_rate": 1.2107832912461465e-06,
380
+ "loss": 0.6386,
381
+ "step": 240
382
+ },
383
+ {
384
+ "epoch": 5.0,
385
+ "eval_f1": 0.7326732673267327,
386
+ "eval_loss": 0.6846389770507812,
387
+ "eval_runtime": 1.4094,
388
+ "eval_samples_per_second": 45.409,
389
+ "eval_steps_per_second": 5.676,
390
+ "step": 240
391
+ },
392
+ {
393
+ "epoch": 5.1,
394
+ "grad_norm": 0.9323675036430359,
395
+ "learning_rate": 1.1855586393451851e-06,
396
+ "loss": 0.5779,
397
+ "step": 245
398
+ },
399
+ {
400
+ "epoch": 5.21,
401
+ "grad_norm": 0.7549787163734436,
402
+ "learning_rate": 1.1603339874442238e-06,
403
+ "loss": 0.5948,
404
+ "step": 250
405
+ },
406
+ {
407
+ "epoch": 5.31,
408
+ "grad_norm": 0.8535837531089783,
409
+ "learning_rate": 1.1351093355432624e-06,
410
+ "loss": 0.6928,
411
+ "step": 255
412
+ },
413
+ {
414
+ "epoch": 5.42,
415
+ "grad_norm": 1.2038137912750244,
416
+ "learning_rate": 1.109884683642301e-06,
417
+ "loss": 0.5887,
418
+ "step": 260
419
+ },
420
+ {
421
+ "epoch": 5.52,
422
+ "grad_norm": 0.9501279592514038,
423
+ "learning_rate": 1.0846600317413395e-06,
424
+ "loss": 0.5776,
425
+ "step": 265
426
+ },
427
+ {
428
+ "epoch": 5.62,
429
+ "grad_norm": 0.7421719431877136,
430
+ "learning_rate": 1.0594353798403781e-06,
431
+ "loss": 0.6734,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 5.73,
436
+ "grad_norm": 0.8555863499641418,
437
+ "learning_rate": 1.0342107279394168e-06,
438
+ "loss": 0.6399,
439
+ "step": 275
440
+ },
441
+ {
442
+ "epoch": 5.83,
443
+ "grad_norm": 0.8841156363487244,
444
+ "learning_rate": 1.0089860760384554e-06,
445
+ "loss": 0.6173,
446
+ "step": 280
447
+ },
448
+ {
449
+ "epoch": 5.94,
450
+ "grad_norm": 0.8565478324890137,
451
+ "learning_rate": 9.83761424137494e-07,
452
+ "loss": 0.5537,
453
+ "step": 285
454
+ },
455
+ {
456
+ "epoch": 6.0,
457
+ "eval_f1": 0.7326732673267327,
458
+ "eval_loss": 0.6864242553710938,
459
+ "eval_runtime": 1.3622,
460
+ "eval_samples_per_second": 46.983,
461
+ "eval_steps_per_second": 5.873,
462
+ "step": 288
463
+ }
464
+ ],
465
+ "logging_steps": 5,
466
+ "max_steps": 480,
467
+ "num_input_tokens_seen": 0,
468
+ "num_train_epochs": 10,
469
+ "save_steps": 500,
470
+ "total_flos": 4356539523260784.0,
471
+ "train_batch_size": 4,
472
+ "trial_name": null,
473
+ "trial_params": {
474
+ "learning_rate": 2.1794099242430636e-06,
475
+ "per_device_train_batch_size": 4
476
+ }
477
+ }
run-4/checkpoint-288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-336/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-336/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3425a160cf28659111e87b6ab601bd4f64cd88b8baa924505a0ed42513beb151
3
+ size 94763496
run-4/checkpoint-336/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a80e51af62d25c253503b816ec7b98c85cc9ba062954c975249337e8ce646afe
3
+ size 189552570
run-4/checkpoint-336/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-336/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d12884ae20f0c926a355fda8650edc055a398d4c7c42545ccdb7d60bd202452
3
+ size 14244
run-4/checkpoint-336/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0d819fb0c1d431c458fec913aa5469b8a7ed1313a0add2ae71b76c0b9a0a219
3
+ size 1064
run-4/checkpoint-336/trainer_state.json ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 7.0,
5
+ "eval_steps": 500,
6
+ "global_step": 336,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ },
234
+ {
235
+ "epoch": 3.02,
236
+ "grad_norm": 0.7627719640731812,
237
+ "learning_rate": 1.6900516773644127e-06,
238
+ "loss": 0.6557,
239
+ "step": 145
240
+ },
241
+ {
242
+ "epoch": 3.12,
243
+ "grad_norm": 0.9291415214538574,
244
+ "learning_rate": 1.6648270254634511e-06,
245
+ "loss": 0.6219,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "grad_norm": 0.9248765707015991,
251
+ "learning_rate": 1.6396023735624898e-06,
252
+ "loss": 0.6325,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 3.33,
257
+ "grad_norm": 0.9842573404312134,
258
+ "learning_rate": 1.6143777216615284e-06,
259
+ "loss": 0.6521,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 3.44,
264
+ "grad_norm": 0.8689214587211609,
265
+ "learning_rate": 1.589153069760567e-06,
266
+ "loss": 0.5929,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 3.54,
271
+ "grad_norm": 1.0012000799179077,
272
+ "learning_rate": 1.5639284178596057e-06,
273
+ "loss": 0.584,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 3.65,
278
+ "grad_norm": 0.7438368797302246,
279
+ "learning_rate": 1.5387037659586443e-06,
280
+ "loss": 0.6813,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 3.75,
285
+ "grad_norm": 1.8603870868682861,
286
+ "learning_rate": 1.513479114057683e-06,
287
+ "loss": 0.6099,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.85,
292
+ "grad_norm": 0.9918416738510132,
293
+ "learning_rate": 1.4882544621567216e-06,
294
+ "loss": 0.6192,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 3.96,
299
+ "grad_norm": 1.9146322011947632,
300
+ "learning_rate": 1.4630298102557603e-06,
301
+ "loss": 0.6472,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 4.0,
306
+ "eval_f1": 0.7326732673267327,
307
+ "eval_loss": 0.6818161010742188,
308
+ "eval_runtime": 1.3841,
309
+ "eval_samples_per_second": 46.239,
310
+ "eval_steps_per_second": 5.78,
311
+ "step": 192
312
+ },
313
+ {
314
+ "epoch": 4.06,
315
+ "grad_norm": 0.9502781629562378,
316
+ "learning_rate": 1.437805158354799e-06,
317
+ "loss": 0.6447,
318
+ "step": 195
319
+ },
320
+ {
321
+ "epoch": 4.17,
322
+ "grad_norm": 0.8570067286491394,
323
+ "learning_rate": 1.4125805064538375e-06,
324
+ "loss": 0.5306,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 4.27,
329
+ "grad_norm": 0.8097484111785889,
330
+ "learning_rate": 1.3873558545528762e-06,
331
+ "loss": 0.6202,
332
+ "step": 205
333
+ },
334
+ {
335
+ "epoch": 4.38,
336
+ "grad_norm": 2.0106472969055176,
337
+ "learning_rate": 1.3621312026519146e-06,
338
+ "loss": 0.6705,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 4.48,
343
+ "grad_norm": 1.090775489807129,
344
+ "learning_rate": 1.3369065507509533e-06,
345
+ "loss": 0.6297,
346
+ "step": 215
347
+ },
348
+ {
349
+ "epoch": 4.58,
350
+ "grad_norm": 0.8988145589828491,
351
+ "learning_rate": 1.311681898849992e-06,
352
+ "loss": 0.5896,
353
+ "step": 220
354
+ },
355
+ {
356
+ "epoch": 4.69,
357
+ "grad_norm": 0.9149978756904602,
358
+ "learning_rate": 1.2864572469490305e-06,
359
+ "loss": 0.6156,
360
+ "step": 225
361
+ },
362
+ {
363
+ "epoch": 4.79,
364
+ "grad_norm": 1.9398412704467773,
365
+ "learning_rate": 1.2612325950480692e-06,
366
+ "loss": 0.6305,
367
+ "step": 230
368
+ },
369
+ {
370
+ "epoch": 4.9,
371
+ "grad_norm": 0.9217966794967651,
372
+ "learning_rate": 1.2360079431471078e-06,
373
+ "loss": 0.5943,
374
+ "step": 235
375
+ },
376
+ {
377
+ "epoch": 5.0,
378
+ "grad_norm": 0.9083653688430786,
379
+ "learning_rate": 1.2107832912461465e-06,
380
+ "loss": 0.6386,
381
+ "step": 240
382
+ },
383
+ {
384
+ "epoch": 5.0,
385
+ "eval_f1": 0.7326732673267327,
386
+ "eval_loss": 0.6846389770507812,
387
+ "eval_runtime": 1.4094,
388
+ "eval_samples_per_second": 45.409,
389
+ "eval_steps_per_second": 5.676,
390
+ "step": 240
391
+ },
392
+ {
393
+ "epoch": 5.1,
394
+ "grad_norm": 0.9323675036430359,
395
+ "learning_rate": 1.1855586393451851e-06,
396
+ "loss": 0.5779,
397
+ "step": 245
398
+ },
399
+ {
400
+ "epoch": 5.21,
401
+ "grad_norm": 0.7549787163734436,
402
+ "learning_rate": 1.1603339874442238e-06,
403
+ "loss": 0.5948,
404
+ "step": 250
405
+ },
406
+ {
407
+ "epoch": 5.31,
408
+ "grad_norm": 0.8535837531089783,
409
+ "learning_rate": 1.1351093355432624e-06,
410
+ "loss": 0.6928,
411
+ "step": 255
412
+ },
413
+ {
414
+ "epoch": 5.42,
415
+ "grad_norm": 1.2038137912750244,
416
+ "learning_rate": 1.109884683642301e-06,
417
+ "loss": 0.5887,
418
+ "step": 260
419
+ },
420
+ {
421
+ "epoch": 5.52,
422
+ "grad_norm": 0.9501279592514038,
423
+ "learning_rate": 1.0846600317413395e-06,
424
+ "loss": 0.5776,
425
+ "step": 265
426
+ },
427
+ {
428
+ "epoch": 5.62,
429
+ "grad_norm": 0.7421719431877136,
430
+ "learning_rate": 1.0594353798403781e-06,
431
+ "loss": 0.6734,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 5.73,
436
+ "grad_norm": 0.8555863499641418,
437
+ "learning_rate": 1.0342107279394168e-06,
438
+ "loss": 0.6399,
439
+ "step": 275
440
+ },
441
+ {
442
+ "epoch": 5.83,
443
+ "grad_norm": 0.8841156363487244,
444
+ "learning_rate": 1.0089860760384554e-06,
445
+ "loss": 0.6173,
446
+ "step": 280
447
+ },
448
+ {
449
+ "epoch": 5.94,
450
+ "grad_norm": 0.8565478324890137,
451
+ "learning_rate": 9.83761424137494e-07,
452
+ "loss": 0.5537,
453
+ "step": 285
454
+ },
455
+ {
456
+ "epoch": 6.0,
457
+ "eval_f1": 0.7326732673267327,
458
+ "eval_loss": 0.6864242553710938,
459
+ "eval_runtime": 1.3622,
460
+ "eval_samples_per_second": 46.983,
461
+ "eval_steps_per_second": 5.873,
462
+ "step": 288
463
+ },
464
+ {
465
+ "epoch": 6.04,
466
+ "grad_norm": 0.8750139474868774,
467
+ "learning_rate": 9.585367722365327e-07,
468
+ "loss": 0.5531,
469
+ "step": 290
470
+ },
471
+ {
472
+ "epoch": 6.15,
473
+ "grad_norm": 1.0445302724838257,
474
+ "learning_rate": 9.333121203355712e-07,
475
+ "loss": 0.638,
476
+ "step": 295
477
+ },
478
+ {
479
+ "epoch": 6.25,
480
+ "grad_norm": 0.7958914637565613,
481
+ "learning_rate": 9.080874684346099e-07,
482
+ "loss": 0.547,
483
+ "step": 300
484
+ },
485
+ {
486
+ "epoch": 6.35,
487
+ "grad_norm": 0.9992254376411438,
488
+ "learning_rate": 8.828628165336485e-07,
489
+ "loss": 0.6425,
490
+ "step": 305
491
+ },
492
+ {
493
+ "epoch": 6.46,
494
+ "grad_norm": 0.8400682806968689,
495
+ "learning_rate": 8.576381646326871e-07,
496
+ "loss": 0.6955,
497
+ "step": 310
498
+ },
499
+ {
500
+ "epoch": 6.56,
501
+ "grad_norm": 0.742438793182373,
502
+ "learning_rate": 8.324135127317256e-07,
503
+ "loss": 0.6473,
504
+ "step": 315
505
+ },
506
+ {
507
+ "epoch": 6.67,
508
+ "grad_norm": 0.6693254113197327,
509
+ "learning_rate": 8.071888608307642e-07,
510
+ "loss": 0.603,
511
+ "step": 320
512
+ },
513
+ {
514
+ "epoch": 6.77,
515
+ "grad_norm": 1.0816401243209839,
516
+ "learning_rate": 7.819642089298028e-07,
517
+ "loss": 0.6053,
518
+ "step": 325
519
+ },
520
+ {
521
+ "epoch": 6.88,
522
+ "grad_norm": 0.7275277376174927,
523
+ "learning_rate": 7.567395570288415e-07,
524
+ "loss": 0.612,
525
+ "step": 330
526
+ },
527
+ {
528
+ "epoch": 6.98,
529
+ "grad_norm": 0.7834873795509338,
530
+ "learning_rate": 7.315149051278801e-07,
531
+ "loss": 0.55,
532
+ "step": 335
533
+ },
534
+ {
535
+ "epoch": 7.0,
536
+ "eval_f1": 0.7326732673267327,
537
+ "eval_loss": 0.6889228820800781,
538
+ "eval_runtime": 1.3769,
539
+ "eval_samples_per_second": 46.483,
540
+ "eval_steps_per_second": 5.81,
541
+ "step": 336
542
+ }
543
+ ],
544
+ "logging_steps": 5,
545
+ "max_steps": 480,
546
+ "num_input_tokens_seen": 0,
547
+ "num_train_epochs": 10,
548
+ "save_steps": 500,
549
+ "total_flos": 5116387614670704.0,
550
+ "train_batch_size": 4,
551
+ "trial_name": null,
552
+ "trial_params": {
553
+ "learning_rate": 2.1794099242430636e-06,
554
+ "per_device_train_batch_size": 4
555
+ }
556
+ }
run-4/checkpoint-336/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-384/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }
run-4/checkpoint-384/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f03572b2dcb34be269d7a8392a1db64aa78ab3c991d84591fe98b5b2f300eea6
3
+ size 94763496
run-4/checkpoint-384/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370faca0c4d8022d7391f5e21d5c7cf5baa3cc1575da59ece8f1da8e073be1d6
3
+ size 189552570
run-4/checkpoint-384/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
run-4/checkpoint-384/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5326b9611b4fb9dc5dc0b29580e7e48abf50913e44071592799c052bebfbacd7
3
+ size 14244
run-4/checkpoint-384/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53cf1db083d766e2eeb0b68e04895788e327c7d113d516f642c4fc0792596377
3
+ size 1064
run-4/checkpoint-384/trainer_state.json ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7326732673267327,
3
+ "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96",
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 384,
7
+ "is_hyper_param_search": true,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.2733114957809448,
14
+ "learning_rate": 2.2702186710865246e-07,
15
+ "loss": 0.7025,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.21,
20
+ "grad_norm": 1.243804931640625,
21
+ "learning_rate": 4.5404373421730493e-07,
22
+ "loss": 0.6974,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.31,
27
+ "grad_norm": 1.7711552381515503,
28
+ "learning_rate": 6.810656013259573e-07,
29
+ "loss": 0.696,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.42,
34
+ "grad_norm": 1.1453403234481812,
35
+ "learning_rate": 9.080874684346099e-07,
36
+ "loss": 0.6989,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.52,
41
+ "grad_norm": 1.2729355096817017,
42
+ "learning_rate": 1.1351093355432624e-06,
43
+ "loss": 0.6968,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.62,
48
+ "grad_norm": 1.1592165231704712,
49
+ "learning_rate": 1.3621312026519146e-06,
50
+ "loss": 0.6959,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.73,
55
+ "grad_norm": 1.1798148155212402,
56
+ "learning_rate": 1.589153069760567e-06,
57
+ "loss": 0.6952,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.83,
62
+ "grad_norm": 2.1216671466827393,
63
+ "learning_rate": 1.8161749368692197e-06,
64
+ "loss": 0.6886,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.94,
69
+ "grad_norm": 1.3416370153427124,
70
+ "learning_rate": 2.043196803977872e-06,
71
+ "loss": 0.6864,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_f1": 0.72,
77
+ "eval_loss": 0.688262939453125,
78
+ "eval_runtime": 1.3468,
79
+ "eval_samples_per_second": 47.521,
80
+ "eval_steps_per_second": 5.94,
81
+ "step": 48
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 2.1856281757354736,
86
+ "learning_rate": 2.169320063482679e-06,
87
+ "loss": 0.6917,
88
+ "step": 50
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4077153205871582,
93
+ "learning_rate": 2.1440954115817176e-06,
94
+ "loss": 0.6884,
95
+ "step": 55
96
+ },
97
+ {
98
+ "epoch": 1.25,
99
+ "grad_norm": 2.1792664527893066,
100
+ "learning_rate": 2.1188707596807562e-06,
101
+ "loss": 0.6668,
102
+ "step": 60
103
+ },
104
+ {
105
+ "epoch": 1.35,
106
+ "grad_norm": 1.0386197566986084,
107
+ "learning_rate": 2.093646107779795e-06,
108
+ "loss": 0.6694,
109
+ "step": 65
110
+ },
111
+ {
112
+ "epoch": 1.46,
113
+ "grad_norm": 2.0565919876098633,
114
+ "learning_rate": 2.0684214558788335e-06,
115
+ "loss": 0.6561,
116
+ "step": 70
117
+ },
118
+ {
119
+ "epoch": 1.56,
120
+ "grad_norm": 1.2978509664535522,
121
+ "learning_rate": 2.043196803977872e-06,
122
+ "loss": 0.6789,
123
+ "step": 75
124
+ },
125
+ {
126
+ "epoch": 1.67,
127
+ "grad_norm": 2.058328628540039,
128
+ "learning_rate": 2.0179721520769108e-06,
129
+ "loss": 0.6633,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 1.77,
134
+ "grad_norm": 0.6023226976394653,
135
+ "learning_rate": 1.9927475001759494e-06,
136
+ "loss": 0.6655,
137
+ "step": 85
138
+ },
139
+ {
140
+ "epoch": 1.88,
141
+ "grad_norm": 0.5510762929916382,
142
+ "learning_rate": 1.967522848274988e-06,
143
+ "loss": 0.6622,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 1.98,
148
+ "grad_norm": 1.098602533340454,
149
+ "learning_rate": 1.9422981963740267e-06,
150
+ "loss": 0.6633,
151
+ "step": 95
152
+ },
153
+ {
154
+ "epoch": 2.0,
155
+ "eval_f1": 0.7326732673267327,
156
+ "eval_loss": 0.6816024780273438,
157
+ "eval_runtime": 1.3765,
158
+ "eval_samples_per_second": 46.493,
159
+ "eval_steps_per_second": 5.812,
160
+ "step": 96
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.9589098691940308,
165
+ "learning_rate": 1.9170735444730654e-06,
166
+ "loss": 0.659,
167
+ "step": 100
168
+ },
169
+ {
170
+ "epoch": 2.19,
171
+ "grad_norm": 1.070695161819458,
172
+ "learning_rate": 1.8918488925721038e-06,
173
+ "loss": 0.6313,
174
+ "step": 105
175
+ },
176
+ {
177
+ "epoch": 2.29,
178
+ "grad_norm": 0.9913639426231384,
179
+ "learning_rate": 1.8666242406711424e-06,
180
+ "loss": 0.6652,
181
+ "step": 110
182
+ },
183
+ {
184
+ "epoch": 2.4,
185
+ "grad_norm": 1.0632878541946411,
186
+ "learning_rate": 1.841399588770181e-06,
187
+ "loss": 0.673,
188
+ "step": 115
189
+ },
190
+ {
191
+ "epoch": 2.5,
192
+ "grad_norm": 2.1036579608917236,
193
+ "learning_rate": 1.8161749368692197e-06,
194
+ "loss": 0.6451,
195
+ "step": 120
196
+ },
197
+ {
198
+ "epoch": 2.6,
199
+ "grad_norm": 1.08384108543396,
200
+ "learning_rate": 1.7909502849682583e-06,
201
+ "loss": 0.6322,
202
+ "step": 125
203
+ },
204
+ {
205
+ "epoch": 2.71,
206
+ "grad_norm": 0.9407000541687012,
207
+ "learning_rate": 1.765725633067297e-06,
208
+ "loss": 0.6755,
209
+ "step": 130
210
+ },
211
+ {
212
+ "epoch": 2.81,
213
+ "grad_norm": 0.9016568660736084,
214
+ "learning_rate": 1.7405009811663356e-06,
215
+ "loss": 0.5985,
216
+ "step": 135
217
+ },
218
+ {
219
+ "epoch": 2.92,
220
+ "grad_norm": 1.1134448051452637,
221
+ "learning_rate": 1.7152763292653743e-06,
222
+ "loss": 0.603,
223
+ "step": 140
224
+ },
225
+ {
226
+ "epoch": 3.0,
227
+ "eval_f1": 0.7326732673267327,
228
+ "eval_loss": 0.6800689697265625,
229
+ "eval_runtime": 1.3861,
230
+ "eval_samples_per_second": 46.173,
231
+ "eval_steps_per_second": 5.772,
232
+ "step": 144
233
+ },
234
+ {
235
+ "epoch": 3.02,
236
+ "grad_norm": 0.7627719640731812,
237
+ "learning_rate": 1.6900516773644127e-06,
238
+ "loss": 0.6557,
239
+ "step": 145
240
+ },
241
+ {
242
+ "epoch": 3.12,
243
+ "grad_norm": 0.9291415214538574,
244
+ "learning_rate": 1.6648270254634511e-06,
245
+ "loss": 0.6219,
246
+ "step": 150
247
+ },
248
+ {
249
+ "epoch": 3.23,
250
+ "grad_norm": 0.9248765707015991,
251
+ "learning_rate": 1.6396023735624898e-06,
252
+ "loss": 0.6325,
253
+ "step": 155
254
+ },
255
+ {
256
+ "epoch": 3.33,
257
+ "grad_norm": 0.9842573404312134,
258
+ "learning_rate": 1.6143777216615284e-06,
259
+ "loss": 0.6521,
260
+ "step": 160
261
+ },
262
+ {
263
+ "epoch": 3.44,
264
+ "grad_norm": 0.8689214587211609,
265
+ "learning_rate": 1.589153069760567e-06,
266
+ "loss": 0.5929,
267
+ "step": 165
268
+ },
269
+ {
270
+ "epoch": 3.54,
271
+ "grad_norm": 1.0012000799179077,
272
+ "learning_rate": 1.5639284178596057e-06,
273
+ "loss": 0.584,
274
+ "step": 170
275
+ },
276
+ {
277
+ "epoch": 3.65,
278
+ "grad_norm": 0.7438368797302246,
279
+ "learning_rate": 1.5387037659586443e-06,
280
+ "loss": 0.6813,
281
+ "step": 175
282
+ },
283
+ {
284
+ "epoch": 3.75,
285
+ "grad_norm": 1.8603870868682861,
286
+ "learning_rate": 1.513479114057683e-06,
287
+ "loss": 0.6099,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 3.85,
292
+ "grad_norm": 0.9918416738510132,
293
+ "learning_rate": 1.4882544621567216e-06,
294
+ "loss": 0.6192,
295
+ "step": 185
296
+ },
297
+ {
298
+ "epoch": 3.96,
299
+ "grad_norm": 1.9146322011947632,
300
+ "learning_rate": 1.4630298102557603e-06,
301
+ "loss": 0.6472,
302
+ "step": 190
303
+ },
304
+ {
305
+ "epoch": 4.0,
306
+ "eval_f1": 0.7326732673267327,
307
+ "eval_loss": 0.6818161010742188,
308
+ "eval_runtime": 1.3841,
309
+ "eval_samples_per_second": 46.239,
310
+ "eval_steps_per_second": 5.78,
311
+ "step": 192
312
+ },
313
+ {
314
+ "epoch": 4.06,
315
+ "grad_norm": 0.9502781629562378,
316
+ "learning_rate": 1.437805158354799e-06,
317
+ "loss": 0.6447,
318
+ "step": 195
319
+ },
320
+ {
321
+ "epoch": 4.17,
322
+ "grad_norm": 0.8570067286491394,
323
+ "learning_rate": 1.4125805064538375e-06,
324
+ "loss": 0.5306,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 4.27,
329
+ "grad_norm": 0.8097484111785889,
330
+ "learning_rate": 1.3873558545528762e-06,
331
+ "loss": 0.6202,
332
+ "step": 205
333
+ },
334
+ {
335
+ "epoch": 4.38,
336
+ "grad_norm": 2.0106472969055176,
337
+ "learning_rate": 1.3621312026519146e-06,
338
+ "loss": 0.6705,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 4.48,
343
+ "grad_norm": 1.090775489807129,
344
+ "learning_rate": 1.3369065507509533e-06,
345
+ "loss": 0.6297,
346
+ "step": 215
347
+ },
348
+ {
349
+ "epoch": 4.58,
350
+ "grad_norm": 0.8988145589828491,
351
+ "learning_rate": 1.311681898849992e-06,
352
+ "loss": 0.5896,
353
+ "step": 220
354
+ },
355
+ {
356
+ "epoch": 4.69,
357
+ "grad_norm": 0.9149978756904602,
358
+ "learning_rate": 1.2864572469490305e-06,
359
+ "loss": 0.6156,
360
+ "step": 225
361
+ },
362
+ {
363
+ "epoch": 4.79,
364
+ "grad_norm": 1.9398412704467773,
365
+ "learning_rate": 1.2612325950480692e-06,
366
+ "loss": 0.6305,
367
+ "step": 230
368
+ },
369
+ {
370
+ "epoch": 4.9,
371
+ "grad_norm": 0.9217966794967651,
372
+ "learning_rate": 1.2360079431471078e-06,
373
+ "loss": 0.5943,
374
+ "step": 235
375
+ },
376
+ {
377
+ "epoch": 5.0,
378
+ "grad_norm": 0.9083653688430786,
379
+ "learning_rate": 1.2107832912461465e-06,
380
+ "loss": 0.6386,
381
+ "step": 240
382
+ },
383
+ {
384
+ "epoch": 5.0,
385
+ "eval_f1": 0.7326732673267327,
386
+ "eval_loss": 0.6846389770507812,
387
+ "eval_runtime": 1.4094,
388
+ "eval_samples_per_second": 45.409,
389
+ "eval_steps_per_second": 5.676,
390
+ "step": 240
391
+ },
392
+ {
393
+ "epoch": 5.1,
394
+ "grad_norm": 0.9323675036430359,
395
+ "learning_rate": 1.1855586393451851e-06,
396
+ "loss": 0.5779,
397
+ "step": 245
398
+ },
399
+ {
400
+ "epoch": 5.21,
401
+ "grad_norm": 0.7549787163734436,
402
+ "learning_rate": 1.1603339874442238e-06,
403
+ "loss": 0.5948,
404
+ "step": 250
405
+ },
406
+ {
407
+ "epoch": 5.31,
408
+ "grad_norm": 0.8535837531089783,
409
+ "learning_rate": 1.1351093355432624e-06,
410
+ "loss": 0.6928,
411
+ "step": 255
412
+ },
413
+ {
414
+ "epoch": 5.42,
415
+ "grad_norm": 1.2038137912750244,
416
+ "learning_rate": 1.109884683642301e-06,
417
+ "loss": 0.5887,
418
+ "step": 260
419
+ },
420
+ {
421
+ "epoch": 5.52,
422
+ "grad_norm": 0.9501279592514038,
423
+ "learning_rate": 1.0846600317413395e-06,
424
+ "loss": 0.5776,
425
+ "step": 265
426
+ },
427
+ {
428
+ "epoch": 5.62,
429
+ "grad_norm": 0.7421719431877136,
430
+ "learning_rate": 1.0594353798403781e-06,
431
+ "loss": 0.6734,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 5.73,
436
+ "grad_norm": 0.8555863499641418,
437
+ "learning_rate": 1.0342107279394168e-06,
438
+ "loss": 0.6399,
439
+ "step": 275
440
+ },
441
+ {
442
+ "epoch": 5.83,
443
+ "grad_norm": 0.8841156363487244,
444
+ "learning_rate": 1.0089860760384554e-06,
445
+ "loss": 0.6173,
446
+ "step": 280
447
+ },
448
+ {
449
+ "epoch": 5.94,
450
+ "grad_norm": 0.8565478324890137,
451
+ "learning_rate": 9.83761424137494e-07,
452
+ "loss": 0.5537,
453
+ "step": 285
454
+ },
455
+ {
456
+ "epoch": 6.0,
457
+ "eval_f1": 0.7326732673267327,
458
+ "eval_loss": 0.6864242553710938,
459
+ "eval_runtime": 1.3622,
460
+ "eval_samples_per_second": 46.983,
461
+ "eval_steps_per_second": 5.873,
462
+ "step": 288
463
+ },
464
+ {
465
+ "epoch": 6.04,
466
+ "grad_norm": 0.8750139474868774,
467
+ "learning_rate": 9.585367722365327e-07,
468
+ "loss": 0.5531,
469
+ "step": 290
470
+ },
471
+ {
472
+ "epoch": 6.15,
473
+ "grad_norm": 1.0445302724838257,
474
+ "learning_rate": 9.333121203355712e-07,
475
+ "loss": 0.638,
476
+ "step": 295
477
+ },
478
+ {
479
+ "epoch": 6.25,
480
+ "grad_norm": 0.7958914637565613,
481
+ "learning_rate": 9.080874684346099e-07,
482
+ "loss": 0.547,
483
+ "step": 300
484
+ },
485
+ {
486
+ "epoch": 6.35,
487
+ "grad_norm": 0.9992254376411438,
488
+ "learning_rate": 8.828628165336485e-07,
489
+ "loss": 0.6425,
490
+ "step": 305
491
+ },
492
+ {
493
+ "epoch": 6.46,
494
+ "grad_norm": 0.8400682806968689,
495
+ "learning_rate": 8.576381646326871e-07,
496
+ "loss": 0.6955,
497
+ "step": 310
498
+ },
499
+ {
500
+ "epoch": 6.56,
501
+ "grad_norm": 0.742438793182373,
502
+ "learning_rate": 8.324135127317256e-07,
503
+ "loss": 0.6473,
504
+ "step": 315
505
+ },
506
+ {
507
+ "epoch": 6.67,
508
+ "grad_norm": 0.6693254113197327,
509
+ "learning_rate": 8.071888608307642e-07,
510
+ "loss": 0.603,
511
+ "step": 320
512
+ },
513
+ {
514
+ "epoch": 6.77,
515
+ "grad_norm": 1.0816401243209839,
516
+ "learning_rate": 7.819642089298028e-07,
517
+ "loss": 0.6053,
518
+ "step": 325
519
+ },
520
+ {
521
+ "epoch": 6.88,
522
+ "grad_norm": 0.7275277376174927,
523
+ "learning_rate": 7.567395570288415e-07,
524
+ "loss": 0.612,
525
+ "step": 330
526
+ },
527
+ {
528
+ "epoch": 6.98,
529
+ "grad_norm": 0.7834873795509338,
530
+ "learning_rate": 7.315149051278801e-07,
531
+ "loss": 0.55,
532
+ "step": 335
533
+ },
534
+ {
535
+ "epoch": 7.0,
536
+ "eval_f1": 0.7326732673267327,
537
+ "eval_loss": 0.6889228820800781,
538
+ "eval_runtime": 1.3769,
539
+ "eval_samples_per_second": 46.483,
540
+ "eval_steps_per_second": 5.81,
541
+ "step": 336
542
+ },
543
+ {
544
+ "epoch": 7.08,
545
+ "grad_norm": 1.24147367477417,
546
+ "learning_rate": 7.062902532269188e-07,
547
+ "loss": 0.508,
548
+ "step": 340
549
+ },
550
+ {
551
+ "epoch": 7.19,
552
+ "grad_norm": 1.8932181596755981,
553
+ "learning_rate": 6.810656013259573e-07,
554
+ "loss": 0.6358,
555
+ "step": 345
556
+ },
557
+ {
558
+ "epoch": 7.29,
559
+ "grad_norm": 1.861436128616333,
560
+ "learning_rate": 6.55840949424996e-07,
561
+ "loss": 0.5741,
562
+ "step": 350
563
+ },
564
+ {
565
+ "epoch": 7.4,
566
+ "grad_norm": 0.8429200053215027,
567
+ "learning_rate": 6.306162975240346e-07,
568
+ "loss": 0.5717,
569
+ "step": 355
570
+ },
571
+ {
572
+ "epoch": 7.5,
573
+ "grad_norm": 1.8665741682052612,
574
+ "learning_rate": 6.053916456230732e-07,
575
+ "loss": 0.6992,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 7.6,
580
+ "grad_norm": 2.312748908996582,
581
+ "learning_rate": 5.801669937221119e-07,
582
+ "loss": 0.6151,
583
+ "step": 365
584
+ },
585
+ {
586
+ "epoch": 7.71,
587
+ "grad_norm": 1.1628329753875732,
588
+ "learning_rate": 5.549423418211505e-07,
589
+ "loss": 0.5354,
590
+ "step": 370
591
+ },
592
+ {
593
+ "epoch": 7.81,
594
+ "grad_norm": 1.8674992322921753,
595
+ "learning_rate": 5.297176899201891e-07,
596
+ "loss": 0.6411,
597
+ "step": 375
598
+ },
599
+ {
600
+ "epoch": 7.92,
601
+ "grad_norm": 0.7112137675285339,
602
+ "learning_rate": 5.044930380192277e-07,
603
+ "loss": 0.6063,
604
+ "step": 380
605
+ },
606
+ {
607
+ "epoch": 8.0,
608
+ "eval_f1": 0.7326732673267327,
609
+ "eval_loss": 0.6909217834472656,
610
+ "eval_runtime": 1.3913,
611
+ "eval_samples_per_second": 45.999,
612
+ "eval_steps_per_second": 5.75,
613
+ "step": 384
614
+ }
615
+ ],
616
+ "logging_steps": 5,
617
+ "max_steps": 480,
618
+ "num_input_tokens_seen": 0,
619
+ "num_train_epochs": 10,
620
+ "save_steps": 500,
621
+ "total_flos": 5808783041309760.0,
622
+ "train_batch_size": 4,
623
+ "trial_name": null,
624
+ "trial_params": {
625
+ "learning_rate": 2.1794099242430636e-06,
626
+ "per_device_train_batch_size": 4
627
+ }
628
+ }
run-4/checkpoint-384/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3
3
+ size 4920
run-4/checkpoint-432/config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ntu-spml/distilhubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": false,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": false,
47
+ "final_dropout": 0.0,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "id2label": {
52
+ "0": "NOT_WORD",
53
+ "1": "WORD"
54
+ },
55
+ "initializer_range": 0.02,
56
+ "intermediate_size": 3072,
57
+ "label2id": {
58
+ "NOT_WORD": "0",
59
+ "WORD": "1"
60
+ },
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "hubert",
70
+ "num_attention_heads": 12,
71
+ "num_conv_pos_embedding_groups": 16,
72
+ "num_conv_pos_embeddings": 128,
73
+ "num_feat_extract_layers": 7,
74
+ "num_hidden_layers": 2,
75
+ "pad_token_id": 0,
76
+ "torch_dtype": "float32",
77
+ "transformers_version": "4.38.1",
78
+ "use_weighted_layer_sum": false,
79
+ "vocab_size": 32
80
+ }