colerobertson
commited on
Commit
•
02160f9
1
Parent(s):
a7fc36e
Training in progress, epoch 1
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- model.safetensors +1 -1
- run-3/checkpoint-16/config.json +80 -0
- run-3/checkpoint-16/model.safetensors +3 -0
- run-3/checkpoint-16/optimizer.pt +3 -0
- run-3/checkpoint-16/preprocessor_config.json +9 -0
- run-3/checkpoint-16/rng_state.pth +3 -0
- run-3/checkpoint-16/scheduler.pt +3 -0
- run-3/checkpoint-16/trainer_state.json +63 -0
- run-3/checkpoint-16/training_args.bin +3 -0
- run-3/checkpoint-24/config.json +80 -0
- run-3/checkpoint-24/model.safetensors +3 -0
- run-3/checkpoint-24/optimizer.pt +3 -0
- run-3/checkpoint-24/preprocessor_config.json +9 -0
- run-3/checkpoint-24/rng_state.pth +3 -0
- run-3/checkpoint-24/scheduler.pt +3 -0
- run-3/checkpoint-24/trainer_state.json +79 -0
- run-3/checkpoint-24/training_args.bin +3 -0
- run-3/checkpoint-32/config.json +80 -0
- run-3/checkpoint-32/model.safetensors +3 -0
- run-3/checkpoint-32/optimizer.pt +3 -0
- run-3/checkpoint-32/preprocessor_config.json +9 -0
- run-3/checkpoint-32/rng_state.pth +3 -0
- run-3/checkpoint-32/scheduler.pt +3 -0
- run-3/checkpoint-32/trainer_state.json +102 -0
- run-3/checkpoint-32/training_args.bin +3 -0
- run-3/checkpoint-40/config.json +80 -0
- run-3/checkpoint-40/model.safetensors +3 -0
- run-3/checkpoint-40/optimizer.pt +3 -0
- run-3/checkpoint-40/preprocessor_config.json +9 -0
- run-3/checkpoint-40/rng_state.pth +3 -0
- run-3/checkpoint-40/scheduler.pt +3 -0
- run-3/checkpoint-40/trainer_state.json +125 -0
- run-3/checkpoint-40/training_args.bin +3 -0
- run-3/checkpoint-48/config.json +80 -0
- run-3/checkpoint-48/model.safetensors +3 -0
- run-3/checkpoint-48/optimizer.pt +3 -0
- run-3/checkpoint-48/preprocessor_config.json +9 -0
- run-3/checkpoint-48/rng_state.pth +3 -0
- run-3/checkpoint-48/scheduler.pt +3 -0
- run-3/checkpoint-48/trainer_state.json +141 -0
- run-3/checkpoint-48/training_args.bin +3 -0
- run-3/checkpoint-56/config.json +80 -0
- run-3/checkpoint-56/model.safetensors +3 -0
- run-3/checkpoint-56/optimizer.pt +3 -0
- run-3/checkpoint-56/preprocessor_config.json +9 -0
- run-3/checkpoint-56/rng_state.pth +3 -0
- run-3/checkpoint-56/scheduler.pt +3 -0
- run-3/checkpoint-56/trainer_state.json +164 -0
- run-3/checkpoint-56/training_args.bin +3 -0
- run-3/checkpoint-64/config.json +80 -0
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 94763496
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:018f138040ad7aa88bf94f4b8deb9d4f75616181b7d07008cdde50dd03503a11
|
3 |
size 94763496
|
run-3/checkpoint-16/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-16/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a760d27f41290ed80c1206e2480b47d92c8ce2c6ee1f3cd329e186b834a2b29
|
3 |
+
size 94763496
|
run-3/checkpoint-16/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:04b4c7382179a64f9ac6f6bc7805a19064505d76480c4aa8b8391170753af6e7
|
3 |
+
size 189552570
|
run-3/checkpoint-16/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-16/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0b4c067c009eda80e8baa1203d26c442448a0cbb4afc20c352a7b4f4c31ecfc
|
3 |
+
size 14244
|
run-3/checkpoint-16/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:829126bb0c61bd2bc81a49adbe8fe99f684971512e37328fc21f35f04bc8cdd9
|
3 |
+
size 1064
|
run-3/checkpoint-16/trainer_state.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 2.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 16,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
}
|
50 |
+
],
|
51 |
+
"logging_steps": 5,
|
52 |
+
"max_steps": 80,
|
53 |
+
"num_input_tokens_seen": 0,
|
54 |
+
"num_train_epochs": 10,
|
55 |
+
"save_steps": 500,
|
56 |
+
"total_flos": 1863816719524704.0,
|
57 |
+
"train_batch_size": 24,
|
58 |
+
"trial_name": null,
|
59 |
+
"trial_params": {
|
60 |
+
"learning_rate": 1.1420981778248994e-05,
|
61 |
+
"per_device_train_batch_size": 24
|
62 |
+
}
|
63 |
+
}
|
run-3/checkpoint-16/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-24/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-24/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3f932857479e62a819449b36f4c42e6e648d1fd17c406c7ee35d9c0b7ea261e
|
3 |
+
size 94763496
|
run-3/checkpoint-24/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:461c0fa31756b908c1ff77514b04efa451eae1a56488331e4a4c40726af10ca0
|
3 |
+
size 189552570
|
run-3/checkpoint-24/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-24/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24d6a900c72e3e0f300fb2a89dab7159acab172bc37646dd605cc8c78374f6f9
|
3 |
+
size 14244
|
run-3/checkpoint-24/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ccd2ea597a4bea6c5e4d0cece4bbe69ed79e233292bc9053077d461aeb0ba5f
|
3 |
+
size 1064
|
run-3/checkpoint-24/trainer_state.json
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 3.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 24,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 2.5,
|
52 |
+
"grad_norm": 0.5775301456451416,
|
53 |
+
"learning_rate": 9.517484815207495e-06,
|
54 |
+
"loss": 0.6436,
|
55 |
+
"step": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"epoch": 3.0,
|
59 |
+
"eval_f1": 0.7326732673267327,
|
60 |
+
"eval_loss": 0.6852684020996094,
|
61 |
+
"eval_runtime": 1.3445,
|
62 |
+
"eval_samples_per_second": 47.601,
|
63 |
+
"eval_steps_per_second": 5.95,
|
64 |
+
"step": 24
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"logging_steps": 5,
|
68 |
+
"max_steps": 80,
|
69 |
+
"num_input_tokens_seen": 0,
|
70 |
+
"num_train_epochs": 10,
|
71 |
+
"save_steps": 500,
|
72 |
+
"total_flos": 2484680744556000.0,
|
73 |
+
"train_batch_size": 24,
|
74 |
+
"trial_name": null,
|
75 |
+
"trial_params": {
|
76 |
+
"learning_rate": 1.1420981778248994e-05,
|
77 |
+
"per_device_train_batch_size": 24
|
78 |
+
}
|
79 |
+
}
|
run-3/checkpoint-24/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-32/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-32/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1454cf89a31d874feb53d9bb87ed0b567c597730d05be3faabbbb19277c03e8
|
3 |
+
size 94763496
|
run-3/checkpoint-32/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e61a24407aa0c80c01a56c0188477f4d5473511e288cab61dfd72dd7238455c9
|
3 |
+
size 189552570
|
run-3/checkpoint-32/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-32/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:556c72cd347c734d7fdec24637de2c2dcd065c09bcdd940199a24e0091d021c0
|
3 |
+
size 14244
|
run-3/checkpoint-32/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45ca9d40a3c2366bc01ceed0fa797d480ee4f2519ead3847177cdf537dc0ac31
|
3 |
+
size 1064
|
run-3/checkpoint-32/trainer_state.json
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 4.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 32,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 2.5,
|
52 |
+
"grad_norm": 0.5775301456451416,
|
53 |
+
"learning_rate": 9.517484815207495e-06,
|
54 |
+
"loss": 0.6436,
|
55 |
+
"step": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"epoch": 3.0,
|
59 |
+
"eval_f1": 0.7326732673267327,
|
60 |
+
"eval_loss": 0.6852684020996094,
|
61 |
+
"eval_runtime": 1.3445,
|
62 |
+
"eval_samples_per_second": 47.601,
|
63 |
+
"eval_steps_per_second": 5.95,
|
64 |
+
"step": 24
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 3.12,
|
68 |
+
"grad_norm": 0.4549338221549988,
|
69 |
+
"learning_rate": 8.72436108060687e-06,
|
70 |
+
"loss": 0.6098,
|
71 |
+
"step": 25
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"epoch": 3.75,
|
75 |
+
"grad_norm": 0.3741567134857178,
|
76 |
+
"learning_rate": 7.931237346006246e-06,
|
77 |
+
"loss": 0.609,
|
78 |
+
"step": 30
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 4.0,
|
82 |
+
"eval_f1": 0.7326732673267327,
|
83 |
+
"eval_loss": 0.6952018737792969,
|
84 |
+
"eval_runtime": 1.3401,
|
85 |
+
"eval_samples_per_second": 47.757,
|
86 |
+
"eval_steps_per_second": 5.97,
|
87 |
+
"step": 32
|
88 |
+
}
|
89 |
+
],
|
90 |
+
"logging_steps": 5,
|
91 |
+
"max_steps": 80,
|
92 |
+
"num_input_tokens_seen": 0,
|
93 |
+
"num_train_epochs": 10,
|
94 |
+
"save_steps": 500,
|
95 |
+
"total_flos": 3687742096503552.0,
|
96 |
+
"train_batch_size": 24,
|
97 |
+
"trial_name": null,
|
98 |
+
"trial_params": {
|
99 |
+
"learning_rate": 1.1420981778248994e-05,
|
100 |
+
"per_device_train_batch_size": 24
|
101 |
+
}
|
102 |
+
}
|
run-3/checkpoint-32/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-40/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-40/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb44356d7dd99d76e08bc8fec067d778a70ce340e950085f1f0d5532bcf71429
|
3 |
+
size 94763496
|
run-3/checkpoint-40/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64d97fad5e3a7a342896071e42b90538b01c4168983183ecb488766c8d1179a3
|
3 |
+
size 189552570
|
run-3/checkpoint-40/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-40/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f04394838d097ad37a86e783ddc6557596b7da068eb72e2798acacf43f38924
|
3 |
+
size 14244
|
run-3/checkpoint-40/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:857294f0798588b687c4fd66b6e7d0f73c0b0a67baea1b5225ec246eeb80a3ae
|
3 |
+
size 1064
|
run-3/checkpoint-40/trainer_state.json
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 5.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 40,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 2.5,
|
52 |
+
"grad_norm": 0.5775301456451416,
|
53 |
+
"learning_rate": 9.517484815207495e-06,
|
54 |
+
"loss": 0.6436,
|
55 |
+
"step": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"epoch": 3.0,
|
59 |
+
"eval_f1": 0.7326732673267327,
|
60 |
+
"eval_loss": 0.6852684020996094,
|
61 |
+
"eval_runtime": 1.3445,
|
62 |
+
"eval_samples_per_second": 47.601,
|
63 |
+
"eval_steps_per_second": 5.95,
|
64 |
+
"step": 24
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 3.12,
|
68 |
+
"grad_norm": 0.4549338221549988,
|
69 |
+
"learning_rate": 8.72436108060687e-06,
|
70 |
+
"loss": 0.6098,
|
71 |
+
"step": 25
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"epoch": 3.75,
|
75 |
+
"grad_norm": 0.3741567134857178,
|
76 |
+
"learning_rate": 7.931237346006246e-06,
|
77 |
+
"loss": 0.609,
|
78 |
+
"step": 30
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 4.0,
|
82 |
+
"eval_f1": 0.7326732673267327,
|
83 |
+
"eval_loss": 0.6952018737792969,
|
84 |
+
"eval_runtime": 1.3401,
|
85 |
+
"eval_samples_per_second": 47.757,
|
86 |
+
"eval_steps_per_second": 5.97,
|
87 |
+
"step": 32
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 4.38,
|
91 |
+
"grad_norm": 0.38969138264656067,
|
92 |
+
"learning_rate": 7.138113611405621e-06,
|
93 |
+
"loss": 0.6091,
|
94 |
+
"step": 35
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"epoch": 5.0,
|
98 |
+
"grad_norm": 0.2588692009449005,
|
99 |
+
"learning_rate": 6.344989876804997e-06,
|
100 |
+
"loss": 0.6043,
|
101 |
+
"step": 40
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"epoch": 5.0,
|
105 |
+
"eval_f1": 0.7326732673267327,
|
106 |
+
"eval_loss": 0.7042198181152344,
|
107 |
+
"eval_runtime": 1.3815,
|
108 |
+
"eval_samples_per_second": 46.326,
|
109 |
+
"eval_steps_per_second": 5.791,
|
110 |
+
"step": 40
|
111 |
+
}
|
112 |
+
],
|
113 |
+
"logging_steps": 5,
|
114 |
+
"max_steps": 80,
|
115 |
+
"num_input_tokens_seen": 0,
|
116 |
+
"num_train_epochs": 10,
|
117 |
+
"save_steps": 500,
|
118 |
+
"total_flos": 5002699790009952.0,
|
119 |
+
"train_batch_size": 24,
|
120 |
+
"trial_name": null,
|
121 |
+
"trial_params": {
|
122 |
+
"learning_rate": 1.1420981778248994e-05,
|
123 |
+
"per_device_train_batch_size": 24
|
124 |
+
}
|
125 |
+
}
|
run-3/checkpoint-40/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-48/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-48/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3046efc5b477cb286b3e11b8ed1a45a21f1c828411af810695ffc44c6f2d00d9
|
3 |
+
size 94763496
|
run-3/checkpoint-48/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4a85c054ab80630a8ef4559aef6814034b41e2b2616bd905706d9550320a94d
|
3 |
+
size 189552570
|
run-3/checkpoint-48/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-48/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8eb236be6c806fe9309f674bf7e81272faf7f91242ad8752235487116506f5f
|
3 |
+
size 14244
|
run-3/checkpoint-48/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a43d0fcfc93299b44b2601f4ab406c95df97c50f9a36eb16ee234f5d3816f4a
|
3 |
+
size 1064
|
run-3/checkpoint-48/trainer_state.json
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 6.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 48,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 2.5,
|
52 |
+
"grad_norm": 0.5775301456451416,
|
53 |
+
"learning_rate": 9.517484815207495e-06,
|
54 |
+
"loss": 0.6436,
|
55 |
+
"step": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"epoch": 3.0,
|
59 |
+
"eval_f1": 0.7326732673267327,
|
60 |
+
"eval_loss": 0.6852684020996094,
|
61 |
+
"eval_runtime": 1.3445,
|
62 |
+
"eval_samples_per_second": 47.601,
|
63 |
+
"eval_steps_per_second": 5.95,
|
64 |
+
"step": 24
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 3.12,
|
68 |
+
"grad_norm": 0.4549338221549988,
|
69 |
+
"learning_rate": 8.72436108060687e-06,
|
70 |
+
"loss": 0.6098,
|
71 |
+
"step": 25
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"epoch": 3.75,
|
75 |
+
"grad_norm": 0.3741567134857178,
|
76 |
+
"learning_rate": 7.931237346006246e-06,
|
77 |
+
"loss": 0.609,
|
78 |
+
"step": 30
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 4.0,
|
82 |
+
"eval_f1": 0.7326732673267327,
|
83 |
+
"eval_loss": 0.6952018737792969,
|
84 |
+
"eval_runtime": 1.3401,
|
85 |
+
"eval_samples_per_second": 47.757,
|
86 |
+
"eval_steps_per_second": 5.97,
|
87 |
+
"step": 32
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 4.38,
|
91 |
+
"grad_norm": 0.38969138264656067,
|
92 |
+
"learning_rate": 7.138113611405621e-06,
|
93 |
+
"loss": 0.6091,
|
94 |
+
"step": 35
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"epoch": 5.0,
|
98 |
+
"grad_norm": 0.2588692009449005,
|
99 |
+
"learning_rate": 6.344989876804997e-06,
|
100 |
+
"loss": 0.6043,
|
101 |
+
"step": 40
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"epoch": 5.0,
|
105 |
+
"eval_f1": 0.7326732673267327,
|
106 |
+
"eval_loss": 0.7042198181152344,
|
107 |
+
"eval_runtime": 1.3815,
|
108 |
+
"eval_samples_per_second": 46.326,
|
109 |
+
"eval_steps_per_second": 5.791,
|
110 |
+
"step": 40
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"epoch": 5.62,
|
114 |
+
"grad_norm": 0.6705239415168762,
|
115 |
+
"learning_rate": 5.551866142204372e-06,
|
116 |
+
"loss": 0.6093,
|
117 |
+
"step": 45
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 6.0,
|
121 |
+
"eval_f1": 0.7326732673267327,
|
122 |
+
"eval_loss": 0.7079658508300781,
|
123 |
+
"eval_runtime": 1.3745,
|
124 |
+
"eval_samples_per_second": 46.561,
|
125 |
+
"eval_steps_per_second": 5.82,
|
126 |
+
"step": 48
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"logging_steps": 5,
|
130 |
+
"max_steps": 80,
|
131 |
+
"num_input_tokens_seen": 0,
|
132 |
+
"num_train_epochs": 10,
|
133 |
+
"save_steps": 500,
|
134 |
+
"total_flos": 5627200224409632.0,
|
135 |
+
"train_batch_size": 24,
|
136 |
+
"trial_name": null,
|
137 |
+
"trial_params": {
|
138 |
+
"learning_rate": 1.1420981778248994e-05,
|
139 |
+
"per_device_train_batch_size": 24
|
140 |
+
}
|
141 |
+
}
|
run-3/checkpoint-48/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-56/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|
run-3/checkpoint-56/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d657c8a9a8d89910fa51b01473cddb30252c72c09d1387d54314134d8223e50e
|
3 |
+
size 94763496
|
run-3/checkpoint-56/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59bed33f570bc4270a85ddd2ac7b86c353888ada5b8f8b946fb1763cc460d2e2
|
3 |
+
size 189552570
|
run-3/checkpoint-56/preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
run-3/checkpoint-56/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30f64c34451dd549a3dd22bd33aa2eb5ae76a449e28e339f69de985a5a36616a
|
3 |
+
size 14244
|
run-3/checkpoint-56/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02589177af8711d6e7d186b2047b3051489461944bff68810fb8fe0a88b20fcc
|
3 |
+
size 1064
|
run-3/checkpoint-56/trainer_state.json
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7326732673267327,
|
3 |
+
"best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-3/checkpoint-8",
|
4 |
+
"epoch": 7.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 56,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.62,
|
13 |
+
"grad_norm": 0.94398033618927,
|
14 |
+
"learning_rate": 7.138113611405621e-06,
|
15 |
+
"loss": 0.6981,
|
16 |
+
"step": 5
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_f1": 0.7326732673267327,
|
21 |
+
"eval_loss": 0.6864166259765625,
|
22 |
+
"eval_runtime": 1.3249,
|
23 |
+
"eval_samples_per_second": 48.305,
|
24 |
+
"eval_steps_per_second": 6.038,
|
25 |
+
"step": 8
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"epoch": 1.25,
|
29 |
+
"grad_norm": 1.5316386222839355,
|
30 |
+
"learning_rate": 1.1103732284408743e-05,
|
31 |
+
"loss": 0.683,
|
32 |
+
"step": 10
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 1.88,
|
36 |
+
"grad_norm": 0.5028849840164185,
|
37 |
+
"learning_rate": 1.031060854980812e-05,
|
38 |
+
"loss": 0.6564,
|
39 |
+
"step": 15
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"epoch": 2.0,
|
43 |
+
"eval_f1": 0.7326732673267327,
|
44 |
+
"eval_loss": 0.680694580078125,
|
45 |
+
"eval_runtime": 1.3565,
|
46 |
+
"eval_samples_per_second": 47.179,
|
47 |
+
"eval_steps_per_second": 5.897,
|
48 |
+
"step": 16
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"epoch": 2.5,
|
52 |
+
"grad_norm": 0.5775301456451416,
|
53 |
+
"learning_rate": 9.517484815207495e-06,
|
54 |
+
"loss": 0.6436,
|
55 |
+
"step": 20
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"epoch": 3.0,
|
59 |
+
"eval_f1": 0.7326732673267327,
|
60 |
+
"eval_loss": 0.6852684020996094,
|
61 |
+
"eval_runtime": 1.3445,
|
62 |
+
"eval_samples_per_second": 47.601,
|
63 |
+
"eval_steps_per_second": 5.95,
|
64 |
+
"step": 24
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 3.12,
|
68 |
+
"grad_norm": 0.4549338221549988,
|
69 |
+
"learning_rate": 8.72436108060687e-06,
|
70 |
+
"loss": 0.6098,
|
71 |
+
"step": 25
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"epoch": 3.75,
|
75 |
+
"grad_norm": 0.3741567134857178,
|
76 |
+
"learning_rate": 7.931237346006246e-06,
|
77 |
+
"loss": 0.609,
|
78 |
+
"step": 30
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 4.0,
|
82 |
+
"eval_f1": 0.7326732673267327,
|
83 |
+
"eval_loss": 0.6952018737792969,
|
84 |
+
"eval_runtime": 1.3401,
|
85 |
+
"eval_samples_per_second": 47.757,
|
86 |
+
"eval_steps_per_second": 5.97,
|
87 |
+
"step": 32
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 4.38,
|
91 |
+
"grad_norm": 0.38969138264656067,
|
92 |
+
"learning_rate": 7.138113611405621e-06,
|
93 |
+
"loss": 0.6091,
|
94 |
+
"step": 35
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"epoch": 5.0,
|
98 |
+
"grad_norm": 0.2588692009449005,
|
99 |
+
"learning_rate": 6.344989876804997e-06,
|
100 |
+
"loss": 0.6043,
|
101 |
+
"step": 40
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"epoch": 5.0,
|
105 |
+
"eval_f1": 0.7326732673267327,
|
106 |
+
"eval_loss": 0.7042198181152344,
|
107 |
+
"eval_runtime": 1.3815,
|
108 |
+
"eval_samples_per_second": 46.326,
|
109 |
+
"eval_steps_per_second": 5.791,
|
110 |
+
"step": 40
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"epoch": 5.62,
|
114 |
+
"grad_norm": 0.6705239415168762,
|
115 |
+
"learning_rate": 5.551866142204372e-06,
|
116 |
+
"loss": 0.6093,
|
117 |
+
"step": 45
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"epoch": 6.0,
|
121 |
+
"eval_f1": 0.7326732673267327,
|
122 |
+
"eval_loss": 0.7079658508300781,
|
123 |
+
"eval_runtime": 1.3745,
|
124 |
+
"eval_samples_per_second": 46.561,
|
125 |
+
"eval_steps_per_second": 5.82,
|
126 |
+
"step": 48
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"epoch": 6.25,
|
130 |
+
"grad_norm": 0.4161934554576874,
|
131 |
+
"learning_rate": 4.758742407603747e-06,
|
132 |
+
"loss": 0.5737,
|
133 |
+
"step": 50
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"epoch": 6.88,
|
137 |
+
"grad_norm": 0.3530200719833374,
|
138 |
+
"learning_rate": 3.965618673003123e-06,
|
139 |
+
"loss": 0.6355,
|
140 |
+
"step": 55
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"epoch": 7.0,
|
144 |
+
"eval_f1": 0.7326732673267327,
|
145 |
+
"eval_loss": 0.7088470458984375,
|
146 |
+
"eval_runtime": 1.3736,
|
147 |
+
"eval_samples_per_second": 46.592,
|
148 |
+
"eval_steps_per_second": 5.824,
|
149 |
+
"step": 56
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"logging_steps": 5,
|
153 |
+
"max_steps": 80,
|
154 |
+
"num_input_tokens_seen": 0,
|
155 |
+
"num_train_epochs": 10,
|
156 |
+
"save_steps": 500,
|
157 |
+
"total_flos": 6905111571067392.0,
|
158 |
+
"train_batch_size": 24,
|
159 |
+
"trial_name": null,
|
160 |
+
"trial_params": {
|
161 |
+
"learning_rate": 1.1420981778248994e-05,
|
162 |
+
"per_device_train_batch_size": 24
|
163 |
+
}
|
164 |
+
}
|
run-3/checkpoint-56/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7020efd76104988408508d9c7e8a6e74af8904f5fd05940d43639ce4c80cf2
|
3 |
+
size 4920
|
run-3/checkpoint-64/config.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "ntu-spml/distilhubert",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"apply_spec_augment": false,
|
5 |
+
"architectures": [
|
6 |
+
"HubertForSequenceClassification"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"classifier_proj_size": 256,
|
11 |
+
"conv_bias": false,
|
12 |
+
"conv_dim": [
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512,
|
19 |
+
512
|
20 |
+
],
|
21 |
+
"conv_kernel": [
|
22 |
+
10,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
3,
|
27 |
+
2,
|
28 |
+
2
|
29 |
+
],
|
30 |
+
"conv_stride": [
|
31 |
+
5,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
"ctc_loss_reduction": "sum",
|
40 |
+
"ctc_zero_infinity": false,
|
41 |
+
"do_stable_layer_norm": false,
|
42 |
+
"eos_token_id": 2,
|
43 |
+
"feat_extract_activation": "gelu",
|
44 |
+
"feat_extract_norm": "group",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"feat_proj_layer_norm": false,
|
47 |
+
"final_dropout": 0.0,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 768,
|
51 |
+
"id2label": {
|
52 |
+
"0": "NOT_WORD",
|
53 |
+
"1": "WORD"
|
54 |
+
},
|
55 |
+
"initializer_range": 0.02,
|
56 |
+
"intermediate_size": 3072,
|
57 |
+
"label2id": {
|
58 |
+
"NOT_WORD": "0",
|
59 |
+
"WORD": "1"
|
60 |
+
},
|
61 |
+
"layer_norm_eps": 1e-05,
|
62 |
+
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 10,
|
64 |
+
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.0,
|
66 |
+
"mask_time_length": 10,
|
67 |
+
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.05,
|
69 |
+
"model_type": "hubert",
|
70 |
+
"num_attention_heads": 12,
|
71 |
+
"num_conv_pos_embedding_groups": 16,
|
72 |
+
"num_conv_pos_embeddings": 128,
|
73 |
+
"num_feat_extract_layers": 7,
|
74 |
+
"num_hidden_layers": 2,
|
75 |
+
"pad_token_id": 0,
|
76 |
+
"torch_dtype": "float32",
|
77 |
+
"transformers_version": "4.38.1",
|
78 |
+
"use_weighted_layer_sum": false,
|
79 |
+
"vocab_size": 32
|
80 |
+
}
|