xezpeleta commited on
Commit
cfe270a
1 Parent(s): ea283e7

End of training

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wandb/run-20241005_141414-821qpm7o/files/output.log filter=lfs diff=lfs merge=lfs -text
37
  wandb/run-20241005_141414-821qpm7o/run-821qpm7o.wandb filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  wandb/run-20241005_141414-821qpm7o/files/output.log filter=lfs diff=lfs merge=lfs -text
37
  wandb/run-20241005_141414-821qpm7o/run-821qpm7o.wandb filter=lfs diff=lfs merge=lfs -text
38
+ wandb/run-20241007_131849-0rbzerob/run-0rbzerob.wandb filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,22 +1,25 @@
1
  ---
2
  library_name: transformers
 
 
3
  license: apache-2.0
4
  base_model: openai/whisper-large-v3
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
- - common_voice_17_0
9
  metrics:
10
  - wer
11
  model-index:
12
- - name: openai/whisper-large-v3
13
  results:
14
  - task:
15
  name: Automatic Speech Recognition
16
  type: automatic-speech-recognition
17
  dataset:
18
- name: common_voice_17_0
19
- type: common_voice_17_0
20
  config: eu
21
  split: test
22
  args: eu
@@ -29,9 +32,9 @@ model-index:
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
  should probably proofread and complete it, then remove this comment. -->
31
 
32
- # openai/whisper-large-v3
33
 
34
- This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on the common_voice_17_0 dataset.
35
  It achieves the following results on the evaluation set:
36
  - Loss: 0.1259
37
  - Wer: 7.2154
 
1
  ---
2
  library_name: transformers
3
+ language:
4
+ - eu
5
  license: apache-2.0
6
  base_model: openai/whisper-large-v3
7
  tags:
8
+ - whisper-event
9
  - generated_from_trainer
10
  datasets:
11
+ - mozilla-foundation/common_voice_17_0
12
  metrics:
13
  - wer
14
  model-index:
15
+ - name: Whisper Large Basque
16
  results:
17
  - task:
18
  name: Automatic Speech Recognition
19
  type: automatic-speech-recognition
20
  dataset:
21
+ name: mozilla-foundation/common_voice_17_0 eu
22
+ type: mozilla-foundation/common_voice_17_0
23
  config: eu
24
  split: test
25
  args: eu
 
32
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
33
  should probably proofread and complete it, then remove this comment. -->
34
 
35
+ # Whisper Large Basque
36
 
37
+ This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on the mozilla-foundation/common_voice_17_0 eu dataset.
38
  It achieves the following results on the evaluation set:
39
  - Loss: 0.1259
40
  - Wer: 7.2154
all_results.json CHANGED
@@ -1,8 +1,14 @@
1
  {
2
- "eval_loss": 0.9277587532997131,
 
3
  "eval_model_preparation_time": 0.0102,
4
- "eval_runtime": 4165.1595,
5
- "eval_samples_per_second": 3.272,
6
- "eval_steps_per_second": 0.409,
7
- "eval_wer": 44.29532045879292
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 5.048,
3
+ "eval_loss": 0.12588092684745789,
4
  "eval_model_preparation_time": 0.0102,
5
+ "eval_runtime": 4097.4891,
6
+ "eval_samples_per_second": 3.326,
7
+ "eval_steps_per_second": 0.416,
8
+ "eval_wer": 7.215361500971087,
9
+ "total_flos": 4.891718061785088e+20,
10
+ "train_loss": 0.0,
11
+ "train_runtime": 289.8068,
12
+ "train_samples_per_second": 552.092,
13
+ "train_steps_per_second": 34.506
14
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "eval_loss": 0.9277587532997131,
3
- "eval_model_preparation_time": 0.0102,
4
- "eval_runtime": 4165.1595,
5
- "eval_samples_per_second": 3.272,
6
- "eval_steps_per_second": 0.409,
7
- "eval_wer": 44.29532045879292
8
  }
 
1
  {
2
+ "epoch": 5.048,
3
+ "eval_loss": 0.12588092684745789,
4
+ "eval_runtime": 4097.4891,
5
+ "eval_samples_per_second": 3.326,
6
+ "eval_steps_per_second": 0.416,
7
+ "eval_wer": 7.215361500971087
8
  }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.048,
3
+ "total_flos": 4.891718061785088e+20,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 289.8068,
6
+ "train_samples_per_second": 552.092,
7
+ "train_steps_per_second": 34.506
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 7.215361500971087,
3
+ "best_model_checkpoint": "./checkpoint-9000",
4
+ "epoch": 5.048,
5
+ "eval_steps": 500,
6
+ "global_step": 9000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0025,
13
+ "grad_norm": 6.131621360778809,
14
+ "learning_rate": 2.1875e-07,
15
+ "loss": 0.9345,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.005,
20
+ "grad_norm": 6.021520137786865,
21
+ "learning_rate": 4.375e-07,
22
+ "loss": 0.8231,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.0075,
27
+ "grad_norm": 5.526496410369873,
28
+ "learning_rate": 6.5625e-07,
29
+ "loss": 0.5623,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.01,
34
+ "grad_norm": 4.9277825355529785,
35
+ "learning_rate": 8.75e-07,
36
+ "loss": 0.4173,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.0125,
41
+ "grad_norm": 4.292990684509277,
42
+ "learning_rate": 1.09375e-06,
43
+ "loss": 0.385,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.015,
48
+ "grad_norm": 5.749295234680176,
49
+ "learning_rate": 1.3125e-06,
50
+ "loss": 0.3931,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.0175,
55
+ "grad_norm": 3.8306965827941895,
56
+ "learning_rate": 1.5312499999999997e-06,
57
+ "loss": 0.3516,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.02,
62
+ "grad_norm": 4.687748908996582,
63
+ "learning_rate": 1.75e-06,
64
+ "loss": 0.3235,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.0225,
69
+ "grad_norm": 4.232759952545166,
70
+ "learning_rate": 1.96875e-06,
71
+ "loss": 0.3314,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 0.025,
76
+ "grad_norm": 4.185751914978027,
77
+ "learning_rate": 2.1875e-06,
78
+ "loss": 0.309,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 0.0275,
83
+ "grad_norm": 4.818612098693848,
84
+ "learning_rate": 2.40625e-06,
85
+ "loss": 0.2991,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 0.03,
90
+ "grad_norm": 4.171736717224121,
91
+ "learning_rate": 2.625e-06,
92
+ "loss": 0.2832,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 0.0325,
97
+ "grad_norm": 5.217376708984375,
98
+ "learning_rate": 2.8437499999999997e-06,
99
+ "loss": 0.2873,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 0.035,
104
+ "grad_norm": 4.671106815338135,
105
+ "learning_rate": 3.0624999999999995e-06,
106
+ "loss": 0.2957,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.0375,
111
+ "grad_norm": 3.9175262451171875,
112
+ "learning_rate": 3.2812499999999997e-06,
113
+ "loss": 0.2634,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 0.04,
118
+ "grad_norm": 4.647582054138184,
119
+ "learning_rate": 3.5e-06,
120
+ "loss": 0.2541,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 0.0425,
125
+ "grad_norm": 3.25675368309021,
126
+ "learning_rate": 3.7187499999999998e-06,
127
+ "loss": 0.2244,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 0.045,
132
+ "grad_norm": 4.597206115722656,
133
+ "learning_rate": 3.9375e-06,
134
+ "loss": 0.2492,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 0.0475,
139
+ "grad_norm": 4.602332592010498,
140
+ "learning_rate": 4.156249999999999e-06,
141
+ "loss": 0.246,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 0.05,
146
+ "grad_norm": 3.6419622898101807,
147
+ "learning_rate": 4.375e-06,
148
+ "loss": 0.2208,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 0.05,
153
+ "eval_loss": 0.2592349350452423,
154
+ "eval_runtime": 4116.5906,
155
+ "eval_samples_per_second": 3.311,
156
+ "eval_steps_per_second": 0.414,
157
+ "eval_wer": 20.691487412510533,
158
+ "step": 500
159
+ },
160
+ {
161
+ "epoch": 0.0525,
162
+ "grad_norm": 3.6599488258361816,
163
+ "learning_rate": 4.363486842105263e-06,
164
+ "loss": 0.2539,
165
+ "step": 525
166
+ },
167
+ {
168
+ "epoch": 0.055,
169
+ "grad_norm": 3.6934616565704346,
170
+ "learning_rate": 4.351973684210526e-06,
171
+ "loss": 0.2313,
172
+ "step": 550
173
+ },
174
+ {
175
+ "epoch": 0.0575,
176
+ "grad_norm": 3.7546138763427734,
177
+ "learning_rate": 4.340460526315789e-06,
178
+ "loss": 0.2272,
179
+ "step": 575
180
+ },
181
+ {
182
+ "epoch": 0.06,
183
+ "grad_norm": 3.096877098083496,
184
+ "learning_rate": 4.3289473684210525e-06,
185
+ "loss": 0.2373,
186
+ "step": 600
187
+ },
188
+ {
189
+ "epoch": 0.0625,
190
+ "grad_norm": 3.572812795639038,
191
+ "learning_rate": 4.3174342105263155e-06,
192
+ "loss": 0.2285,
193
+ "step": 625
194
+ },
195
+ {
196
+ "epoch": 0.065,
197
+ "grad_norm": 3.3494396209716797,
198
+ "learning_rate": 4.3059210526315785e-06,
199
+ "loss": 0.2293,
200
+ "step": 650
201
+ },
202
+ {
203
+ "epoch": 0.0675,
204
+ "grad_norm": 3.5156869888305664,
205
+ "learning_rate": 4.2944078947368415e-06,
206
+ "loss": 0.2063,
207
+ "step": 675
208
+ },
209
+ {
210
+ "epoch": 0.07,
211
+ "grad_norm": 3.698807716369629,
212
+ "learning_rate": 4.282894736842105e-06,
213
+ "loss": 0.2113,
214
+ "step": 700
215
+ },
216
+ {
217
+ "epoch": 0.0725,
218
+ "grad_norm": 3.716585636138916,
219
+ "learning_rate": 4.271381578947368e-06,
220
+ "loss": 0.2055,
221
+ "step": 725
222
+ },
223
+ {
224
+ "epoch": 0.075,
225
+ "grad_norm": 4.204227924346924,
226
+ "learning_rate": 4.2598684210526314e-06,
227
+ "loss": 0.2114,
228
+ "step": 750
229
+ },
230
+ {
231
+ "epoch": 0.0775,
232
+ "grad_norm": 3.479562282562256,
233
+ "learning_rate": 4.2483552631578944e-06,
234
+ "loss": 0.2224,
235
+ "step": 775
236
+ },
237
+ {
238
+ "epoch": 0.08,
239
+ "grad_norm": 4.5203094482421875,
240
+ "learning_rate": 4.2368421052631575e-06,
241
+ "loss": 0.2523,
242
+ "step": 800
243
+ },
244
+ {
245
+ "epoch": 0.0825,
246
+ "grad_norm": 3.6081738471984863,
247
+ "learning_rate": 4.2253289473684205e-06,
248
+ "loss": 0.2383,
249
+ "step": 825
250
+ },
251
+ {
252
+ "epoch": 0.085,
253
+ "grad_norm": 3.2602758407592773,
254
+ "learning_rate": 4.2138157894736835e-06,
255
+ "loss": 0.1808,
256
+ "step": 850
257
+ },
258
+ {
259
+ "epoch": 0.0875,
260
+ "grad_norm": 3.6786868572235107,
261
+ "learning_rate": 4.202302631578947e-06,
262
+ "loss": 0.1747,
263
+ "step": 875
264
+ },
265
+ {
266
+ "epoch": 0.09,
267
+ "grad_norm": 3.1120803356170654,
268
+ "learning_rate": 4.19078947368421e-06,
269
+ "loss": 0.1662,
270
+ "step": 900
271
+ },
272
+ {
273
+ "epoch": 0.0925,
274
+ "grad_norm": 3.1962203979492188,
275
+ "learning_rate": 4.179276315789473e-06,
276
+ "loss": 0.1771,
277
+ "step": 925
278
+ },
279
+ {
280
+ "epoch": 0.095,
281
+ "grad_norm": 3.172363758087158,
282
+ "learning_rate": 4.167763157894736e-06,
283
+ "loss": 0.1751,
284
+ "step": 950
285
+ },
286
+ {
287
+ "epoch": 0.0975,
288
+ "grad_norm": 2.4304590225219727,
289
+ "learning_rate": 4.156249999999999e-06,
290
+ "loss": 0.1701,
291
+ "step": 975
292
+ },
293
+ {
294
+ "epoch": 0.1,
295
+ "grad_norm": 3.193345308303833,
296
+ "learning_rate": 4.144736842105262e-06,
297
+ "loss": 0.1489,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 0.1,
302
+ "eval_loss": 0.1971057653427124,
303
+ "eval_runtime": 4130.6867,
304
+ "eval_samples_per_second": 3.3,
305
+ "eval_steps_per_second": 0.413,
306
+ "eval_wer": 14.68265601524424,
307
+ "step": 1000
308
+ },
309
+ {
310
+ "epoch": 0.1025,
311
+ "grad_norm": 3.322065591812134,
312
+ "learning_rate": 4.133223684210526e-06,
313
+ "loss": 0.1701,
314
+ "step": 1025
315
+ },
316
+ {
317
+ "epoch": 0.105,
318
+ "grad_norm": 3.5462722778320312,
319
+ "learning_rate": 4.121710526315789e-06,
320
+ "loss": 0.1875,
321
+ "step": 1050
322
+ },
323
+ {
324
+ "epoch": 0.1075,
325
+ "grad_norm": 3.39326810836792,
326
+ "learning_rate": 4.110197368421052e-06,
327
+ "loss": 0.1506,
328
+ "step": 1075
329
+ },
330
+ {
331
+ "epoch": 0.11,
332
+ "grad_norm": 2.9165821075439453,
333
+ "learning_rate": 4.098684210526315e-06,
334
+ "loss": 0.1525,
335
+ "step": 1100
336
+ },
337
+ {
338
+ "epoch": 0.1125,
339
+ "grad_norm": 3.262007236480713,
340
+ "learning_rate": 4.087171052631578e-06,
341
+ "loss": 0.157,
342
+ "step": 1125
343
+ },
344
+ {
345
+ "epoch": 0.115,
346
+ "grad_norm": 2.4523119926452637,
347
+ "learning_rate": 4.075657894736842e-06,
348
+ "loss": 0.1416,
349
+ "step": 1150
350
+ },
351
+ {
352
+ "epoch": 0.1175,
353
+ "grad_norm": 2.7651798725128174,
354
+ "learning_rate": 4.064144736842105e-06,
355
+ "loss": 0.1527,
356
+ "step": 1175
357
+ },
358
+ {
359
+ "epoch": 0.12,
360
+ "grad_norm": 3.609523296356201,
361
+ "learning_rate": 4.052631578947368e-06,
362
+ "loss": 0.1822,
363
+ "step": 1200
364
+ },
365
+ {
366
+ "epoch": 0.1225,
367
+ "grad_norm": 3.8101985454559326,
368
+ "learning_rate": 4.041118421052631e-06,
369
+ "loss": 0.1703,
370
+ "step": 1225
371
+ },
372
+ {
373
+ "epoch": 0.125,
374
+ "grad_norm": 3.8921287059783936,
375
+ "learning_rate": 4.029605263157894e-06,
376
+ "loss": 0.1924,
377
+ "step": 1250
378
+ },
379
+ {
380
+ "epoch": 0.1275,
381
+ "grad_norm": 4.463279724121094,
382
+ "learning_rate": 4.018092105263157e-06,
383
+ "loss": 0.1818,
384
+ "step": 1275
385
+ },
386
+ {
387
+ "epoch": 0.13,
388
+ "grad_norm": 3.6556308269500732,
389
+ "learning_rate": 4.00657894736842e-06,
390
+ "loss": 0.1726,
391
+ "step": 1300
392
+ },
393
+ {
394
+ "epoch": 0.1325,
395
+ "grad_norm": 2.98067569732666,
396
+ "learning_rate": 3.995065789473683e-06,
397
+ "loss": 0.174,
398
+ "step": 1325
399
+ },
400
+ {
401
+ "epoch": 0.135,
402
+ "grad_norm": 2.8287429809570312,
403
+ "learning_rate": 3.983552631578947e-06,
404
+ "loss": 0.1631,
405
+ "step": 1350
406
+ },
407
+ {
408
+ "epoch": 0.1375,
409
+ "grad_norm": 2.6438794136047363,
410
+ "learning_rate": 3.97203947368421e-06,
411
+ "loss": 0.1475,
412
+ "step": 1375
413
+ },
414
+ {
415
+ "epoch": 0.14,
416
+ "grad_norm": 3.513123035430908,
417
+ "learning_rate": 3.960526315789473e-06,
418
+ "loss": 0.1457,
419
+ "step": 1400
420
+ },
421
+ {
422
+ "epoch": 0.1425,
423
+ "grad_norm": 2.4688916206359863,
424
+ "learning_rate": 3.949013157894737e-06,
425
+ "loss": 0.1375,
426
+ "step": 1425
427
+ },
428
+ {
429
+ "epoch": 0.145,
430
+ "grad_norm": 4.005943775177002,
431
+ "learning_rate": 3.9375e-06,
432
+ "loss": 0.1623,
433
+ "step": 1450
434
+ },
435
+ {
436
+ "epoch": 0.1475,
437
+ "grad_norm": 2.91786789894104,
438
+ "learning_rate": 3.925986842105263e-06,
439
+ "loss": 0.1701,
440
+ "step": 1475
441
+ },
442
+ {
443
+ "epoch": 0.15,
444
+ "grad_norm": 3.5332415103912354,
445
+ "learning_rate": 3.914473684210526e-06,
446
+ "loss": 0.1973,
447
+ "step": 1500
448
+ },
449
+ {
450
+ "epoch": 0.15,
451
+ "eval_loss": 0.17469166219234467,
452
+ "eval_runtime": 4132.0041,
453
+ "eval_samples_per_second": 3.299,
454
+ "eval_steps_per_second": 0.412,
455
+ "eval_wer": 12.377697973542453,
456
+ "step": 1500
457
+ },
458
+ {
459
+ "epoch": 0.1525,
460
+ "grad_norm": 4.05070686340332,
461
+ "learning_rate": 3.902960526315789e-06,
462
+ "loss": 0.1796,
463
+ "step": 1525
464
+ },
465
+ {
466
+ "epoch": 0.155,
467
+ "grad_norm": 2.989821195602417,
468
+ "learning_rate": 3.891447368421052e-06,
469
+ "loss": 0.1561,
470
+ "step": 1550
471
+ },
472
+ {
473
+ "epoch": 0.1575,
474
+ "grad_norm": 2.9603219032287598,
475
+ "learning_rate": 3.879934210526315e-06,
476
+ "loss": 0.1609,
477
+ "step": 1575
478
+ },
479
+ {
480
+ "epoch": 0.16,
481
+ "grad_norm": 3.2663583755493164,
482
+ "learning_rate": 3.868421052631579e-06,
483
+ "loss": 0.1833,
484
+ "step": 1600
485
+ },
486
+ {
487
+ "epoch": 0.1625,
488
+ "grad_norm": 3.459775686264038,
489
+ "learning_rate": 3.856907894736842e-06,
490
+ "loss": 0.1727,
491
+ "step": 1625
492
+ },
493
+ {
494
+ "epoch": 0.165,
495
+ "grad_norm": 3.427720069885254,
496
+ "learning_rate": 3.845394736842105e-06,
497
+ "loss": 0.181,
498
+ "step": 1650
499
+ },
500
+ {
501
+ "epoch": 0.1675,
502
+ "grad_norm": 4.471118450164795,
503
+ "learning_rate": 3.833881578947368e-06,
504
+ "loss": 0.1536,
505
+ "step": 1675
506
+ },
507
+ {
508
+ "epoch": 0.17,
509
+ "grad_norm": 3.1428306102752686,
510
+ "learning_rate": 3.822368421052632e-06,
511
+ "loss": 0.1372,
512
+ "step": 1700
513
+ },
514
+ {
515
+ "epoch": 1.0021,
516
+ "grad_norm": 2.8270132541656494,
517
+ "learning_rate": 3.8108552631578944e-06,
518
+ "loss": 0.1454,
519
+ "step": 1725
520
+ },
521
+ {
522
+ "epoch": 1.0046,
523
+ "grad_norm": 3.0873589515686035,
524
+ "learning_rate": 3.799342105263158e-06,
525
+ "loss": 0.1303,
526
+ "step": 1750
527
+ },
528
+ {
529
+ "epoch": 1.0071,
530
+ "grad_norm": 3.187711000442505,
531
+ "learning_rate": 3.787828947368421e-06,
532
+ "loss": 0.1383,
533
+ "step": 1775
534
+ },
535
+ {
536
+ "epoch": 1.0096,
537
+ "grad_norm": 3.1710643768310547,
538
+ "learning_rate": 3.776315789473684e-06,
539
+ "loss": 0.1626,
540
+ "step": 1800
541
+ },
542
+ {
543
+ "epoch": 1.0121,
544
+ "grad_norm": 3.4516818523406982,
545
+ "learning_rate": 3.7648026315789473e-06,
546
+ "loss": 0.1405,
547
+ "step": 1825
548
+ },
549
+ {
550
+ "epoch": 1.0146,
551
+ "grad_norm": 2.930408000946045,
552
+ "learning_rate": 3.7532894736842103e-06,
553
+ "loss": 0.143,
554
+ "step": 1850
555
+ },
556
+ {
557
+ "epoch": 1.0171,
558
+ "grad_norm": 3.066941261291504,
559
+ "learning_rate": 3.7417763157894733e-06,
560
+ "loss": 0.1437,
561
+ "step": 1875
562
+ },
563
+ {
564
+ "epoch": 1.0196,
565
+ "grad_norm": 3.389916181564331,
566
+ "learning_rate": 3.7302631578947363e-06,
567
+ "loss": 0.1289,
568
+ "step": 1900
569
+ },
570
+ {
571
+ "epoch": 1.0221,
572
+ "grad_norm": 3.048574209213257,
573
+ "learning_rate": 3.7187499999999998e-06,
574
+ "loss": 0.1415,
575
+ "step": 1925
576
+ },
577
+ {
578
+ "epoch": 1.0246,
579
+ "grad_norm": 2.5267295837402344,
580
+ "learning_rate": 3.7072368421052628e-06,
581
+ "loss": 0.1386,
582
+ "step": 1950
583
+ },
584
+ {
585
+ "epoch": 1.0271,
586
+ "grad_norm": 3.151757001876831,
587
+ "learning_rate": 3.6957236842105258e-06,
588
+ "loss": 0.1436,
589
+ "step": 1975
590
+ },
591
+ {
592
+ "epoch": 1.0296,
593
+ "grad_norm": 3.629039764404297,
594
+ "learning_rate": 3.684210526315789e-06,
595
+ "loss": 0.1353,
596
+ "step": 2000
597
+ },
598
+ {
599
+ "epoch": 1.0296,
600
+ "eval_loss": 0.1527385264635086,
601
+ "eval_runtime": 4116.9756,
602
+ "eval_samples_per_second": 3.311,
603
+ "eval_steps_per_second": 0.414,
604
+ "eval_wer": 10.719520685990693,
605
+ "step": 2000
606
+ },
607
+ {
608
+ "epoch": 1.0321,
609
+ "grad_norm": 1.8788173198699951,
610
+ "learning_rate": 3.6726973684210522e-06,
611
+ "loss": 0.1322,
612
+ "step": 2025
613
+ },
614
+ {
615
+ "epoch": 1.0346,
616
+ "grad_norm": 2.587233066558838,
617
+ "learning_rate": 3.6611842105263157e-06,
618
+ "loss": 0.1176,
619
+ "step": 2050
620
+ },
621
+ {
622
+ "epoch": 1.0371,
623
+ "grad_norm": 4.001532077789307,
624
+ "learning_rate": 3.6496710526315787e-06,
625
+ "loss": 0.1233,
626
+ "step": 2075
627
+ },
628
+ {
629
+ "epoch": 1.0396,
630
+ "grad_norm": 3.3947739601135254,
631
+ "learning_rate": 3.638157894736842e-06,
632
+ "loss": 0.1188,
633
+ "step": 2100
634
+ },
635
+ {
636
+ "epoch": 1.0421,
637
+ "grad_norm": 3.4743120670318604,
638
+ "learning_rate": 3.626644736842105e-06,
639
+ "loss": 0.1318,
640
+ "step": 2125
641
+ },
642
+ {
643
+ "epoch": 1.0446,
644
+ "grad_norm": 2.9288718700408936,
645
+ "learning_rate": 3.615131578947368e-06,
646
+ "loss": 0.1224,
647
+ "step": 2150
648
+ },
649
+ {
650
+ "epoch": 1.0471,
651
+ "grad_norm": 2.6081368923187256,
652
+ "learning_rate": 3.603618421052631e-06,
653
+ "loss": 0.1232,
654
+ "step": 2175
655
+ },
656
+ {
657
+ "epoch": 1.0496,
658
+ "grad_norm": 2.4068429470062256,
659
+ "learning_rate": 3.5921052631578946e-06,
660
+ "loss": 0.1073,
661
+ "step": 2200
662
+ },
663
+ {
664
+ "epoch": 1.0521,
665
+ "grad_norm": 3.049074411392212,
666
+ "learning_rate": 3.5805921052631576e-06,
667
+ "loss": 0.1071,
668
+ "step": 2225
669
+ },
670
+ {
671
+ "epoch": 1.0546,
672
+ "grad_norm": 2.0809032917022705,
673
+ "learning_rate": 3.5690789473684206e-06,
674
+ "loss": 0.1217,
675
+ "step": 2250
676
+ },
677
+ {
678
+ "epoch": 1.0571,
679
+ "grad_norm": 3.0854332447052,
680
+ "learning_rate": 3.5575657894736836e-06,
681
+ "loss": 0.1332,
682
+ "step": 2275
683
+ },
684
+ {
685
+ "epoch": 1.0596,
686
+ "grad_norm": 3.580145835876465,
687
+ "learning_rate": 3.546052631578947e-06,
688
+ "loss": 0.131,
689
+ "step": 2300
690
+ },
691
+ {
692
+ "epoch": 1.0621,
693
+ "grad_norm": 3.8924479484558105,
694
+ "learning_rate": 3.53453947368421e-06,
695
+ "loss": 0.136,
696
+ "step": 2325
697
+ },
698
+ {
699
+ "epoch": 1.0646,
700
+ "grad_norm": 2.8398871421813965,
701
+ "learning_rate": 3.523026315789473e-06,
702
+ "loss": 0.1081,
703
+ "step": 2350
704
+ },
705
+ {
706
+ "epoch": 1.0671,
707
+ "grad_norm": 3.007026195526123,
708
+ "learning_rate": 3.511513157894737e-06,
709
+ "loss": 0.1115,
710
+ "step": 2375
711
+ },
712
+ {
713
+ "epoch": 1.0695999999999999,
714
+ "grad_norm": 1.5712552070617676,
715
+ "learning_rate": 3.5e-06,
716
+ "loss": 0.1183,
717
+ "step": 2400
718
+ },
719
+ {
720
+ "epoch": 1.0721,
721
+ "grad_norm": 3.844963312149048,
722
+ "learning_rate": 3.488486842105263e-06,
723
+ "loss": 0.113,
724
+ "step": 2425
725
+ },
726
+ {
727
+ "epoch": 1.0746,
728
+ "grad_norm": 2.8939759731292725,
729
+ "learning_rate": 3.476973684210526e-06,
730
+ "loss": 0.1115,
731
+ "step": 2450
732
+ },
733
+ {
734
+ "epoch": 1.0771,
735
+ "grad_norm": 1.8150537014007568,
736
+ "learning_rate": 3.4654605263157894e-06,
737
+ "loss": 0.1117,
738
+ "step": 2475
739
+ },
740
+ {
741
+ "epoch": 1.0796000000000001,
742
+ "grad_norm": 2.839418649673462,
743
+ "learning_rate": 3.4539473684210524e-06,
744
+ "loss": 0.1065,
745
+ "step": 2500
746
+ },
747
+ {
748
+ "epoch": 1.0796000000000001,
749
+ "eval_loss": 0.1456422209739685,
750
+ "eval_runtime": 4133.4016,
751
+ "eval_samples_per_second": 3.298,
752
+ "eval_steps_per_second": 0.412,
753
+ "eval_wer": 9.869361281102277,
754
+ "step": 2500
755
+ },
756
+ {
757
+ "epoch": 1.0821,
758
+ "grad_norm": 3.4274985790252686,
759
+ "learning_rate": 3.4424342105263154e-06,
760
+ "loss": 0.1067,
761
+ "step": 2525
762
+ },
763
+ {
764
+ "epoch": 1.0846,
765
+ "grad_norm": 2.2946057319641113,
766
+ "learning_rate": 3.4309210526315784e-06,
767
+ "loss": 0.1038,
768
+ "step": 2550
769
+ },
770
+ {
771
+ "epoch": 1.0871,
772
+ "grad_norm": 2.5364551544189453,
773
+ "learning_rate": 3.419407894736842e-06,
774
+ "loss": 0.1073,
775
+ "step": 2575
776
+ },
777
+ {
778
+ "epoch": 1.0896,
779
+ "grad_norm": 2.9779515266418457,
780
+ "learning_rate": 3.4083552631578944e-06,
781
+ "loss": 0.1067,
782
+ "step": 2600
783
+ },
784
+ {
785
+ "epoch": 1.0921,
786
+ "grad_norm": 2.502685308456421,
787
+ "learning_rate": 3.3968421052631574e-06,
788
+ "loss": 0.1229,
789
+ "step": 2625
790
+ },
791
+ {
792
+ "epoch": 1.0946,
793
+ "grad_norm": 2.181756019592285,
794
+ "learning_rate": 3.3853289473684205e-06,
795
+ "loss": 0.1071,
796
+ "step": 2650
797
+ },
798
+ {
799
+ "epoch": 1.0971,
800
+ "grad_norm": 2.428738594055176,
801
+ "learning_rate": 3.3738157894736843e-06,
802
+ "loss": 0.101,
803
+ "step": 2675
804
+ },
805
+ {
806
+ "epoch": 1.0996,
807
+ "grad_norm": 3.797952651977539,
808
+ "learning_rate": 3.3623026315789473e-06,
809
+ "loss": 0.1198,
810
+ "step": 2700
811
+ },
812
+ {
813
+ "epoch": 1.1021,
814
+ "grad_norm": 2.9902758598327637,
815
+ "learning_rate": 3.3507894736842103e-06,
816
+ "loss": 0.1013,
817
+ "step": 2725
818
+ },
819
+ {
820
+ "epoch": 1.1046,
821
+ "grad_norm": 3.0514307022094727,
822
+ "learning_rate": 3.3392763157894734e-06,
823
+ "loss": 0.1075,
824
+ "step": 2750
825
+ },
826
+ {
827
+ "epoch": 1.1071,
828
+ "grad_norm": 3.2877554893493652,
829
+ "learning_rate": 3.327763157894737e-06,
830
+ "loss": 0.1059,
831
+ "step": 2775
832
+ },
833
+ {
834
+ "epoch": 1.1096,
835
+ "grad_norm": 2.3952691555023193,
836
+ "learning_rate": 3.31625e-06,
837
+ "loss": 0.0926,
838
+ "step": 2800
839
+ },
840
+ {
841
+ "epoch": 1.1121,
842
+ "grad_norm": 2.2840464115142822,
843
+ "learning_rate": 3.304736842105263e-06,
844
+ "loss": 0.1048,
845
+ "step": 2825
846
+ },
847
+ {
848
+ "epoch": 1.1146,
849
+ "grad_norm": 2.7062416076660156,
850
+ "learning_rate": 3.293223684210526e-06,
851
+ "loss": 0.1049,
852
+ "step": 2850
853
+ },
854
+ {
855
+ "epoch": 1.1171,
856
+ "grad_norm": 2.971315860748291,
857
+ "learning_rate": 3.2817105263157893e-06,
858
+ "loss": 0.1073,
859
+ "step": 2875
860
+ },
861
+ {
862
+ "epoch": 1.1196,
863
+ "grad_norm": 2.8689844608306885,
864
+ "learning_rate": 3.2701973684210523e-06,
865
+ "loss": 0.1141,
866
+ "step": 2900
867
+ },
868
+ {
869
+ "epoch": 1.1221,
870
+ "grad_norm": 3.6150734424591064,
871
+ "learning_rate": 3.2586842105263153e-06,
872
+ "loss": 0.1066,
873
+ "step": 2925
874
+ },
875
+ {
876
+ "epoch": 1.1246,
877
+ "grad_norm": 2.3004024028778076,
878
+ "learning_rate": 3.2471710526315783e-06,
879
+ "loss": 0.1248,
880
+ "step": 2950
881
+ },
882
+ {
883
+ "epoch": 1.1271,
884
+ "grad_norm": 2.5995240211486816,
885
+ "learning_rate": 3.2356578947368417e-06,
886
+ "loss": 0.0972,
887
+ "step": 2975
888
+ },
889
+ {
890
+ "epoch": 1.1296,
891
+ "grad_norm": 2.957960367202759,
892
+ "learning_rate": 3.224144736842105e-06,
893
+ "loss": 0.106,
894
+ "step": 3000
895
+ },
896
+ {
897
+ "epoch": 1.1296,
898
+ "eval_loss": 0.13624447584152222,
899
+ "eval_runtime": 4123.4662,
900
+ "eval_samples_per_second": 3.305,
901
+ "eval_steps_per_second": 0.413,
902
+ "eval_wer": 9.09249148008355,
903
+ "step": 3000
904
+ },
905
+ {
906
+ "epoch": 1.1320999999999999,
907
+ "grad_norm": 2.653007984161377,
908
+ "learning_rate": 3.212631578947368e-06,
909
+ "loss": 0.1083,
910
+ "step": 3025
911
+ },
912
+ {
913
+ "epoch": 1.1346,
914
+ "grad_norm": 2.6895744800567627,
915
+ "learning_rate": 3.2011184210526316e-06,
916
+ "loss": 0.1119,
917
+ "step": 3050
918
+ },
919
+ {
920
+ "epoch": 1.1371,
921
+ "grad_norm": 2.1507463455200195,
922
+ "learning_rate": 3.1896052631578946e-06,
923
+ "loss": 0.0944,
924
+ "step": 3075
925
+ },
926
+ {
927
+ "epoch": 1.1396,
928
+ "grad_norm": 3.61063289642334,
929
+ "learning_rate": 3.1780921052631576e-06,
930
+ "loss": 0.095,
931
+ "step": 3100
932
+ },
933
+ {
934
+ "epoch": 1.1421000000000001,
935
+ "grad_norm": 2.570584774017334,
936
+ "learning_rate": 3.1665789473684206e-06,
937
+ "loss": 0.1076,
938
+ "step": 3125
939
+ },
940
+ {
941
+ "epoch": 1.1446,
942
+ "grad_norm": 3.05507230758667,
943
+ "learning_rate": 3.155065789473684e-06,
944
+ "loss": 0.1175,
945
+ "step": 3150
946
+ },
947
+ {
948
+ "epoch": 1.1471,
949
+ "grad_norm": 2.82817006111145,
950
+ "learning_rate": 3.143552631578947e-06,
951
+ "loss": 0.0965,
952
+ "step": 3175
953
+ },
954
+ {
955
+ "epoch": 1.1496,
956
+ "grad_norm": 2.336517572402954,
957
+ "learning_rate": 3.13203947368421e-06,
958
+ "loss": 0.0955,
959
+ "step": 3200
960
+ },
961
+ {
962
+ "epoch": 1.1521,
963
+ "grad_norm": 3.8640036582946777,
964
+ "learning_rate": 3.120526315789473e-06,
965
+ "loss": 0.1044,
966
+ "step": 3225
967
+ },
968
+ {
969
+ "epoch": 1.1546,
970
+ "grad_norm": 3.7205588817596436,
971
+ "learning_rate": 3.1090131578947366e-06,
972
+ "loss": 0.1013,
973
+ "step": 3250
974
+ },
975
+ {
976
+ "epoch": 1.1571,
977
+ "grad_norm": 2.1962900161743164,
978
+ "learning_rate": 3.0974999999999996e-06,
979
+ "loss": 0.0978,
980
+ "step": 3275
981
+ },
982
+ {
983
+ "epoch": 1.1596,
984
+ "grad_norm": 3.3310599327087402,
985
+ "learning_rate": 3.0859868421052626e-06,
986
+ "loss": 0.1089,
987
+ "step": 3300
988
+ },
989
+ {
990
+ "epoch": 1.1621,
991
+ "grad_norm": 2.699566602706909,
992
+ "learning_rate": 3.074473684210526e-06,
993
+ "loss": 0.1078,
994
+ "step": 3325
995
+ },
996
+ {
997
+ "epoch": 1.1646,
998
+ "grad_norm": 3.79370379447937,
999
+ "learning_rate": 3.0629605263157894e-06,
1000
+ "loss": 0.1118,
1001
+ "step": 3350
1002
+ },
1003
+ {
1004
+ "epoch": 1.1671,
1005
+ "grad_norm": 1.9741384983062744,
1006
+ "learning_rate": 3.0514473684210525e-06,
1007
+ "loss": 0.1119,
1008
+ "step": 3375
1009
+ },
1010
+ {
1011
+ "epoch": 1.1696,
1012
+ "grad_norm": 2.29034686088562,
1013
+ "learning_rate": 3.0399342105263155e-06,
1014
+ "loss": 0.1015,
1015
+ "step": 3400
1016
+ },
1017
+ {
1018
+ "epoch": 2.0017,
1019
+ "grad_norm": 2.011443853378296,
1020
+ "learning_rate": 3.028421052631579e-06,
1021
+ "loss": 0.0708,
1022
+ "step": 3425
1023
+ },
1024
+ {
1025
+ "epoch": 2.0042,
1026
+ "grad_norm": 1.2196134328842163,
1027
+ "learning_rate": 3.016907894736842e-06,
1028
+ "loss": 0.0668,
1029
+ "step": 3450
1030
+ },
1031
+ {
1032
+ "epoch": 2.0067,
1033
+ "grad_norm": 2.863933563232422,
1034
+ "learning_rate": 3.005394736842105e-06,
1035
+ "loss": 0.0673,
1036
+ "step": 3475
1037
+ },
1038
+ {
1039
+ "epoch": 2.0092,
1040
+ "grad_norm": 1.9341013431549072,
1041
+ "learning_rate": 2.9938815789473684e-06,
1042
+ "loss": 0.0718,
1043
+ "step": 3500
1044
+ },
1045
+ {
1046
+ "epoch": 2.0092,
1047
+ "eval_loss": 0.13255682587623596,
1048
+ "eval_runtime": 4133.4892,
1049
+ "eval_samples_per_second": 3.297,
1050
+ "eval_steps_per_second": 0.412,
1051
+ "eval_wer": 8.542819451060867,
1052
+ "step": 3500
1053
+ },
1054
+ {
1055
+ "epoch": 2.0117,
1056
+ "grad_norm": 2.795734405517578,
1057
+ "learning_rate": 2.9823684210526314e-06,
1058
+ "loss": 0.071,
1059
+ "step": 3525
1060
+ },
1061
+ {
1062
+ "epoch": 2.0142,
1063
+ "grad_norm": 1.982479214668274,
1064
+ "learning_rate": 2.9708552631578944e-06,
1065
+ "loss": 0.0629,
1066
+ "step": 3550
1067
+ },
1068
+ {
1069
+ "epoch": 2.0167,
1070
+ "grad_norm": 3.168161630630493,
1071
+ "learning_rate": 2.9593421052631574e-06,
1072
+ "loss": 0.0593,
1073
+ "step": 3575
1074
+ },
1075
+ {
1076
+ "epoch": 2.0192,
1077
+ "grad_norm": 2.259500741958618,
1078
+ "learning_rate": 2.947828947368421e-06,
1079
+ "loss": 0.0696,
1080
+ "step": 3600
1081
+ },
1082
+ {
1083
+ "epoch": 2.0217,
1084
+ "grad_norm": 2.1626062393188477,
1085
+ "learning_rate": 2.936315789473684e-06,
1086
+ "loss": 0.0687,
1087
+ "step": 3625
1088
+ },
1089
+ {
1090
+ "epoch": 2.0242,
1091
+ "grad_norm": 2.4419946670532227,
1092
+ "learning_rate": 2.924802631578947e-06,
1093
+ "loss": 0.0686,
1094
+ "step": 3650
1095
+ },
1096
+ {
1097
+ "epoch": 2.0267,
1098
+ "grad_norm": 2.445758819580078,
1099
+ "learning_rate": 2.9132894736842103e-06,
1100
+ "loss": 0.0631,
1101
+ "step": 3675
1102
+ },
1103
+ {
1104
+ "epoch": 2.0292,
1105
+ "grad_norm": 2.614476442337036,
1106
+ "learning_rate": 2.9017763157894737e-06,
1107
+ "loss": 0.0647,
1108
+ "step": 3700
1109
+ },
1110
+ {
1111
+ "epoch": 2.0317,
1112
+ "grad_norm": 1.4166672229766846,
1113
+ "learning_rate": 2.8902631578947367e-06,
1114
+ "loss": 0.0653,
1115
+ "step": 3725
1116
+ },
1117
+ {
1118
+ "epoch": 2.0342,
1119
+ "grad_norm": 1.8435245752334595,
1120
+ "learning_rate": 2.8787499999999998e-06,
1121
+ "loss": 0.0567,
1122
+ "step": 3750
1123
+ },
1124
+ {
1125
+ "epoch": 2.0367,
1126
+ "grad_norm": 1.8179950714111328,
1127
+ "learning_rate": 2.867236842105263e-06,
1128
+ "loss": 0.0636,
1129
+ "step": 3775
1130
+ },
1131
+ {
1132
+ "epoch": 2.0392,
1133
+ "grad_norm": 1.487122893333435,
1134
+ "learning_rate": 2.855723684210526e-06,
1135
+ "loss": 0.0598,
1136
+ "step": 3800
1137
+ },
1138
+ {
1139
+ "epoch": 2.0417,
1140
+ "grad_norm": 2.9211690425872803,
1141
+ "learning_rate": 2.8442105263157892e-06,
1142
+ "loss": 0.0599,
1143
+ "step": 3825
1144
+ },
1145
+ {
1146
+ "epoch": 2.0442,
1147
+ "grad_norm": 2.5018093585968018,
1148
+ "learning_rate": 2.8326973684210522e-06,
1149
+ "loss": 0.055,
1150
+ "step": 3850
1151
+ },
1152
+ {
1153
+ "epoch": 2.0467,
1154
+ "grad_norm": 2.186502456665039,
1155
+ "learning_rate": 2.8211842105263157e-06,
1156
+ "loss": 0.0533,
1157
+ "step": 3875
1158
+ },
1159
+ {
1160
+ "epoch": 2.0492,
1161
+ "grad_norm": 1.039233922958374,
1162
+ "learning_rate": 2.8096710526315787e-06,
1163
+ "loss": 0.0514,
1164
+ "step": 3900
1165
+ },
1166
+ {
1167
+ "epoch": 2.0517,
1168
+ "grad_norm": 1.871267557144165,
1169
+ "learning_rate": 2.7981578947368417e-06,
1170
+ "loss": 0.0512,
1171
+ "step": 3925
1172
+ },
1173
+ {
1174
+ "epoch": 2.0542,
1175
+ "grad_norm": 2.0849483013153076,
1176
+ "learning_rate": 2.7866447368421047e-06,
1177
+ "loss": 0.0579,
1178
+ "step": 3950
1179
+ },
1180
+ {
1181
+ "epoch": 2.0567,
1182
+ "grad_norm": 1.6887531280517578,
1183
+ "learning_rate": 2.775131578947368e-06,
1184
+ "loss": 0.0575,
1185
+ "step": 3975
1186
+ },
1187
+ {
1188
+ "epoch": 2.0592,
1189
+ "grad_norm": 1.88097083568573,
1190
+ "learning_rate": 2.763618421052631e-06,
1191
+ "loss": 0.0683,
1192
+ "step": 4000
1193
+ },
1194
+ {
1195
+ "epoch": 2.0592,
1196
+ "eval_loss": 0.1342601627111435,
1197
+ "eval_runtime": 4125.8373,
1198
+ "eval_samples_per_second": 3.304,
1199
+ "eval_steps_per_second": 0.413,
1200
+ "eval_wer": 8.485103888013485,
1201
+ "step": 4000
1202
+ },
1203
+ {
1204
+ "epoch": 2.0617,
1205
+ "grad_norm": 2.1877427101135254,
1206
+ "learning_rate": 2.7521052631578946e-06,
1207
+ "loss": 0.0614,
1208
+ "step": 4025
1209
+ },
1210
+ {
1211
+ "epoch": 2.0642,
1212
+ "grad_norm": 1.4176368713378906,
1213
+ "learning_rate": 2.740592105263158e-06,
1214
+ "loss": 0.0559,
1215
+ "step": 4050
1216
+ },
1217
+ {
1218
+ "epoch": 2.0667,
1219
+ "grad_norm": 2.4362101554870605,
1220
+ "learning_rate": 2.729078947368421e-06,
1221
+ "loss": 0.0593,
1222
+ "step": 4075
1223
+ },
1224
+ {
1225
+ "epoch": 2.0692,
1226
+ "grad_norm": 1.8663033246994019,
1227
+ "learning_rate": 2.717565789473684e-06,
1228
+ "loss": 0.0591,
1229
+ "step": 4100
1230
+ },
1231
+ {
1232
+ "epoch": 2.0717,
1233
+ "grad_norm": 1.627626657485962,
1234
+ "learning_rate": 2.706052631578947e-06,
1235
+ "loss": 0.0637,
1236
+ "step": 4125
1237
+ },
1238
+ {
1239
+ "epoch": 2.0742,
1240
+ "grad_norm": 2.2072463035583496,
1241
+ "learning_rate": 2.6945394736842105e-06,
1242
+ "loss": 0.0571,
1243
+ "step": 4150
1244
+ },
1245
+ {
1246
+ "epoch": 2.0767,
1247
+ "grad_norm": 1.7411611080169678,
1248
+ "learning_rate": 2.6830263157894735e-06,
1249
+ "loss": 0.0588,
1250
+ "step": 4175
1251
+ },
1252
+ {
1253
+ "epoch": 2.0792,
1254
+ "grad_norm": 1.324000358581543,
1255
+ "learning_rate": 2.6715131578947365e-06,
1256
+ "loss": 0.0482,
1257
+ "step": 4200
1258
+ },
1259
+ {
1260
+ "epoch": 2.0817,
1261
+ "grad_norm": 1.4138795137405396,
1262
+ "learning_rate": 2.6599999999999995e-06,
1263
+ "loss": 0.0477,
1264
+ "step": 4225
1265
+ },
1266
+ {
1267
+ "epoch": 2.0842,
1268
+ "grad_norm": 2.403547763824463,
1269
+ "learning_rate": 2.648486842105263e-06,
1270
+ "loss": 0.0558,
1271
+ "step": 4250
1272
+ },
1273
+ {
1274
+ "epoch": 2.0867,
1275
+ "grad_norm": 1.3718703985214233,
1276
+ "learning_rate": 2.636973684210526e-06,
1277
+ "loss": 0.0546,
1278
+ "step": 4275
1279
+ },
1280
+ {
1281
+ "epoch": 2.0892,
1282
+ "grad_norm": 2.296445369720459,
1283
+ "learning_rate": 2.625460526315789e-06,
1284
+ "loss": 0.0554,
1285
+ "step": 4300
1286
+ },
1287
+ {
1288
+ "epoch": 2.0917,
1289
+ "grad_norm": 2.3471312522888184,
1290
+ "learning_rate": 2.613947368421052e-06,
1291
+ "loss": 0.051,
1292
+ "step": 4325
1293
+ },
1294
+ {
1295
+ "epoch": 2.0942,
1296
+ "grad_norm": 1.6061975955963135,
1297
+ "learning_rate": 2.602434210526316e-06,
1298
+ "loss": 0.0548,
1299
+ "step": 4350
1300
+ },
1301
+ {
1302
+ "epoch": 2.0967,
1303
+ "grad_norm": 2.979126453399658,
1304
+ "learning_rate": 2.590921052631579e-06,
1305
+ "loss": 0.0492,
1306
+ "step": 4375
1307
+ },
1308
+ {
1309
+ "epoch": 2.0992,
1310
+ "grad_norm": 1.7963169813156128,
1311
+ "learning_rate": 2.579407894736842e-06,
1312
+ "loss": 0.0514,
1313
+ "step": 4400
1314
+ },
1315
+ {
1316
+ "epoch": 2.1017,
1317
+ "grad_norm": 2.4996039867401123,
1318
+ "learning_rate": 2.5678947368421053e-06,
1319
+ "loss": 0.0399,
1320
+ "step": 4425
1321
+ },
1322
+ {
1323
+ "epoch": 2.1042,
1324
+ "grad_norm": 1.7498191595077515,
1325
+ "learning_rate": 2.5563815789473683e-06,
1326
+ "loss": 0.0522,
1327
+ "step": 4450
1328
+ },
1329
+ {
1330
+ "epoch": 2.1067,
1331
+ "grad_norm": 1.413889765739441,
1332
+ "learning_rate": 2.5448684210526313e-06,
1333
+ "loss": 0.0517,
1334
+ "step": 4475
1335
+ },
1336
+ {
1337
+ "epoch": 2.1092,
1338
+ "grad_norm": 2.0956978797912598,
1339
+ "learning_rate": 2.5333552631578943e-06,
1340
+ "loss": 0.0482,
1341
+ "step": 4500
1342
+ },
1343
+ {
1344
+ "epoch": 2.1092,
1345
+ "eval_loss": 0.1336347758769989,
1346
+ "eval_runtime": 4119.9162,
1347
+ "eval_samples_per_second": 3.308,
1348
+ "eval_steps_per_second": 0.414,
1349
+ "eval_wer": 8.104914067939463,
1350
+ "step": 4500
1351
+ },
1352
+ {
1353
+ "epoch": 2.1117,
1354
+ "grad_norm": 3.138298749923706,
1355
+ "learning_rate": 2.5218421052631578e-06,
1356
+ "loss": 0.0568,
1357
+ "step": 4525
1358
+ },
1359
+ {
1360
+ "epoch": 2.1142,
1361
+ "grad_norm": 1.4262772798538208,
1362
+ "learning_rate": 2.510328947368421e-06,
1363
+ "loss": 0.0475,
1364
+ "step": 4550
1365
+ },
1366
+ {
1367
+ "epoch": 2.1167,
1368
+ "grad_norm": 3.3500139713287354,
1369
+ "learning_rate": 2.498815789473684e-06,
1370
+ "loss": 0.0474,
1371
+ "step": 4575
1372
+ },
1373
+ {
1374
+ "epoch": 2.1192,
1375
+ "grad_norm": 4.509912014007568,
1376
+ "learning_rate": 2.4873026315789472e-06,
1377
+ "loss": 0.0586,
1378
+ "step": 4600
1379
+ },
1380
+ {
1381
+ "epoch": 2.1217,
1382
+ "grad_norm": 2.1386468410491943,
1383
+ "learning_rate": 2.4757894736842102e-06,
1384
+ "loss": 0.062,
1385
+ "step": 4625
1386
+ },
1387
+ {
1388
+ "epoch": 2.1242,
1389
+ "grad_norm": 1.1121129989624023,
1390
+ "learning_rate": 2.4642763157894733e-06,
1391
+ "loss": 0.0563,
1392
+ "step": 4650
1393
+ },
1394
+ {
1395
+ "epoch": 2.1267,
1396
+ "grad_norm": 1.677538514137268,
1397
+ "learning_rate": 2.4527631578947363e-06,
1398
+ "loss": 0.0519,
1399
+ "step": 4675
1400
+ },
1401
+ {
1402
+ "epoch": 2.1292,
1403
+ "grad_norm": 1.579513430595398,
1404
+ "learning_rate": 2.44125e-06,
1405
+ "loss": 0.0544,
1406
+ "step": 4700
1407
+ },
1408
+ {
1409
+ "epoch": 2.1317,
1410
+ "grad_norm": 2.1100914478302,
1411
+ "learning_rate": 2.429736842105263e-06,
1412
+ "loss": 0.0578,
1413
+ "step": 4725
1414
+ },
1415
+ {
1416
+ "epoch": 2.1342,
1417
+ "grad_norm": 1.779682993888855,
1418
+ "learning_rate": 2.418223684210526e-06,
1419
+ "loss": 0.0486,
1420
+ "step": 4750
1421
+ },
1422
+ {
1423
+ "epoch": 2.1367,
1424
+ "grad_norm": 1.7443439960479736,
1425
+ "learning_rate": 2.4067105263157896e-06,
1426
+ "loss": 0.0534,
1427
+ "step": 4775
1428
+ },
1429
+ {
1430
+ "epoch": 2.1391999999999998,
1431
+ "grad_norm": 1.9388935565948486,
1432
+ "learning_rate": 2.3951973684210526e-06,
1433
+ "loss": 0.0516,
1434
+ "step": 4800
1435
+ },
1436
+ {
1437
+ "epoch": 2.1417,
1438
+ "grad_norm": 1.82517409324646,
1439
+ "learning_rate": 2.3836842105263156e-06,
1440
+ "loss": 0.0451,
1441
+ "step": 4825
1442
+ },
1443
+ {
1444
+ "epoch": 2.1442,
1445
+ "grad_norm": 1.9101967811584473,
1446
+ "learning_rate": 2.3721710526315786e-06,
1447
+ "loss": 0.0546,
1448
+ "step": 4850
1449
+ },
1450
+ {
1451
+ "epoch": 2.1467,
1452
+ "grad_norm": 1.7242915630340576,
1453
+ "learning_rate": 2.360657894736842e-06,
1454
+ "loss": 0.0495,
1455
+ "step": 4875
1456
+ },
1457
+ {
1458
+ "epoch": 2.1492,
1459
+ "grad_norm": 1.9127079248428345,
1460
+ "learning_rate": 2.349144736842105e-06,
1461
+ "loss": 0.0465,
1462
+ "step": 4900
1463
+ },
1464
+ {
1465
+ "epoch": 2.1517,
1466
+ "grad_norm": 2.7716519832611084,
1467
+ "learning_rate": 2.337631578947368e-06,
1468
+ "loss": 0.0493,
1469
+ "step": 4925
1470
+ },
1471
+ {
1472
+ "epoch": 2.1542,
1473
+ "grad_norm": 3.141706705093384,
1474
+ "learning_rate": 2.326118421052631e-06,
1475
+ "loss": 0.046,
1476
+ "step": 4950
1477
+ },
1478
+ {
1479
+ "epoch": 2.1567,
1480
+ "grad_norm": 2.2624270915985107,
1481
+ "learning_rate": 2.3146052631578945e-06,
1482
+ "loss": 0.0522,
1483
+ "step": 4975
1484
+ },
1485
+ {
1486
+ "epoch": 2.1592000000000002,
1487
+ "grad_norm": 1.2777652740478516,
1488
+ "learning_rate": 2.3030921052631575e-06,
1489
+ "loss": 0.0548,
1490
+ "step": 5000
1491
+ },
1492
+ {
1493
+ "epoch": 2.1592000000000002,
1494
+ "eval_loss": 0.13162237405776978,
1495
+ "eval_runtime": 4127.2085,
1496
+ "eval_samples_per_second": 3.302,
1497
+ "eval_steps_per_second": 0.413,
1498
+ "eval_wer": 7.9244384184103485,
1499
+ "step": 5000
1500
+ },
1501
+ {
1502
+ "epoch": 2.1617,
1503
+ "grad_norm": 2.106818675994873,
1504
+ "learning_rate": 2.2915789473684206e-06,
1505
+ "loss": 0.0527,
1506
+ "step": 5025
1507
+ },
1508
+ {
1509
+ "epoch": 2.1642,
1510
+ "grad_norm": 2.2705554962158203,
1511
+ "learning_rate": 2.2800657894736844e-06,
1512
+ "loss": 0.0483,
1513
+ "step": 5050
1514
+ },
1515
+ {
1516
+ "epoch": 2.1667,
1517
+ "grad_norm": 1.5468271970748901,
1518
+ "learning_rate": 2.2685526315789474e-06,
1519
+ "loss": 0.0516,
1520
+ "step": 5075
1521
+ },
1522
+ {
1523
+ "epoch": 2.1692,
1524
+ "grad_norm": 2.0331270694732666,
1525
+ "learning_rate": 2.2570394736842104e-06,
1526
+ "loss": 0.0551,
1527
+ "step": 5100
1528
+ },
1529
+ {
1530
+ "epoch": 3.0013,
1531
+ "grad_norm": 1.107423186302185,
1532
+ "learning_rate": 2.2455263157894734e-06,
1533
+ "loss": 0.0434,
1534
+ "step": 5125
1535
+ },
1536
+ {
1537
+ "epoch": 3.0038,
1538
+ "grad_norm": 3.9103100299835205,
1539
+ "learning_rate": 2.234013157894737e-06,
1540
+ "loss": 0.0362,
1541
+ "step": 5150
1542
+ },
1543
+ {
1544
+ "epoch": 3.0063,
1545
+ "grad_norm": 1.193088173866272,
1546
+ "learning_rate": 2.2225e-06,
1547
+ "loss": 0.0327,
1548
+ "step": 5175
1549
+ },
1550
+ {
1551
+ "epoch": 3.0088,
1552
+ "grad_norm": 1.0432852506637573,
1553
+ "learning_rate": 2.210986842105263e-06,
1554
+ "loss": 0.0326,
1555
+ "step": 5200
1556
+ },
1557
+ {
1558
+ "epoch": 3.0113,
1559
+ "grad_norm": 0.7116020917892456,
1560
+ "learning_rate": 2.199473684210526e-06,
1561
+ "loss": 0.0296,
1562
+ "step": 5225
1563
+ },
1564
+ {
1565
+ "epoch": 3.0138,
1566
+ "grad_norm": 2.009617805480957,
1567
+ "learning_rate": 2.1879605263157894e-06,
1568
+ "loss": 0.0367,
1569
+ "step": 5250
1570
+ },
1571
+ {
1572
+ "epoch": 3.0163,
1573
+ "grad_norm": 1.9047244787216187,
1574
+ "learning_rate": 2.1764473684210524e-06,
1575
+ "loss": 0.0347,
1576
+ "step": 5275
1577
+ },
1578
+ {
1579
+ "epoch": 3.0188,
1580
+ "grad_norm": 1.630439043045044,
1581
+ "learning_rate": 2.164934210526316e-06,
1582
+ "loss": 0.0291,
1583
+ "step": 5300
1584
+ },
1585
+ {
1586
+ "epoch": 3.0213,
1587
+ "grad_norm": 1.4158824682235718,
1588
+ "learning_rate": 2.153421052631579e-06,
1589
+ "loss": 0.0321,
1590
+ "step": 5325
1591
+ },
1592
+ {
1593
+ "epoch": 3.0238,
1594
+ "grad_norm": 1.2792794704437256,
1595
+ "learning_rate": 2.141907894736842e-06,
1596
+ "loss": 0.0338,
1597
+ "step": 5350
1598
+ },
1599
+ {
1600
+ "epoch": 3.0263,
1601
+ "grad_norm": 1.6505346298217773,
1602
+ "learning_rate": 2.1303947368421053e-06,
1603
+ "loss": 0.0348,
1604
+ "step": 5375
1605
+ },
1606
+ {
1607
+ "epoch": 3.0288,
1608
+ "grad_norm": 1.5343618392944336,
1609
+ "learning_rate": 2.1188815789473683e-06,
1610
+ "loss": 0.0318,
1611
+ "step": 5400
1612
+ },
1613
+ {
1614
+ "epoch": 3.0313,
1615
+ "grad_norm": 1.8325493335723877,
1616
+ "learning_rate": 2.1073684210526313e-06,
1617
+ "loss": 0.0333,
1618
+ "step": 5425
1619
+ },
1620
+ {
1621
+ "epoch": 3.0338,
1622
+ "grad_norm": 1.7224900722503662,
1623
+ "learning_rate": 2.0958552631578943e-06,
1624
+ "loss": 0.0322,
1625
+ "step": 5450
1626
+ },
1627
+ {
1628
+ "epoch": 3.0362999999999998,
1629
+ "grad_norm": 1.3443737030029297,
1630
+ "learning_rate": 2.0843421052631577e-06,
1631
+ "loss": 0.0304,
1632
+ "step": 5475
1633
+ },
1634
+ {
1635
+ "epoch": 3.0388,
1636
+ "grad_norm": 1.3260679244995117,
1637
+ "learning_rate": 2.0728289473684207e-06,
1638
+ "loss": 0.0282,
1639
+ "step": 5500
1640
+ },
1641
+ {
1642
+ "epoch": 3.0388,
1643
+ "eval_loss": 0.13909843564033508,
1644
+ "eval_runtime": 4135.2147,
1645
+ "eval_samples_per_second": 3.296,
1646
+ "eval_steps_per_second": 0.412,
1647
+ "eval_wer": 7.8181684927992965,
1648
+ "step": 5500
1649
+ },
1650
+ {
1651
+ "epoch": 3.0413,
1652
+ "grad_norm": 1.0075204372406006,
1653
+ "learning_rate": 2.061315789473684e-06,
1654
+ "loss": 0.0308,
1655
+ "step": 5525
1656
+ },
1657
+ {
1658
+ "epoch": 3.0438,
1659
+ "grad_norm": 1.0206842422485352,
1660
+ "learning_rate": 2.049802631578947e-06,
1661
+ "loss": 0.0306,
1662
+ "step": 5550
1663
+ },
1664
+ {
1665
+ "epoch": 3.0463,
1666
+ "grad_norm": 1.411301851272583,
1667
+ "learning_rate": 2.03828947368421e-06,
1668
+ "loss": 0.0243,
1669
+ "step": 5575
1670
+ },
1671
+ {
1672
+ "epoch": 3.0488,
1673
+ "grad_norm": 0.959862470626831,
1674
+ "learning_rate": 2.0267763157894732e-06,
1675
+ "loss": 0.0272,
1676
+ "step": 5600
1677
+ },
1678
+ {
1679
+ "epoch": 3.0513,
1680
+ "grad_norm": 2.2999842166900635,
1681
+ "learning_rate": 2.0152631578947367e-06,
1682
+ "loss": 0.0246,
1683
+ "step": 5625
1684
+ },
1685
+ {
1686
+ "epoch": 3.0538,
1687
+ "grad_norm": 2.890066146850586,
1688
+ "learning_rate": 2.00375e-06,
1689
+ "loss": 0.0299,
1690
+ "step": 5650
1691
+ },
1692
+ {
1693
+ "epoch": 3.0563,
1694
+ "grad_norm": 1.7101376056671143,
1695
+ "learning_rate": 1.992236842105263e-06,
1696
+ "loss": 0.0322,
1697
+ "step": 5675
1698
+ },
1699
+ {
1700
+ "epoch": 3.0588,
1701
+ "grad_norm": 1.531943917274475,
1702
+ "learning_rate": 1.980723684210526e-06,
1703
+ "loss": 0.0345,
1704
+ "step": 5700
1705
+ },
1706
+ {
1707
+ "epoch": 3.0613,
1708
+ "grad_norm": 1.6334413290023804,
1709
+ "learning_rate": 1.969210526315789e-06,
1710
+ "loss": 0.032,
1711
+ "step": 5725
1712
+ },
1713
+ {
1714
+ "epoch": 3.0638,
1715
+ "grad_norm": 2.112278461456299,
1716
+ "learning_rate": 1.9576973684210526e-06,
1717
+ "loss": 0.0304,
1718
+ "step": 5750
1719
+ },
1720
+ {
1721
+ "epoch": 3.0663,
1722
+ "grad_norm": 1.7582517862319946,
1723
+ "learning_rate": 1.9461842105263156e-06,
1724
+ "loss": 0.0254,
1725
+ "step": 5775
1726
+ },
1727
+ {
1728
+ "epoch": 3.0688,
1729
+ "grad_norm": 1.3391777276992798,
1730
+ "learning_rate": 1.934671052631579e-06,
1731
+ "loss": 0.0316,
1732
+ "step": 5800
1733
+ },
1734
+ {
1735
+ "epoch": 3.0713,
1736
+ "grad_norm": 0.8350562453269958,
1737
+ "learning_rate": 1.923157894736842e-06,
1738
+ "loss": 0.0329,
1739
+ "step": 5825
1740
+ },
1741
+ {
1742
+ "epoch": 3.0738,
1743
+ "grad_norm": 0.7084619402885437,
1744
+ "learning_rate": 1.911644736842105e-06,
1745
+ "loss": 0.0325,
1746
+ "step": 5850
1747
+ },
1748
+ {
1749
+ "epoch": 3.0763,
1750
+ "grad_norm": 1.2961277961730957,
1751
+ "learning_rate": 1.9001315789473683e-06,
1752
+ "loss": 0.0313,
1753
+ "step": 5875
1754
+ },
1755
+ {
1756
+ "epoch": 3.0788,
1757
+ "grad_norm": 1.032840371131897,
1758
+ "learning_rate": 1.8886184210526315e-06,
1759
+ "loss": 0.0224,
1760
+ "step": 5900
1761
+ },
1762
+ {
1763
+ "epoch": 3.0813,
1764
+ "grad_norm": 1.2073044776916504,
1765
+ "learning_rate": 1.8771052631578945e-06,
1766
+ "loss": 0.0215,
1767
+ "step": 5925
1768
+ },
1769
+ {
1770
+ "epoch": 3.0838,
1771
+ "grad_norm": 0.8210967779159546,
1772
+ "learning_rate": 1.8655921052631577e-06,
1773
+ "loss": 0.0258,
1774
+ "step": 5950
1775
+ },
1776
+ {
1777
+ "epoch": 3.0863,
1778
+ "grad_norm": 1.5273653268814087,
1779
+ "learning_rate": 1.854078947368421e-06,
1780
+ "loss": 0.0254,
1781
+ "step": 5975
1782
+ },
1783
+ {
1784
+ "epoch": 3.0888,
1785
+ "grad_norm": 3.194197177886963,
1786
+ "learning_rate": 1.8425657894736842e-06,
1787
+ "loss": 0.025,
1788
+ "step": 6000
1789
+ },
1790
+ {
1791
+ "epoch": 3.0888,
1792
+ "eval_loss": 0.14247554540634155,
1793
+ "eval_runtime": 4123.5746,
1794
+ "eval_samples_per_second": 3.305,
1795
+ "eval_steps_per_second": 0.413,
1796
+ "eval_wer": 7.940928579281029,
1797
+ "step": 6000
1798
+ },
1799
+ {
1800
+ "epoch": 3.0913,
1801
+ "grad_norm": 2.1373400688171387,
1802
+ "learning_rate": 1.8310526315789472e-06,
1803
+ "loss": 0.031,
1804
+ "step": 6025
1805
+ },
1806
+ {
1807
+ "epoch": 3.0938,
1808
+ "grad_norm": 1.0779415369033813,
1809
+ "learning_rate": 1.8195394736842104e-06,
1810
+ "loss": 0.024,
1811
+ "step": 6050
1812
+ },
1813
+ {
1814
+ "epoch": 3.0963,
1815
+ "grad_norm": 0.9637121558189392,
1816
+ "learning_rate": 1.8080263157894734e-06,
1817
+ "loss": 0.0282,
1818
+ "step": 6075
1819
+ },
1820
+ {
1821
+ "epoch": 3.0987999999999998,
1822
+ "grad_norm": 1.1645703315734863,
1823
+ "learning_rate": 1.7965131578947366e-06,
1824
+ "loss": 0.0278,
1825
+ "step": 6100
1826
+ },
1827
+ {
1828
+ "epoch": 3.1013,
1829
+ "grad_norm": 1.2814173698425293,
1830
+ "learning_rate": 1.7849999999999996e-06,
1831
+ "loss": 0.0199,
1832
+ "step": 6125
1833
+ },
1834
+ {
1835
+ "epoch": 3.1038,
1836
+ "grad_norm": 1.458809494972229,
1837
+ "learning_rate": 1.773486842105263e-06,
1838
+ "loss": 0.0264,
1839
+ "step": 6150
1840
+ },
1841
+ {
1842
+ "epoch": 3.1063,
1843
+ "grad_norm": 1.6669671535491943,
1844
+ "learning_rate": 1.7619736842105263e-06,
1845
+ "loss": 0.0272,
1846
+ "step": 6175
1847
+ },
1848
+ {
1849
+ "epoch": 3.1088,
1850
+ "grad_norm": 1.5049173831939697,
1851
+ "learning_rate": 1.7504605263157893e-06,
1852
+ "loss": 0.0243,
1853
+ "step": 6200
1854
+ },
1855
+ {
1856
+ "epoch": 3.1113,
1857
+ "grad_norm": 0.861107587814331,
1858
+ "learning_rate": 1.7389473684210525e-06,
1859
+ "loss": 0.0274,
1860
+ "step": 6225
1861
+ },
1862
+ {
1863
+ "epoch": 3.1138,
1864
+ "grad_norm": 1.0454998016357422,
1865
+ "learning_rate": 1.7274342105263155e-06,
1866
+ "loss": 0.0258,
1867
+ "step": 6250
1868
+ },
1869
+ {
1870
+ "epoch": 3.1163,
1871
+ "grad_norm": 1.7108014822006226,
1872
+ "learning_rate": 1.7159210526315788e-06,
1873
+ "loss": 0.0259,
1874
+ "step": 6275
1875
+ },
1876
+ {
1877
+ "epoch": 3.1188,
1878
+ "grad_norm": 0.8804712295532227,
1879
+ "learning_rate": 1.704407894736842e-06,
1880
+ "loss": 0.0255,
1881
+ "step": 6300
1882
+ },
1883
+ {
1884
+ "epoch": 3.1213,
1885
+ "grad_norm": 2.0050883293151855,
1886
+ "learning_rate": 1.6928947368421052e-06,
1887
+ "loss": 0.0304,
1888
+ "step": 6325
1889
+ },
1890
+ {
1891
+ "epoch": 3.1238,
1892
+ "grad_norm": 1.4400875568389893,
1893
+ "learning_rate": 1.6813815789473682e-06,
1894
+ "loss": 0.0333,
1895
+ "step": 6350
1896
+ },
1897
+ {
1898
+ "epoch": 3.1263,
1899
+ "grad_norm": 1.4423948526382446,
1900
+ "learning_rate": 1.6698684210526315e-06,
1901
+ "loss": 0.0279,
1902
+ "step": 6375
1903
+ },
1904
+ {
1905
+ "epoch": 3.1288,
1906
+ "grad_norm": 1.3972327709197998,
1907
+ "learning_rate": 1.6583552631578947e-06,
1908
+ "loss": 0.0255,
1909
+ "step": 6400
1910
+ },
1911
+ {
1912
+ "epoch": 3.1313,
1913
+ "grad_norm": 1.6908966302871704,
1914
+ "learning_rate": 1.6468421052631577e-06,
1915
+ "loss": 0.0267,
1916
+ "step": 6425
1917
+ },
1918
+ {
1919
+ "epoch": 3.1338,
1920
+ "grad_norm": 0.9540082216262817,
1921
+ "learning_rate": 1.635328947368421e-06,
1922
+ "loss": 0.0265,
1923
+ "step": 6450
1924
+ },
1925
+ {
1926
+ "epoch": 3.1363,
1927
+ "grad_norm": 1.41488778591156,
1928
+ "learning_rate": 1.6238157894736841e-06,
1929
+ "loss": 0.0224,
1930
+ "step": 6475
1931
+ },
1932
+ {
1933
+ "epoch": 3.1388,
1934
+ "grad_norm": 0.4790860116481781,
1935
+ "learning_rate": 1.6123026315789474e-06,
1936
+ "loss": 0.0274,
1937
+ "step": 6500
1938
+ },
1939
+ {
1940
+ "epoch": 3.1388,
1941
+ "eval_loss": 0.13914132118225098,
1942
+ "eval_runtime": 4133.8202,
1943
+ "eval_samples_per_second": 3.297,
1944
+ "eval_steps_per_second": 0.412,
1945
+ "eval_wer": 7.731137088204039,
1946
+ "step": 6500
1947
+ },
1948
+ {
1949
+ "epoch": 3.1413,
1950
+ "grad_norm": 2.5638585090637207,
1951
+ "learning_rate": 1.6007894736842104e-06,
1952
+ "loss": 0.025,
1953
+ "step": 6525
1954
+ },
1955
+ {
1956
+ "epoch": 3.1438,
1957
+ "grad_norm": 1.8847306966781616,
1958
+ "learning_rate": 1.5892763157894736e-06,
1959
+ "loss": 0.0294,
1960
+ "step": 6550
1961
+ },
1962
+ {
1963
+ "epoch": 3.1463,
1964
+ "grad_norm": 1.0196236371994019,
1965
+ "learning_rate": 1.5777631578947366e-06,
1966
+ "loss": 0.0255,
1967
+ "step": 6575
1968
+ },
1969
+ {
1970
+ "epoch": 3.1488,
1971
+ "grad_norm": 1.0703202486038208,
1972
+ "learning_rate": 1.5662499999999998e-06,
1973
+ "loss": 0.0246,
1974
+ "step": 6600
1975
+ },
1976
+ {
1977
+ "epoch": 3.1513,
1978
+ "grad_norm": 2.646519422531128,
1979
+ "learning_rate": 1.5547368421052628e-06,
1980
+ "loss": 0.0213,
1981
+ "step": 6625
1982
+ },
1983
+ {
1984
+ "epoch": 3.1538,
1985
+ "grad_norm": 1.7430530786514282,
1986
+ "learning_rate": 1.5432236842105263e-06,
1987
+ "loss": 0.0267,
1988
+ "step": 6650
1989
+ },
1990
+ {
1991
+ "epoch": 3.1563,
1992
+ "grad_norm": 1.0606240034103394,
1993
+ "learning_rate": 1.5317105263157895e-06,
1994
+ "loss": 0.0269,
1995
+ "step": 6675
1996
+ },
1997
+ {
1998
+ "epoch": 3.1588,
1999
+ "grad_norm": 1.4670476913452148,
2000
+ "learning_rate": 1.5201973684210525e-06,
2001
+ "loss": 0.0271,
2002
+ "step": 6700
2003
+ },
2004
+ {
2005
+ "epoch": 3.1612999999999998,
2006
+ "grad_norm": 2.345014810562134,
2007
+ "learning_rate": 1.5086842105263157e-06,
2008
+ "loss": 0.0252,
2009
+ "step": 6725
2010
+ },
2011
+ {
2012
+ "epoch": 3.1638,
2013
+ "grad_norm": 2.9098987579345703,
2014
+ "learning_rate": 1.4971710526315787e-06,
2015
+ "loss": 0.0272,
2016
+ "step": 6750
2017
+ },
2018
+ {
2019
+ "epoch": 3.1663,
2020
+ "grad_norm": 0.5682694911956787,
2021
+ "learning_rate": 1.485657894736842e-06,
2022
+ "loss": 0.0237,
2023
+ "step": 6775
2024
+ },
2025
+ {
2026
+ "epoch": 3.1688,
2027
+ "grad_norm": 1.4645904302597046,
2028
+ "learning_rate": 1.4746052631578947e-06,
2029
+ "loss": 0.0303,
2030
+ "step": 6800
2031
+ },
2032
+ {
2033
+ "epoch": 4.0009,
2034
+ "grad_norm": 1.3764489889144897,
2035
+ "learning_rate": 1.4630921052631578e-06,
2036
+ "loss": 0.0242,
2037
+ "step": 6825
2038
+ },
2039
+ {
2040
+ "epoch": 4.0034,
2041
+ "grad_norm": 0.8848748803138733,
2042
+ "learning_rate": 1.451578947368421e-06,
2043
+ "loss": 0.0163,
2044
+ "step": 6850
2045
+ },
2046
+ {
2047
+ "epoch": 4.0059,
2048
+ "grad_norm": 0.619125485420227,
2049
+ "learning_rate": 1.440065789473684e-06,
2050
+ "loss": 0.0188,
2051
+ "step": 6875
2052
+ },
2053
+ {
2054
+ "epoch": 4.0084,
2055
+ "grad_norm": 0.9328649044036865,
2056
+ "learning_rate": 1.4285526315789472e-06,
2057
+ "loss": 0.0173,
2058
+ "step": 6900
2059
+ },
2060
+ {
2061
+ "epoch": 4.0109,
2062
+ "grad_norm": 1.77474045753479,
2063
+ "learning_rate": 1.4170394736842104e-06,
2064
+ "loss": 0.0146,
2065
+ "step": 6925
2066
+ },
2067
+ {
2068
+ "epoch": 4.0134,
2069
+ "grad_norm": 1.3934537172317505,
2070
+ "learning_rate": 1.4055263157894737e-06,
2071
+ "loss": 0.0156,
2072
+ "step": 6950
2073
+ },
2074
+ {
2075
+ "epoch": 4.0159,
2076
+ "grad_norm": 1.2856354713439941,
2077
+ "learning_rate": 1.3940131578947367e-06,
2078
+ "loss": 0.0173,
2079
+ "step": 6975
2080
+ },
2081
+ {
2082
+ "epoch": 4.0184,
2083
+ "grad_norm": 2.1229758262634277,
2084
+ "learning_rate": 1.3824999999999999e-06,
2085
+ "loss": 0.0155,
2086
+ "step": 7000
2087
+ },
2088
+ {
2089
+ "epoch": 4.0184,
2090
+ "eval_loss": 0.14916160702705383,
2091
+ "eval_runtime": 4128.7355,
2092
+ "eval_samples_per_second": 3.301,
2093
+ "eval_steps_per_second": 0.413,
2094
+ "eval_wer": 7.697240646414307,
2095
+ "step": 7000
2096
+ },
2097
+ {
2098
+ "epoch": 4.0209,
2099
+ "grad_norm": 0.44512999057769775,
2100
+ "learning_rate": 1.3709868421052631e-06,
2101
+ "loss": 0.0153,
2102
+ "step": 7025
2103
+ },
2104
+ {
2105
+ "epoch": 4.0234,
2106
+ "grad_norm": 1.8791674375534058,
2107
+ "learning_rate": 1.3594736842105261e-06,
2108
+ "loss": 0.0165,
2109
+ "step": 7050
2110
+ },
2111
+ {
2112
+ "epoch": 4.0259,
2113
+ "grad_norm": 5.244405746459961,
2114
+ "learning_rate": 1.3479605263157894e-06,
2115
+ "loss": 0.0179,
2116
+ "step": 7075
2117
+ },
2118
+ {
2119
+ "epoch": 4.0284,
2120
+ "grad_norm": 1.1926153898239136,
2121
+ "learning_rate": 1.3364473684210526e-06,
2122
+ "loss": 0.0161,
2123
+ "step": 7100
2124
+ },
2125
+ {
2126
+ "epoch": 4.0309,
2127
+ "grad_norm": 1.1147819757461548,
2128
+ "learning_rate": 1.3249342105263158e-06,
2129
+ "loss": 0.015,
2130
+ "step": 7125
2131
+ },
2132
+ {
2133
+ "epoch": 4.0334,
2134
+ "grad_norm": 1.9370721578598022,
2135
+ "learning_rate": 1.3134210526315788e-06,
2136
+ "loss": 0.0142,
2137
+ "step": 7150
2138
+ },
2139
+ {
2140
+ "epoch": 4.0359,
2141
+ "grad_norm": 0.49344903230667114,
2142
+ "learning_rate": 1.301907894736842e-06,
2143
+ "loss": 0.0134,
2144
+ "step": 7175
2145
+ },
2146
+ {
2147
+ "epoch": 4.0384,
2148
+ "grad_norm": 1.8190902471542358,
2149
+ "learning_rate": 1.290394736842105e-06,
2150
+ "loss": 0.0168,
2151
+ "step": 7200
2152
+ },
2153
+ {
2154
+ "epoch": 4.0409,
2155
+ "grad_norm": 0.7560425400733948,
2156
+ "learning_rate": 1.2788815789473683e-06,
2157
+ "loss": 0.0143,
2158
+ "step": 7225
2159
+ },
2160
+ {
2161
+ "epoch": 4.0434,
2162
+ "grad_norm": 1.0451087951660156,
2163
+ "learning_rate": 1.2673684210526313e-06,
2164
+ "loss": 0.0149,
2165
+ "step": 7250
2166
+ },
2167
+ {
2168
+ "epoch": 4.0459,
2169
+ "grad_norm": 1.0334726572036743,
2170
+ "learning_rate": 1.2558552631578947e-06,
2171
+ "loss": 0.0136,
2172
+ "step": 7275
2173
+ },
2174
+ {
2175
+ "epoch": 4.0484,
2176
+ "grad_norm": 0.6531663537025452,
2177
+ "learning_rate": 1.244342105263158e-06,
2178
+ "loss": 0.0137,
2179
+ "step": 7300
2180
+ },
2181
+ {
2182
+ "epoch": 4.0509,
2183
+ "grad_norm": 0.8954887986183167,
2184
+ "learning_rate": 1.232828947368421e-06,
2185
+ "loss": 0.0118,
2186
+ "step": 7325
2187
+ },
2188
+ {
2189
+ "epoch": 4.0534,
2190
+ "grad_norm": 1.0640511512756348,
2191
+ "learning_rate": 1.2213157894736842e-06,
2192
+ "loss": 0.0126,
2193
+ "step": 7350
2194
+ },
2195
+ {
2196
+ "epoch": 4.0559,
2197
+ "grad_norm": 0.2824617922306061,
2198
+ "learning_rate": 1.2098026315789472e-06,
2199
+ "loss": 0.0139,
2200
+ "step": 7375
2201
+ },
2202
+ {
2203
+ "epoch": 4.0584,
2204
+ "grad_norm": 1.0095443725585938,
2205
+ "learning_rate": 1.1982894736842104e-06,
2206
+ "loss": 0.018,
2207
+ "step": 7400
2208
+ },
2209
+ {
2210
+ "epoch": 4.0609,
2211
+ "grad_norm": 1.1475225687026978,
2212
+ "learning_rate": 1.1867763157894734e-06,
2213
+ "loss": 0.0133,
2214
+ "step": 7425
2215
+ },
2216
+ {
2217
+ "epoch": 4.0634,
2218
+ "grad_norm": 1.5951991081237793,
2219
+ "learning_rate": 1.1752631578947369e-06,
2220
+ "loss": 0.013,
2221
+ "step": 7450
2222
+ },
2223
+ {
2224
+ "epoch": 4.0659,
2225
+ "grad_norm": 0.3482917249202728,
2226
+ "learning_rate": 1.1637499999999999e-06,
2227
+ "loss": 0.0154,
2228
+ "step": 7475
2229
+ },
2230
+ {
2231
+ "epoch": 4.0684,
2232
+ "grad_norm": 1.1572391986846924,
2233
+ "learning_rate": 1.152236842105263e-06,
2234
+ "loss": 0.0189,
2235
+ "step": 7500
2236
+ },
2237
+ {
2238
+ "epoch": 4.0684,
2239
+ "eval_loss": 0.15172211825847626,
2240
+ "eval_runtime": 4117.5679,
2241
+ "eval_samples_per_second": 3.31,
2242
+ "eval_steps_per_second": 0.414,
2243
+ "eval_wer": 7.656931364285977,
2244
+ "step": 7500
2245
+ },
2246
+ {
2247
+ "epoch": 4.0709,
2248
+ "grad_norm": 1.3942557573318481,
2249
+ "learning_rate": 1.140723684210526e-06,
2250
+ "loss": 0.0143,
2251
+ "step": 7525
2252
+ },
2253
+ {
2254
+ "epoch": 4.0734,
2255
+ "grad_norm": 0.8097572326660156,
2256
+ "learning_rate": 1.1292105263157893e-06,
2257
+ "loss": 0.0127,
2258
+ "step": 7550
2259
+ },
2260
+ {
2261
+ "epoch": 4.0759,
2262
+ "grad_norm": 0.740375816822052,
2263
+ "learning_rate": 1.1176973684210526e-06,
2264
+ "loss": 0.0124,
2265
+ "step": 7575
2266
+ },
2267
+ {
2268
+ "epoch": 4.0784,
2269
+ "grad_norm": 0.8702480792999268,
2270
+ "learning_rate": 1.1061842105263156e-06,
2271
+ "loss": 0.0137,
2272
+ "step": 7600
2273
+ },
2274
+ {
2275
+ "epoch": 4.0809,
2276
+ "grad_norm": 1.223105788230896,
2277
+ "learning_rate": 1.094671052631579e-06,
2278
+ "loss": 0.0137,
2279
+ "step": 7625
2280
+ },
2281
+ {
2282
+ "epoch": 4.0834,
2283
+ "grad_norm": 0.43614983558654785,
2284
+ "learning_rate": 1.083157894736842e-06,
2285
+ "loss": 0.0109,
2286
+ "step": 7650
2287
+ },
2288
+ {
2289
+ "epoch": 4.0859,
2290
+ "grad_norm": 1.0974986553192139,
2291
+ "learning_rate": 1.0716447368421052e-06,
2292
+ "loss": 0.0118,
2293
+ "step": 7675
2294
+ },
2295
+ {
2296
+ "epoch": 4.0884,
2297
+ "grad_norm": 0.7234652042388916,
2298
+ "learning_rate": 1.0601315789473682e-06,
2299
+ "loss": 0.0125,
2300
+ "step": 7700
2301
+ },
2302
+ {
2303
+ "epoch": 4.0909,
2304
+ "grad_norm": 0.7752431035041809,
2305
+ "learning_rate": 1.0486184210526315e-06,
2306
+ "loss": 0.0135,
2307
+ "step": 7725
2308
+ },
2309
+ {
2310
+ "epoch": 4.0934,
2311
+ "grad_norm": 0.8796952366828918,
2312
+ "learning_rate": 1.0371052631578947e-06,
2313
+ "loss": 0.0158,
2314
+ "step": 7750
2315
+ },
2316
+ {
2317
+ "epoch": 4.0959,
2318
+ "grad_norm": 3.9135661125183105,
2319
+ "learning_rate": 1.0255921052631577e-06,
2320
+ "loss": 0.0139,
2321
+ "step": 7775
2322
+ },
2323
+ {
2324
+ "epoch": 4.0984,
2325
+ "grad_norm": 0.4837290942668915,
2326
+ "learning_rate": 1.014078947368421e-06,
2327
+ "loss": 0.0103,
2328
+ "step": 7800
2329
+ },
2330
+ {
2331
+ "epoch": 4.1009,
2332
+ "grad_norm": 1.1155998706817627,
2333
+ "learning_rate": 1.0025657894736842e-06,
2334
+ "loss": 0.0106,
2335
+ "step": 7825
2336
+ },
2337
+ {
2338
+ "epoch": 4.1034,
2339
+ "grad_norm": 2.628676652908325,
2340
+ "learning_rate": 9.910526315789474e-07,
2341
+ "loss": 0.0089,
2342
+ "step": 7850
2343
+ },
2344
+ {
2345
+ "epoch": 4.1059,
2346
+ "grad_norm": 1.716665506362915,
2347
+ "learning_rate": 9.795394736842104e-07,
2348
+ "loss": 0.0132,
2349
+ "step": 7875
2350
+ },
2351
+ {
2352
+ "epoch": 4.1084,
2353
+ "grad_norm": 1.6751716136932373,
2354
+ "learning_rate": 9.680263157894736e-07,
2355
+ "loss": 0.0137,
2356
+ "step": 7900
2357
+ },
2358
+ {
2359
+ "epoch": 4.1109,
2360
+ "grad_norm": 0.9773244261741638,
2361
+ "learning_rate": 9.565131578947368e-07,
2362
+ "loss": 0.0111,
2363
+ "step": 7925
2364
+ },
2365
+ {
2366
+ "epoch": 4.1134,
2367
+ "grad_norm": 1.44219172000885,
2368
+ "learning_rate": 9.45e-07,
2369
+ "loss": 0.0139,
2370
+ "step": 7950
2371
+ },
2372
+ {
2373
+ "epoch": 4.1159,
2374
+ "grad_norm": 0.8723123073577881,
2375
+ "learning_rate": 9.334868421052631e-07,
2376
+ "loss": 0.0117,
2377
+ "step": 7975
2378
+ },
2379
+ {
2380
+ "epoch": 4.1184,
2381
+ "grad_norm": 0.6484673023223877,
2382
+ "learning_rate": 9.219736842105263e-07,
2383
+ "loss": 0.0139,
2384
+ "step": 8000
2385
+ },
2386
+ {
2387
+ "epoch": 4.1184,
2388
+ "eval_loss": 0.15393850207328796,
2389
+ "eval_runtime": 4128.9341,
2390
+ "eval_samples_per_second": 3.301,
2391
+ "eval_steps_per_second": 0.413,
2392
+ "eval_wer": 7.626699402689728,
2393
+ "step": 8000
2394
+ },
2395
+ {
2396
+ "epoch": 4.1209,
2397
+ "grad_norm": 1.3702197074890137,
2398
+ "learning_rate": 9.104605263157894e-07,
2399
+ "loss": 0.0158,
2400
+ "step": 8025
2401
+ },
2402
+ {
2403
+ "epoch": 4.1234,
2404
+ "grad_norm": 1.425645351409912,
2405
+ "learning_rate": 8.989473684210525e-07,
2406
+ "loss": 0.0117,
2407
+ "step": 8050
2408
+ },
2409
+ {
2410
+ "epoch": 4.1259,
2411
+ "grad_norm": 1.4255399703979492,
2412
+ "learning_rate": 8.874342105263158e-07,
2413
+ "loss": 0.015,
2414
+ "step": 8075
2415
+ },
2416
+ {
2417
+ "epoch": 4.1284,
2418
+ "grad_norm": 0.6988621950149536,
2419
+ "learning_rate": 8.759210526315789e-07,
2420
+ "loss": 0.0141,
2421
+ "step": 8100
2422
+ },
2423
+ {
2424
+ "epoch": 4.1309,
2425
+ "grad_norm": 1.1563546657562256,
2426
+ "learning_rate": 8.64407894736842e-07,
2427
+ "loss": 0.0122,
2428
+ "step": 8125
2429
+ },
2430
+ {
2431
+ "epoch": 4.1334,
2432
+ "grad_norm": 1.2023714780807495,
2433
+ "learning_rate": 8.528947368421051e-07,
2434
+ "loss": 0.013,
2435
+ "step": 8150
2436
+ },
2437
+ {
2438
+ "epoch": 4.1359,
2439
+ "grad_norm": 0.9450110197067261,
2440
+ "learning_rate": 8.413815789473683e-07,
2441
+ "loss": 0.0123,
2442
+ "step": 8175
2443
+ },
2444
+ {
2445
+ "epoch": 4.1384,
2446
+ "grad_norm": 0.9265995621681213,
2447
+ "learning_rate": 8.298684210526316e-07,
2448
+ "loss": 0.0114,
2449
+ "step": 8200
2450
+ },
2451
+ {
2452
+ "epoch": 4.1409,
2453
+ "grad_norm": 0.4234980046749115,
2454
+ "learning_rate": 8.183552631578947e-07,
2455
+ "loss": 0.0085,
2456
+ "step": 8225
2457
+ },
2458
+ {
2459
+ "epoch": 4.1434,
2460
+ "grad_norm": 1.3323073387145996,
2461
+ "learning_rate": 8.068421052631579e-07,
2462
+ "loss": 0.014,
2463
+ "step": 8250
2464
+ },
2465
+ {
2466
+ "epoch": 4.1459,
2467
+ "grad_norm": 1.2050007581710815,
2468
+ "learning_rate": 7.95328947368421e-07,
2469
+ "loss": 0.0106,
2470
+ "step": 8275
2471
+ },
2472
+ {
2473
+ "epoch": 4.1484,
2474
+ "grad_norm": 1.261042594909668,
2475
+ "learning_rate": 7.838157894736841e-07,
2476
+ "loss": 0.0107,
2477
+ "step": 8300
2478
+ },
2479
+ {
2480
+ "epoch": 4.1509,
2481
+ "grad_norm": 1.2892303466796875,
2482
+ "learning_rate": 7.723026315789474e-07,
2483
+ "loss": 0.0145,
2484
+ "step": 8325
2485
+ },
2486
+ {
2487
+ "epoch": 4.1534,
2488
+ "grad_norm": 1.1626112461090088,
2489
+ "learning_rate": 7.607894736842105e-07,
2490
+ "loss": 0.0139,
2491
+ "step": 8350
2492
+ },
2493
+ {
2494
+ "epoch": 4.1559,
2495
+ "grad_norm": 1.0547322034835815,
2496
+ "learning_rate": 7.492763157894736e-07,
2497
+ "loss": 0.0154,
2498
+ "step": 8375
2499
+ },
2500
+ {
2501
+ "epoch": 4.1584,
2502
+ "grad_norm": 0.44805532693862915,
2503
+ "learning_rate": 7.377631578947367e-07,
2504
+ "loss": 0.0109,
2505
+ "step": 8400
2506
+ },
2507
+ {
2508
+ "epoch": 4.1609,
2509
+ "grad_norm": 0.7095866203308105,
2510
+ "learning_rate": 7.262499999999999e-07,
2511
+ "loss": 0.0114,
2512
+ "step": 8425
2513
+ },
2514
+ {
2515
+ "epoch": 4.1634,
2516
+ "grad_norm": 1.4220194816589355,
2517
+ "learning_rate": 7.14736842105263e-07,
2518
+ "loss": 0.0134,
2519
+ "step": 8450
2520
+ },
2521
+ {
2522
+ "epoch": 4.1659,
2523
+ "grad_norm": 1.0814168453216553,
2524
+ "learning_rate": 7.032236842105263e-07,
2525
+ "loss": 0.0142,
2526
+ "step": 8475
2527
+ },
2528
+ {
2529
+ "epoch": 4.1684,
2530
+ "grad_norm": 0.7026916146278381,
2531
+ "learning_rate": 6.917105263157895e-07,
2532
+ "loss": 0.0141,
2533
+ "step": 8500
2534
+ },
2535
+ {
2536
+ "epoch": 4.1684,
2537
+ "eval_loss": 0.15496784448623657,
2538
+ "eval_runtime": 4124.1829,
2539
+ "eval_samples_per_second": 3.305,
2540
+ "eval_steps_per_second": 0.413,
2541
+ "eval_wer": 7.542416358239584,
2542
+ "step": 8500
2543
+ },
2544
+ {
2545
+ "epoch": 5.0005,
2546
+ "grad_norm": 4.648550033569336,
2547
+ "learning_rate": 6.801973684210526e-07,
2548
+ "loss": 0.0285,
2549
+ "step": 8525
2550
+ },
2551
+ {
2552
+ "epoch": 5.003,
2553
+ "grad_norm": 1.9204503297805786,
2554
+ "learning_rate": 6.691447368421053e-07,
2555
+ "loss": 0.0761,
2556
+ "step": 8550
2557
+ },
2558
+ {
2559
+ "epoch": 5.0055,
2560
+ "grad_norm": 1.7285746335983276,
2561
+ "learning_rate": 6.576315789473684e-07,
2562
+ "loss": 0.0602,
2563
+ "step": 8575
2564
+ },
2565
+ {
2566
+ "epoch": 5.008,
2567
+ "grad_norm": 1.1516830921173096,
2568
+ "learning_rate": 6.461184210526315e-07,
2569
+ "loss": 0.0585,
2570
+ "step": 8600
2571
+ },
2572
+ {
2573
+ "epoch": 5.0105,
2574
+ "grad_norm": 3.3867828845977783,
2575
+ "learning_rate": 6.346052631578947e-07,
2576
+ "loss": 0.0656,
2577
+ "step": 8625
2578
+ },
2579
+ {
2580
+ "epoch": 5.013,
2581
+ "grad_norm": 4.064920902252197,
2582
+ "learning_rate": 6.230921052631579e-07,
2583
+ "loss": 0.0683,
2584
+ "step": 8650
2585
+ },
2586
+ {
2587
+ "epoch": 5.0155,
2588
+ "grad_norm": 3.695047378540039,
2589
+ "learning_rate": 6.11578947368421e-07,
2590
+ "loss": 0.0659,
2591
+ "step": 8675
2592
+ },
2593
+ {
2594
+ "epoch": 5.018,
2595
+ "grad_norm": 2.9087939262390137,
2596
+ "learning_rate": 6.000657894736842e-07,
2597
+ "loss": 0.0611,
2598
+ "step": 8700
2599
+ },
2600
+ {
2601
+ "epoch": 5.0205,
2602
+ "grad_norm": 3.368290424346924,
2603
+ "learning_rate": 5.885526315789473e-07,
2604
+ "loss": 0.0603,
2605
+ "step": 8725
2606
+ },
2607
+ {
2608
+ "epoch": 5.023,
2609
+ "grad_norm": 3.7565319538116455,
2610
+ "learning_rate": 5.770394736842104e-07,
2611
+ "loss": 0.0614,
2612
+ "step": 8750
2613
+ },
2614
+ {
2615
+ "epoch": 5.0255,
2616
+ "grad_norm": 2.4887771606445312,
2617
+ "learning_rate": 5.655263157894735e-07,
2618
+ "loss": 0.0497,
2619
+ "step": 8775
2620
+ },
2621
+ {
2622
+ "epoch": 5.028,
2623
+ "grad_norm": 2.1670076847076416,
2624
+ "learning_rate": 5.540131578947369e-07,
2625
+ "loss": 0.0662,
2626
+ "step": 8800
2627
+ },
2628
+ {
2629
+ "epoch": 5.0305,
2630
+ "grad_norm": 1.3746148347854614,
2631
+ "learning_rate": 5.425e-07,
2632
+ "loss": 0.0507,
2633
+ "step": 8825
2634
+ },
2635
+ {
2636
+ "epoch": 5.033,
2637
+ "grad_norm": 1.8274154663085938,
2638
+ "learning_rate": 5.309868421052631e-07,
2639
+ "loss": 0.0449,
2640
+ "step": 8850
2641
+ },
2642
+ {
2643
+ "epoch": 5.0355,
2644
+ "grad_norm": 2.9424078464508057,
2645
+ "learning_rate": 5.194736842105262e-07,
2646
+ "loss": 0.0529,
2647
+ "step": 8875
2648
+ },
2649
+ {
2650
+ "epoch": 5.038,
2651
+ "grad_norm": 2.457754611968994,
2652
+ "learning_rate": 5.079605263157895e-07,
2653
+ "loss": 0.042,
2654
+ "step": 8900
2655
+ },
2656
+ {
2657
+ "epoch": 5.0405,
2658
+ "grad_norm": 2.208768606185913,
2659
+ "learning_rate": 4.964473684210526e-07,
2660
+ "loss": 0.0407,
2661
+ "step": 8925
2662
+ },
2663
+ {
2664
+ "epoch": 5.043,
2665
+ "grad_norm": 1.9554438591003418,
2666
+ "learning_rate": 4.849342105263158e-07,
2667
+ "loss": 0.0465,
2668
+ "step": 8950
2669
+ },
2670
+ {
2671
+ "epoch": 5.0455,
2672
+ "grad_norm": 1.1464567184448242,
2673
+ "learning_rate": 4.734210526315789e-07,
2674
+ "loss": 0.0537,
2675
+ "step": 8975
2676
+ },
2677
+ {
2678
+ "epoch": 5.048,
2679
+ "grad_norm": 3.1216509342193604,
2680
+ "learning_rate": 4.6190789473684203e-07,
2681
+ "loss": 0.0368,
2682
+ "step": 9000
2683
+ },
2684
+ {
2685
+ "epoch": 5.048,
2686
+ "eval_loss": 0.12588092684745789,
2687
+ "eval_runtime": 4149.257,
2688
+ "eval_samples_per_second": 3.285,
2689
+ "eval_steps_per_second": 0.411,
2690
+ "eval_wer": 7.215361500971087,
2691
+ "step": 9000
2692
+ },
2693
+ {
2694
+ "epoch": 5.048,
2695
+ "step": 9000,
2696
+ "total_flos": 4.891718061785088e+20,
2697
+ "train_loss": 0.0,
2698
+ "train_runtime": 289.8068,
2699
+ "train_samples_per_second": 552.092,
2700
+ "train_steps_per_second": 34.506
2701
+ }
2702
+ ],
2703
+ "logging_steps": 25,
2704
+ "max_steps": 10000,
2705
+ "num_input_tokens_seen": 0,
2706
+ "num_train_epochs": 9223372036854775807,
2707
+ "save_steps": 1000,
2708
+ "stateful_callbacks": {
2709
+ "TrainerControl": {
2710
+ "args": {
2711
+ "should_epoch_stop": false,
2712
+ "should_evaluate": false,
2713
+ "should_log": false,
2714
+ "should_save": true,
2715
+ "should_training_stop": false
2716
+ },
2717
+ "attributes": {}
2718
+ }
2719
+ },
2720
+ "total_flos": 4.891718061785088e+20,
2721
+ "train_batch_size": 16,
2722
+ "trial_name": null,
2723
+ "trial_params": null
2724
+ }
wandb/run-20241007_131849-0rbzerob/files/output.log CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/run-20241007_131849-0rbzerob/run-0rbzerob.wandb CHANGED
Binary files a/wandb/run-20241007_131849-0rbzerob/run-0rbzerob.wandb and b/wandb/run-20241007_131849-0rbzerob/run-0rbzerob.wandb differ