huseinzol05 commited on
Commit
d754ac0
β€’
1 Parent(s): 724e1d9

improve score

Browse files
Files changed (1) hide show
  1. app.py +77 -24
app.py CHANGED
@@ -9,81 +9,134 @@ INTRODUCTION_TEXT = """
9
 
10
  ## Dataset
11
 
12
- πŸ“ˆ We evaluate models based on 2 datasets,
13
 
14
- 1. Fleurs MY-MS test set, Malay language, https://huggingface.co/datasets/malaysia-ai/fleurs-my-ms
15
- 2. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS
 
16
 
17
- During test we,
18
 
19
- 1. Lowercase.
20
- 2. Remove punctuations.
 
21
  """
22
 
23
  open_source = [
24
  {
25
  'model': 'openai/whisper-large-v3',
26
- 'Fleurs MY-MS CER': 0.027414635425413655,
27
- 'Fleurs MY-MS WER': 0.0912705436045907,
 
 
 
28
  'IMDA TTS CER': 0.016648493852990828,
29
  'IMDA TTS WER': 0.0386282289139432,
30
  },
31
  {
32
  'model': 'openai/whisper-medium',
33
- 'Fleurs MY-MS CER': 0.045260198639505075,
34
- 'Fleurs MY-MS WER': 0.14913723876746685,
 
 
 
35
  'IMDA TTS CER': 0.02065587879424904,
36
  'IMDA TTS WER': 0.047277690563404855,
37
  },
38
  {
39
  'model': 'openai/whisper-small',
40
- 'Fleurs MY-MS CER': 0.07028889922090295,
41
- 'Fleurs MY-MS WER': 0.2327510905228186,
 
 
 
42
  'IMDA TTS CER': 0.024812471688517194,
43
  'IMDA TTS WER': 0.058901277294134434,
44
  },
45
  {
46
  'model': 'openai/whisper-base',
47
- 'Fleurs MY-MS CER': 0.24820848114299138,
48
- 'Fleurs MY-MS WER': 0.5164123884823085,
 
 
 
49
  'IMDA TTS CER': 0.03914533450681607,
50
  'IMDA TTS WER': 0.08951682444539587,
51
  },
52
  {
53
  'model': 'openai/whisper-tiny',
54
- 'Fleurs MY-MS CER': 0.4569030231808184,
55
- 'Fleurs MY-MS WER': 0.7505351570430122,
 
 
 
56
  'IMDA TTS CER': 0.048805770734828904,
57
  'IMDA TTS WER': 0.11150629529200957,
58
  },
59
  {
60
  'model': 'mesolitica/malaysian-whisper-medium',
61
- 'Fleurs MY-MS CER': 0.029782543838559473,
62
- 'Fleurs MY-MS WER': 0.09812949956440954,
63
- 'IMDA TTS CER': 0.01932580931010918,
64
- 'IMDA TTS WER': 0.045691970727685015,
 
 
 
65
  },
66
  {
67
  'model': 'mesolitica/malaysian-whisper-small',
68
- 'Fleurs MY-MS CER': 0.03596621199151582,
69
- 'Fleurs MY-MS WER': 0.12024457480764372,
 
 
 
70
  'IMDA TTS CER': 0.024228721439634855,
71
  'IMDA TTS WER': 0.05546294182008469,
72
  },
73
  {
74
  'model': 'mesolitica/malaysian-whisper-base',
75
- 'Fleurs MY-MS CER': 0.07478803508650385,
76
- 'Fleurs MY-MS WER': 0.21823941044294087,
 
 
 
77
  'IMDA TTS CER': 0.03982418421412676,
78
  'IMDA TTS WER': 0.08917690642690643,
79
  },
80
  {
81
  'model': 'mesolitica/malaysian-whisper-tiny',
 
 
 
82
  'Fleurs MY-MS CER': 0.13390519685940314,
83
  'Fleurs MY-MS WER': 0.3461808122686204,
84
  'IMDA TTS CER': 0.07957313474501154,
85
  'IMDA TTS WER': 0.1421708648494363,
86
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ]
88
 
89
  data = pd.DataFrame(open_source)
 
9
 
10
  ## Dataset
11
 
12
+ πŸ“ˆ We evaluate models based on 3 datasets,
13
 
14
+ 1. Malaya-Speech test set, Malay language, https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/tree/main/malaya-speech
15
+ 2. Fleurs MS-MY test set, Malay language, https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/tree/main/fleurs-ms-my
16
+ 3. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS
17
 
18
+ ## Heavy postprocess test set
19
 
20
+ 1. We filtered test set that contain numbers because malaya-speech transducer trained on normalized numbers.
21
+ 2. We lower case because malaya-speech transducer trained on lower case.
22
+ 3. We removed punctuation because malaya-speech transducer trained without punctuation.
23
  """
24
 
25
  open_source = [
26
  {
27
  'model': 'openai/whisper-large-v3',
28
+ 'model size FP16 (MB)': 3090,
29
+ 'Malaya-Speech test CER': 0.0349251317825172,
30
+ 'Malaya-Speech test WER': 0.1032828282828283,
31
+ 'Fleurs MY-MS CER': 0.026055551396846878,
32
+ 'Fleurs MY-MS WER': 0.07652049926522007,
33
  'IMDA TTS CER': 0.016648493852990828,
34
  'IMDA TTS WER': 0.0386282289139432,
35
  },
36
  {
37
  'model': 'openai/whisper-medium',
38
+ 'model size FP16 (MB)': 1530,
39
+ 'Malaya-Speech test CER': 0.05064920144820262,
40
+ 'Malaya-Speech test WER': 0.17534205321090568,
41
+ 'Fleurs MY-MS CER': 0.04366882208520179,
42
+ 'Fleurs MY-MS WER': 0.13546055192128273,
43
  'IMDA TTS CER': 0.02065587879424904,
44
  'IMDA TTS WER': 0.047277690563404855,
45
  },
46
  {
47
  'model': 'openai/whisper-small',
48
+ 'model size FP16 (MB)': 483.5,
49
+ 'Malaya-Speech test CER': 0.07485209857268262,
50
+ 'Malaya-Speech test WER': 0.25748516055893106,
51
+ 'Fleurs MY-MS CER': 0.06781078047622793,
52
+ 'Fleurs MY-MS WER': 0.21953142859857497,
53
  'IMDA TTS CER': 0.024812471688517194,
54
  'IMDA TTS WER': 0.058901277294134434,
55
  },
56
  {
57
  'model': 'openai/whisper-base',
58
+ 'model size FP16 (MB)': 145,
59
+ 'Malaya-Speech test CER': 0.3574879236610538,
60
+ 'Malaya-Speech test WER': 0.8303456599563157,
61
+ 'Fleurs MY-MS CER': 0.1319124653794061,
62
+ 'Fleurs MY-MS WER': 0.40499286081235003,
63
  'IMDA TTS CER': 0.03914533450681607,
64
  'IMDA TTS WER': 0.08951682444539587,
65
  },
66
  {
67
  'model': 'openai/whisper-tiny',
68
+ 'model size FP16 (MB)': 75.5,
69
+ 'Malaya-Speech test CER': 0.26941094281472105,
70
+ 'Malaya-Speech test WER': 0.7414099751189915,
71
+ 'Fleurs MY-MS CER': 0.38749733168917505,
72
+ 'Fleurs MY-MS WER': 0.812253445128297,
73
  'IMDA TTS CER': 0.048805770734828904,
74
  'IMDA TTS WER': 0.11150629529200957,
75
  },
76
  {
77
  'model': 'mesolitica/malaysian-whisper-medium',
78
+ 'model size FP16 (MB)': 1530,
79
+ 'Malaya-Speech test CER': 0.05622483776367814,
80
+ 'Malaya-Speech test WER': 0.14406629724252673,
81
+ 'Fleurs MY-MS CER': 0.025543266604368554,
82
+ 'Fleurs MY-MS WER': 0.07940219915492629,
83
+ 'IMDA TTS CER': 0.01971214262944062,
84
+ 'IMDA TTS WER': 0.047223078508792794,
85
  },
86
  {
87
  'model': 'mesolitica/malaysian-whisper-small',
88
+ 'model size FP16 (MB)': 483.5,
89
+ 'Malaya-Speech test CER': 0.049162419174983304,
90
+ 'Malaya-Speech test WER': 0.15926901346983313,
91
+ 'Fleurs MY-MS CER': 0.035517572531147,
92
+ 'Fleurs MY-MS WER': 0.10938718963023729,
93
  'IMDA TTS CER': 0.024228721439634855,
94
  'IMDA TTS WER': 0.05546294182008469,
95
  },
96
  {
97
  'model': 'mesolitica/malaysian-whisper-base',
98
+ 'model size FP16 (MB)': 145,
99
+ 'Malaya-Speech test CER': 0.07242006488452603,
100
+ 'Malaya-Speech test WER': 0.22081683495617924,
101
+ 'Fleurs MY-MS CER': 0.06639564802362424,
102
+ 'Fleurs MY-MS WER': 0.19675812232021192,
103
  'IMDA TTS CER': 0.03982418421412676,
104
  'IMDA TTS WER': 0.08917690642690643,
105
  },
106
  {
107
  'model': 'mesolitica/malaysian-whisper-tiny',
108
+ 'model size FP16 (MB)': 75.5,
109
+ 'Malaya-Speech test CER': 0.09423990117534763,
110
+ 'Malaya-Speech test WER': 0.295029492365558,
111
  'Fleurs MY-MS CER': 0.13390519685940314,
112
  'Fleurs MY-MS WER': 0.3461808122686204,
113
  'IMDA TTS CER': 0.07957313474501154,
114
  'IMDA TTS WER': 0.1421708648494363,
115
  },
116
+ {
117
+ 'model': 'mesolitica/conformer-large-malay-whisper',
118
+ 'model size FP16 (MB)': 206.5,
119
+ 'Malaya-Speech test CER': 0.025933167255719317,
120
+ 'Malaya-Speech test WER': 0.0912131356803488,
121
+ 'Fleurs MY-MS CER': 0.02548791948171514,
122
+ 'Fleurs MY-MS WER': 0.08376713097429746,
123
+ },
124
+ {
125
+ 'model': 'mesolitica/conformer-medium-malay-whisper',
126
+ 'model size FP16 (MB)': 121.5,
127
+ 'Malaya-Speech test CER': 0.024955598713609053,
128
+ 'Malaya-Speech test WER': 0.09315638444736804,
129
+ 'Fleurs MY-MS CER': 0.029205645523910067,
130
+ 'Fleurs MY-MS WER': 0.09253131557833799,
131
+ },
132
+ {
133
+ 'model': 'mesolitica/conformer-medium-mixed',
134
+ 'model size FP16 (MB)': 121.5,
135
+ 'Malaya-Speech test CER': 0.034618711056551774,
136
+ 'Malaya-Speech test WER': 0.11179440626161938,
137
+ 'Fleurs MY-MS CER': 0.032894184549728075,
138
+ 'Fleurs MY-MS WER': 0.1026977414887425,
139
+ },
140
  ]
141
 
142
  data = pd.DataFrame(open_source)