Spaces:
Runtime error
Runtime error
huseinzol05
commited on
Commit
β’
d754ac0
1
Parent(s):
724e1d9
improve score
Browse files
app.py
CHANGED
@@ -9,81 +9,134 @@ INTRODUCTION_TEXT = """
|
|
9 |
|
10 |
## Dataset
|
11 |
|
12 |
-
π We evaluate models based on
|
13 |
|
14 |
-
1.
|
15 |
-
2.
|
|
|
16 |
|
17 |
-
|
18 |
|
19 |
-
1.
|
20 |
-
2.
|
|
|
21 |
"""
|
22 |
|
23 |
open_source = [
|
24 |
{
|
25 |
'model': 'openai/whisper-large-v3',
|
26 |
-
'
|
27 |
-
'
|
|
|
|
|
|
|
28 |
'IMDA TTS CER': 0.016648493852990828,
|
29 |
'IMDA TTS WER': 0.0386282289139432,
|
30 |
},
|
31 |
{
|
32 |
'model': 'openai/whisper-medium',
|
33 |
-
'
|
34 |
-
'
|
|
|
|
|
|
|
35 |
'IMDA TTS CER': 0.02065587879424904,
|
36 |
'IMDA TTS WER': 0.047277690563404855,
|
37 |
},
|
38 |
{
|
39 |
'model': 'openai/whisper-small',
|
40 |
-
'
|
41 |
-
'
|
|
|
|
|
|
|
42 |
'IMDA TTS CER': 0.024812471688517194,
|
43 |
'IMDA TTS WER': 0.058901277294134434,
|
44 |
},
|
45 |
{
|
46 |
'model': 'openai/whisper-base',
|
47 |
-
'
|
48 |
-
'
|
|
|
|
|
|
|
49 |
'IMDA TTS CER': 0.03914533450681607,
|
50 |
'IMDA TTS WER': 0.08951682444539587,
|
51 |
},
|
52 |
{
|
53 |
'model': 'openai/whisper-tiny',
|
54 |
-
'
|
55 |
-
'
|
|
|
|
|
|
|
56 |
'IMDA TTS CER': 0.048805770734828904,
|
57 |
'IMDA TTS WER': 0.11150629529200957,
|
58 |
},
|
59 |
{
|
60 |
'model': 'mesolitica/malaysian-whisper-medium',
|
61 |
-
'
|
62 |
-
'
|
63 |
-
'
|
64 |
-
'
|
|
|
|
|
|
|
65 |
},
|
66 |
{
|
67 |
'model': 'mesolitica/malaysian-whisper-small',
|
68 |
-
'
|
69 |
-
'
|
|
|
|
|
|
|
70 |
'IMDA TTS CER': 0.024228721439634855,
|
71 |
'IMDA TTS WER': 0.05546294182008469,
|
72 |
},
|
73 |
{
|
74 |
'model': 'mesolitica/malaysian-whisper-base',
|
75 |
-
'
|
76 |
-
'
|
|
|
|
|
|
|
77 |
'IMDA TTS CER': 0.03982418421412676,
|
78 |
'IMDA TTS WER': 0.08917690642690643,
|
79 |
},
|
80 |
{
|
81 |
'model': 'mesolitica/malaysian-whisper-tiny',
|
|
|
|
|
|
|
82 |
'Fleurs MY-MS CER': 0.13390519685940314,
|
83 |
'Fleurs MY-MS WER': 0.3461808122686204,
|
84 |
'IMDA TTS CER': 0.07957313474501154,
|
85 |
'IMDA TTS WER': 0.1421708648494363,
|
86 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
]
|
88 |
|
89 |
data = pd.DataFrame(open_source)
|
|
|
9 |
|
10 |
## Dataset
|
11 |
|
12 |
+
π We evaluate models based on 3 datasets,
|
13 |
|
14 |
+
1. Malaya-Speech test set, Malay language, https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/tree/main/malaya-speech
|
15 |
+
2. Fleurs MS-MY test set, Malay language, https://huggingface.co/datasets/huseinzol05/malaya-speech-stt-test-set/tree/main/fleurs-ms-my
|
16 |
+
3. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS
|
17 |
|
18 |
+
## Heavy postprocess test set
|
19 |
|
20 |
+
1. We filtered test set that contain numbers because malaya-speech transducer trained on normalized numbers.
|
21 |
+
2. We lower case because malaya-speech transducer trained on lower case.
|
22 |
+
3. We removed punctuation because malaya-speech transducer trained without punctuation.
|
23 |
"""
|
24 |
|
25 |
open_source = [
|
26 |
{
|
27 |
'model': 'openai/whisper-large-v3',
|
28 |
+
'model size FP16 (MB)': 3090,
|
29 |
+
'Malaya-Speech test CER': 0.0349251317825172,
|
30 |
+
'Malaya-Speech test WER': 0.1032828282828283,
|
31 |
+
'Fleurs MY-MS CER': 0.026055551396846878,
|
32 |
+
'Fleurs MY-MS WER': 0.07652049926522007,
|
33 |
'IMDA TTS CER': 0.016648493852990828,
|
34 |
'IMDA TTS WER': 0.0386282289139432,
|
35 |
},
|
36 |
{
|
37 |
'model': 'openai/whisper-medium',
|
38 |
+
'model size FP16 (MB)': 1530,
|
39 |
+
'Malaya-Speech test CER': 0.05064920144820262,
|
40 |
+
'Malaya-Speech test WER': 0.17534205321090568,
|
41 |
+
'Fleurs MY-MS CER': 0.04366882208520179,
|
42 |
+
'Fleurs MY-MS WER': 0.13546055192128273,
|
43 |
'IMDA TTS CER': 0.02065587879424904,
|
44 |
'IMDA TTS WER': 0.047277690563404855,
|
45 |
},
|
46 |
{
|
47 |
'model': 'openai/whisper-small',
|
48 |
+
'model size FP16 (MB)': 483.5,
|
49 |
+
'Malaya-Speech test CER': 0.07485209857268262,
|
50 |
+
'Malaya-Speech test WER': 0.25748516055893106,
|
51 |
+
'Fleurs MY-MS CER': 0.06781078047622793,
|
52 |
+
'Fleurs MY-MS WER': 0.21953142859857497,
|
53 |
'IMDA TTS CER': 0.024812471688517194,
|
54 |
'IMDA TTS WER': 0.058901277294134434,
|
55 |
},
|
56 |
{
|
57 |
'model': 'openai/whisper-base',
|
58 |
+
'model size FP16 (MB)': 145,
|
59 |
+
'Malaya-Speech test CER': 0.3574879236610538,
|
60 |
+
'Malaya-Speech test WER': 0.8303456599563157,
|
61 |
+
'Fleurs MY-MS CER': 0.1319124653794061,
|
62 |
+
'Fleurs MY-MS WER': 0.40499286081235003,
|
63 |
'IMDA TTS CER': 0.03914533450681607,
|
64 |
'IMDA TTS WER': 0.08951682444539587,
|
65 |
},
|
66 |
{
|
67 |
'model': 'openai/whisper-tiny',
|
68 |
+
'model size FP16 (MB)': 75.5,
|
69 |
+
'Malaya-Speech test CER': 0.26941094281472105,
|
70 |
+
'Malaya-Speech test WER': 0.7414099751189915,
|
71 |
+
'Fleurs MY-MS CER': 0.38749733168917505,
|
72 |
+
'Fleurs MY-MS WER': 0.812253445128297,
|
73 |
'IMDA TTS CER': 0.048805770734828904,
|
74 |
'IMDA TTS WER': 0.11150629529200957,
|
75 |
},
|
76 |
{
|
77 |
'model': 'mesolitica/malaysian-whisper-medium',
|
78 |
+
'model size FP16 (MB)': 1530,
|
79 |
+
'Malaya-Speech test CER': 0.05622483776367814,
|
80 |
+
'Malaya-Speech test WER': 0.14406629724252673,
|
81 |
+
'Fleurs MY-MS CER': 0.025543266604368554,
|
82 |
+
'Fleurs MY-MS WER': 0.07940219915492629,
|
83 |
+
'IMDA TTS CER': 0.01971214262944062,
|
84 |
+
'IMDA TTS WER': 0.047223078508792794,
|
85 |
},
|
86 |
{
|
87 |
'model': 'mesolitica/malaysian-whisper-small',
|
88 |
+
'model size FP16 (MB)': 483.5,
|
89 |
+
'Malaya-Speech test CER': 0.049162419174983304,
|
90 |
+
'Malaya-Speech test WER': 0.15926901346983313,
|
91 |
+
'Fleurs MY-MS CER': 0.035517572531147,
|
92 |
+
'Fleurs MY-MS WER': 0.10938718963023729,
|
93 |
'IMDA TTS CER': 0.024228721439634855,
|
94 |
'IMDA TTS WER': 0.05546294182008469,
|
95 |
},
|
96 |
{
|
97 |
'model': 'mesolitica/malaysian-whisper-base',
|
98 |
+
'model size FP16 (MB)': 145,
|
99 |
+
'Malaya-Speech test CER': 0.07242006488452603,
|
100 |
+
'Malaya-Speech test WER': 0.22081683495617924,
|
101 |
+
'Fleurs MY-MS CER': 0.06639564802362424,
|
102 |
+
'Fleurs MY-MS WER': 0.19675812232021192,
|
103 |
'IMDA TTS CER': 0.03982418421412676,
|
104 |
'IMDA TTS WER': 0.08917690642690643,
|
105 |
},
|
106 |
{
|
107 |
'model': 'mesolitica/malaysian-whisper-tiny',
|
108 |
+
'model size FP16 (MB)': 75.5,
|
109 |
+
'Malaya-Speech test CER': 0.09423990117534763,
|
110 |
+
'Malaya-Speech test WER': 0.295029492365558,
|
111 |
'Fleurs MY-MS CER': 0.13390519685940314,
|
112 |
'Fleurs MY-MS WER': 0.3461808122686204,
|
113 |
'IMDA TTS CER': 0.07957313474501154,
|
114 |
'IMDA TTS WER': 0.1421708648494363,
|
115 |
},
|
116 |
+
{
|
117 |
+
'model': 'mesolitica/conformer-large-malay-whisper',
|
118 |
+
'model size FP16 (MB)': 206.5,
|
119 |
+
'Malaya-Speech test CER': 0.025933167255719317,
|
120 |
+
'Malaya-Speech test WER': 0.0912131356803488,
|
121 |
+
'Fleurs MY-MS CER': 0.02548791948171514,
|
122 |
+
'Fleurs MY-MS WER': 0.08376713097429746,
|
123 |
+
},
|
124 |
+
{
|
125 |
+
'model': 'mesolitica/conformer-medium-malay-whisper',
|
126 |
+
'model size FP16 (MB)': 121.5,
|
127 |
+
'Malaya-Speech test CER': 0.024955598713609053,
|
128 |
+
'Malaya-Speech test WER': 0.09315638444736804,
|
129 |
+
'Fleurs MY-MS CER': 0.029205645523910067,
|
130 |
+
'Fleurs MY-MS WER': 0.09253131557833799,
|
131 |
+
},
|
132 |
+
{
|
133 |
+
'model': 'mesolitica/conformer-medium-mixed',
|
134 |
+
'model size FP16 (MB)': 121.5,
|
135 |
+
'Malaya-Speech test CER': 0.034618711056551774,
|
136 |
+
'Malaya-Speech test WER': 0.11179440626161938,
|
137 |
+
'Fleurs MY-MS CER': 0.032894184549728075,
|
138 |
+
'Fleurs MY-MS WER': 0.1026977414887425,
|
139 |
+
},
|
140 |
]
|
141 |
|
142 |
data = pd.DataFrame(open_source)
|