Mostafa Shahin commited on
Commit
4c01711
1 Parent(s): fbc2e1b

First Commit

Browse files
Phonemize.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_from_disk
2
+ from dp.phonemizer import Phonemizer
3
+ from speechbrain.pretrained import GraphemeToPhoneme
4
+ import cmudict
5
+ import re
6
+ import fire
7
+ import torch
8
+ from os.path import join
9
+
10
+ if torch.cuda.is_available() and torch.cuda.device_count() > 1:
11
+ torch.multiprocessing.set_start_method('spawn')
12
+
13
+ class phonemization:
14
+ def __init__(self):
15
+ self.chars_to_ignore_regex = r'[,?.!-;:"]'
16
+ self.dp_phonemizer_model_path = join('models','d_phonemizer','en_us_cmudict_forward.pt')
17
+ self.sb_phonemizer_model_path = join('models','sb_phonemizer')
18
+
19
+
20
+ self.cmu_dict = cmudict.dict()
21
+ self.dp_phonemizer = Phonemizer.from_checkpoint(self.dp_phonemizer_model_path)
22
+ if torch.cuda.is_available():
23
+ self.sb_phonemizer = GraphemeToPhoneme.from_hparams(self.sb_phonemizer_model_path,run_opts={"device":"cuda"})
24
+ else:
25
+ self.sb_phonemizer = GraphemeToPhoneme.from_hparams(self.sb_phonemizer_model_path)
26
+ self.normalize = False
27
+
28
+
29
+
30
+
31
+
32
+ def dp_phonemize(self, text):
33
+ return self.dp_phonemizer(text, lang='en_us',expand_acronyms=False).replace('[',' ').replace(']',' ').split()
34
+
35
+
36
+ def cmu_phonemize(self,
37
+ text,
38
+ fallback_phonemizer=dp_phonemize):
39
+ phoneme_lst=[]
40
+ for word in text.split():
41
+ if word in self.cmu_dict:
42
+ phoneme_lst.extend(re.sub('[0-9]','',' '.join(self.cmu_dict.get(word)[0])).split())
43
+ else:
44
+ phoneme_lst.extend(fallback_phonemizer(self,word))
45
+ phoneme_lst = [p.lower() for p in phoneme_lst]
46
+ return(phoneme_lst)
47
+
48
+
49
+ def sb_phonemize(self,text):
50
+ return self.sb_phonemizer(text)
51
+
52
+ def remove_special_characters(self,text):
53
+ #print(text)
54
+ return re.sub(self.chars_to_ignore_regex, ' ', text).lower() + " "
55
+
56
+ def replace_multiple_spaces_with_single_space(self, input_string):
57
+ """Replace multiple spaces with a single space."""
58
+ return re.sub(r'\s+', ' ', input_string)
59
+
60
+ def phonemize_batch(self,
61
+ batch,
62
+ phonamizer_fn=dp_phonemize,
63
+ suffix=''):
64
+
65
+ if self.normalize:
66
+ text = batch['text_norm'].lower()
67
+ else:
68
+ text = batch['text'].lower()
69
+ phoneme_str = ' '.join(phonamizer_fn(text))
70
+ phoneme_str = phoneme_str.lower()
71
+ phoneme_str = self.replace_multiple_spaces_with_single_space(phoneme_str)
72
+ batch[f'phoneme{suffix}'] = phoneme_str.strip()
73
+ return batch
74
+
75
+ def remove_special_characters_batch(self, batch):
76
+ batch["text_norm"] = self.remove_special_characters(batch["text"])
77
+ return batch
78
+
79
+ def run(self,
80
+ dataset_path,
81
+ output_path,
82
+ phonemizers='dp,sb,cmu',
83
+ normalize=True,
84
+ nproc=1):
85
+
86
+ data = load_from_disk(dataset_path)
87
+
88
+ if normalize:
89
+ data = data.map(self.remove_special_characters_batch, num_proc=nproc)
90
+ for phonemizer in phonemizers.split(','):
91
+ if phonemizer == 'cmu':
92
+ data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.cmu_phonemize,'suffix':'_cmu'},num_proc=nproc)
93
+ if phonemizer == 'dp':
94
+ data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.dp_phonemize,'suffix':'_dp'},num_proc=nproc)
95
+ if phonemizer == 'sb':
96
+ if torch.cuda.is_available():
97
+ nproc = torch.cuda.device_count()
98
+ data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.sb_phonemize,'suffix':'_sb'},num_proc=nproc, cache_file_name='/g/data/iv96/mostafa/cache_sb', load_from_cache_file=True)
99
+ data.save_to_disk(output_path)
100
+
101
+
102
+ if __name__=='__main__':
103
+ fire.Fire(phonemization)
104
+
__pycache__/Phonemize.cpython-312.pyc ADDED
Binary file (6.31 kB). View file
 
__pycache__/transcriber.cpython-312.pyc ADDED
Binary file (20.6 kB). View file
 
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import Phonemize
3
+ from Levenshtein import editops
4
+ from gradio.components import Audio, Dropdown, Textbox, Image
5
+ import gradio as gr
6
+ import transcriber
7
+ import json
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+ from scipy.io import wavfile
11
+ from scipy.signal import spectrogram
12
+ import numpy as np
13
+
14
+
15
+ engine = transcriber.transcribe_SA(model_path='models/SA',verbose=0)
16
+ phonemizer = Phonemize.phonemization()
17
+
18
+ prompts = np.loadtxt('data/prompts.txt', dtype=str)
19
+
20
+ Attributes = engine.att_list
21
+ df_output = None
22
+
23
+ def select_prompt():
24
+ return random.choice(prompts)
25
+
26
+ def phonemize_prompt(prompt):
27
+ return ' '.join(phonemizer.cmu_phonemize(prompt)).lower()
28
+
29
+ def diff_fn():
30
+ return [('H','+'),('E','-'),('N',None),('\n', None),('F','-'),('Fgo','-'),('M','+')]
31
+
32
+ def recognizeAudio(audio_file, attributes):
33
+ #print(','.join(attributes))
34
+ global df_output
35
+ output = engine.transcribe(audio_file, attributes= tuple(attributes), phonological_matrix_file='data/p2att_en_us-arpa.csv', human_readable=False)
36
+ records = []
37
+ d = json.loads(output)
38
+ records.append(['Phoneme']+d['Phoneme']['symbols'])
39
+ for att in d['Attributes']:
40
+ records.append([att['Name']]+att['Pattern'])
41
+ df = pd.DataFrame.from_records(records)
42
+ df.fillna('', inplace=True)
43
+ df_output = df
44
+ return df.to_html(header=False, index=False)
45
+
46
+ #Get error by matching the expected sequence with the recognized one and return the output in a format that can be visualized by the gradio HighlightedText box
47
+ def get_error(exp_list, rec_list):
48
+ exp_list = list(exp_list)
49
+ rec_list = list(rec_list)
50
+ vocab = set(exp_list+rec_list)
51
+ w2c = dict(zip(vocab,range(len(vocab))))
52
+
53
+ exp_out = [[a,None] for a in exp_list]
54
+ rec_out = [[a,None] for a in rec_list]
55
+ exp_enc = ''.join([chr(w2c[c]) for c in exp_list])
56
+ rec_enc = ''.join([chr(w2c[c]) for c in rec_list])
57
+
58
+ for op, exp_i, rec_i in editops(exp_enc, rec_enc):
59
+ if op == 'replace':
60
+ exp_out[exp_i][1] = 'S'
61
+ rec_out[rec_i][1] = 'S'
62
+ elif op == 'insert':
63
+ rec_out[rec_i][1] = 'I'
64
+ elif op == 'delete':
65
+ exp_out[exp_i][1] = 'D'
66
+
67
+ diff_list = [['Expected:\t', None]] + exp_out + [['\n',None]] + [['Recognized:\t', None]] + rec_out
68
+ return diff_list
69
+
70
+
71
+ def scale_vector(vector, new_min, new_max):
72
+ min_val = min(vector)
73
+ max_val = max(vector)
74
+ scaled_vector = []
75
+ for val in vector:
76
+ scaled_val = ((val - min_val) * (new_max - new_min) / (max_val - min_val)) + new_min
77
+ scaled_vector.append(scaled_val)
78
+ return scaled_vector
79
+
80
+
81
+
82
+ def create_spectrogram_with_att(wav_file, att_contour, att):
83
+ # Read the WAV file
84
+ sampling_rate, data = wavfile.read(wav_file)
85
+
86
+ # Calculate the spectrogram
87
+ f, t, Sxx = spectrogram(data, fs=sampling_rate)
88
+ fig, ax = plt.subplots(figsize=(10, 5))
89
+
90
+ # Plot the spectrogram
91
+ ax.pcolormesh(t, f, 10 * np.log10(Sxx), shading='gouraud')
92
+ ax.set_ylabel('Frequency (Hz)')
93
+ ax.set_xlabel('Time (s)')
94
+ ax.set_title(f'Spectrogram with {att} Contour')
95
+ ax.set_ylim(0, 8000) # Adjust the frequency range if necessary
96
+
97
+ # Plot the att contour
98
+ time_pitch = np.arange(0, len(att_contour) * 0.02, 0.02) # Assuming pitch_contour is sampled every 20 ms
99
+ ax.plot(time_pitch, att_contour, color='blue', label=f'{att} Contour')
100
+ ax.legend()
101
+
102
+ return fig
103
+
104
+ def plot_contour(audio_file, att):
105
+ indx = engine.processor.tokenizer.convert_tokens_to_ids([f'p_{att}'])
106
+ att_contour = engine.logits.squeeze()[:,indx]
107
+ att_contour = scale_vector(att_contour, 0, 6000)
108
+ fig = create_spectrogram_with_att(audio_file, att_contour, att)
109
+ return fig
110
+
111
+
112
+ with gr.Blocks() as gui:
113
+ with gr.Tab("Main"):
114
+ prompt = gr.Textbox(label='Prompt', value=select_prompt)
115
+ get_prompt = gr.Button("Get Prompt")
116
+ get_prompt.click(fn=select_prompt, outputs=prompt)
117
+
118
+ prompt_phonemes = gr.Textbox(label="Expected Phonemes", interactive=False)
119
+ get_phoneme = gr.Button("Get Phonemes")
120
+ get_phoneme.click(fn=phonemize_prompt, inputs=prompt, outputs=prompt_phonemes)
121
+
122
+ record_audio = gr.Audio(sources=["microphone","upload"], type="filepath")
123
+ att_list = gr.Dropdown(label="Select Attributes", choices=Attributes, value=['vowel', 'voiced', 'consonant'] ,multiselect=True)
124
+ process = gr.Button("Process Audio")
125
+
126
+ recognition = gr.HTML(label='Output')
127
+
128
+ process.click(fn=recognizeAudio, inputs=[record_audio,att_list], outputs=recognition)
129
+
130
+
131
+
132
+ with gr.Tab("Assessment"):
133
+ assess = gr.Button("Assessment")
134
+ diff = []
135
+ for i in range(len(Attributes)+1):
136
+ diff.append(gr.HighlightedText(
137
+ combine_adjacent=False,
138
+ show_legend=True,
139
+ color_map={"S": "red", "I": "green", "D":"blue"}, visible=False))
140
+
141
+ def get_assessment(prompt_phonemes):#, recognized_phonemes, recognized_attributes):
142
+ outputs = [gr.HighlightedText(visible=False)]*(len(Attributes)+1)
143
+ outputs[0] = gr.HighlightedText(label=f"Phoneme Assessment",
144
+ value=get_error(prompt_phonemes.split(), df_output.iloc[0].values[1:]),
145
+ visible=True)
146
+ i = 1
147
+ for i,r in df_output.iloc[1:].iterrows():
148
+ convert = lambda ph: '-' if f'n_{att}' in engine.p2att_map[ph] else '+'
149
+ att = r.iloc[0]
150
+ exp_att = [convert(ph) for ph in prompt_phonemes.split()]
151
+ rec_att = r.iloc[1:].values
152
+ outputs[i] = gr.HighlightedText(label=f"{att} Assessment",
153
+ value=get_error(exp_att, rec_att),
154
+ visible=True)
155
+ i += 1
156
+
157
+ return outputs
158
+
159
+ assess.click(fn=get_assessment, inputs= [prompt_phonemes], outputs=diff)
160
+
161
+ with gr.Tab("Analysis"):
162
+ selected_att = gr.Dropdown( Attributes, label="Select an Attribute to plot", value='voiced', interactive=True)
163
+ do_plot = gr.Button('Plot')
164
+ plot_block = gr.Plot(label='Spectrogram with Attribute Contour')
165
+ do_plot.click(plot_contour, inputs=[record_audio,selected_att], outputs=plot_block)
166
+
167
+ gui.launch()
data/p2att_en_us-arpa.csv ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Phoneme_arpa,alveolar,palatal,dental,glottal,labial,velar,anterior,posterior,retroflex,high,low,mid,front,back,central,consonant,sonorant,long,short,vowel,semivowel,fricative,nasal,stop,approximant,affricate,liquid,continuant,monophthong,diphthong,round,voiced,labiodental,obstruent,bilabial,coronal,dorsal
2
+ aa,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
3
+ ae,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
4
+ ah,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
5
+ ao,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
6
+ aw,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
7
+ ay,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
8
+ eh,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
9
+ er,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
10
+ ey,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
11
+ ih,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
12
+ iy,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
13
+ ow,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
14
+ oy,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
15
+ uh,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
16
+ uw,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
17
+ b,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0
18
+ ch,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
19
+ d,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
20
+ dh,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0
21
+ f,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0
22
+ g,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
23
+ hh,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1
24
+ jh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0
25
+ k,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1
26
+ l,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0
27
+ m,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
28
+ n,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
29
+ nd,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
30
+ ng,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1
31
+ p,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
32
+ r,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0
33
+ s,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
34
+ sh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
35
+ sil,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36
+ t,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
37
+ th,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
38
+ v,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0
39
+ w,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0
40
+ y,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0
41
+ z,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0
42
+ zh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0
data/prompts.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Top
2
+ Cop
3
+ Tight
4
+ Kite
5
+ Torn
6
+ Corn
7
+ Tame
8
+ Came
9
+ Tall
10
+ Call
11
+ Tail
12
+ Kale
13
+ Bat
14
+ Back
15
+ Pit
16
+ Pick
17
+ Ate
18
+ Ache
19
+ But
20
+ Buck
21
+ Sit
22
+ Sick
23
+ Rate
24
+ Rake
25
+ Date
26
+ Gate
27
+ Deer
28
+ Gear
29
+ Drip
30
+ Grip
31
+ Down
32
+ Gown
33
+ Doe
34
+ Go
35
+ Bid
36
+ Big
37
+ Led
38
+ Leg
39
+ Mud
40
+ Mug
41
+ Bud
42
+ Bug
43
+ Bed
44
+ Beg
models/SA/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "</s>": 72,
3
+ "<s>": 71,
4
+ "<unk>": 73
5
+ }
models/SA/config.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/g/data/iv96/mostafa/Speech-Attribute-Transcription/models/wav2vec2-large-robust/",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_dropout_prob": 0.1,
59
+ "hidden_size": 1024,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4096,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.1,
64
+ "mask_feature_length": 10,
65
+ "mask_feature_min_masks": 0,
66
+ "mask_feature_prob": 0.0,
67
+ "mask_time_length": 10,
68
+ "mask_time_min_masks": 2,
69
+ "mask_time_prob": 0.05,
70
+ "model_type": "wav2vec2",
71
+ "num_adapter_layers": 3,
72
+ "num_attention_heads": 16,
73
+ "num_codevector_groups": 2,
74
+ "num_codevectors_per_group": 320,
75
+ "num_conv_pos_embedding_groups": 16,
76
+ "num_conv_pos_embeddings": 128,
77
+ "num_feat_extract_layers": 7,
78
+ "num_hidden_layers": 24,
79
+ "num_negatives": 100,
80
+ "output_hidden_size": 1024,
81
+ "pad_token_id": 0,
82
+ "proj_codevector_dim": 768,
83
+ "tdnn_dilation": [
84
+ 1,
85
+ 2,
86
+ 3,
87
+ 1,
88
+ 1
89
+ ],
90
+ "tdnn_dim": [
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 1500
96
+ ],
97
+ "tdnn_kernel": [
98
+ 5,
99
+ 3,
100
+ 3,
101
+ 1,
102
+ 1
103
+ ],
104
+ "torch_dtype": "float32",
105
+ "transformers_version": "4.37.2",
106
+ "use_weighted_layer_sum": false,
107
+ "vocab_size": 71,
108
+ "xvector_output_dim": 512
109
+ }
models/SA/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31862db1655be478b59e480e490165c7109e8b659277b43ee3fcc3fff772fea0
3
+ size 1262098580
models/SA/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
models/SA/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
models/SA/tokenizer_config.json ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "p_alveolar",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "n_alveolar",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "p_palatal",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "4": {
36
+ "content": "n_palatal",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": true,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "5": {
44
+ "content": "p_dental",
45
+ "lstrip": true,
46
+ "normalized": false,
47
+ "rstrip": true,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "6": {
52
+ "content": "n_dental",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": true,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "7": {
60
+ "content": "p_glottal",
61
+ "lstrip": true,
62
+ "normalized": false,
63
+ "rstrip": true,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "8": {
68
+ "content": "n_glottal",
69
+ "lstrip": true,
70
+ "normalized": false,
71
+ "rstrip": true,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "9": {
76
+ "content": "p_labial",
77
+ "lstrip": true,
78
+ "normalized": false,
79
+ "rstrip": true,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "10": {
84
+ "content": "n_labial",
85
+ "lstrip": true,
86
+ "normalized": false,
87
+ "rstrip": true,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "11": {
92
+ "content": "p_velar",
93
+ "lstrip": true,
94
+ "normalized": false,
95
+ "rstrip": true,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "12": {
100
+ "content": "n_velar",
101
+ "lstrip": true,
102
+ "normalized": false,
103
+ "rstrip": true,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "13": {
108
+ "content": "p_anterior",
109
+ "lstrip": true,
110
+ "normalized": false,
111
+ "rstrip": true,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "14": {
116
+ "content": "n_anterior",
117
+ "lstrip": true,
118
+ "normalized": false,
119
+ "rstrip": true,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "15": {
124
+ "content": "p_posterior",
125
+ "lstrip": true,
126
+ "normalized": false,
127
+ "rstrip": true,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "16": {
132
+ "content": "n_posterior",
133
+ "lstrip": true,
134
+ "normalized": false,
135
+ "rstrip": true,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "17": {
140
+ "content": "p_retroflex",
141
+ "lstrip": true,
142
+ "normalized": false,
143
+ "rstrip": true,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "18": {
148
+ "content": "n_retroflex",
149
+ "lstrip": true,
150
+ "normalized": false,
151
+ "rstrip": true,
152
+ "single_word": false,
153
+ "special": false
154
+ },
155
+ "19": {
156
+ "content": "p_mid",
157
+ "lstrip": true,
158
+ "normalized": false,
159
+ "rstrip": true,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "20": {
164
+ "content": "n_mid",
165
+ "lstrip": true,
166
+ "normalized": false,
167
+ "rstrip": true,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "21": {
172
+ "content": "p_high",
173
+ "lstrip": true,
174
+ "normalized": false,
175
+ "rstrip": true,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "22": {
180
+ "content": "n_high",
181
+ "lstrip": true,
182
+ "normalized": false,
183
+ "rstrip": true,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "23": {
188
+ "content": "p_low",
189
+ "lstrip": true,
190
+ "normalized": false,
191
+ "rstrip": true,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "24": {
196
+ "content": "n_low",
197
+ "lstrip": true,
198
+ "normalized": false,
199
+ "rstrip": true,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "25": {
204
+ "content": "p_front",
205
+ "lstrip": true,
206
+ "normalized": false,
207
+ "rstrip": true,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "26": {
212
+ "content": "n_front",
213
+ "lstrip": true,
214
+ "normalized": false,
215
+ "rstrip": true,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "27": {
220
+ "content": "p_back",
221
+ "lstrip": true,
222
+ "normalized": false,
223
+ "rstrip": true,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "28": {
228
+ "content": "n_back",
229
+ "lstrip": true,
230
+ "normalized": false,
231
+ "rstrip": true,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "29": {
236
+ "content": "p_central",
237
+ "lstrip": true,
238
+ "normalized": false,
239
+ "rstrip": true,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "30": {
244
+ "content": "n_central",
245
+ "lstrip": true,
246
+ "normalized": false,
247
+ "rstrip": true,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "31": {
252
+ "content": "p_consonant",
253
+ "lstrip": true,
254
+ "normalized": false,
255
+ "rstrip": true,
256
+ "single_word": false,
257
+ "special": false
258
+ },
259
+ "32": {
260
+ "content": "n_consonant",
261
+ "lstrip": true,
262
+ "normalized": false,
263
+ "rstrip": true,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "33": {
268
+ "content": "p_sonorant",
269
+ "lstrip": true,
270
+ "normalized": false,
271
+ "rstrip": true,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "34": {
276
+ "content": "n_sonorant",
277
+ "lstrip": true,
278
+ "normalized": false,
279
+ "rstrip": true,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "35": {
284
+ "content": "p_long",
285
+ "lstrip": true,
286
+ "normalized": false,
287
+ "rstrip": true,
288
+ "single_word": false,
289
+ "special": false
290
+ },
291
+ "36": {
292
+ "content": "n_long",
293
+ "lstrip": true,
294
+ "normalized": false,
295
+ "rstrip": true,
296
+ "single_word": false,
297
+ "special": false
298
+ },
299
+ "37": {
300
+ "content": "p_short",
301
+ "lstrip": true,
302
+ "normalized": false,
303
+ "rstrip": true,
304
+ "single_word": false,
305
+ "special": false
306
+ },
307
+ "38": {
308
+ "content": "n_short",
309
+ "lstrip": true,
310
+ "normalized": false,
311
+ "rstrip": true,
312
+ "single_word": false,
313
+ "special": false
314
+ },
315
+ "39": {
316
+ "content": "p_vowel",
317
+ "lstrip": true,
318
+ "normalized": false,
319
+ "rstrip": true,
320
+ "single_word": false,
321
+ "special": false
322
+ },
323
+ "40": {
324
+ "content": "n_vowel",
325
+ "lstrip": true,
326
+ "normalized": false,
327
+ "rstrip": true,
328
+ "single_word": false,
329
+ "special": false
330
+ },
331
+ "41": {
332
+ "content": "p_semivowel",
333
+ "lstrip": true,
334
+ "normalized": false,
335
+ "rstrip": true,
336
+ "single_word": false,
337
+ "special": false
338
+ },
339
+ "42": {
340
+ "content": "n_semivowel",
341
+ "lstrip": true,
342
+ "normalized": false,
343
+ "rstrip": true,
344
+ "single_word": false,
345
+ "special": false
346
+ },
347
+ "43": {
348
+ "content": "p_fricative",
349
+ "lstrip": true,
350
+ "normalized": false,
351
+ "rstrip": true,
352
+ "single_word": false,
353
+ "special": false
354
+ },
355
+ "44": {
356
+ "content": "n_fricative",
357
+ "lstrip": true,
358
+ "normalized": false,
359
+ "rstrip": true,
360
+ "single_word": false,
361
+ "special": false
362
+ },
363
+ "45": {
364
+ "content": "p_nasal",
365
+ "lstrip": true,
366
+ "normalized": false,
367
+ "rstrip": true,
368
+ "single_word": false,
369
+ "special": false
370
+ },
371
+ "46": {
372
+ "content": "n_nasal",
373
+ "lstrip": true,
374
+ "normalized": false,
375
+ "rstrip": true,
376
+ "single_word": false,
377
+ "special": false
378
+ },
379
+ "47": {
380
+ "content": "p_stop",
381
+ "lstrip": true,
382
+ "normalized": false,
383
+ "rstrip": true,
384
+ "single_word": false,
385
+ "special": false
386
+ },
387
+ "48": {
388
+ "content": "n_stop",
389
+ "lstrip": true,
390
+ "normalized": false,
391
+ "rstrip": true,
392
+ "single_word": false,
393
+ "special": false
394
+ },
395
+ "49": {
396
+ "content": "p_approximant",
397
+ "lstrip": true,
398
+ "normalized": false,
399
+ "rstrip": true,
400
+ "single_word": false,
401
+ "special": false
402
+ },
403
+ "50": {
404
+ "content": "n_approximant",
405
+ "lstrip": true,
406
+ "normalized": false,
407
+ "rstrip": true,
408
+ "single_word": false,
409
+ "special": false
410
+ },
411
+ "51": {
412
+ "content": "p_affricate",
413
+ "lstrip": true,
414
+ "normalized": false,
415
+ "rstrip": true,
416
+ "single_word": false,
417
+ "special": false
418
+ },
419
+ "52": {
420
+ "content": "n_affricate",
421
+ "lstrip": true,
422
+ "normalized": false,
423
+ "rstrip": true,
424
+ "single_word": false,
425
+ "special": false
426
+ },
427
+ "53": {
428
+ "content": "p_liquid",
429
+ "lstrip": true,
430
+ "normalized": false,
431
+ "rstrip": true,
432
+ "single_word": false,
433
+ "special": false
434
+ },
435
+ "54": {
436
+ "content": "n_liquid",
437
+ "lstrip": true,
438
+ "normalized": false,
439
+ "rstrip": true,
440
+ "single_word": false,
441
+ "special": false
442
+ },
443
+ "55": {
444
+ "content": "p_continuant",
445
+ "lstrip": true,
446
+ "normalized": false,
447
+ "rstrip": true,
448
+ "single_word": false,
449
+ "special": false
450
+ },
451
+ "56": {
452
+ "content": "n_continuant",
453
+ "lstrip": true,
454
+ "normalized": false,
455
+ "rstrip": true,
456
+ "single_word": false,
457
+ "special": false
458
+ },
459
+ "57": {
460
+ "content": "p_monophthong",
461
+ "lstrip": true,
462
+ "normalized": false,
463
+ "rstrip": true,
464
+ "single_word": false,
465
+ "special": false
466
+ },
467
+ "58": {
468
+ "content": "n_monophthong",
469
+ "lstrip": true,
470
+ "normalized": false,
471
+ "rstrip": true,
472
+ "single_word": false,
473
+ "special": false
474
+ },
475
+ "59": {
476
+ "content": "p_diphthong",
477
+ "lstrip": true,
478
+ "normalized": false,
479
+ "rstrip": true,
480
+ "single_word": false,
481
+ "special": false
482
+ },
483
+ "60": {
484
+ "content": "n_diphthong",
485
+ "lstrip": true,
486
+ "normalized": false,
487
+ "rstrip": true,
488
+ "single_word": false,
489
+ "special": false
490
+ },
491
+ "61": {
492
+ "content": "p_round",
493
+ "lstrip": true,
494
+ "normalized": false,
495
+ "rstrip": true,
496
+ "single_word": false,
497
+ "special": false
498
+ },
499
+ "62": {
500
+ "content": "n_round",
501
+ "lstrip": true,
502
+ "normalized": false,
503
+ "rstrip": true,
504
+ "single_word": false,
505
+ "special": false
506
+ },
507
+ "63": {
508
+ "content": "p_voiced",
509
+ "lstrip": true,
510
+ "normalized": false,
511
+ "rstrip": true,
512
+ "single_word": false,
513
+ "special": false
514
+ },
515
+ "64": {
516
+ "content": "n_voiced",
517
+ "lstrip": true,
518
+ "normalized": false,
519
+ "rstrip": true,
520
+ "single_word": false,
521
+ "special": false
522
+ },
523
+ "65": {
524
+ "content": "p_bilabial",
525
+ "lstrip": true,
526
+ "normalized": false,
527
+ "rstrip": true,
528
+ "single_word": false,
529
+ "special": false
530
+ },
531
+ "66": {
532
+ "content": "n_bilabial",
533
+ "lstrip": true,
534
+ "normalized": false,
535
+ "rstrip": true,
536
+ "single_word": false,
537
+ "special": false
538
+ },
539
+ "67": {
540
+ "content": "p_coronal",
541
+ "lstrip": true,
542
+ "normalized": false,
543
+ "rstrip": true,
544
+ "single_word": false,
545
+ "special": false
546
+ },
547
+ "68": {
548
+ "content": "n_coronal",
549
+ "lstrip": true,
550
+ "normalized": false,
551
+ "rstrip": true,
552
+ "single_word": false,
553
+ "special": false
554
+ },
555
+ "69": {
556
+ "content": "p_dorsal",
557
+ "lstrip": true,
558
+ "normalized": false,
559
+ "rstrip": true,
560
+ "single_word": false,
561
+ "special": false
562
+ },
563
+ "70": {
564
+ "content": "n_dorsal",
565
+ "lstrip": true,
566
+ "normalized": false,
567
+ "rstrip": true,
568
+ "single_word": false,
569
+ "special": false
570
+ },
571
+ "71": {
572
+ "content": "<s>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "72": {
580
+ "content": "</s>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "73": {
588
+ "content": "<unk>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ }
595
+ },
596
+ "bos_token": "<s>",
597
+ "clean_up_tokenization_spaces": true,
598
+ "do_lower_case": false,
599
+ "eos_token": "</s>",
600
+ "model_max_length": 1000000000000000019884624838656,
601
+ "pad_token": "<pad>",
602
+ "processor_class": "Wav2Vec2Processor",
603
+ "replace_word_delimiter_char": " ",
604
+ "target_lang": null,
605
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
606
+ "unk_token": "<unk>",
607
+ "word_delimiter_token": ""
608
+ }
models/SA/vocab.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 0,
3
+ "n_affricate": 52,
4
+ "n_alveolar": 2,
5
+ "n_anterior": 14,
6
+ "n_approximant": 50,
7
+ "n_back": 28,
8
+ "n_bilabial": 66,
9
+ "n_central": 30,
10
+ "n_consonant": 32,
11
+ "n_continuant": 56,
12
+ "n_coronal": 68,
13
+ "n_dental": 6,
14
+ "n_diphthong": 60,
15
+ "n_dorsal": 70,
16
+ "n_fricative": 44,
17
+ "n_front": 26,
18
+ "n_glottal": 8,
19
+ "n_high": 22,
20
+ "n_labial": 10,
21
+ "n_liquid": 54,
22
+ "n_long": 36,
23
+ "n_low": 24,
24
+ "n_mid": 20,
25
+ "n_monophthong": 58,
26
+ "n_nasal": 46,
27
+ "n_palatal": 4,
28
+ "n_posterior": 16,
29
+ "n_retroflex": 18,
30
+ "n_round": 62,
31
+ "n_semivowel": 42,
32
+ "n_short": 38,
33
+ "n_sonorant": 34,
34
+ "n_stop": 48,
35
+ "n_velar": 12,
36
+ "n_voiced": 64,
37
+ "n_vowel": 40,
38
+ "p_affricate": 51,
39
+ "p_alveolar": 1,
40
+ "p_anterior": 13,
41
+ "p_approximant": 49,
42
+ "p_back": 27,
43
+ "p_bilabial": 65,
44
+ "p_central": 29,
45
+ "p_consonant": 31,
46
+ "p_continuant": 55,
47
+ "p_coronal": 67,
48
+ "p_dental": 5,
49
+ "p_diphthong": 59,
50
+ "p_dorsal": 69,
51
+ "p_fricative": 43,
52
+ "p_front": 25,
53
+ "p_glottal": 7,
54
+ "p_high": 21,
55
+ "p_labial": 9,
56
+ "p_liquid": 53,
57
+ "p_long": 35,
58
+ "p_low": 23,
59
+ "p_mid": 19,
60
+ "p_monophthong": 57,
61
+ "p_nasal": 45,
62
+ "p_palatal": 3,
63
+ "p_posterior": 15,
64
+ "p_retroflex": 17,
65
+ "p_round": 61,
66
+ "p_semivowel": 41,
67
+ "p_short": 37,
68
+ "p_sonorant": 33,
69
+ "p_stop": 47,
70
+ "p_velar": 11,
71
+ "p_voiced": 63,
72
+ "p_vowel": 39
73
+ }
models/d_phonemizer/en_us_cmudict_forward.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2e1fb223d7e027bf7b33052540c6f71d19db6d7fd87ab8671152b8b114501c2
3
+ size 66725366
models/sb_phonemizer/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "GraphemeToPhoneme"
3
+ }
models/sb_phonemizer/ctc_lin.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c72639caba01630cf5ccc9b287b6eb7b79acc2276aa6f5cc23640640ac8f7ee
3
+ size 177319
models/sb_phonemizer/hyperparams.yaml ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-07-09 from:
2
+ # /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: LSTM (encoder) + GRU (decoder) (tokenized)
6
+ # Authors:
7
+ # Loren Lugosch & Mirco Ravanelli 2020
8
+ # Artem Ploujnikov 2021
9
+ # ################################
10
+
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ seed: 1234
13
+ __set_seed: !apply:torch.manual_seed [1234]
14
+
15
+
16
+ # Tokenizers
17
+ char_tokenize: false
18
+ char_token_type: unigram # ["unigram", "bpe", "char"]
19
+ char_token_output: 512
20
+ char_token_wordwise: true
21
+ phn_tokenize: false
22
+ phn_token_type: unigram # ["unigram", "bpe", "char"]
23
+ phn_token_output: 512 # index(blank/eos/bos/unk) = 0
24
+ phn_token_wordwise: true
25
+ character_coverage: 1.0
26
+
27
+
28
+ phonemes_count: 43
29
+ graphemes_count: 31
30
+ phonemes_enable_space: true
31
+
32
+ # Training Parameters
33
+ lexicon_epochs: 50
34
+ lexicon_ctc_epochs: 10
35
+ lexicon_limit_to_stop: 50 # No stopping by default, can override
36
+ lexicon_limit_warmup: 50 # No stopping by default, can override
37
+ sentence_epochs: 13
38
+ sentence_ctc_epochs: 10
39
+ sentence_limit_to_stop: 3
40
+ sentence_limit_warmup: 3
41
+ homograph_epochs: 50
42
+ homograph_ctc_epochs: 10
43
+ homograph_limit_to_stop: 5
44
+ homograph_limit_warmup: 10
45
+ lexicon_batch_size: 1024
46
+ sentence_batch_size: 32
47
+ homograph_batch_size: 32
48
+ ctc_weight: 0.5
49
+ homograph_loss_weight: 2.0
50
+ lr: 0.002
51
+ save_for_pretrained: true
52
+
53
+ # Model parameters
54
+ output_neurons: &id004 !apply:speechbrain.utils.hparams.choice
55
+
56
+ value: false
57
+ choices:
58
+ true: 513
59
+ false: 43
60
+
61
+ enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice
62
+ value: false
63
+ choices:
64
+ true: 513
65
+ false: 31
66
+
67
+ enc_dropout: 0.5
68
+ enc_neurons: 512
69
+ enc_num_layers: 4
70
+ dec_dropout: 0.5
71
+ dec_neurons: 512
72
+ dec_att_neurons: 256
73
+ dec_num_layers: 4
74
+ embedding_dim: 512
75
+
76
+ # Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
77
+ # Available modes:
78
+ # raw: no BOS/EOS tokens are added
79
+ # bos: a beginning-of-sequence token is added
80
+ # eos: an end-of-sequence token is added
81
+ grapheme_sequence_mode: bos
82
+ phoneme_sequence_mode: bos
83
+
84
+
85
+ # Special Token information
86
+ bos_index: 0
87
+ eos_index: 1
88
+ blank_index: 2
89
+ unk_index: 2
90
+ token_space_index: 512
91
+
92
+
93
+ # Language Model
94
+ lm_emb_dim: 256 # dimension of the embeddings
95
+ lm_rnn_size: 512 # dimension of hidden layers
96
+ lm_layers: 2 # number of hidden layers
97
+ lm_output_neurons: 43
98
+
99
+ # Beam Searcher
100
+ use_language_model: false
101
+ beam_search_min_decode_ratio: 0
102
+ beam_search_max_decode_ratio: 1.0
103
+ beam_search_beam_size: 16
104
+ beam_search_beam_size_valid: 16
105
+ beam_search_eos_threshold: 10.0
106
+ beam_search_using_max_attn_shift: false
107
+ beam_search_max_attn_shift: 10
108
+ beam_search_coverage_penalty: 5.0
109
+ beam_search_lm_weight: 0.5
110
+ beam_search_ctc_weight_decode: 0.4
111
+ beam_search_temperature: 1.25
112
+ beam_search_temperature_lm: 1.0
113
+
114
+ # Word embeddings
115
+ use_word_emb: true
116
+ word_emb_model: bert-base-uncased
117
+ word_emb_dim: 768
118
+ word_emb_enc_dim: 256
119
+ word_emb_norm_type: batch
120
+
121
+ graphemes: &id028
122
+ - A
123
+ - B
124
+ - C
125
+ - D
126
+ - E
127
+ - F
128
+ - G
129
+ - H
130
+ - I
131
+ - J
132
+ - K
133
+ - L
134
+ - M
135
+ - N
136
+ - O
137
+ - P
138
+ - Q
139
+ - R
140
+ - S
141
+ - T
142
+ - U
143
+ - V
144
+ - W
145
+ - X
146
+ - Y
147
+ - Z
148
+ - "'"
149
+ - ' '
150
+
151
+ phonemes: &id001
152
+
153
+
154
+ - AA
155
+ - AE
156
+ - AH
157
+ - AO
158
+ - AW
159
+ - AY
160
+ - B
161
+ - CH
162
+ - D
163
+ - DH
164
+ - EH
165
+ - ER
166
+ - EY
167
+ - F
168
+ - G
169
+ - HH
170
+ - IH
171
+ - IY
172
+ - JH
173
+ - K
174
+ - L
175
+ - M
176
+ - N
177
+ - NG
178
+ - OW
179
+ - OY
180
+ - P
181
+ - R
182
+ - S
183
+ - SH
184
+ - T
185
+ - TH
186
+ - UH
187
+ - UW
188
+ - V
189
+ - W
190
+ - Y
191
+ - Z
192
+ - ZH
193
+ - ' '
194
+
195
+ enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim
196
+ use_word_emb: true
197
+ word_emb_enc_dim: 256
198
+ embedding_dim: 512
199
+
200
+
201
+ phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
202
+
203
+
204
+ # Models
205
+ tokens: *id001
206
+ char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map
207
+ map_dict: *id002
208
+ enc: &id006 !new:speechbrain.nnet.RNN.LSTM
209
+ input_shape: [null, null, *id003]
210
+ bidirectional: true
211
+ hidden_size: 512
212
+ num_layers: 4
213
+ dropout: 0.5
214
+
215
+ lin: &id010 !new:speechbrain.nnet.linear.Linear
216
+ input_size: 512
217
+ n_neurons: *id004
218
+ bias: false
219
+
220
+ ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear
221
+ input_size: 1024
222
+ n_neurons: *id004
223
+ encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
224
+ num_embeddings: *id005
225
+ embedding_dim: 512
226
+
227
+ emb: &id008 !new:speechbrain.nnet.embedding.Embedding
228
+ num_embeddings: *id004
229
+ embedding_dim: 512
230
+
231
+ dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
232
+ enc_dim: 1024
233
+ input_size: 512
234
+ rnn_type: gru
235
+ attn_type: content
236
+ dropout: 0.5
237
+ hidden_size: 512
238
+ attn_dim: 256
239
+ num_layers: 4
240
+
241
+ word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
242
+
243
+ word_emb_dim: 768
244
+ word_emb_enc_dim: 256
245
+ norm_type: batch
246
+
247
+ word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
248
+ init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
249
+ model: bert-base-uncased
250
+
251
+ log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax
252
+ apply_log: true
253
+
254
+ modules:
255
+ model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
256
+ enc: *id006
257
+ encoder_emb: *id007
258
+ emb: *id008
259
+ dec: *id009
260
+ lin: *id010
261
+ out: *id011
262
+ use_word_emb: true
263
+ word_emb_enc: *id012
264
+ enc: *id006
265
+ encoder_emb: *id007
266
+ emb: *id008
267
+ dec: *id009
268
+ lin: *id010
269
+ ctc_lin: *id013
270
+ out: *id011
271
+ word_emb:
272
+ word_emb_enc: *id012
273
+ model: *id014
274
+ lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM
275
+ embedding_dim: 256
276
+ rnn_layers: 2
277
+ rnn_neurons: 512
278
+ output_neurons: 43
279
+ return_hidden: true
280
+
281
+ opt_class: !name:torch.optim.Adam
282
+ lr: 0.002
283
+
284
+ beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher
285
+ embedding: *id008
286
+ decoder: *id009
287
+ linear: *id010
288
+ ctc_linear: *id013
289
+ bos_index: 0
290
+ eos_index: 1
291
+ blank_index: 2
292
+ min_decode_ratio: 0
293
+ max_decode_ratio: 1.0
294
+ beam_size: 16
295
+ eos_threshold: 10.0
296
+ using_max_attn_shift: false
297
+ max_attn_shift: 10
298
+ coverage_penalty: 5.0
299
+ ctc_weight: 0.4
300
+
301
+ beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
302
+ embedding: *id008
303
+ decoder: *id009
304
+ linear: *id010
305
+ ctc_linear: *id013
306
+ bos_index: 0
307
+ eos_index: 1
308
+ blank_index: 2
309
+ min_decode_ratio: 0
310
+ max_decode_ratio: 1.0
311
+ beam_size: 16
312
+ eos_threshold: 10.0
313
+ using_max_attn_shift: false
314
+ max_attn_shift: 10
315
+ coverage_penalty: 5.0
316
+ ctc_weight: 0.4
317
+
318
+ beam_searcher_lm: !new:speechbrain.decoders.seq2seq.S2SRNNBeamSearchLM
319
+ embedding: *id008
320
+ decoder: *id009
321
+ linear: *id010
322
+ ctc_linear: *id013
323
+ language_model: *id015
324
+ bos_index: 0
325
+ eos_index: 1
326
+ blank_index: 2
327
+ min_decode_ratio: 0
328
+ max_decode_ratio: 1.0
329
+ beam_size: 16
330
+ eos_threshold: 10.0
331
+ using_max_attn_shift: false
332
+ max_attn_shift: 10
333
+ coverage_penalty: 5.0
334
+ ctc_weight: 0.4
335
+ lm_weight: 0.5
336
+ temperature: 1.25
337
+ temperature_lm: 1.0
338
+
339
+
340
+ lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler
341
+ initial_value: 0.002
342
+ improvement_threshold: 0.0
343
+ annealing_factor: 0.8
344
+ patient: 0
345
+
346
+ homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
347
+
348
+ seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss
349
+
350
+ label_smoothing: 0.1
351
+
352
+ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
353
+ blank_index: 2
354
+
355
+ seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss
356
+
357
+ label_smoothing: 0.1
358
+ reduction: batch
359
+
360
+ homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss
361
+ seq_cost: *id016
362
+ seq_stats: !name:speechbrain.utils.metric_stats.MetricStats
363
+ metric: *id017
364
+ seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats
365
+ metric: *id017
366
+ classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats
367
+
368
+ per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
369
+ per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats
370
+
371
+
372
+ model_output_keys:
373
+ - p_seq
374
+ - char_lens
375
+ - encoder_out
376
+
377
+ grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
378
+ phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
379
+
380
+
381
+ grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
382
+ init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
383
+ model_dir: grapheme_tokenizer
384
+ bos_id: 0
385
+ eos_id: 1
386
+ unk_id: 2
387
+ vocab_size: 512
388
+ annotation_train: tokenizer_annotation_train.json
389
+ annotation_read: char
390
+ model_type: unigram # ["unigram", "bpe", "char"]
391
+ character_coverage: 1.0
392
+ annotation_format: json
393
+ text_file: grapheme_annotations.txt
394
+
395
+ phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
396
+ init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
397
+ model_dir: phoneme_tokenizer
398
+ bos_id: 0
399
+ eos_id: 1
400
+ unk_id: 2
401
+ vocab_size: 512
402
+ annotation_train: tokenizer_annotation_train.json
403
+ annotation_read: phn
404
+ model_type: unigram # ["unigram", "bpe", "char"]
405
+ character_coverage: 1.0
406
+ annotation_list_to_check: [tokenizer_annotation_valid.json]
407
+ annotation_format: json
408
+ text_file: phoneme_annotations.txt
409
+
410
+ out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
411
+ tokenizer: *id022
412
+ char_map: *id023
413
+ token_space_index: 512
414
+ wordwise: true
415
+
416
+ out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode
417
+
418
+ encoder: *id024
419
+ out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
420
+ value: false
421
+ choices:
422
+ true: *id025
423
+ false: *id026
424
+ encode_pipeline:
425
+ batch: false
426
+ use_padded_data: true
427
+ output_keys:
428
+ - grapheme_list
429
+ - grapheme_encoded_list
430
+ - grapheme_encoded
431
+ - word_emb
432
+ init:
433
+ - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
434
+ encoder: *id027
435
+ tokens: *id028
436
+ bos_index: 0
437
+ eos_index: 1
438
+ - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
439
+ encoder: *id024
440
+ tokens: *id001
441
+ bos_index: 0
442
+ eos_index: 1
443
+ steps:
444
+ - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
445
+ graphemes: *id028
446
+ takes: txt
447
+ provides: txt_cleaned
448
+ - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
449
+ grapheme_encoder: *id027
450
+ takes: txt_cleaned
451
+ provides:
452
+ - grapheme_list
453
+ - grapheme_encoded_list
454
+ - grapheme_encoded_raw
455
+
456
+ - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
457
+ encoder: *id027
458
+ takes: grapheme_encoded_list
459
+ provides:
460
+ - grapheme_encoded
461
+ - grapheme_len
462
+ - grapheme_encoded_eos
463
+ - grapheme_len_eos
464
+ - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
465
+ word_emb: !ref <word_emb>
466
+ grapheme_encoder: !ref <grapheme_encoder>
467
+ use_word_emb: !ref <use_word_emb>
468
+ takes:
469
+ - txt
470
+ - grapheme_encoded
471
+ - grapheme_len
472
+ provides: word_emb
473
+
474
+ decode_pipeline:
475
+ batch: true
476
+ output_keys:
477
+ - phonemes
478
+ steps:
479
+ - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
480
+ beam_searcher: *id029
481
+ takes:
482
+ - char_lens
483
+ - encoder_out
484
+ provides:
485
+ - hyps
486
+ - scores
487
+ - func: !apply:speechbrain.utils.hparams.choice
488
+ value: false
489
+ choices:
490
+ true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
491
+ tokenizer: *id022
492
+ char_map: *id023
493
+ token_space_index: 512
494
+ wordwise: true
495
+ false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
496
+ phoneme_encoder: *id024
497
+ takes:
498
+ - hyps
499
+ provides:
500
+ - phonemes
501
+
502
+
503
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
504
+ loadables:
505
+ model: *id014
506
+ ctc_lin: *id013
507
+
models/sb_phonemizer/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71bf7a7b290f88de5fdd7364fa4ab249bdd94a29e6cdc742ee6f69edeae64f61
3
+ size 128643257
phoneme_vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "aa": 1, "ae": 2, "ah": 3, "ao": 4, "aw": 5, "ay": 6, "eh": 7, "er": 8, "ey": 9, "ih": 10, "iy": 11, "ow": 12, "oy": 13, "uh": 14, "uw": 15, "b": 16, "ch": 17, "d": 18, "dh": 19, "f": 20, "g": 21, "hh": 22, "jh": 23, "k": 24, "l": 25, "m": 26, "n": 27, "nd": 28, "ng": 29, "p": 30, "r": 31, "s": 32, "sh": 33, "sil": 34, "t": 35, "th": 36, "v": 37, "w": 38, "y": 39, "z": 40, "zh": 41}
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/ctc_lin.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/ctc_lin.ckpt
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/custom.py ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/custom.py
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/hyperparams.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/hyperparams.yaml
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/model.ckpt ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/model.ckpt
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ datasets==2.16.1
2
+ deep-phonemizer==0.0.19
3
+ speechbrain==0.5.16
4
+ cmudict==1.0.22
5
+ fire==0.6.0
6
+ python-Levenshtein==0.25.0
7
+ librosa==0.10.1
8
+ transformers==4.37.2
transcriber.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fire
2
+ import logging
3
+ import sys, os
4
+ import yaml
5
+ import json
6
+ import torch
7
+ import librosa
8
+ from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
9
+ import transformers
10
+ import pandas as pd
11
+
12
+ logger = logging.getLogger(__name__)
13
+ # Setup logging
14
+ logger.setLevel(logging.ERROR)
15
+ console_handler = logging.StreamHandler()
16
+ formater = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
17
+ datefmt="%m/%d/%Y %H:%M:%S",)
18
+ console_handler.setFormatter(formater)
19
+ console_handler.setLevel(logging.ERROR)
20
+
21
+ logger.addHandler(console_handler)
22
+
23
+
24
+ class transcribe_SA():
25
+ def __init__(self, model_path, verbose=0):
26
+ if verbose == 0:
27
+ logger.setLevel(logging.ERROR)
28
+ transformers.logging.set_verbosity_error()
29
+ #console_handler.setLevel(logging.ERROR)
30
+ elif verbose == 1:
31
+ logger.setLevel(logging.WARNING)
32
+ transformers.logging.set_verbosity_warning()
33
+ #console_handler.setLevel(logging.WARNING)
34
+ else:
35
+ logger.setLevel(logging.INFO)
36
+ transformers.logging.set_verbosity_info()
37
+ #console_handler.setLevel(logging.INFO)
38
+ # Read YAML file
39
+ logger.info('Init Object')
40
+ if torch.cuda.is_available():
41
+ self.accelerate = True
42
+ self.device = torch.device('cuda')
43
+ self.n_devices = torch.cuda.device_count()
44
+ assert self.n_devices == 1, 'Support only single GPU. Please use CUDA_VISIBLE_DEVICES=gpu_index if you have multiple gpus' #Currently support only single gpu
45
+ else:
46
+ self.device = torch.device('cpu')
47
+ self.n_devices = 1
48
+ self.model_path = model_path
49
+ self.load_model()
50
+ self.get_available_attributes()
51
+ self.get_att_binary_group_indexs()
52
+
53
+ def load_model(self):
54
+ if not os.path.exists(self.model_path):
55
+ logger.error(f'Model file {self.model_path} is not exist')
56
+ raise FileNotFoundError
57
+
58
+ self.processor = Wav2Vec2Processor.from_pretrained(self.model_path)
59
+ self.model = Wav2Vec2ForCTC.from_pretrained(self.model_path)
60
+ self.pad_token_id = self.processor.tokenizer.pad_token_id
61
+ self.sampling_rate = self.processor.feature_extractor.sampling_rate
62
+
63
+ def get_available_attributes(self):
64
+ if not hasattr(self, 'model'):
65
+ logger.error('model not loaded, call load_model first!')
66
+ raise AttributeError("model not defined")
67
+ att_list = set(self.processor.tokenizer.get_vocab().keys()) - set(self.processor.tokenizer.all_special_tokens)
68
+ att_list = [p.replace('p_','') for p in att_list if p[0]=='p']
69
+ self.att_list = att_list
70
+
71
+ def print_availabel_attributes(self):
72
+ print(self.att_list)
73
+
74
+
75
+ def get_att_binary_group_indexs(self):
76
+ self.group_ids = [] #Each group contains the token_ids of [<PAD>, n_att, p_att] sorted by their token ids
77
+ for i, att in enumerate(self.att_list):
78
+ n_indx = self.processor.tokenizer.convert_tokens_to_ids(f'n_{att}')
79
+ p_indx = self.processor.tokenizer.convert_tokens_to_ids(f'p_{att}')
80
+ self.group_ids.append(sorted([self.pad_token_id, n_indx, p_indx]))
81
+
82
+ def decode_att(self, logits, att): #Need to lowercase when first read from the user
83
+ mask = torch.zeros(logits.size()[2], dtype = torch.bool)
84
+ try:
85
+ i = self.att_list.index(att)
86
+ except ValueError:
87
+ logger.error(f'The given attribute {att} not supported in the given model {self.model_path}')
88
+ raise
89
+ mask[self.group_ids[i]] = True
90
+ logits_g = logits[:,:,mask]
91
+ pred_ids = torch.argmax(logits_g,dim=-1)
92
+ pred_ids = pred_ids.cpu().apply_(lambda x: self.group_ids[i][x])
93
+ pred = self.processor.batch_decode(pred_ids,spaces_between_special_tokens=True)[0].split()
94
+ return list(map(lambda x:{f'p_{att}':'+',f'n_{att}':'-'}[x], pred))
95
+
96
+ def read_audio_file(self, audio_file):
97
+ if not os.path.exists(audio_file):
98
+ logger.error(f'Audio file {audio_file} is not exist')
99
+ raise FileNotFoundError
100
+ y, _ = librosa.load(audio_file, sr=self.sampling_rate)
101
+
102
+ return y
103
+
104
+
105
+ def get_logits(self, y):
106
+
107
+ input_values = self.processor(audio=y, sampling_rate=self.sampling_rate, return_tensors="pt").input_values
108
+
109
+ with torch.no_grad():
110
+ logits = self.model(input_values).logits
111
+
112
+ return logits
113
+
114
+
115
+ def check_identical_phonemes(self, df_p2att):
116
+ identical_phonemes = []
117
+ for index,row in df_p2att.iterrows():
118
+ mask = df_p2att.eq(row).all(axis=1)
119
+ indexes = df_p2att[mask].index.values
120
+ if len(indexes) > 1:
121
+ identical_phonemes.append(tuple(indexes))
122
+ if identical_phonemes:
123
+ logger.warning('The following phonemes has identical phonological features given the phonological features used in the model. If using fixed weight layer, these phonemes will be confused with each other')
124
+ identical_phonemes = set(identical_phonemes)
125
+ for x in identical_phonemes:
126
+ logger.warning(f"{','.join(x)}")
127
+
128
+ def read_phoneme2att(self,p2att_file):
129
+
130
+ if not os.path.exists(p2att_file):
131
+ logger.error(f'Phonological matrix file {p2att_file} is not exist')
132
+ raise FileNotFoundError(f'{p2att_file}')
133
+
134
+ df_p2att = pd.read_csv(p2att_file, index_col=0)
135
+
136
+ self.check_identical_phonemes(df_p2att)
137
+ not_supported = set(df_p2att.columns) - set(self.att_list)
138
+ if not_supported:
139
+ logger.warning(f"Attribute/s {','.join(not_supported)} is not supported by the model {self.model_path} and will be ignored. To get available attributes of the selected model run transcribe --model_path=/path/to/model print_availabel_attributes")
140
+ df_p2att = df_p2att.drop(columns=not_supported)
141
+
142
+ self.phoneme_list = df_p2att.index.values
143
+ self.p2att_map = {}
144
+ for i, r in df_p2att.iterrows():
145
+ phoneme = i
146
+ self.p2att_map[phoneme] = []
147
+ for att in r.index.values:
148
+ if f'p_{att}' not in self.processor.tokenizer.vocab:
149
+ logger.warn(f'Attribute {att} is not supported by the model {self.model_path} and will be ignored. To get available attributes of the selected model run transcribe --model_path=/path/to/model print_availabel_attributes')
150
+ continue
151
+ value = r[att]
152
+ if value == 0:
153
+ self.p2att_map[phoneme].append(f'n_{att}')
154
+ elif value == 1:
155
+ self.p2att_map[phoneme].append(f'p_{att}')
156
+ else:
157
+ logger.error(f'Invalid value of {value} for attribute {att} of phoneme {phoneme}. Values in the phoneme to attribute map should be either 0 or 1')
158
+ raise ValueError(f'{value} should be 0 or 1')
159
+
160
+
161
+ def create_phoneme_tokenizer(self):
162
+ vocab_list = self.phoneme_list
163
+ vocab_dict = {v: k+1 for k, v in enumerate(vocab_list)}
164
+ vocab_dict['<pad>'] = 0
165
+ vocab_dict = dict(sorted(vocab_dict.items(), key= lambda x: x[1]))
166
+ vocab_file = 'phoneme_vocab.json'
167
+ with open(vocab_file, 'w') as f:
168
+ json.dump(vocab_dict, f)
169
+ #Build processor
170
+ self.phoneme_tokenizer = Wav2Vec2CTCTokenizer(vocab_file, pad_token="<pad>", word_delimiter_token="")
171
+
172
+ def create_phonological_matrix(self):
173
+ self.phonological_matrix = torch.zeros((self.phoneme_tokenizer.vocab_size, self.processor.tokenizer.vocab_size)).type(torch.FloatTensor)
174
+ self.phonological_matrix[self.phoneme_tokenizer.pad_token_id, self.processor.tokenizer.pad_token_id] = 1
175
+ for p in self.phoneme_list:
176
+ for att in self.p2att_map[p]:
177
+ self.phonological_matrix[self.phoneme_tokenizer.convert_tokens_to_ids(p), self.processor.tokenizer.convert_tokens_to_ids(att)] = 1
178
+
179
+
180
+ #This function gets the attribute logits from the output layer and convert to phonemes
181
+ #Input is a sequence of logits (one vector per frame) and output phoneme sequence
182
+ #Note that this is CTC so number of output phonemes is not equal to number of input frames
183
+ def decode_phoneme(self,logits):
184
+ def masked_log_softmax(vector: torch.Tensor, mask: torch.Tensor, dim: int = -1) -> torch.Tensor:
185
+ if mask is not None:
186
+ mask = mask.float()
187
+ while mask.dim() < vector.dim():
188
+ mask = mask.unsqueeze(1)
189
+ # vector + mask.log() is an easy way to zero out masked elements in logspace, but it
190
+ # results in nans when the whole vector is masked. We need a very small value instead of a
191
+ # zero in the mask for these cases. log(1 + 1e-45) is still basically 0, so we can safely
192
+ # just add 1e-45 before calling mask.log(). We use 1e-45 because 1e-46 is so small it
193
+ # becomes 0 - this is just the smallest value we can actually use.
194
+ vector = vector + (mask + 1e-45).log()
195
+ return torch.nn.functional.log_softmax(vector, dim=dim)
196
+
197
+ log_props_all_masked = []
198
+ for i in range(len(self.att_list)):
199
+ mask = torch.zeros(logits.size()[2], dtype = torch.bool)
200
+ mask[self.group_ids[i]] = True
201
+ mask.unsqueeze_(0).unsqueeze_(0)
202
+ log_probs = masked_log_softmax(vector=logits, mask=mask, dim=-1).masked_fill(~mask,0)
203
+ log_props_all_masked.append(log_probs)
204
+ log_probs_cat = torch.stack(log_props_all_masked, dim=0).sum(dim=0)
205
+ log_probs_phoneme = torch.matmul(self.phonological_matrix,log_probs_cat.transpose(1,2)).transpose(1,2).type(torch.FloatTensor)
206
+ pred_ids = torch.argmax(log_probs_phoneme,dim=-1)
207
+ pred = self.phoneme_tokenizer.batch_decode(pred_ids,spaces_between_special_tokens=True)[0]
208
+ return pred
209
+
210
+
211
+ def print_human_readable(self, output, with_phoneme = False):
212
+ column_widths = []
213
+ rows = []
214
+ if with_phoneme:
215
+ column_widths.append(max([len(att['Name']) for att in output['Attributes']]+[len('Phoneme')]))
216
+ column_widths.extend([5]*max([len(att['Pattern']) for att in output['Attributes']]+[len(output['Phoneme']['symbols'])]))
217
+ rows.append(('Phoneme'.center(column_widths[0]), *[s.center(column_widths[j+1]) for j,s in enumerate(output['Phoneme']['symbols'])]))
218
+ else:
219
+ column_widths.append(max([len(att['Name']) for att in output['Attributes']]))
220
+ column_widths.extend([5]*max([len(att['Pattern']) for att in output['Attributes']]))
221
+ for i in range(len(output['Attributes'])):
222
+ att = output['Attributes'][i]
223
+ rows.append((att['Name'].center(column_widths[0]), *[s.center(column_widths[j+1]) for j,s in enumerate(att['Pattern'])]))
224
+ out_string = ''
225
+ for row in rows:
226
+ out_string += '|'.join(row)
227
+ out_string += '\n'
228
+ return out_string
229
+
230
+ def transcribe(self, audio_file,
231
+ attributes='all',
232
+ phonological_matrix_file = None,
233
+ human_readable = True):
234
+
235
+
236
+ output = {}
237
+ output['wav_file_path'] = audio_file
238
+ output['Attributes'] = []
239
+ output['Phoneme'] = {}
240
+
241
+ #Initiate the model
242
+ #self.load_model()
243
+ #self.get_available_attributes()
244
+ #self.get_att_binary_group_indexs()
245
+
246
+ if attributes == 'all':
247
+ target_attributes = self.att_list
248
+ else:
249
+ attributes = attributes if isinstance(attributes,tuple) else (attributes,)
250
+ target_attributes = [att.lower() for att in attributes if att.lower() in self.att_list]
251
+
252
+ if not target_attributes:
253
+ logger.error(f'None of the given attributes is supported by model {self.model_path}. To get available attributes of the selected model run transcribe --model_path=/path/to/model get_available_attributes')
254
+ raise ValueError("Invalid attributes")
255
+
256
+ #Process audio
257
+ y = self.read_audio_file(audio_file)
258
+ self.logits = self.get_logits(y)
259
+
260
+ for att in target_attributes:
261
+ output['Attributes'].append({'Name':att, 'Pattern' : self.decode_att(self.logits, att)})
262
+
263
+ if phonological_matrix_file:
264
+ self.read_phoneme2att(phonological_matrix_file)
265
+ self.create_phoneme_tokenizer()
266
+ self.create_phonological_matrix()
267
+ output['Phoneme']['symbols'] = self.decode_phoneme(self.logits).split()
268
+
269
+
270
+
271
+ json_string = json.dumps(output, indent=4)
272
+ if human_readable:
273
+ return self.print_human_readable(output, phonological_matrix_file!=None)
274
+ else:
275
+ return json_string
276
+ #return json_string
277
+
278
+
279
+ def main():
280
+ fire.Fire(transcribe_SA)
281
+
282
+ if __name__ == '__main__':
283
+ main()