Spaces:

mostafaashahin
/

Phone-aid

Running

App Files Files Community

Mostafa Shahin commited on Mar 31

Commit

4c01711

•

1 Parent(s): fbc2e1b

First Commit

Browse files

Files changed (25) hide show

Phonemize.py +104 -0
__pycache__/Phonemize.cpython-312.pyc +0 -0
__pycache__/transcriber.cpython-312.pyc +0 -0
app.py +167 -0
data/p2att_en_us-arpa.csv +42 -0
data/prompts.txt +44 -0
models/SA/added_tokens.json +5 -0
models/SA/config.json +109 -0
models/SA/model.safetensors +3 -0
models/SA/preprocessor_config.json +10 -0
models/SA/special_tokens_map.json +6 -0
models/SA/tokenizer_config.json +608 -0
models/SA/vocab.json +73 -0
models/d_phonemizer/en_us_cmudict_forward.pt +3 -0
models/sb_phonemizer/config.json +3 -0
models/sb_phonemizer/ctc_lin.ckpt +3 -0
models/sb_phonemizer/hyperparams.yaml +507 -0
models/sb_phonemizer/model.ckpt +3 -0
phoneme_vocab.json +1 -0
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/ctc_lin.ckpt +1 -0
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/custom.py +1 -0
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/hyperparams.yaml +1 -0
pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/model.ckpt +1 -0
requirements.txt +8 -0
transcriber.py +283 -0

Phonemize.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from datasets import load_from_disk
+from dp.phonemizer import Phonemizer
+from speechbrain.pretrained import GraphemeToPhoneme
+import cmudict
+import re
+import fire
+import torch
+from os.path import join
+if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+    torch.multiprocessing.set_start_method('spawn')
+class phonemization:
+    def __init__(self):
+        self.chars_to_ignore_regex = r'[,?.!-;:"]'
+        self.dp_phonemizer_model_path = join('models','d_phonemizer','en_us_cmudict_forward.pt')
+        self.sb_phonemizer_model_path = join('models','sb_phonemizer')
+        self.cmu_dict = cmudict.dict()
+        self.dp_phonemizer = Phonemizer.from_checkpoint(self.dp_phonemizer_model_path)
+        if torch.cuda.is_available():
+            self.sb_phonemizer = GraphemeToPhoneme.from_hparams(self.sb_phonemizer_model_path,run_opts={"device":"cuda"})
+        else:
+            self.sb_phonemizer = GraphemeToPhoneme.from_hparams(self.sb_phonemizer_model_path)
+        self.normalize = False
+    def dp_phonemize(self, text):
+        return self.dp_phonemizer(text, lang='en_us',expand_acronyms=False).replace('[',' ').replace(']',' ').split()
+    def cmu_phonemize(self,
+                      text,
+                      fallback_phonemizer=dp_phonemize):
+        phoneme_lst=[]
+        for word in text.split():
+            if word in self.cmu_dict:
+                phoneme_lst.extend(re.sub('[0-9]','',' '.join(self.cmu_dict.get(word)[0])).split())
+            else:
+                phoneme_lst.extend(fallback_phonemizer(self,word))
+        phoneme_lst = [p.lower() for p in phoneme_lst]
+        return(phoneme_lst)
+    def sb_phonemize(self,text):
+        return self.sb_phonemizer(text)
+    def remove_special_characters(self,text):
+        #print(text)
+        return re.sub(self.chars_to_ignore_regex, ' ', text).lower() + " "
+    def replace_multiple_spaces_with_single_space(self, input_string):
+        """Replace multiple spaces with a single space."""
+        return re.sub(r'\s+', ' ', input_string)
+    def phonemize_batch(self,
+                        batch,
+                        phonamizer_fn=dp_phonemize,
+                        suffix=''):
+        if self.normalize:
+            text = batch['text_norm'].lower()
+        else:
+            text = batch['text'].lower()
+        phoneme_str = ' '.join(phonamizer_fn(text))
+        phoneme_str = phoneme_str.lower()
+        phoneme_str = self.replace_multiple_spaces_with_single_space(phoneme_str)
+        batch[f'phoneme{suffix}'] = phoneme_str.strip()
+        return batch
+    def remove_special_characters_batch(self, batch):
+        batch["text_norm"] = self.remove_special_characters(batch["text"])
+        return batch
+    def run(self,
+            dataset_path,
+            output_path,
+            phonemizers='dp,sb,cmu',
+            normalize=True,
+            nproc=1):
+        data = load_from_disk(dataset_path)
+        if normalize:
+            data = data.map(self.remove_special_characters_batch, num_proc=nproc)
+        for phonemizer in phonemizers.split(','):
+            if phonemizer == 'cmu':
+                data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.cmu_phonemize,'suffix':'_cmu'},num_proc=nproc)
+            if phonemizer == 'dp':
+                data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.dp_phonemize,'suffix':'_dp'},num_proc=nproc)
+            if phonemizer == 'sb':
+                if torch.cuda.is_available():
+                    nproc = torch.cuda.device_count()
+                data = data.map(self.phonemize_batch, fn_kwargs={'phonamizer_fn':self.sb_phonemize,'suffix':'_sb'},num_proc=nproc, cache_file_name='/g/data/iv96/mostafa/cache_sb', load_from_cache_file=True)
+        data.save_to_disk(output_path)
+if __name__=='__main__':
+    fire.Fire(phonemization)

__pycache__/Phonemize.cpython-312.pyc ADDED Viewed

Binary file (6.31 kB). View file

__pycache__/transcriber.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import random
+import Phonemize
+from Levenshtein import editops
+from gradio.components import Audio, Dropdown, Textbox, Image
+import gradio as gr
+import transcriber
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+from scipy.signal import spectrogram
+import numpy as np
+engine = transcriber.transcribe_SA(model_path='models/SA',verbose=0)
+phonemizer = Phonemize.phonemization()
+prompts = np.loadtxt('data/prompts.txt', dtype=str)
+Attributes = engine.att_list
+df_output = None
+def select_prompt():
+    return random.choice(prompts)
+def phonemize_prompt(prompt):
+    return ' '.join(phonemizer.cmu_phonemize(prompt)).lower()
+def diff_fn():
+    return [('H','+'),('E','-'),('N',None),('\n', None),('F','-'),('Fgo','-'),('M','+')]
+def recognizeAudio(audio_file, attributes):
+    #print(','.join(attributes))
+    global df_output
+    output = engine.transcribe(audio_file, attributes= tuple(attributes), phonological_matrix_file='data/p2att_en_us-arpa.csv', human_readable=False)
+    records = []
+    d = json.loads(output)
+    records.append(['Phoneme']+d['Phoneme']['symbols'])
+    for att in d['Attributes']:
+        records.append([att['Name']]+att['Pattern'])
+    df = pd.DataFrame.from_records(records)
+    df.fillna('', inplace=True)
+    df_output = df
+    return df.to_html(header=False, index=False)
+#Get error by matching the expected sequence with the recognized one and return the output in a format that can be visualized by the gradio HighlightedText box
+def get_error(exp_list, rec_list):
+    exp_list = list(exp_list)
+    rec_list = list(rec_list)
+    vocab = set(exp_list+rec_list)
+    w2c = dict(zip(vocab,range(len(vocab))))
+    exp_out = [[a,None] for a in exp_list]
+    rec_out = [[a,None] for a in rec_list]
+    exp_enc = ''.join([chr(w2c[c]) for c in exp_list])
+    rec_enc = ''.join([chr(w2c[c]) for c in rec_list])
+    for op, exp_i, rec_i in editops(exp_enc, rec_enc):
+        if op == 'replace':
+            exp_out[exp_i][1] = 'S'
+            rec_out[rec_i][1] = 'S'
+        elif op == 'insert':
+            rec_out[rec_i][1] = 'I'
+        elif op == 'delete':
+            exp_out[exp_i][1] = 'D'
+    diff_list = [['Expected:\t', None]] + exp_out + [['\n',None]] + [['Recognized:\t', None]] + rec_out
+    return diff_list
+def scale_vector(vector, new_min, new_max):
+    min_val = min(vector)
+    max_val = max(vector)
+    scaled_vector = []
+    for val in vector:
+        scaled_val = ((val - min_val) * (new_max - new_min) / (max_val - min_val)) + new_min
+        scaled_vector.append(scaled_val)
+    return scaled_vector
+def create_spectrogram_with_att(wav_file, att_contour, att):
+    # Read the WAV file
+    sampling_rate, data = wavfile.read(wav_file)
+    # Calculate the spectrogram
+    f, t, Sxx = spectrogram(data, fs=sampling_rate)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    # Plot the spectrogram
+    ax.pcolormesh(t, f, 10 * np.log10(Sxx), shading='gouraud')
+    ax.set_ylabel('Frequency (Hz)')
+    ax.set_xlabel('Time (s)')
+    ax.set_title(f'Spectrogram with {att} Contour')
+    ax.set_ylim(0, 8000)  # Adjust the frequency range if necessary
+    # Plot the att contour
+    time_pitch = np.arange(0, len(att_contour) * 0.02, 0.02)  # Assuming pitch_contour is sampled every 20 ms
+    ax.plot(time_pitch, att_contour, color='blue', label=f'{att} Contour')
+    ax.legend()
+    return fig
+def plot_contour(audio_file, att):
+    indx = engine.processor.tokenizer.convert_tokens_to_ids([f'p_{att}'])
+    att_contour = engine.logits.squeeze()[:,indx]
+    att_contour = scale_vector(att_contour, 0, 6000)
+    fig = create_spectrogram_with_att(audio_file, att_contour, att)
+    return fig
+with gr.Blocks() as gui:
+    with gr.Tab("Main"):
+        prompt = gr.Textbox(label='Prompt', value=select_prompt)
+        get_prompt = gr.Button("Get Prompt")
+        get_prompt.click(fn=select_prompt, outputs=prompt)
+        prompt_phonemes = gr.Textbox(label="Expected Phonemes", interactive=False)
+        get_phoneme = gr.Button("Get Phonemes")
+        get_phoneme.click(fn=phonemize_prompt, inputs=prompt, outputs=prompt_phonemes)
+        record_audio = gr.Audio(sources=["microphone","upload"], type="filepath")
+        att_list = gr.Dropdown(label="Select Attributes", choices=Attributes, value=['vowel', 'voiced', 'consonant'] ,multiselect=True)
+        process = gr.Button("Process Audio")
+        recognition = gr.HTML(label='Output')
+        process.click(fn=recognizeAudio, inputs=[record_audio,att_list], outputs=recognition)
+    with gr.Tab("Assessment"):
+        assess = gr.Button("Assessment")
+        diff = []
+        for i in range(len(Attributes)+1):
+            diff.append(gr.HighlightedText(
+                    combine_adjacent=False,
+                    show_legend=True,
+                    color_map={"S": "red", "I": "green", "D":"blue"}, visible=False))
+        def get_assessment(prompt_phonemes):#, recognized_phonemes, recognized_attributes):
+            outputs = [gr.HighlightedText(visible=False)]*(len(Attributes)+1)
+            outputs[0] = gr.HighlightedText(label=f"Phoneme Assessment",
+                                              value=get_error(prompt_phonemes.split(), df_output.iloc[0].values[1:]),
+                                              visible=True)
+            i = 1
+            for i,r in df_output.iloc[1:].iterrows():
+                convert = lambda ph: '-' if f'n_{att}' in engine.p2att_map[ph] else '+'
+                att = r.iloc[0]
+                exp_att = [convert(ph) for ph in prompt_phonemes.split()]
+                rec_att = r.iloc[1:].values
+                outputs[i] = gr.HighlightedText(label=f"{att} Assessment",
+                                              value=get_error(exp_att, rec_att),
+                                              visible=True)
+                i += 1
+            return outputs
+        assess.click(fn=get_assessment, inputs= [prompt_phonemes], outputs=diff)
+    with gr.Tab("Analysis"):
+        selected_att = gr.Dropdown( Attributes, label="Select an Attribute to plot", value='voiced', interactive=True)
+        do_plot = gr.Button('Plot')
+        plot_block = gr.Plot(label='Spectrogram with Attribute Contour')
+        do_plot.click(plot_contour, inputs=[record_audio,selected_att], outputs=plot_block)
+gui.launch()

data/p2att_en_us-arpa.csv ADDED Viewed

	@@ -0,0 +1,42 @@

+Phoneme_arpa,alveolar,palatal,dental,glottal,labial,velar,anterior,posterior,retroflex,high,low,mid,front,back,central,consonant,sonorant,long,short,vowel,semivowel,fricative,nasal,stop,approximant,affricate,liquid,continuant,monophthong,diphthong,round,voiced,labiodental,obstruent,bilabial,coronal,dorsal
+aa,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+ae,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+ah,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+ao,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
+aw,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
+ay,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
+eh,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
+er,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+ey,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0
+ih,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+iy,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0
+ow,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
+oy,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0
+uh,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
+uw,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
+b,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0
+ch,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
+d,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0
+dh,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0
+f,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0
+g,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
+hh,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1
+jh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0
+k,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1
+l,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0
+m,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
+n,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
+nd,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
+ng,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1
+p,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
+r,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0
+s,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
+sh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
+sil,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+t,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
+th,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
+v,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0
+w,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0
+y,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0
+z,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0
+zh,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0

data/prompts.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+Top
+Cop
+Tight
+Kite
+Torn
+Corn
+Tame
+Came
+Tall
+Call
+Tail
+Kale
+Bat
+Back
+Pit
+Pick
+Ate
+Ache
+But
+Buck
+Sit
+Sick
+Rate
+Rake
+Date
+Gate
+Deer
+Gear
+Drip
+Grip
+Down
+Gown
+Doe
+Go
+Bid
+Big
+Led
+Leg
+Mud
+Mug
+Bud
+Bug
+Bed
+Beg

models/SA/added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "</s>": 72,
+  "<s>": 71,
+  "<unk>": 73
+}

models/SA/config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "_name_or_path": "/g/data/iv96/mostafa/Speech-Attribute-Transcription/models/wav2vec2-large-robust/",
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.2",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 71,
+  "xvector_output_dim": 512
+}

models/SA/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31862db1655be478b59e480e490165c7109e8b659277b43ee3fcc3fff772fea0
+size 1262098580

models/SA/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

models/SA/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

models/SA/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,608 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "p_alveolar",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "2": {
+      "content": "n_alveolar",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "3": {
+      "content": "p_palatal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "4": {
+      "content": "n_palatal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "p_dental",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "n_dental",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "p_glottal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "n_glottal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "p_labial",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "n_labial",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "p_velar",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "n_velar",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "p_anterior",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "n_anterior",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "p_posterior",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "n_posterior",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "p_retroflex",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "n_retroflex",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "p_mid",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "n_mid",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "p_high",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "n_high",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "p_low",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "n_low",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "p_front",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "n_front",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "p_back",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "n_back",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "p_central",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "n_central",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "p_consonant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "n_consonant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "p_sonorant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "n_sonorant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "p_long",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "n_long",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "p_short",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "n_short",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "p_vowel",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "n_vowel",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "p_semivowel",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "n_semivowel",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "p_fricative",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "n_fricative",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "p_nasal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "n_nasal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "p_stop",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "n_stop",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "p_approximant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "n_approximant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "p_affricate",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "n_affricate",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "p_liquid",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "n_liquid",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "p_continuant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "n_continuant",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "p_monophthong",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "n_monophthong",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "p_diphthong",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "n_diphthong",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "p_round",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "n_round",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "p_voiced",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "64": {
+      "content": "n_voiced",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "65": {
+      "content": "p_bilabial",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "66": {
+      "content": "n_bilabial",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "67": {
+      "content": "p_coronal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "68": {
+      "content": "n_coronal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "69": {
+      "content": "p_dorsal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "70": {
+      "content": "n_dorsal",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "71": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": ""
+}

models/SA/vocab.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "<pad>": 0,
+  "n_affricate": 52,
+  "n_alveolar": 2,
+  "n_anterior": 14,
+  "n_approximant": 50,
+  "n_back": 28,
+  "n_bilabial": 66,
+  "n_central": 30,
+  "n_consonant": 32,
+  "n_continuant": 56,
+  "n_coronal": 68,
+  "n_dental": 6,
+  "n_diphthong": 60,
+  "n_dorsal": 70,
+  "n_fricative": 44,
+  "n_front": 26,
+  "n_glottal": 8,
+  "n_high": 22,
+  "n_labial": 10,
+  "n_liquid": 54,
+  "n_long": 36,
+  "n_low": 24,
+  "n_mid": 20,
+  "n_monophthong": 58,
+  "n_nasal": 46,
+  "n_palatal": 4,
+  "n_posterior": 16,
+  "n_retroflex": 18,
+  "n_round": 62,
+  "n_semivowel": 42,
+  "n_short": 38,
+  "n_sonorant": 34,
+  "n_stop": 48,
+  "n_velar": 12,
+  "n_voiced": 64,
+  "n_vowel": 40,
+  "p_affricate": 51,
+  "p_alveolar": 1,
+  "p_anterior": 13,
+  "p_approximant": 49,
+  "p_back": 27,
+  "p_bilabial": 65,
+  "p_central": 29,
+  "p_consonant": 31,
+  "p_continuant": 55,
+  "p_coronal": 67,
+  "p_dental": 5,
+  "p_diphthong": 59,
+  "p_dorsal": 69,
+  "p_fricative": 43,
+  "p_front": 25,
+  "p_glottal": 7,
+  "p_high": 21,
+  "p_labial": 9,
+  "p_liquid": 53,
+  "p_long": 35,
+  "p_low": 23,
+  "p_mid": 19,
+  "p_monophthong": 57,
+  "p_nasal": 45,
+  "p_palatal": 3,
+  "p_posterior": 15,
+  "p_retroflex": 17,
+  "p_round": 61,
+  "p_semivowel": 41,
+  "p_short": 37,
+  "p_sonorant": 33,
+  "p_stop": 47,
+  "p_velar": 11,
+  "p_voiced": 63,
+  "p_vowel": 39
+}

models/d_phonemizer/en_us_cmudict_forward.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e1fb223d7e027bf7b33052540c6f71d19db6d7fd87ab8671152b8b114501c2
+size 66725366

models/sb_phonemizer/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "speechbrain_interface": "GraphemeToPhoneme"
+}

models/sb_phonemizer/ctc_lin.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c72639caba01630cf5ccc9b287b6eb7b79acc2276aa6f5cc23640640ac8f7ee
+size 177319

models/sb_phonemizer/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,507 @@

+# Generated 2022-07-09 from:
+# /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
+# yamllint disable
+# ################################
+# Model: LSTM (encoder) + GRU (decoder) (tokenized)
+# Authors:
+# Loren Lugosch & Mirco Ravanelli 2020
+# Artem Ploujnikov 2021
+# ################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1234
+__set_seed: !apply:torch.manual_seed [1234]
+# Tokenizers
+char_tokenize: false
+char_token_type: unigram  # ["unigram", "bpe", "char"]
+char_token_output: 512
+char_token_wordwise: true
+phn_tokenize: false
+phn_token_type: unigram  # ["unigram", "bpe", "char"]
+phn_token_output: 512  # index(blank/eos/bos/unk) = 0
+phn_token_wordwise: true
+character_coverage: 1.0
+phonemes_count: 43
+graphemes_count: 31
+phonemes_enable_space: true
+# Training Parameters
+lexicon_epochs: 50
+lexicon_ctc_epochs: 10
+lexicon_limit_to_stop: 50                    # No stopping by default, can override
+lexicon_limit_warmup: 50                    # No stopping by default, can override
+sentence_epochs: 13
+sentence_ctc_epochs: 10
+sentence_limit_to_stop: 3
+sentence_limit_warmup: 3
+homograph_epochs: 50
+homograph_ctc_epochs: 10
+homograph_limit_to_stop: 5
+homograph_limit_warmup: 10
+lexicon_batch_size: 1024
+sentence_batch_size: 32
+homograph_batch_size: 32
+ctc_weight: 0.5
+homograph_loss_weight: 2.0
+lr: 0.002
+save_for_pretrained: true
+# Model parameters
+output_neurons: &id004 !apply:speechbrain.utils.hparams.choice
+  value: false
+  choices:
+    true: 513
+    false: 43
+enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice
+  value: false
+  choices:
+    true: 513
+    false: 31
+enc_dropout: 0.5
+enc_neurons: 512
+enc_num_layers: 4
+dec_dropout: 0.5
+dec_neurons: 512
+dec_att_neurons: 256
+dec_num_layers: 4
+embedding_dim: 512
+# Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
+# Available modes:
+# raw: no BOS/EOS tokens are added
+# bos: a beginning-of-sequence token is added
+# eos: an end-of-sequence token is added
+grapheme_sequence_mode: bos
+phoneme_sequence_mode: bos
+# Special Token information
+bos_index: 0
+eos_index: 1
+blank_index: 2
+unk_index: 2
+token_space_index: 512
+# Language Model
+lm_emb_dim: 256 # dimension of the embeddings
+lm_rnn_size: 512 # dimension of hidden layers
+lm_layers: 2 # number of hidden layers
+lm_output_neurons: 43
+# Beam Searcher
+use_language_model: false
+beam_search_min_decode_ratio: 0
+beam_search_max_decode_ratio: 1.0
+beam_search_beam_size: 16
+beam_search_beam_size_valid: 16
+beam_search_eos_threshold: 10.0
+beam_search_using_max_attn_shift: false
+beam_search_max_attn_shift: 10
+beam_search_coverage_penalty: 5.0
+beam_search_lm_weight: 0.5
+beam_search_ctc_weight_decode: 0.4
+beam_search_temperature: 1.25
+beam_search_temperature_lm: 1.0
+# Word embeddings
+use_word_emb: true
+word_emb_model: bert-base-uncased
+word_emb_dim: 768
+word_emb_enc_dim: 256
+word_emb_norm_type: batch
+graphemes: &id028
+- A
+- B
+- C
+- D
+- E
+- F
+- G
+- H
+- I
+- J
+- K
+- L
+- M
+- N
+- O
+- P
+- Q
+- R
+- S
+- T
+- U
+- V
+- W
+- X
+- Y
+- Z
+- "'"
+- ' '
+phonemes: &id001
+- AA
+- AE
+- AH
+- AO
+- AW
+- AY
+- B
+- CH
+- D
+- DH
+- EH
+- ER
+- EY
+- F
+- G
+- HH
+- IH
+- IY
+- JH
+- K
+- L
+- M
+- N
+- NG
+- OW
+- OY
+- P
+- R
+- S
+- SH
+- T
+- TH
+- UH
+- UW
+- V
+- W
+- Y
+- Z
+- ZH
+- ' '
+enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim
+  use_word_emb: true
+  word_emb_enc_dim: 256
+  embedding_dim: 512
+phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
+# Models
+  tokens: *id001
+char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map
+  map_dict: *id002
+enc: &id006 !new:speechbrain.nnet.RNN.LSTM
+  input_shape: [null, null, *id003]
+  bidirectional: true
+  hidden_size: 512
+  num_layers: 4
+  dropout: 0.5
+lin: &id010 !new:speechbrain.nnet.linear.Linear
+  input_size: 512
+  n_neurons: *id004
+  bias: false
+ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear
+  input_size: 1024
+  n_neurons: *id004
+encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
+  num_embeddings: *id005
+  embedding_dim: 512
+emb: &id008 !new:speechbrain.nnet.embedding.Embedding
+  num_embeddings: *id004
+  embedding_dim: 512
+dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
+  enc_dim: 1024
+  input_size: 512
+  rnn_type: gru
+  attn_type: content
+  dropout: 0.5
+  hidden_size: 512
+  attn_dim: 256
+  num_layers: 4
+word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
+  word_emb_dim: 768
+  word_emb_enc_dim: 256
+  norm_type: batch
+word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
+  init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
+    model: bert-base-uncased
+log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax
+  apply_log: true
+modules:
+  model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
+    enc: *id006
+    encoder_emb: *id007
+    emb: *id008
+    dec: *id009
+    lin: *id010
+    out: *id011
+    use_word_emb: true
+    word_emb_enc: *id012
+  enc: *id006
+  encoder_emb: *id007
+  emb: *id008
+  dec: *id009
+  lin: *id010
+  ctc_lin: *id013
+  out: *id011
+  word_emb:
+  word_emb_enc: *id012
+model: *id014
+lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM
+  embedding_dim: 256
+  rnn_layers: 2
+  rnn_neurons: 512
+  output_neurons: 43
+  return_hidden: true
+opt_class: !name:torch.optim.Adam
+  lr: 0.002
+beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher
+  embedding: *id008
+  decoder: *id009
+  linear: *id010
+  ctc_linear: *id013
+  bos_index: 0
+  eos_index: 1
+  blank_index: 2
+  min_decode_ratio: 0
+  max_decode_ratio: 1.0
+  beam_size: 16
+  eos_threshold: 10.0
+  using_max_attn_shift: false
+  max_attn_shift: 10
+  coverage_penalty: 5.0
+  ctc_weight: 0.4
+beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
+  embedding: *id008
+  decoder: *id009
+  linear: *id010
+  ctc_linear: *id013
+  bos_index: 0
+  eos_index: 1
+  blank_index: 2
+  min_decode_ratio: 0
+  max_decode_ratio: 1.0
+  beam_size: 16
+  eos_threshold: 10.0
+  using_max_attn_shift: false
+  max_attn_shift: 10
+  coverage_penalty: 5.0
+  ctc_weight: 0.4
+beam_searcher_lm: !new:speechbrain.decoders.seq2seq.S2SRNNBeamSearchLM
+  embedding: *id008
+  decoder: *id009
+  linear: *id010
+  ctc_linear: *id013
+  language_model: *id015
+  bos_index: 0
+  eos_index: 1
+  blank_index: 2
+  min_decode_ratio: 0
+  max_decode_ratio: 1.0
+  beam_size: 16
+  eos_threshold: 10.0
+  using_max_attn_shift: false
+  max_attn_shift: 10
+  coverage_penalty: 5.0
+  ctc_weight: 0.4
+  lm_weight: 0.5
+  temperature: 1.25
+  temperature_lm: 1.0
+lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler
+  initial_value: 0.002
+  improvement_threshold: 0.0
+  annealing_factor: 0.8
+  patient: 0
+homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
+seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss
+  label_smoothing: 0.1
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+  blank_index: 2
+seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss
+  label_smoothing: 0.1
+  reduction: batch
+homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss
+  seq_cost: *id016
+seq_stats: !name:speechbrain.utils.metric_stats.MetricStats
+  metric: *id017
+seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats
+  metric: *id017
+classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats
+per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
+per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats
+model_output_keys:
+- p_seq
+- char_lens
+- encoder_out
+grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
+phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
+grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
+  init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
+    model_dir: grapheme_tokenizer
+    bos_id: 0
+    eos_id: 1
+    unk_id: 2
+    vocab_size: 512
+    annotation_train: tokenizer_annotation_train.json
+    annotation_read: char
+    model_type: unigram                    # ["unigram", "bpe", "char"]
+    character_coverage: 1.0
+    annotation_format: json
+    text_file: grapheme_annotations.txt
+phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
+  init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
+    model_dir: phoneme_tokenizer
+    bos_id: 0
+    eos_id: 1
+    unk_id: 2
+    vocab_size: 512
+    annotation_train: tokenizer_annotation_train.json
+    annotation_read: phn
+    model_type: unigram                   # ["unigram", "bpe", "char"]
+    character_coverage: 1.0
+    annotation_list_to_check: [tokenizer_annotation_valid.json]
+    annotation_format: json
+    text_file: phoneme_annotations.txt
+out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
+  tokenizer: *id022
+  char_map: *id023
+  token_space_index: 512
+  wordwise: true
+out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode
+  encoder: *id024
+out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
+  value: false
+  choices:
+    true: *id025
+    false: *id026
+encode_pipeline:
+  batch: false
+  use_padded_data: true
+  output_keys:
+  - grapheme_list
+  - grapheme_encoded_list
+  - grapheme_encoded
+  - word_emb
+  init:
+  - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
+      encoder: *id027
+      tokens: *id028
+      bos_index: 0
+      eos_index: 1
+  - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
+      encoder: *id024
+      tokens: *id001
+      bos_index: 0
+      eos_index: 1
+  steps:
+  - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
+      graphemes: *id028
+    takes: txt
+    provides: txt_cleaned
+  - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
+      grapheme_encoder: *id027
+    takes: txt_cleaned
+    provides:
+    - grapheme_list
+    - grapheme_encoded_list
+    - grapheme_encoded_raw
+  - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
+      encoder: *id027
+    takes: grapheme_encoded_list
+    provides:
+    - grapheme_encoded
+    - grapheme_len
+    - grapheme_encoded_eos
+    - grapheme_len_eos
+  - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
+      word_emb: !ref <word_emb>
+      grapheme_encoder: !ref <grapheme_encoder>
+      use_word_emb: !ref <use_word_emb>
+    takes:
+    - txt
+    - grapheme_encoded
+    - grapheme_len
+    provides: word_emb
+decode_pipeline:
+  batch: true
+  output_keys:
+  - phonemes
+  steps:
+  - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
+      beam_searcher: *id029
+    takes:
+    - char_lens
+    - encoder_out
+    provides:
+    - hyps
+    - scores
+  - func: !apply:speechbrain.utils.hparams.choice
+      value: false
+      choices:
+        true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
+          tokenizer: *id022
+          char_map: *id023
+          token_space_index: 512
+          wordwise: true
+        false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
+          phoneme_encoder: *id024
+    takes:
+    - hyps
+    provides:
+    - phonemes
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    model: *id014
+    ctc_lin: *id013

models/sb_phonemizer/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71bf7a7b290f88de5fdd7364fa4ab249bdd94a29e6cdc742ee6f69edeae64f61
+size 128643257

phoneme_vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"<pad>": 0, "aa": 1, "ae": 2, "ah": 3, "ao": 4, "aw": 5, "ay": 6, "eh": 7, "er": 8, "ey": 9, "ih": 10, "iy": 11, "ow": 12, "oy": 13, "uh": 14, "uw": 15, "b": 16, "ch": 17, "d": 18, "dh": 19, "f": 20, "g": 21, "hh": 22, "jh": 23, "k": 24, "l": 25, "m": 26, "n": 27, "nd": 28, "ng": 29, "p": 30, "r": 31, "s": 32, "sh": 33, "sil": 34, "t": 35, "th": 36, "v": 37, "w": 38, "y": 39, "z": 40, "zh": 41}

pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/ctc_lin.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/ctc_lin.ckpt

pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/custom.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/custom.py

pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/hyperparams.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/hyperparams.yaml

pretrained_models/GraphemeToPhoneme-f9e3219c75cc17c936d5a85994b73823/model.ckpt ADDED Viewed

	@@ -0,0 +1 @@


1	+ /Users/z5173707/root/projects/phonological/Demo/Phone-aid/models/sb_phonemizer/model.ckpt

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+datasets==2.16.1
+deep-phonemizer==0.0.19
+speechbrain==0.5.16
+cmudict==1.0.22
+fire==0.6.0
+python-Levenshtein==0.25.0
+librosa==0.10.1
+transformers==4.37.2

transcriber.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import fire
+import logging
+import sys, os
+import yaml
+import json
+import torch
+import librosa
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2ForCTC
+import transformers
+import pandas as pd
+logger = logging.getLogger(__name__)
+# Setup logging
+logger.setLevel(logging.ERROR)
+console_handler = logging.StreamHandler()
+formater = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",)
+console_handler.setFormatter(formater)
+console_handler.setLevel(logging.ERROR)
+logger.addHandler(console_handler)
+class transcribe_SA():
+    def __init__(self, model_path, verbose=0):
+        if verbose == 0:
+            logger.setLevel(logging.ERROR)
+            transformers.logging.set_verbosity_error()
+            #console_handler.setLevel(logging.ERROR)
+        elif verbose == 1:
+            logger.setLevel(logging.WARNING)
+            transformers.logging.set_verbosity_warning()
+            #console_handler.setLevel(logging.WARNING)
+        else:
+            logger.setLevel(logging.INFO)
+            transformers.logging.set_verbosity_info()
+            #console_handler.setLevel(logging.INFO)
+        # Read YAML file
+        logger.info('Init Object')
+        if torch.cuda.is_available():
+            self.accelerate = True
+            self.device = torch.device('cuda')
+            self.n_devices = torch.cuda.device_count()
+            assert self.n_devices == 1, 'Support only single GPU. Please use CUDA_VISIBLE_DEVICES=gpu_index if you have multiple gpus' #Currently support only single gpu
+        else:
+            self.device = torch.device('cpu')
+            self.n_devices = 1
+        self.model_path = model_path
+        self.load_model()
+        self.get_available_attributes()
+        self.get_att_binary_group_indexs()
+    def load_model(self):
+        if not os.path.exists(self.model_path):
+            logger.error(f'Model file {self.model_path} is not exist')
+            raise FileNotFoundError
+        self.processor = Wav2Vec2Processor.from_pretrained(self.model_path)
+        self.model = Wav2Vec2ForCTC.from_pretrained(self.model_path)
+        self.pad_token_id = self.processor.tokenizer.pad_token_id
+        self.sampling_rate = self.processor.feature_extractor.sampling_rate
+    def get_available_attributes(self):
+        if not hasattr(self, 'model'):
+            logger.error('model not loaded, call load_model first!')
+            raise AttributeError("model not defined")
+        att_list = set(self.processor.tokenizer.get_vocab().keys()) - set(self.processor.tokenizer.all_special_tokens)
+        att_list = [p.replace('p_','') for p in att_list if p[0]=='p']
+        self.att_list = att_list
+    def print_availabel_attributes(self):
+        print(self.att_list)
+    def get_att_binary_group_indexs(self):
+        self.group_ids = [] #Each group contains the token_ids of [<PAD>, n_att, p_att] sorted by their token ids
+        for i, att in enumerate(self.att_list):
+            n_indx = self.processor.tokenizer.convert_tokens_to_ids(f'n_{att}')
+            p_indx = self.processor.tokenizer.convert_tokens_to_ids(f'p_{att}')
+            self.group_ids.append(sorted([self.pad_token_id, n_indx, p_indx]))
+    def decode_att(self, logits, att): #Need to lowercase when first read from the user
+        mask = torch.zeros(logits.size()[2], dtype = torch.bool)
+        try:
+            i = self.att_list.index(att)
+        except ValueError:
+            logger.error(f'The given attribute {att} not supported in the given model {self.model_path}')
+            raise
+        mask[self.group_ids[i]] = True
+        logits_g = logits[:,:,mask]
+        pred_ids = torch.argmax(logits_g,dim=-1)
+        pred_ids = pred_ids.cpu().apply_(lambda x: self.group_ids[i][x])
+        pred = self.processor.batch_decode(pred_ids,spaces_between_special_tokens=True)[0].split()
+        return list(map(lambda x:{f'p_{att}':'+',f'n_{att}':'-'}[x], pred))
+    def read_audio_file(self, audio_file):
+        if not os.path.exists(audio_file):
+            logger.error(f'Audio file {audio_file} is not exist')
+            raise FileNotFoundError
+        y, _ = librosa.load(audio_file, sr=self.sampling_rate)
+        return y
+    def get_logits(self, y):
+        input_values = self.processor(audio=y, sampling_rate=self.sampling_rate, return_tensors="pt").input_values
+        with torch.no_grad():
+            logits = self.model(input_values).logits
+        return logits
+    def check_identical_phonemes(self, df_p2att):
+        identical_phonemes = []
+        for index,row in df_p2att.iterrows():
+            mask = df_p2att.eq(row).all(axis=1)
+            indexes = df_p2att[mask].index.values
+            if len(indexes) > 1:
+                identical_phonemes.append(tuple(indexes))
+        if identical_phonemes:
+            logger.warning('The following phonemes has identical phonological features given the phonological features used in the model. If using fixed weight layer, these phonemes will be confused with each other')
+            identical_phonemes = set(identical_phonemes)
+            for x in identical_phonemes:
+                logger.warning(f"{','.join(x)}")
+    def read_phoneme2att(self,p2att_file):
+        if not os.path.exists(p2att_file):
+            logger.error(f'Phonological matrix file {p2att_file} is not exist')
+            raise FileNotFoundError(f'{p2att_file}')
+        df_p2att = pd.read_csv(p2att_file, index_col=0)
+        self.check_identical_phonemes(df_p2att)
+        not_supported = set(df_p2att.columns) - set(self.att_list)
+        if not_supported:
+            logger.warning(f"Attribute/s {','.join(not_supported)} is not supported by the model {self.model_path} and will be ignored. To get available attributes of the selected model run transcribe --model_path=/path/to/model print_availabel_attributes")
+            df_p2att = df_p2att.drop(columns=not_supported)
+        self.phoneme_list = df_p2att.index.values
+        self.p2att_map = {}
+        for i, r in df_p2att.iterrows():
+            phoneme = i
+            self.p2att_map[phoneme] = []
+            for att in r.index.values:
+                if f'p_{att}' not in self.processor.tokenizer.vocab:
+                    logger.warn(f'Attribute {att} is not supported by the model {self.model_path} and will be ignored. To get available attributes of the selected model run transcribe --model_path=/path/to/model print_availabel_attributes')
+                    continue
+                value = r[att]
+                if value == 0:
+                    self.p2att_map[phoneme].append(f'n_{att}')
+                elif value == 1:
+                    self.p2att_map[phoneme].append(f'p_{att}')
+                else:
+                    logger.error(f'Invalid value of {value} for attribute {att} of phoneme {phoneme}. Values in the phoneme to attribute map should be either 0 or 1')
+                    raise ValueError(f'{value} should be 0 or 1')
+    def create_phoneme_tokenizer(self):
+        vocab_list = self.phoneme_list
+        vocab_dict = {v: k+1 for k, v in enumerate(vocab_list)}
+        vocab_dict['<pad>'] = 0
+        vocab_dict = dict(sorted(vocab_dict.items(), key= lambda x: x[1]))
+        vocab_file = 'phoneme_vocab.json'
+        with open(vocab_file, 'w') as f:
+            json.dump(vocab_dict, f)
+        #Build processor
+        self.phoneme_tokenizer = Wav2Vec2CTCTokenizer(vocab_file, pad_token="<pad>", word_delimiter_token="")
+    def create_phonological_matrix(self):
+        self.phonological_matrix = torch.zeros((self.phoneme_tokenizer.vocab_size, self.processor.tokenizer.vocab_size)).type(torch.FloatTensor)
+        self.phonological_matrix[self.phoneme_tokenizer.pad_token_id, self.processor.tokenizer.pad_token_id] = 1
+        for p in self.phoneme_list:
+            for att in self.p2att_map[p]:
+                self.phonological_matrix[self.phoneme_tokenizer.convert_tokens_to_ids(p), self.processor.tokenizer.convert_tokens_to_ids(att)] = 1
+    #This function gets the attribute logits from the output layer and convert to phonemes
+    #Input is a sequence of logits (one vector per frame) and output phoneme sequence
+    #Note that this is CTC so number of output phonemes is not equal to number of input frames
+    def decode_phoneme(self,logits):
+        def masked_log_softmax(vector: torch.Tensor, mask: torch.Tensor, dim: int = -1) -> torch.Tensor:
+            if mask is not None:
+                mask = mask.float()
+                while mask.dim() < vector.dim():
+                    mask = mask.unsqueeze(1)
+                # vector + mask.log() is an easy way to zero out masked elements in logspace, but it
+                # results in nans when the whole vector is masked.  We need a very small value instead of a
+                # zero in the mask for these cases.  log(1 + 1e-45) is still basically 0, so we can safely
+                # just add 1e-45 before calling mask.log().  We use 1e-45 because 1e-46 is so small it
+                # becomes 0 - this is just the smallest value we can actually use.
+                vector = vector + (mask + 1e-45).log()
+            return torch.nn.functional.log_softmax(vector, dim=dim)
+        log_props_all_masked = []
+        for i in range(len(self.att_list)):
+            mask = torch.zeros(logits.size()[2], dtype = torch.bool)
+            mask[self.group_ids[i]] = True
+            mask.unsqueeze_(0).unsqueeze_(0)
+            log_probs = masked_log_softmax(vector=logits, mask=mask, dim=-1).masked_fill(~mask,0)
+            log_props_all_masked.append(log_probs)
+        log_probs_cat = torch.stack(log_props_all_masked, dim=0).sum(dim=0)
+        log_probs_phoneme = torch.matmul(self.phonological_matrix,log_probs_cat.transpose(1,2)).transpose(1,2).type(torch.FloatTensor)
+        pred_ids = torch.argmax(log_probs_phoneme,dim=-1)
+        pred = self.phoneme_tokenizer.batch_decode(pred_ids,spaces_between_special_tokens=True)[0]
+        return pred
+    def print_human_readable(self, output, with_phoneme = False):
+            column_widths = []
+            rows = []
+            if with_phoneme:
+                column_widths.append(max([len(att['Name']) for att in output['Attributes']]+[len('Phoneme')]))
+                column_widths.extend([5]*max([len(att['Pattern']) for att in output['Attributes']]+[len(output['Phoneme']['symbols'])]))
+                rows.append(('Phoneme'.center(column_widths[0]), *[s.center(column_widths[j+1]) for j,s in enumerate(output['Phoneme']['symbols'])]))
+            else:
+                column_widths.append(max([len(att['Name']) for att in output['Attributes']]))
+                column_widths.extend([5]*max([len(att['Pattern']) for att in output['Attributes']]))
+            for i in range(len(output['Attributes'])):
+                att = output['Attributes'][i]
+                rows.append((att['Name'].center(column_widths[0]), *[s.center(column_widths[j+1]) for j,s in enumerate(att['Pattern'])]))
+            out_string = ''
+            for row in rows:
+                out_string += '|'.join(row)
+                out_string += '\n'
+            return out_string
+    def transcribe(self, audio_file,
+                   attributes='all',
+                   phonological_matrix_file = None,
+                   human_readable = True):
+        output = {}
+        output['wav_file_path'] = audio_file
+        output['Attributes'] = []
+        output['Phoneme'] = {}
+        #Initiate the model
+        #self.load_model()
+        #self.get_available_attributes()
+        #self.get_att_binary_group_indexs()
+        if attributes == 'all':
+            target_attributes = self.att_list
+        else:
+            attributes = attributes if isinstance(attributes,tuple) else (attributes,)
+            target_attributes = [att.lower() for att in attributes if att.lower() in self.att_list]
+        if not target_attributes:
+            logger.error(f'None of the given attributes is supported by model {self.model_path}. To get available attributes of the selected model run transcribe --model_path=/path/to/model get_available_attributes')
+            raise ValueError("Invalid attributes")
+        #Process audio
+        y = self.read_audio_file(audio_file)
+        self.logits = self.get_logits(y)
+        for att in target_attributes:
+            output['Attributes'].append({'Name':att, 'Pattern' : self.decode_att(self.logits, att)})
+        if phonological_matrix_file:
+            self.read_phoneme2att(phonological_matrix_file)
+            self.create_phoneme_tokenizer()
+            self.create_phonological_matrix()
+            output['Phoneme']['symbols'] = self.decode_phoneme(self.logits).split()
+        json_string = json.dumps(output, indent=4)
+        if human_readable:
+            return self.print_human_readable(output, phonological_matrix_file!=None)
+        else:
+            return json_string
+        #return json_string
+def main():
+    fire.Fire(transcribe_SA)
+if __name__ == '__main__':
+    main()