benjipeng commited on
Commit
b96156b
1 Parent(s): aa22bc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -37
app.py CHANGED
@@ -1,60 +1,195 @@
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
-
5
- from transformers import (
6
- VitsModel,
7
- VitsTokenizer,
8
- pipeline
9
- )
10
 
11
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
  # load speech translation checkpoint
15
- asr_pipe = pipeline(
16
- "automatic-speech-recognition",
17
- model="openai/whisper-base",
18
- device=device
19
- )
20
-
21
- model = VitsModel.from_pretrained("Matthijs/mms-tts-deu")
22
- tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
23
-
24
-
25
- def translate(audio):
26
- outputs = asr_pipe(
27
- audio,
28
- max_new_tokens=256,
29
- generate_kwargs={"task": "transcribe", "language": "de"}
30
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  return outputs["text"]
32
 
33
 
34
- def synthesise(text):
35
- if len(text.strip()) == 0:
36
- return (16000, np.zeros(0).astype(np.int16))
 
 
 
37
 
38
- inputs = tokenizer(text, return_tensors="pt")
39
- input_ids = inputs["input_ids"]
40
 
41
- with torch.no_grad():
42
- outputs = model(input_ids)
43
 
44
- speech = outputs.audio[0]
 
 
 
45
  return speech.cpu()
46
 
47
 
48
  def speech_to_speech_translation(audio):
49
- translated_text = translate(audio)
50
- synthesised_speech = synthesise(translated_text)
 
 
 
51
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
52
- return 16000, synthesised_speech
 
53
 
54
  title = "Cascaded STST"
55
  description = """
56
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
57
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
58
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
59
  """
60
 
@@ -63,14 +198,15 @@ demo = gr.Blocks()
63
  mic_translate = gr.Interface(
64
  fn=speech_to_speech_translation,
65
  inputs=gr.Audio(source="microphone", type="filepath"),
66
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
67
  title=title,
68
  description=description,
69
  )
 
70
  file_translate = gr.Interface(
71
  fn=speech_to_speech_translation,
72
  inputs=gr.Audio(source="upload", type="filepath"),
73
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
74
  examples=[["./example.wav"]],
75
  title=title,
76
  description=description,
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
+ from datasets import load_dataset
5
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
 
 
6
 
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
  # load speech translation checkpoint
11
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
12
+ greek_translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-el")
13
+
14
+ # load text-to-speech checkpoint and speaker embeddings
15
+ model_id = "microsoft/speecht5_tts" # update with your model id
16
+ # pipe = pipeline("automatic-speech-recognition", model=model_id)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
+ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
21
+
22
+ processor = SpeechT5Processor.from_pretrained(model_id)
23
+
24
+ model_id_greek = "Sandiago21/speecht5_finetuned_google_fleurs_greek"
25
+ model_greek = SpeechT5ForTextToSpeech.from_pretrained(model_id_greek)
26
+ processor_greek = SpeechT5Processor.from_pretrained(model_id_greek)
27
+
28
+ replacements = [
29
+ ("á", "a"),
30
+ ("â", "a"),
31
+ ("ã", "a"),
32
+ ("í", "i"),
33
+ ("á", "a"),
34
+ ("í", "i"),
35
+ ("ñ", "n"),
36
+ ("ó", "o"),
37
+ ("ú", "u"),
38
+ ("ü", "u"),
39
+ ("á", "a"),
40
+ ("ç", "c"),
41
+ ("è", "e"),
42
+ ("ì", "i"),
43
+ ("í", "i"),
44
+ ("ò", "o"),
45
+ ("ó", "o"),
46
+ ("ù", "u"),
47
+ ("ú", "u"),
48
+ ("š", "s"),
49
+ ("ï", "i"),
50
+ ("à", "a"),
51
+ ("â", "a"),
52
+ ("ç", "c"),
53
+ ("è", "e"),
54
+ ("ë", "e"),
55
+ ("î", "i"),
56
+ ("ï", "i"),
57
+ ("ô", "o"),
58
+ ("ù", "u"),
59
+ ("û", "u"),
60
+ ("ü", "u"),
61
+ ("ου", "u"),
62
+ ("αυ", "af"),
63
+ ("ευ", "ef"),
64
+ ("ει", "i"),
65
+ ("οι", "i"),
66
+ ("αι", "e"),
67
+ ("ού", "u"),
68
+ ("εί", "i"),
69
+ ("οί", "i"),
70
+ ("αί", "e"),
71
+ ("Ά", "A"),
72
+ ("Έ", "E"),
73
+ ("Ή", "H"),
74
+ ("Ί", "I"),
75
+ ("Ό", "O"),
76
+ ("Ύ", "Y"),
77
+ ("Ώ", "O"),
78
+ ("ΐ", "i"),
79
+ ("Α", "A"),
80
+ ("Β", "B"),
81
+ ("Γ", "G"),
82
+ ("Δ", "L"),
83
+ ("Ε", "Ε"),
84
+ ("Ζ", "Z"),
85
+ ("Η", "I"),
86
+ ("Θ", "Th"),
87
+ ("Ι", "I"),
88
+ ("Κ", "K"),
89
+ ("Λ", "L"),
90
+ ("Μ", "M"),
91
+ ("Ν", "N"),
92
+ ("Ξ", "Ks"),
93
+ ("Ο", "O"),
94
+ ("Π", "P"),
95
+ ("Ρ", "R"),
96
+ ("Σ", "S"),
97
+ ("Τ", "T"),
98
+ ("Υ", "Y"),
99
+ ("Φ", "F"),
100
+ ("Χ", "X"),
101
+ ("Ω", "O"),
102
+ ("ά", "a"),
103
+ ("έ", "e"),
104
+ ("ή", "i"),
105
+ ("ί", "i"),
106
+ ("α", "a"),
107
+ ("β", "v"),
108
+ ("γ", "g"),
109
+ ("δ", "d"),
110
+ ("ε", "e"),
111
+ ("ζ", "z"),
112
+ ("η", "i"),
113
+ ("θ", "th"),
114
+ ("ι", "i"),
115
+ ("κ", "k"),
116
+ ("λ", "l"),
117
+ ("μ", "m"),
118
+ ("ν", "n"),
119
+ ("ξ", "ks"),
120
+ ("ο", "o"),
121
+ ("π", "p"),
122
+ ("ρ", "r"),
123
+ ("ς", "s"),
124
+ ("σ", "s"),
125
+ ("τ", "t"),
126
+ ("υ", "i"),
127
+ ("φ", "f"),
128
+ ("χ", "h"),
129
+ ("ψ", "ps"),
130
+ ("ω", "o"),
131
+ ("ϊ", "i"),
132
+ ("ϋ", "i"),
133
+ ("ό", "o"),
134
+ ("ύ", "i"),
135
+ ("ώ", "o"),
136
+ ("í", "i"),
137
+ ("õ", "o"),
138
+ ("Ε", "E"),
139
+ ("Ψ", "Ps"),
140
+ ]
141
+
142
+ def cleanup_text(text):
143
+ for src, dst in replacements:
144
+ text = text.replace(src, dst)
145
+ return text
146
+
147
+
148
+ def synthesize_speech(text):
149
+ text = cleanup_text(text)
150
+ inputs = processor(text=text, return_tensors="pt")
151
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
152
+
153
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
154
+
155
+
156
+ def translate_to_english(audio):
157
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "english"})
158
  return outputs["text"]
159
 
160
 
161
+ def synthesise_from_english(text):
162
+ text = cleanup_text(text)
163
+ inputs = processor(text=text, return_tensors="pt")
164
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
165
+ return speech.cpu().numpy()
166
+
167
 
168
+ def translate_from_english_to_greek(text):
169
+ return greek_translation_pipe(text)[0]["translation_text"]
170
 
 
 
171
 
172
+ def synthesise_from_greek(text):
173
+ text = cleanup_text(text)
174
+ inputs = processor_greek(text=text, return_tensors="pt")
175
+ speech = model_greek.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
176
  return speech.cpu()
177
 
178
 
179
  def speech_to_speech_translation(audio):
180
+ translated_text = translate_to_english(audio)
181
+ translated_text = translate_from_english_to_greek(translated_text)
182
+ # synthesised_speech = synthesise_from_english(translated_text)
183
+ # translated_text = translate_from_english_to_greek(synthesised_speech)
184
+ synthesised_speech = synthesise_from_greek(translated_text)
185
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
186
+ return ((16000, synthesised_speech), translated_text)
187
+
188
 
189
  title = "Cascaded STST"
190
  description = """
191
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Greek. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [Sandiago21/speecht5_finetuned_google_fleurs_greek](https://huggingface.co/Sandiago21/speecht5_finetuned_google_fleurs_greek) checkpoint for text-to-speech, which is based on Microsoft's
192
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Greek Audio dataset:
193
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
194
  """
195
 
 
198
  mic_translate = gr.Interface(
199
  fn=speech_to_speech_translation,
200
  inputs=gr.Audio(source="microphone", type="filepath"),
201
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
202
  title=title,
203
  description=description,
204
  )
205
+
206
  file_translate = gr.Interface(
207
  fn=speech_to_speech_translation,
208
  inputs=gr.Audio(source="upload", type="filepath"),
209
+ outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
210
  examples=[["./example.wav"]],
211
  title=title,
212
  description=description,