Update synthesise function again
Browse files
app.py
CHANGED
@@ -73,24 +73,20 @@ def translate(audio):
|
|
73 |
return translated_text
|
74 |
|
75 |
def synthesise(text):
|
76 |
-
#
|
77 |
inputs = tts_tokenizer(text, return_tensors="pt").to(device)
|
78 |
|
79 |
with torch.no_grad():
|
80 |
output = tts_model(**inputs)
|
81 |
print(f"TTS Model Output: {output}")
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
#
|
90 |
-
speech_numpy = (speech.squeeze().cpu().numpy() * 32767).astype(np.int16)
|
91 |
-
|
92 |
-
return sampling_rate, speech_numpy
|
93 |
-
|
94 |
|
95 |
# Normalize audio
|
96 |
def normalize_audio(audio):
|
|
|
73 |
return translated_text
|
74 |
|
75 |
def synthesise(text):
|
76 |
+
# Properly tokenize the translated text for the VITS model
|
77 |
inputs = tts_tokenizer(text, return_tensors="pt").to(device)
|
78 |
|
79 |
with torch.no_grad():
|
80 |
output = tts_model(**inputs)
|
81 |
print(f"TTS Model Output: {output}")
|
82 |
+
|
83 |
+
# Accessing the correct key for the waveform
|
84 |
+
speech = output['model_outputs'] # Correct key access for the synthesized waveform
|
85 |
+
|
86 |
+
# Convert to numpy format suitable for audio output
|
87 |
+
speech_numpy = speech.squeeze().cpu().numpy() # Remove batch and channel dimensions if necessary
|
88 |
+
|
89 |
+
return (speech_numpy * 32767).astype(np.int16) # Ensure correct format for audio output
|
|
|
|
|
|
|
|
|
90 |
|
91 |
# Normalize audio
|
92 |
def normalize_audio(audio):
|