speech-to-speech-translation

Running

dmcartor commited on 10 days ago

Commit

f5b2e41

•

1 Parent(s): 7477f26

Modifying synthesise function again

Files changed (1) hide show

app.py CHANGED Viewed

@@ -76,17 +76,20 @@ def synthesise(text):
     # Properly tokenize the translated text for the VITS model
     inputs = tts_tokenizer(text, return_tensors="pt").to(device)
     with torch.no_grad():
         output = tts_model(**inputs)
-        print(f"TTS Model Output: {output}")
-        # Accessing the correct key for the waveform
-        speech = output['model_outputs']  # Correct key access for the synthesized waveform
-        # Convert to numpy format suitable for audio output
-        speech_numpy = speech.squeeze().cpu().numpy()  # Remove batch and channel dimensions if necessary
-    return (speech_numpy * 32767).astype(np.int16)  # Ensure correct format for audio output
 # Normalize audio
 def normalize_audio(audio):

     # Properly tokenize the translated text for the VITS model
     inputs = tts_tokenizer(text, return_tensors="pt").to(device)
+    # Run the model to generate the waveform
     with torch.no_grad():
         output = tts_model(**inputs)
+    # Check the output and access the waveform
+    print(f"TTS Model Output: {output}")
+    # Access the synthesized waveform from the model output
+    speech = output.audio  # The waveform is stored in the 'audio' key
+    # Convert to numpy format suitable for audio output
+    speech_numpy = (speech.squeeze().cpu().numpy() * 32767).astype(np.int16)
+    return speech_numpy
 # Normalize audio
 def normalize_audio(audio):