import gradio as gr import torch import os from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset, Audio import numpy as np from speechbrain.inference import EncoderClassifier # Load models and processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("tdnathmlenthusiast/speecht5_finetuned_techical_dataset_en") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker encoder device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device}, savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb") ) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings # Load a sample from the dataset for speaker embedding try: dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train", trust_remote_code=True) dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) sample = dataset[50] speaker_embedding = create_speaker_embedding(sample['audio']['array']) except Exception as e: print(f"Error loading dataset: {e}") # Use a random speaker embedding as fallback speaker_embedding = torch.randn(1, 512) def text_to_speech(text): # Clean up text replacements = [ ("0", "zero"), ("1", "one"), ("2", "two"), ("3", "three"), ("4", "four"), ("5", "five"), ("6", "six"), ("7", "seven"), ("8", "eight"), ("9", "nine"), ("_", " ") ] for src, dst in replacements: text = text.replace(src, dst) inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) return (16000, speech.numpy()) iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Technical English Text-to-Speech", description="Enter english text to convert to speech" ) iface.launch()