Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 15,396 Bytes
fb95943 3561130 f60cca2 c86d965 ebe377d c86d965 dde297c c2546a5 ebe377d c2546a5 f60cca2 c2546a5 f60cca2 7dce1b0 c2546a5 ebe377d fb95943 c2546a5 3561130 fb95943 c86d965 fb95943 c86d965 3561130 fb95943 3561130 8c486cf 7f6e9a8 7dce1b0 60e18c0 3561130 60e18c0 3561130 7b1c05b 60e18c0 3561130 7dce1b0 8c486cf 3561130 7dce1b0 60e18c0 3561130 c2546a5 58493c0 1276f85 58493c0 7b1c05b 58493c0 8c486cf 58493c0 8c486cf 58493c0 fb95943 c86d965 fb95943 1a40474 c2546a5 fb95943 8c486cf fb95943 8c486cf 58493c0 1a40474 8935672 e0cfda2 8935672 8c486cf 3561130 58493c0 fb95943 58493c0 8c486cf c2546a5 c580b60 c2546a5 1f9f8f9 fb95943 c2546a5 3561130 58493c0 fb95943 3561130 e0cfda2 1a40474 e0cfda2 1a40474 fb95943 35b7ab4 3561130 7f6e9a8 35b7ab4 fb95943 3561130 60e18c0 fb95943 60e18c0 fb95943 7ad7315 1b4fbf7 fb95943 1b4fbf7 7ad7315 fb95943 7ad7315 35b7ab4 f60cca2 c2546a5 3561130 58493c0 3561130 e0cfda2 1a40474 e0cfda2 3561130 e0cfda2 7b1c05b 3561130 60e18c0 58493c0 1276f85 58493c0 e0cfda2 58493c0 e0cfda2 58493c0 1276f85 7b1c05b 1276f85 7b1c05b 58493c0 1276f85 7ad7315 fb95943 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 |
import argparse
import json
import os
import re
import tempfile
import librosa
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from mel_processing import spectrogram_torch
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
def audio_postprocess(self, y):
if y is None:
return None
if gr_utils.validate_url(y):
file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
elif isinstance(y, tuple):
sample_rate, data = y
file = tempfile.NamedTemporaryFile(
suffix=".wav", dir=self.temp_dir, delete=False
)
gr_processing_utils.audio_to_file(sample_rate, data, file.name)
else:
file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)
return gr_processing_utils.encode_url_or_file_to_base64(file.name)
gr.Audio.postprocess = audio_postprocess
def get_text(text, hps, is_symbol):
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
def create_tts_fn(model, hps, speaker_ids):
def tts_fn(text, speaker, speed, is_symbol):
if limitation:
text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
max_len = 150
if is_symbol:
max_len *= 3
if text_len > max_len:
return "Error: Text is too long", None
speaker_id = speaker_ids[speaker]
stn_tst = get_text(text, hps, is_symbol)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
sid = LongTensor([speaker_id]).to(device)
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
del stn_tst, x_tst, x_tst_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return tts_fn
def create_vc_fn(model, hps, speaker_ids):
def vc_fn(original_speaker, target_speaker, input_audio):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if limitation and duration > 30:
return "Error: Audio is too long", None
original_speaker_id = speaker_ids[original_speaker]
target_speaker_id = speaker_ids[target_speaker]
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != hps.data.sampling_rate:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
with no_grad():
y = torch.FloatTensor(audio)
y = y.unsqueeze(0)
spec = spectrogram_torch(y, hps.data.filter_length,
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
center=False).to(device)
spec_lengths = LongTensor([spec.size(-1)]).to(device)
sid_src = LongTensor([original_speaker_id]).to(device)
sid_tgt = LongTensor([target_speaker_id]).to(device)
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
0, 0].data.cpu().float().numpy()
del y, spec, spec_lengths, sid_src, sid_tgt
return "Success", (hps.data.sampling_rate, audio)
return vc_fn
def create_soft_vc_fn(model, hps, speaker_ids):
def soft_vc_fn(target_speaker, input_audio1, input_audio2):
input_audio = input_audio1
if input_audio is None:
input_audio = input_audio2
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if limitation and duration > 30:
return "Error: Audio is too long", None
target_speaker_id = speaker_ids[target_speaker]
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
with torch.inference_mode():
units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
with no_grad():
unit_lengths = LongTensor([units.size(1)]).to(device)
sid = LongTensor([target_speaker_id]).to(device)
audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
del units, unit_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return soft_vc_fn
def create_to_symbol_fn(hps):
def to_symbol_fn(is_symbol_input, input_text, temp_text):
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
else (temp_text, temp_text)
return to_symbol_fn
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
device = torch.device(args.device)
models_tts = []
models_vc = []
models_soft_vc = []
with open("saved_model/info.json", "r", encoding="utf-8") as f:
models_info = json.load(f)
for i, info in models_info.items():
name = info["title"]
lang = info["lang"]
example = info["example"]
config_path = f"saved_model/{i}/config.json"
model_path = f"saved_model/{i}/model.pth"
cover = info["cover"]
cover_path = f"saved_model/{i}/{cover}" if cover else None
hps = utils.get_hparams_from_file(config_path)
model = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
utils.load_checkpoint(model_path, model, None)
model.eval().to(device)
speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
t = info["type"]
if t == "vits":
models_tts.append((name, cover_path, speakers, lang, example,
hps.symbols, create_tts_fn(model, hps, speaker_ids),
create_to_symbol_fn(hps)))
models_vc.append((name, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
elif t == "soft-vits-vc":
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
app = gr.Blocks()
with app:
gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n"
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
"[Open In Colab]"
"(https://colab.research.google.com/drive/14Pb8lpmwZL-JI5Ub6jpG4sz2-8KS0kbS?usp=sharing)"
" without queue and length limitation")
with gr.Tabs():
with gr.TabItem("TTS"):
with gr.Tabs():
for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
to_symbol_fn) in enumerate(models_tts):
with gr.TabItem(f"model{i}"):
with gr.Column():
cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}"
f"lang: {lang}")
tts_input1 = gr.TextArea(label="Text (150 words limitation)", value=example,
elem_id=f"tts-input{i}")
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
type="index", value=speakers[0])
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
with gr.Accordion(label="Advanced Options", open=False):
temp_text_var = gr.Variable()
symbol_input = gr.Checkbox(value=False, label="Symbol input")
symbol_list = gr.Dataset(label="Symbol list", components=[tts_input1],
samples=[[x] for x in symbols],
elem_id=f"symbol-list{i}")
symbol_list_json = gr.Json(value=symbols, visible=False)
tts_submit = gr.Button("Generate", variant="primary")
tts_output1 = gr.Textbox(label="Output Message")
tts_output2 = gr.Audio(label="Output Audio")
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
[tts_output1, tts_output2])
symbol_input.change(to_symbol_fn,
[symbol_input, tts_input1, temp_text_var],
[tts_input1, temp_text_var])
symbol_list.click(None, [symbol_list, symbol_list_json], [],
_js=f"""
(i,symbols) => {{
let root = document.querySelector("body > gradio-app");
if (root.shadowRoot != null)
root = root.shadowRoot;
let text_input = root.querySelector("#tts-input{i}").querySelector("textarea");
let startPos = text_input.selectionStart;
let endPos = text_input.selectionEnd;
let oldTxt = text_input.value;
let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
text_input.value = result;
let x = window.scrollX, y = window.scrollY;
text_input.focus();
text_input.selectionStart = startPos + symbols[i].length;
text_input.selectionEnd = startPos + symbols[i].length;
text_input.blur();
window.scrollTo(x, y);
return [];
}}""")
with gr.TabItem("Voice Conversion"):
with gr.Tabs():
for i, (name, cover_path, speakers, vc_fn) in enumerate(models_vc):
with gr.TabItem(f"model{i}"):
cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}")
vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
value=speakers[0])
vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
value=speakers[min(len(speakers) - 1, 1)])
vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
vc_submit = gr.Button("Convert", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
with gr.TabItem("Soft Voice Conversion"):
with gr.Tabs():
for i, (name, cover_path, speakers, soft_vc_fn) in enumerate(models_soft_vc):
with gr.TabItem(f"model{i}"):
cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
gr.Markdown(f"## {name}\n\n"
f"{cover_markdown}")
vc_input1 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
value=speakers[0])
source_tabs = gr.Tabs()
with source_tabs:
with gr.TabItem("microphone"):
vc_input2 = gr.Audio(label="Input Audio (30s limitation)", source="microphone")
with gr.TabItem("upload"):
vc_input3 = gr.Audio(label="Input Audio (30s limitation)", source="upload")
vc_submit = gr.Button("Convert", variant="primary")
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio")
# clear inputs
source_tabs.set_event_trigger("change", None, [], [vc_input2, vc_input3],
js="()=>[null,null]")
vc_submit.click(soft_vc_fn, [vc_input1, vc_input2, vc_input3],
[vc_output1, vc_output2])
gr.Markdown(
"unofficial demo for \n\n"
"- [https://github.com/CjangCjengh/MoeGoe](https://github.com/CjangCjengh/MoeGoe)\n"
"- [https://github.com/Francis-Komizu/VITS](https://github.com/Francis-Komizu/VITS)\n"
"- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
"- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
)
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|