Spaces:

zhuowen999
/

rvc-tts-yutou

Runtime error

App Files Files Community

rvc-tts-yutou / app.py

zhuowen999

Update app.py

27ffa6a verified 6 months ago

raw

history blame contribute delete

13.6 kB

	import asyncio
	import datetime
	import logging
	import os
	import time
	import traceback
	import shutil
	import urllib.request
	import zipfile
	import gdown
	from argparse import ArgumentParser
	import requests
	import random
	os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d . -o hubert_base.pt")
	os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d . -o rmvpe.pt")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.pth -d ./weights/yoimiya -o yoimiya.pth")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.index -d ./weights/yoimiya -o yoimiya.index")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.pth -d ./weights/hitzeed -o hitzeed.pth")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.index -d ./weights/hitzeed -o hitzeed.index")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/Eminem2333333/blob/main/model.pth -d ./weights/Eminem -o Eminem.pth")
	# os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.index -d ./weights/Eminem -o Eminem.index")
	os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/zhuowen999/yutou/resolve/main/yutou.index -d ./weights/yutou -o yutou.index")
	os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/zhuowen999/yutou/resolve/main/yutou.pth -d ./weights/yutou -o yutou.pth")

	BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

	rvc_models_dir = os.path.join(BASE_DIR, 'weights')



	import edge_tts
	import gradio as gr
	import librosa
	import torch
	from fairseq import checkpoint_utils

	from config import Config
	from lib.infer_pack.models import (
	SynthesizerTrnMs256NSFsid,
	SynthesizerTrnMs256NSFsid_nono,
	SynthesizerTrnMs768NSFsid,
	SynthesizerTrnMs768NSFsid_nono,
	)
	from rmvpe import RMVPE
	from vc_infer_pipeline import VC

	logging.getLogger("fairseq").setLevel(logging.WARNING)
	logging.getLogger("numba").setLevel(logging.WARNING)
	logging.getLogger("markdown_it").setLevel(logging.WARNING)
	logging.getLogger("urllib3").setLevel(logging.WARNING)
	logging.getLogger("matplotlib").setLevel(logging.WARNING)

	limitation = os.getenv("SYSTEM") == "spaces"

	config = Config()

	edge_output_filename = "edge_output.mp3"
	tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
	tts_voices = ['zh-CN-XiaoxiaoMultilingualNeural','zh-CN-YunyiMultilingualNeural','zh-CN-XiaoyuMultilingualNeural','zh-CN-XiaochenMultilingualNeural']

	model_root = "weights"
	models = [
	d for d in os.listdir(model_root) if os.path.isdir(os.path.join(model_root, d))
	]
	if len(models) == 0:
	raise ValueError("No model found in `weights` folder")
	models.sort()

	def tts_new(text,path,voice='zh-CN-XiaoxiaoMultilingualNeural',rate=-8):
	url = "https://www.text-to-speech.cn/getSpeek.php"

	payload = {
	"user_id": str(random.randint(120100,2000000)),
	"language": "中文（普通话，简体）",
	"voice": voice,
	"text": text,
	"role": "0",
	"style": "0",
	"styledegree": "1",
	"volume": "75",
	"predict": "0",
	"rate": rate,
	"pitch": "0",
	"kbitrate": "audio-16khz-128kbitrate-mono-mp3",
	"silence": "",
	"yzm": "^\""
	}
	headers = {
	"Referer": "https://www.text-to-speech.cn/",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
	"accept": "/",
	"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
	"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
	"origin": "https://www.text-to-speech.cn",
	"referer": "https://www.text-to-speech.cn/"
	}

	response = requests.post(url, data=payload, headers=headers)
	url=response.json()['download']
	print(response.json())
	mp3=requests.get(url)
	with open(path, "wb") as f:
	f.write(mp3.content)

	def model_data(model_name):
	# global n_spk, tgt_sr, net_g, vc, cpt, version, index_file
	pth_files = [
	os.path.join(model_root, model_name, f)
	for f in os.listdir(os.path.join(model_root, model_name))
	if f.endswith(".pth")
	]
	if len(pth_files) == 0:
	raise ValueError(f"No pth file found in {model_root}/{model_name}")
	pth_path = pth_files[0]
	print(f"Loading {pth_path}")
	cpt = torch.load(pth_path, map_location="cpu")
	tgt_sr = cpt["config"][-1]
	cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
	if_f0 = cpt.get("f0", 1)
	version = cpt.get("version", "v1")
	if version == "v1":
	if if_f0 == 1:
	net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
	else:
	net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
	elif version == "v2":
	if if_f0 == 1:
	net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
	else:
	net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
	else:
	raise ValueError("Unknown version")
	del net_g.enc_q
	net_g.load_state_dict(cpt["weight"], strict=False)
	print("Model loaded")
	net_g.eval().to(config.device)
	if config.is_half:
	net_g = net_g.half()
	else:
	net_g = net_g.float()
	vc = VC(tgt_sr, config)
	# n_spk = cpt["config"][-3]

	index_files = [
	os.path.join(model_root, model_name, f)
	for f in os.listdir(os.path.join(model_root, model_name))
	if f.endswith(".index")
	]
	if len(index_files) == 0:
	print("No index file found")
	index_file = ""
	else:
	index_file = index_files[0]
	print(f"Index file found: {index_file}")

	return tgt_sr, net_g, vc, version, index_file, if_f0


	def load_hubert():
	global hubert_model
	models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
	["hubert_base.pt"],
	suffix="",
	)
	hubert_model = models[0]
	hubert_model = hubert_model.to(config.device)
	if config.is_half:
	hubert_model = hubert_model.half()
	else:
	hubert_model = hubert_model.float()
	return hubert_model.eval()


	print("Loading hubert model...")
	hubert_model = load_hubert()
	print("Hubert model loaded.")

	print("Loading rmvpe model...")
	rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
	print("rmvpe model loaded.")


	def tts(
	model_name,
	speed,
	tts_text,
	tts_voice,
	f0_up_key,
	f0_method,
	index_rate,
	protect,
	filter_radius=3,
	resample_sr=0,
	rms_mix_rate=0.25,
	):
	print("------------------")
	print(datetime.datetime.now())
	print("tts_text:")
	print(tts_text)
	print(f"tts_voice: {tts_voice}")
	print(f"Model name: {model_name}")
	print(f"F0: {f0_method}, Key: {f0_up_key}, Index: {index_rate}, Protect: {protect}")
	try:
	if limitation and len(tts_text) > 280:
	print("Error: Text too long")
	return (
	f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.",
	None,
	None,
	)
	tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
	t0 = time.time()
	if speed >= 0:
	speed_str = f"+{speed}%"
	else:
	speed_str = f"{speed}%"
	# asyncio.run(
	# edge_tts.Communicate(
	# tts_text, "-".join(tts_voice.split("-")[:-1]), rate=speed_str
	# ).save(edge_output_filename)
	# )
	tts_new(tts_text,edge_output_filename,tts_voice,speed)
	t1 = time.time()
	edge_time = t1 - t0
	audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
	duration = len(audio) / sr
	print(f"Audio duration: {duration}s")
	if limitation and duration >= 20:
	print("Error: Audio too long")
	return (
	f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
	edge_output_filename,
	None,
	)

	f0_up_key = int(f0_up_key)

	if not hubert_model:
	load_hubert()
	if f0_method == "rmvpe":
	vc.model_rmvpe = rmvpe_model
	times = [0, 0, 0]
	audio_opt = vc.pipeline(
	hubert_model,
	net_g,
	0,
	audio,
	edge_output_filename,
	times,
	f0_up_key,
	f0_method,
	index_file,
	# file_big_npy,
	index_rate,
	if_f0,
	filter_radius,
	tgt_sr,
	resample_sr,
	rms_mix_rate,
	version,
	protect,
	None,
	)
	if tgt_sr != resample_sr >= 16000:
	tgt_sr = resample_sr
	info = f"Success. Time: edge-tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
	print(info)
	return (
	info,
	edge_output_filename,
	(tgt_sr, audio_opt),
	)
	except EOFError:
	info = (
	"It seems that the edge-tts output is not valid. "
	"This may occur when the input text and the speaker do not match. "
	"For example, maybe you entered Japanese (without alphabets) text but chose non-Japanese speaker?"
	)
	print(info)
	return info, None, None
	except:
	info = traceback.format_exc()
	print(info)
	return info, None, None







	initial_md = """
	# RVC TTS HF 🤗


	[![open in clab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Blane187/rvc-tts/blob/main/rvc_tts.ipynb)


	This is a text-to-speech webui of RVC models.

	Input text ➡[(edge-tts)](https://github.com/rany2/edge-tts)➡ Speech mp3 file ➡[(RVC)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)➡ Final output
	"""

	Another_md = """

	RVC TTS → [🌐 Github](https://github.com/Blane187/rvc-tts.git)

	"""

	app = gr.Blocks(title="RVC-TTS")
	with app:
	gr.Markdown(initial_md)
	gr.Markdown(Another_md)

	with gr.Row():
	with gr.Column():
	model_name = gr.Dropdown(label="Model", choices=models, value=models[0])
	f0_key_up = gr.Number(
	label="Transpose (the best value depends on the models and speakers)",
	value=0,
	)
	with gr.Column():
	f0_method = gr.Radio(
	label="Pitch extraction method (Rmvpe is default)",
	choices=["rmvpe", "crepe"], # harvest is too slow
	value="rmvpe",
	interactive=True,
	)
	index_rate = gr.Slider(
	minimum=0,
	maximum=1,
	label="Index rate",
	value=1,
	interactive=True,
	)
	protect0 = gr.Slider(
	minimum=0,
	maximum=0.5,
	label="Protect",
	value=0.33,
	step=0.01,
	interactive=True,
	)
	with gr.Row():
	with gr.Column():
	tts_voice = gr.Dropdown(
	label="Edge-tts speaker (format: language-Country-Name-Gender)",
	choices=tts_voices,
	allow_custom_value=False,
	value="zh-CN-XiaoxiaoMultilingualNeural",
	)
	speed = gr.Slider(
	minimum=-100,
	maximum=100,
	label="Speech speed (%)",
	value=0,
	step=10,
	interactive=True,
	)
	with gr.Column():
	tts_text = gr.Textbox(label="Input Text", value="很高兴见到你")
	with gr.Column():
	but0 = gr.Button("Convert", variant="primary")
	info_text = gr.Textbox(label="Output info")
	with gr.Column():
	edge_tts_output = gr.Audio(label="Edge Voice", type="filepath")
	with gr.Column():
	tts_output = gr.Audio(label="Result")


	but0.click(
	tts,
	[
	model_name,
	speed,
	tts_text,
	tts_voice,
	f0_key_up,
	f0_method,
	index_rate,
	protect0,
	],
	[info_text, edge_tts_output, tts_output],
	)
	with gr.Row():
	examples = gr.Examples(
	examples_per_page=100,
	examples=[
	["これは日本語テキストから音声への変換デモです。", "ja-JP-NanamiNeural-Female"],
	[
	"This is an English text to speech conversation demo.",
	"en-US-AriaNeural-Female",
	],
	],
	inputs=[tts_text, tts_voice],
	)

	app.launch()