TokaiTeio-SVC / inference_main.py
Junity's picture
finished trans
51345f0
raw
history blame
5.27 kB
import io
import logging
import time
from pathlib import Path
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile
from inference import infer_tool
from inference import slicer
from inference.infer_tool import Svc
logging.getLogger('numba').setLevel(logging.WARNING)
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
def infer(file_path, spk_list=['tokaiteio'], trans=[0], config_path="configs/config.json", device="cpu", cluster_model_path="logs/44k/kmeans_10000.pt", slice_db=-40, wav_format='flac', auto_predict_f0=False, cluster_infer_ratio=0, noice_scale=0.4, pad_seconds=0.5, model_path="logs/44k/G_318400.pth"):
# import argparse
# parser = argparse.ArgumentParser(description='sovits4 inference')
# # 一定要设置的部分
# parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
# parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
# parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
# parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
# parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
# # 可选项部分
# parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,
# help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
# parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
# parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则填0即可')
# # 不用动的部分
# parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
# parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
# parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
# parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
# parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
# args = parser.parse_args()
svc_model = Svc(model_path, config_path, device, cluster_model_path)
# infer_tool.mkdir(["raw", "results"])
# clean_names = args.clean_names
# trans = trans
# spk_list = args.spk_list
# slice_db = args.slice_db
# wav_format = args.wav_format
# auto_predict_f0 = args.auto_predict_f0
# cluster_infer_ratio = args.cluster_infer_ratio
# noice_scale = args.noice_scale
# pad_seconds = args.pad_seconds
# if there is a lot of file, let the trans be the same length as the file
# infer_tool.fill_a_to_b(trans, clean_names)
if "." not in file_path:
file_path += ".wav"
infer_tool.format_wav(file_path)
wav_path = Path(file_path).with_suffix('.wav')
chunks = slicer.cut(wav_path, db_thresh=slice_db)
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
for spk in spk_list:
audio = []
for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
if slice_tag:
print('jump empty segment')
_audio = np.zeros(length)
else:
# padd
pad_len = int(audio_sr * pad_seconds)
data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, data, audio_sr, format="wav")
raw_path.seek(0)
out_audio, out_sr = svc_model.infer(spk, trans[0], raw_path,
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale
)
_audio = out_audio.cpu().numpy()
pad_len = int(svc_model.target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
audio.extend(list(infer_tool.pad_array(_audio, length)))
key = "auto" if auto_predict_f0 else f"{trans[0]}key"
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
res_path = f"results/{wav_path.stem}_{spk}_{key}{cluster_name}.{wav_format}"
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
return res_path