diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..6c73e337d2806394fafdc3cd4152afbe38bf2b2d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+userdic.csv filter=lfs diff=lfs merge=lfs -text
+TEMP/jieba.cache filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f8ad61c3af5ab894c5b6fd58e7476c921682068c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.dic
\ No newline at end of file
diff --git a/TEMP/jieba.cache b/TEMP/jieba.cache
new file mode 100644
index 0000000000000000000000000000000000000000..4d60dddc16d24e3a042279ea1ebb87c82a2884a0
--- /dev/null
+++ b/TEMP/jieba.cache
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94dd7ac7210667800791cb5ced1f6c0744ae1806ba5b7fdadfd7973cc4bdacf3
+size 9254935
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d005a40fd349176b1957dacfbcebae2f58df696
--- /dev/null
+++ b/app.py
@@ -0,0 +1,384 @@
+import os
+import sys
+# to avoid the modified user.pth file
+cnhubert_base_path = "GPT_SoVITS\pretrained_models\chinese-hubert-base"
+bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
+os.environ["version"] = 'v2'
+now_dir = os.getcwd()
+sys.path.insert(0, now_dir)
+import gradio as gr
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import numpy as np
+from pathlib import Path
+import os,librosa,torch, audiosegment
+from scipy.io.wavfile import write as wavwrite
+from GPT_SoVITS.feature_extractor import cnhubert
+cnhubert.cnhubert_base_path=cnhubert_base_path
+from GPT_SoVITS.module.models import SynthesizerTrn
+from GPT_SoVITS.AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from GPT_SoVITS.text import cleaned_text_to_sequence
+from GPT_SoVITS.text.cleaner import clean_text
+from time import time as ttime
+from GPT_SoVITS.module.mel_processing import spectrogram_torch
+import tempfile
+from tools.my_utils import load_audio
+import os
+import json
+
+################ End strange import and user.pth modification ################
+
+# import pyopenjtalk
+# cwd = os.getcwd()
+# if os.path.exists(os.path.join(cwd,'user.dic')):
+# pyopenjtalk.update_global_jtalk_with_user_dict(os.path.join(cwd, 'user.dic'))
+
+
+import logging
+logging.getLogger('httpx').setLevel(logging.WARNING)
+logging.getLogger('httpcore').setLevel(logging.WARNING)
+logging.getLogger('multipart').setLevel(logging.WARNING)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+#device = "cpu"
+is_half = False
+
+tokenizer = AutoTokenizer.from_pretrained(bert_path)
+bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
+if(is_half==True):bert_model=bert_model.half().to(device)
+else:bert_model=bert_model.to(device)
+# bert_model=bert_model.to(device)
+def get_bert_feature(text, word2ph): # Bert(不是HuBERT的特征计算)
+ with torch.no_grad():
+ inputs = tokenizer(text, return_tensors="pt")
+ for i in inputs:
+ inputs[i] = inputs[i].to(device)#####输入是long不用管精度问题,精度随bert_model
+ res = bert_model(**inputs, output_hidden_states=True)
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
+ assert len(word2ph) == len(text)
+ phone_level_feature = []
+ for i in range(len(word2ph)):
+ repeat_feature = res[i].repeat(word2ph[i], 1)
+ phone_level_feature.append(repeat_feature)
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
+ # if(is_half==True):phone_level_feature=phone_level_feature.half()
+ return phone_level_feature.T
+
+loaded_sovits_model = [] # [(path, dict, model)]
+loaded_gpt_model = []
+ssl_model = cnhubert.get_model()
+if (is_half == True):
+ ssl_model = ssl_model.half().to(device)
+else:
+ ssl_model = ssl_model.to(device)
+
+
+def load_model(sovits_path, gpt_path):
+ global ssl_model
+ global loaded_sovits_model
+ global loaded_gpt_model
+ vq_model = None
+ t2s_model = None
+ dict_s2 = None
+ dict_s1 = None
+ hps = None
+ for path, dict_s2_, model in loaded_sovits_model:
+ if path == sovits_path:
+ vq_model = model
+ dict_s2 = dict_s2_
+ break
+ for path, dict_s1_, model in loaded_gpt_model:
+ if path == gpt_path:
+ t2s_model = model
+ dict_s1 = dict_s1_
+ break
+
+ if dict_s2 is None:
+ dict_s2 = torch.load(sovits_path, map_location="cpu")
+ hps = dict_s2["config"]
+
+ if dict_s1 is None:
+ dict_s1 = torch.load(gpt_path, map_location="cpu")
+ config = dict_s1["config"]
+ class DictToAttrRecursive:
+ def __init__(self, input_dict):
+ for key, value in input_dict.items():
+ if isinstance(value, dict):
+ # 如果值是字典,递归调用构造函数
+ setattr(self, key, DictToAttrRecursive(value))
+ else:
+ setattr(self, key, value)
+
+ hps = DictToAttrRecursive(hps)
+ hps.model.semantic_frame_rate = "25hz"
+
+
+ if not vq_model:
+ vq_model = SynthesizerTrn(
+ hps.data.filter_length // 2 + 1,
+ hps.train.segment_size // hps.data.hop_length,
+ n_speakers=hps.data.n_speakers,
+ **hps.model)
+ if (is_half == True):
+ vq_model = vq_model.half().to(device)
+ else:
+ vq_model = vq_model.to(device)
+ vq_model.eval()
+ vq_model.load_state_dict(dict_s2["weight"], strict=False)
+ loaded_sovits_model.append((sovits_path, dict_s2, vq_model))
+ hz = 50
+ max_sec = config['data']['max_sec']
+ if not t2s_model:
+ t2s_model = Text2SemanticLightningModule(config, "ojbk", is_train=False)
+ t2s_model.load_state_dict(dict_s1["weight"])
+ if (is_half == True): t2s_model = t2s_model.half()
+ t2s_model = t2s_model.to(device)
+ t2s_model.eval()
+ total = sum([param.nelement() for param in t2s_model.parameters()])
+ loaded_gpt_model.append((gpt_path, dict_s1, t2s_model))
+ return vq_model, ssl_model, t2s_model, hps, config, hz, max_sec
+
+
+def get_spepc(hps, filename):
+ audio=load_audio(filename,int(hps.data.sampling_rate))
+ audio = audio / np.max(np.abs(audio))
+ audio=torch.FloatTensor(audio)
+ audio_norm = audio
+ # audio_norm = audio / torch.max(torch.abs(audio))
+ audio_norm = audio_norm.unsqueeze(0)
+ spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
+ return spec
+
+def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
+ def tts_fn(ref_wav_path, prompt_text, prompt_language, target_phone, text_language, target_text = None):
+ t0 = ttime()
+ prompt_text=prompt_text.strip()
+ prompt_language=prompt_language
+ with torch.no_grad():
+ wav16k, sr = librosa.load(ref_wav_path, sr=16000, mono=False)
+ direction = np.array([1,1])
+ if wav16k.ndim == 2:
+ power = np.sum(np.abs(wav16k) ** 2, axis=1)
+ direction = power / np.sum(power)
+ wav16k = (wav16k[0] + wav16k[1]) / 2
+ #
+ # maxx=0.95
+ # tmp_max = np.abs(wav16k).max()
+ # alpha=0.5
+ # wav16k = (wav16k / tmp_max * (maxx * alpha*32768)) + ((1 - alpha)*32768) * wav16k
+ #在这里归一化
+ #print(max(np.abs(wav16k)))
+ #wav16k = wav16k / np.max(np.abs(wav16k))
+ #print(max(np.abs(wav16k)))
+ # 添加0.3s的静音
+ wav16k = np.concatenate([wav16k, np.zeros(int(hps.data.sampling_rate * 0.3)),])
+ wav16k = torch.from_numpy(wav16k)
+ wav16k = wav16k.float()
+ if(is_half==True):wav16k=wav16k.half().to(device)
+ else:wav16k=wav16k.to(device)
+ ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
+ codes = vq_model.extract_latent(ssl_content)
+ prompt_semantic = codes[0, 0]
+ t1 = ttime()
+ phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
+ phones1=cleaned_text_to_sequence(phones1)
+ #texts=text.split("\n")
+ audio_opt = []
+ zero_wav=np.zeros((2, int(hps.data.sampling_rate*0.3)),dtype=np.float16 if is_half==True else np.float32)
+ phones = get_phone_from_str_list(target_phone, text_language)
+ for phones2 in phones:
+ if(len(phones2) == 0):
+ continue
+ if(len(phones2) == 1 and phones2[0] == ""):
+ continue
+ #phones2, word2ph2, norm_text2 = clean_text(text, text_language)
+ phones2 = cleaned_text_to_sequence(phones2)
+ #if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
+ bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
+ #if(text_language=="zh"):bert2 = get_bert_feature(norm_text2, word2ph2).to(device)
+ bert2 = torch.zeros((1024, len(phones2))).to(bert1)
+ bert = torch.cat([bert1, bert2], 1)
+
+ all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
+ bert = bert.to(device).unsqueeze(0)
+ all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
+ prompt = prompt_semantic.unsqueeze(0).to(device)
+ t2 = ttime()
+ idx = 0
+ cnt = 0
+ while idx == 0 and cnt < 2:
+ with torch.no_grad():
+ # pred_semantic = t2s_model.model.infer
+ pred_semantic,idx = t2s_model.model.infer_panel(
+ all_phoneme_ids,
+ all_phoneme_len,
+ prompt,
+ bert,
+ # prompt_phone_len=ph_offset,
+ top_k=config['inference']['top_k'],
+ early_stop_num=hz * max_sec)
+ t3 = ttime()
+ cnt+=1
+ if idx == 0:
+ return "Error: Generation failure: bad zero prediction.", None
+ pred_semantic = pred_semantic[:,-idx:].unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
+ refer = get_spepc(hps, ref_wav_path)#.to(device)
+ if(is_half==True):refer=refer.half().to(device)
+ else:refer=refer.to(device)
+ # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
+ audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer).detach().cpu().numpy()[0, 0]###试试重建不带上prompt部分
+ # direction乘上,变双通道
+ # 强制0.5
+ direction = np.array([1, 1])
+ audio = np.expand_dims(audio, 0) * direction[:, np.newaxis]
+ audio_opt.append(audio)
+ audio_opt.append(zero_wav)
+ t4 = ttime()
+
+ audio = (hps.data.sampling_rate,(np.concatenate(audio_opt, axis=1)*32768).astype(np.int16).T)
+ prefix_1 = prompt_text[:8].replace(" ", "_").replace("\n", "_").replace("?","_").replace("!","_").replace(",","_")
+ prefix_2 = target_text[:8].replace(" ", "_").replace("\n", "_").replace("?","_").replace("!","_").replace(",","_")
+ filename = tempfile.mktemp(suffix=".wav",prefix=f"{prefix_1}_{prefix_2}_")
+ #audiosegment.from_numpy_array(audio[1].T, framerate=audio[0]).export(filename, format="WAV")
+ wavwrite(filename, audio[0], audio[1])
+ return "Success", audio, filename
+ return tts_fn
+
+
+def get_str_list_from_phone(text, text_language):
+ # raw文本过g2p得到音素列表,再转成字符串
+ # 注意,这里的text是一个段落,可能包含多个句子
+ # 段落间\n分割,音素间空格分割
+ print(text)
+ texts=text.split("\n")
+ phone_list = []
+ for text in texts:
+ phones2, word2ph2, norm_text2 = clean_text(text, text_language)
+ phone_list.append(" ".join(phones2))
+ return "\n".join(phone_list)
+
+def get_phone_from_str_list(str_list:str, language:str = 'ja'):
+ # 从音素字符串中得到音素列表
+ # 注意,这里的text是一个段落,可能包含多个句子
+ # 段落间\n分割,音素间空格分割
+ sentences = str_list.split("\n")
+ phones = []
+ for sentence in sentences:
+ phones.append(sentence.split(" "))
+ return phones
+
+splits={",","。","?","!",",",".","?","!","~",":",":","—","…",}#不考虑省略号
+def split(todo_text):
+ todo_text = todo_text.replace("……", "。").replace("——", ",")
+ if (todo_text[-1] not in splits): todo_text += "。"
+ i_split_head = i_split_tail = 0
+ len_text = len(todo_text)
+ todo_texts = []
+ while (1):
+ if (i_split_head >= len_text): break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
+ if (todo_text[i_split_head] in splits):
+ i_split_head += 1
+ todo_texts.append(todo_text[i_split_tail:i_split_head])
+ i_split_tail = i_split_head
+ else:
+ i_split_head += 1
+ return todo_texts
+
+
+def change_reference_audio(prompt_text, transcripts):
+ return transcripts[prompt_text]
+
+
+models = []
+models_info = json.load(open("./models/models_info.json", "r", encoding="utf-8"))
+
+
+
+for i, info in models_info.items():
+ title = info['title']
+ cover = info['cover']
+ gpt_weight = info['gpt_weight']
+ sovits_weight = info['sovits_weight']
+ example_reference = info['example_reference']
+ transcripts = {}
+ transcript_path = info["transcript_path"]
+ path = os.path.dirname(transcript_path)
+ with open(transcript_path, 'r', encoding='utf-8') as file:
+ for line in file:
+ line = line.strip().replace("\\", "/")
+ items = line.split("|")
+ wav,t = items[0], items[-1]
+ wav = os.path.basename(wav)
+ transcripts[t] = os.path.join(os.path.join(path,"reference_audio"), wav)
+
+ vq_model, ssl_model, t2s_model, hps, config, hz, max_sec = load_model(sovits_weight, gpt_weight)
+
+
+ models.append(
+ (
+ i,
+ title,
+ cover,
+ transcripts,
+ example_reference,
+ create_tts_fn(
+ vq_model, ssl_model, t2s_model, hps, config, hz, max_sec
+ )
+ )
+ )
+with gr.Blocks() as app:
+ gr.Markdown(
+ "#
GPT-SoVITS Demo\n"
+ )
+ with gr.Tabs():
+ for (name, title, cover, transcripts, example_reference, tts_fn) in models:
+ with gr.TabItem(name):
+ with gr.Row():
+ gr.Markdown(
+ '')
+ with gr.Row():
+ with gr.Column():
+ prompt_text = gr.Dropdown(
+ label="Transcript of the Reference Audio",
+ value=example_reference if example_reference in transcripts else list(transcripts.keys())[0],
+ choices=list(transcripts.keys())
+ )
+ inp_ref_audio = gr.Audio(
+ label="Reference Audio",
+ type="filepath",
+ interactive=False,
+ value=transcripts[example_reference] if example_reference in transcripts else list(transcripts.values())[0]
+ )
+ transcripts_state = gr.State(value=transcripts)
+ prompt_text.change(
+ fn=change_reference_audio,
+ inputs=[prompt_text, transcripts_state],
+ outputs=[inp_ref_audio]
+ )
+ prompt_language = gr.State(value="ja")
+ with gr.Column():
+ text = gr.Textbox(label="Input Text", value="私はお兄ちゃんのだいだいだーいすきな妹なんだから、言うことなんでも聞いてくれますよね!")
+ text_language = gr.Dropdown(
+ label="Language",
+ choices=["ja"],
+ value="ja"
+ )
+ clean_button = gr.Button("Clean Text", variant="primary")
+ inference_button = gr.Button("Generate", variant="primary")
+ cleaned_text = gr.Textbox(label="Cleaned Text")
+ output = gr.Audio(label="Output Audio")
+ output_file = gr.File(label="Output Audio File")
+ om = gr.Textbox(label="Output Message")
+ clean_button.click(
+ fn=get_str_list_from_phone,
+ inputs=[text, text_language],
+ outputs=[cleaned_text]
+ )
+ inference_button.click(
+ fn=tts_fn,
+ inputs=[inp_ref_audio, prompt_text, prompt_language, cleaned_text, text_language, text],
+ outputs=[om, output, output_file]
+ )
+
+app.launch(share=True)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8ed0d73e4e446ea168026990f6de1d65f3920cb2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,35 @@
+numpy==1.23.4
+scipy
+tensorboard
+librosa==0.9.2
+numba==0.56.4
+pytorch-lightning
+gradio
+gradio_client
+ffmpeg-python
+onnxruntime; sys_platform == 'darwin'
+onnxruntime-gpu; sys_platform != 'darwin'
+tqdm
+funasr==1.0.27
+cn2an
+pypinyin
+pyopenjtalk
+g2p_en
+torchaudio
+modelscope==1.10.0
+sentencepiece
+transformers
+chardet
+PyYAML
+psutil
+jieba_fast
+jieba
+LangSegment>=0.2.0
+wordsegment
+rotary_embedding_torch
+pyjyutping
+g2pk2
+ko_pron
+opencc; sys_platform != 'linux'
+opencc==1.1.1; sys_platform == 'linux'
+python_mecab_ko; sys_platform != 'win32'
\ No newline at end of file
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/__pycache__/__init__.cpython-310.pyc b/tools/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ceef744777488674b7b7119a82c948a72791fed6
Binary files /dev/null and b/tools/__pycache__/__init__.cpython-310.pyc differ
diff --git a/tools/__pycache__/my_utils.cpython-310.pyc b/tools/__pycache__/my_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bd98df4a7af1f1391cbeddcbd3b863805851b8b
Binary files /dev/null and b/tools/__pycache__/my_utils.cpython-310.pyc differ
diff --git a/tools/asr/__pycache__/config.cpython-310.pyc b/tools/asr/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97fff4f061d54ebe4f7b5e0dd0704e8a095a0ce8
Binary files /dev/null and b/tools/asr/__pycache__/config.cpython-310.pyc differ
diff --git a/tools/asr/config.py b/tools/asr/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b3e0422a013956995da59cc07e455dd6303114
--- /dev/null
+++ b/tools/asr/config.py
@@ -0,0 +1,33 @@
+import os
+
+def check_fw_local_models():
+ '''
+ 启动时检查本地是否有 Faster Whisper 模型.
+ '''
+ model_size_list = [
+ "tiny", "tiny.en",
+ "base", "base.en",
+ "small", "small.en",
+ "medium", "medium.en",
+ "large", "large-v1",
+ "large-v2", "large-v3"]
+ for i, size in enumerate(model_size_list):
+ if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
+ model_size_list[i] = size + '-local'
+ return model_size_list
+
+asr_dict = {
+ "达摩 ASR (中文)": {
+ 'lang': ['zh'],
+ 'size': ['large'],
+ 'path': 'funasr_asr.py',
+ 'precision': ['float32']
+ },
+ "Faster Whisper (多语种)": {
+ 'lang': ['auto', 'zh', 'en', 'ja'],
+ 'size': check_fw_local_models(),
+ 'path': 'fasterwhisper_asr.py',
+ 'precision': ['float32', 'float16', 'int8']
+ },
+}
+
diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py
new file mode 100644
index 0000000000000000000000000000000000000000..24befd47f5d73ecc160ec41e08e8a50445e659cc
--- /dev/null
+++ b/tools/asr/fasterwhisper_asr.py
@@ -0,0 +1,114 @@
+import argparse
+import os
+import traceback
+
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import torch
+from faster_whisper import WhisperModel
+from tqdm import tqdm
+
+from tools.asr.config import check_fw_local_models
+
+language_code_list = [
+ "af", "am", "ar", "as", "az",
+ "ba", "be", "bg", "bn", "bo",
+ "br", "bs", "ca", "cs", "cy",
+ "da", "de", "el", "en", "es",
+ "et", "eu", "fa", "fi", "fo",
+ "fr", "gl", "gu", "ha", "haw",
+ "he", "hi", "hr", "ht", "hu",
+ "hy", "id", "is", "it", "ja",
+ "jw", "ka", "kk", "km", "kn",
+ "ko", "la", "lb", "ln", "lo",
+ "lt", "lv", "mg", "mi", "mk",
+ "ml", "mn", "mr", "ms", "mt",
+ "my", "ne", "nl", "nn", "no",
+ "oc", "pa", "pl", "ps", "pt",
+ "ro", "ru", "sa", "sd", "si",
+ "sk", "sl", "sn", "so", "sq",
+ "sr", "su", "sv", "sw", "ta",
+ "te", "tg", "th", "tk", "tl",
+ "tr", "tt", "uk", "ur", "uz",
+ "vi", "yi", "yo", "zh", "yue",
+ "auto"]
+
+def execute_asr(input_folder, output_folder, model_size, language, precision):
+ if '-local' in model_size:
+ model_size = model_size[:-6]
+ model_path = f'tools/asr/models/faster-whisper-{model_size}'
+ else:
+ model_path = model_size
+ if language == 'auto':
+ language = None #不设置语种由模型自动输出概率最高的语种
+ print("loading faster whisper model:",model_size,model_path)
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
+ try:
+ model = WhisperModel(model_path, device=device, compute_type=precision)
+ except:
+ return print(traceback.format_exc())
+
+ input_file_names = os.listdir(input_folder)
+ input_file_names.sort()
+
+ output = []
+ output_file_name = os.path.basename(input_folder)
+
+ for file_name in tqdm(input_file_names):
+ try:
+ file_path = os.path.join(input_folder, file_name)
+ segments, info = model.transcribe(
+ audio = file_path,
+ beam_size = 5,
+ vad_filter = True,
+ vad_parameters = dict(min_silence_duration_ms=700),
+ language = language)
+ text = ''
+
+ if info.language == "zh":
+ print("检测为中文文本, 转 FunASR 处理")
+ if("only_asr"not in globals()):
+ from tools.asr.funasr_asr import \
+ only_asr # #如果用英文就不需要导入下载模型
+ text = only_asr(file_path)
+
+ if text == '':
+ for segment in segments:
+ text += segment.text
+ output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
+ except:
+ print(traceback.format_exc())
+
+ output_folder = output_folder or "output/asr_opt"
+ os.makedirs(output_folder, exist_ok=True)
+ output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
+
+ with open(output_file_path, "w", encoding="utf-8") as f:
+ f.write("\n".join(output))
+ print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
+ return output_file_path
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_folder", type=str, required=True,
+ help="Path to the folder containing WAV files.")
+ parser.add_argument("-o", "--output_folder", type=str, required=True,
+ help="Output folder to store transcriptions.")
+ parser.add_argument("-s", "--model_size", type=str, default='large-v3',
+ choices=check_fw_local_models(),
+ help="Model Size of Faster Whisper")
+ parser.add_argument("-l", "--language", type=str, default='ja',
+ choices=language_code_list,
+ help="Language of the audio files.")
+ parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32','int8'],
+ help="fp16, int8 or fp32")
+
+ cmd = parser.parse_args()
+ output_file_path = execute_asr(
+ input_folder = cmd.input_folder,
+ output_folder = cmd.output_folder,
+ model_size = cmd.model_size,
+ language = cmd.language,
+ precision = cmd.precision,
+ )
diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9352c1fe4ccf23a26d4cad7afad306f26e5464b0
--- /dev/null
+++ b/tools/asr/funasr_asr.py
@@ -0,0 +1,79 @@
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+import traceback
+from tqdm import tqdm
+# from funasr.utils import version_checker
+# version_checker.check_for_update = lambda: None
+from funasr import AutoModel
+
+path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
+path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
+path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+
+
+model = AutoModel(
+ model = path_asr,
+ model_revision = "v2.0.4",
+ vad_model = path_vad,
+ vad_model_revision = "v2.0.4",
+ punc_model = path_punc,
+ punc_model_revision = "v2.0.4",
+)
+
+def only_asr(input_file):
+ try:
+ text = model.generate(input=input_file)[0]["text"]
+ except:
+ text = ''
+ print(traceback.format_exc())
+ return text
+
+def execute_asr(input_folder, output_folder, model_size, language):
+ input_file_names = os.listdir(input_folder)
+ input_file_names.sort()
+
+ output = []
+ output_file_name = os.path.basename(input_folder)
+
+ for file_name in tqdm(input_file_names):
+ try:
+ file_path = os.path.join(input_folder, file_name)
+ text = model.generate(input=file_path)[0]["text"]
+ output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
+ except:
+ print(traceback.format_exc())
+
+ output_folder = output_folder or "output/asr_opt"
+ os.makedirs(output_folder, exist_ok=True)
+ output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
+
+ with open(output_file_path, "w", encoding="utf-8") as f:
+ f.write("\n".join(output))
+ print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
+ return output_file_path
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_folder", type=str, required=True,
+ help="Path to the folder containing WAV files.")
+ parser.add_argument("-o", "--output_folder", type=str, required=True,
+ help="Output folder to store transcriptions.")
+ parser.add_argument("-s", "--model_size", type=str, default='large',
+ help="Model Size of FunASR is Large")
+ parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
+ help="Language of the audio files.")
+ parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
+ help="fp16 or fp32")#还没接入
+
+ cmd = parser.parse_args()
+ execute_asr(
+ input_folder = cmd.input_folder,
+ output_folder = cmd.output_folder,
+ model_size = cmd.model_size,
+ language = cmd.language,
+ )
diff --git a/tools/asr/models/.gitignore b/tools/asr/models/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a3a0c8b5f48c0260a4cb43aa577f9b18896ee280
--- /dev/null
+++ b/tools/asr/models/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/tools/cmd-denoise.py b/tools/cmd-denoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fdcab6dc1c8a3727d69faa96349b889b0d76d6d
--- /dev/null
+++ b/tools/cmd-denoise.py
@@ -0,0 +1,33 @@
+import os,argparse
+import traceback
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from tqdm import tqdm
+
+path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
+path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
+ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
+def execute_denoise(input_folder,output_folder):
+ os.makedirs(output_folder,exist_ok=True)
+ # print(input_folder)
+ # print(list(os.listdir(input_folder).sort()))
+ for name in tqdm(os.listdir(input_folder)):
+ try:
+ ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
+ except:
+ traceback.print_exc()
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-i", "--input_folder", type=str, required=True,
+ help="Path to the folder containing WAV files.")
+ parser.add_argument("-o", "--output_folder", type=str, required=True,
+ help="Output folder to store transcriptions.")
+ parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
+ help="fp16 or fp32")#还没接入
+ cmd = parser.parse_args()
+ execute_denoise(
+ input_folder = cmd.input_folder,
+ output_folder = cmd.output_folder,
+ )
\ No newline at end of file
diff --git a/tools/denoise-model/.gitignore b/tools/denoise-model/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..005717ead0bb8f920c00d76feb8207deb7946a57
--- /dev/null
+++ b/tools/denoise-model/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/tools/i18n/__pycache__/i18n.cpython-310.pyc b/tools/i18n/__pycache__/i18n.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..983fce0e4be24615cd323973a7b5ee6ed5673480
Binary files /dev/null and b/tools/i18n/__pycache__/i18n.cpython-310.pyc differ
diff --git a/tools/i18n/i18n.py b/tools/i18n/i18n.py
new file mode 100644
index 0000000000000000000000000000000000000000..a20c109e4e3b8bb3e1b2bdfe92af10a57617253a
--- /dev/null
+++ b/tools/i18n/i18n.py
@@ -0,0 +1,30 @@
+import json
+import locale
+import os
+
+I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale')
+
+def load_language_list(language):
+ with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
+ language_list = json.load(f)
+ return language_list
+
+class I18nAuto:
+ def __init__(self, language=None):
+ if language in ["Auto", None]:
+ language = locale.getdefaultlocale()[0]
+ # getlocale can't identify the system's language ((None, None))
+ if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
+ language = "en_US"
+ self.language = language
+ self.language_map = load_language_list(language)
+
+ def __call__(self, key):
+ return self.language_map.get(key, key)
+
+ def __repr__(self):
+ return "Use Language: " + self.language
+
+if __name__ == "__main__":
+ i18n = I18nAuto(language='en_US')
+ print(i18n)
\ No newline at end of file
diff --git a/tools/i18n/locale/en_US.json b/tools/i18n/locale/en_US.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4b1dbd6fe5b6f8f1e25d287a45f3d390dffa38c
--- /dev/null
+++ b/tools/i18n/locale/en_US.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb): Best choice for dual-channel reverberation, cannot remove single-channel reverberation;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverberation, can remove mono reverberation, but does not clean heavily high-frequency plate reverberation.",
+ "*GPT模型列表": "*GPT models list",
+ "*SoVITS模型列表": "*SoVITS models list",
+ "*实验/模型名": "*Experiment/model name",
+ "*文本标注文件": "*Text labelling file",
+ "*训练集音频文件目录": "*Audio dataset folder",
+ "*请上传并填写参考信息": "*Please upload and fill reference information",
+ "*请填写需要合成的目标文本和语种模式": "*Please fill in the target text and language mode for synthesis",
+ ".list标注文件的路径": ".list annotation file path",
+ "0-前置数据集获取工具": "0-Fetch dataset",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5 webui (for vocal separation, deecho, dereverb and denoise)",
+ "0b-语音切分工具": "0b-Audio slicer",
+ "0bb-语音降噪工具": "0bb-Voice denoiser",
+ "0c-中文批量离线ASR工具": "0c-Chinese ASR tool",
+ "0d-语音文本校对标注工具": "0d-Speech to text proofreading tool",
+ "1-GPT-SoVITS-TTS": "1-GPT-SOVITS-TTS",
+ "1A-训练集格式化工具": "1A-Dataset formatting",
+ "1Aa-文本内容": "1Aa-Text",
+ "1Aabc-训练集格式化一键三连": "1Aabc-One-click formatting",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL self-supervised feature extraction",
+ "1Ac-语义token提取": "1Ac-semantics token extraction",
+ "1B-微调训练": "1B-Fine-tuned training",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS training. The model is located in SoVITS_weights.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT training. The model is located in GPT_weights.",
+ "1C-推理": "1C-inference",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. The DeEcho-DeReverb model's processing time is nearly twice that of the other two DeEcho models.",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Preserve Vocals: Choose this option for audio without harmonies, as it better retains the main vocal compared to the HP5 model. This option includes two built-in models, HP2 and HP3. HP3 may slightly let through some accompaniment but retains the main vocal slightly better than HP2.",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-Voice Changer",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb Model is slow;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Keep Only Main Vocal: Choose this option for audio with harmonies, as it may slightly reduce the main vocal. Includes one built-in HP5 model;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. Personal Recommendation for the cleanest configuration: First use MDX-Net followed by DeEcho-Aggressive",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Reverberation and delay removal model(by FoxJoy):",
+ "ASR 模型": "ASR model",
+ "ASR 模型尺寸": "ASR model size",
+ "数据类型精度": "Computing precision",
+ "ASR 语言设置": "ASR language",
+ "ASR进程输出信息": "ASR output log",
+ "GPT模型列表": "GPT weight list",
+ "GPT训练进程输出信息": "GPT training output log",
+ "GPU卡号,只能填1个整数": "GPU number, can only input ONE integer",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ",
+ "SSL进程输出信息": "SSL output log",
+ "SoVITS模型列表": "SoVITS weight list",
+ "SoVITS训练进程输出信息": "SoVITS training output log",
+ "TTS推理WebUI进程输出信息": "TTS inference webui output log",
+ "TTS推理进程已关闭": "TTS inference process closed",
+ "TTS推理进程已开启": "TTS inference process is opened",
+ "UVR5已关闭": "UVR5 closed",
+ "UVR5已开启": "UVR5 opened ",
+ "UVR5进程输出信息": "UVR5 process output log",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT sampling parameters (not too low when there's no reference text. Use default if unsure):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
+ "max:归一化后最大值多少": "Loudness multiplier after normalized",
+ "max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
+ "min_interval:最短切割间隔": "Minumum interval for audio cutting",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "One-click formatting output",
+ "不切": "No slice",
+ "中文": "Chinese",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Chinese Tutorial:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Chinese-English Mixed",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be imported. If a folder path exists, this input is ignored.",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Batch processing for vocal and instrumental separation, using the UVR5 model.",
+ "人声提取激进程度": "Vocal extraction aggressiveness",
+ "伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "When using the no-reference text mode, it is recommended to use a fine-tuned GPT. If the reference audio is unclear and you don't know what to write, you can enable this feature, which will ignore the reference text you've entered.",
+ "保存频率save_every_epoch": "Save frequency (save_every_epoch):",
+ "凑50字一切": "Cut per 50 characters",
+ "凑四句一切": "Slice once every 4 sentences",
+ "切分后文本": "Text after sliced",
+ "切分后的子音频的输出根目录": "Audio slicer output folder",
+ "切割使用的进程数": "CPU threads used for audio slicing",
+ "刷新模型路径": "refreshing model paths",
+ "前端处理后的文本(每句):": "Processed text from the frontend (per sentence):",
+ "去混响/去延迟,附:": "Dereverberation/Delay Removal, including:",
+ "参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range, please choose another one!",
+ "参考音频的文本": "Text for reference audio",
+ "参考音频的语种": "Language for reference audio",
+ "合成语音": "Start inference",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": " Step-to-step phoneme transformation and modification coming soon!",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Please fill in the segmented audio files' directory! The full path of the audio file = the directory concatenated with the filename corresponding to the waveform in the list file (not the full path). If left blank, the absolute full path in the .list file will be used.",
+ "多语种混合": "Multilingual Mixed",
+ "实际输入的参考文本:": "Actual Input Reference Text:",
+ "实际输入的目标文本(切句后):": "Actual Input Target Text (after sentence segmentation):",
+ "实际输入的目标文本(每句):": "Actual Input Target Text (per sentence):",
+ "实际输入的目标文本:": "Actual Input Target Text:",
+ "导出文件格式": "Export file format",
+ "开启GPT训练": "Start GPT training",
+ "开启SSL提取": "Start SSL extracting",
+ "开启SoVITS训练": "Start SoVITS training",
+ "开启一键三连": "Start one-click formatting",
+ "开启文本获取": "Start speech-to-text",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
+ "开启离线批量ASR": "Start batch ASR",
+ "开启语义token提取": "Start semantics token extraction",
+ "开启语音切割": "Start audio slicer",
+ "开启语音降噪": "Start voice denoiser",
+ "怎么切": "How to slice the sentence",
+ "总训练轮数total_epoch": "Total training epochs (total_epoch):",
+ "总训练轮数total_epoch,不建议太高": "Total epochs, do not increase to a value that is too high",
+ "打标工具WebUI已关闭": "proofreading tool webui is closed",
+ "打标工具WebUI已开启": "proofreading tool webui is opened",
+ "打标工具进程输出信息": "Proofreading tool output log",
+ "指定输出主人声文件夹": "Specify the output folder for vocals:",
+ "指定输出非主人声文件夹": "Specify the output folder for accompaniment:",
+ "按中文句号。切": "Slice by Chinese punct",
+ "按标点符号切": "Slice by every punct",
+ "按英文句号.切": "Slice by English punct",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Text slicer tool, since there will be issues when infering long texts, so it is advised to cut first. When infering, it will infer respectively then combined together.",
+ "文本模块学习率权重": "Text model learning rate weighting",
+ "文本进程输出信息": "Text processing output",
+ "施工中,请静候佳音": "In construction, please wait",
+ "日文": "Japanese",
+ "日英混合": "Japanese-English Mixed",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Save only the latest '.ckpt' file to save disk space:",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:",
+ "是否开启TTS推理WebUI": "Open TTS inference WEBUI",
+ "是否开启UVR5-WebUI": "Open UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "Enable DPO training (experimental feature)",
+ "是否开启打标WebUI": "Open labelling WebUI",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Whether to directly adjust the speech rate of the last synthesis result to prevent randomness.",
+ "显卡信息": "GPU Information",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible.
If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory Agreement-LICENSE for details.",
+ "模型": "Model",
+ "模型分为三类:": "Models are categorized into three types:",
+ "模型切换": "Model switch",
+ "每张显卡的batch_size": "Batch size per GPU:",
+ "终止ASR进程": "Stop ASR task",
+ "终止GPT训练": "Stop GPT training",
+ "终止SSL提取进程": "Stop SSL extraction",
+ "终止SoVITS训练": "Stop SoVITS training",
+ "终止一键三连": "Stop one-click formatting",
+ "终止文本获取进程": "Stop speech-to-text",
+ "终止语义token提取进程": "Stop semantics token extraction",
+ "终止语音切割": "Stop audio cutting",
+ "终止语音降噪进程": "Stop voice denoising",
+ "英文": "English",
+ "语义token提取进程输出信息": "Sematics token extraction output log",
+ "语速": "Speech rate",
+ "语速调整,高为更快": "Adjust speech rate, higher for faster",
+ "语音切割进程输出信息": "Audio slicer output log",
+ "语音降噪进程输出信息": "Voice Denoiser Process Output Information",
+ "请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
+ "请输入有效文本": "Please enter valid text.",
+ "转换": "Convert",
+ "输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:",
+ "输入文件夹路径": "Input folder path",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "output folder (logs/{experiment name}) should have files and folders starts with 23456.",
+ "输出信息": "Output information",
+ "输出文件夹路径": "Output folder path",
+ "输出的语音": "Inference Result",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
+ "降噪结果输出文件夹": "Denoised Results Output Folder",
+ "降噪音频文件输入文件夹": "Denoising Audio File Input Folder",
+ "需要合成的切分前文本": "Inference text that needs to be sliced",
+ "需要合成的文本": "Inference text",
+ "需要合成的语种": "Inference text language",
+ "音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
+ "预训练的GPT模型路径": "Pretrained GPT model path",
+ "预训练的SSL模型路径": "Pretrained SSL model path",
+ "预训练的SoVITS-D模型路径": "Pretrained SoVITS-D model path",
+ "预训练的SoVITS-G模型路径": "Pretrained SoVITS-G model path",
+ "预训练的中文BERT模型路径": " Pretrained BERT model path"
+}
diff --git a/tools/i18n/locale/es_ES.json b/tools/i18n/locale/es_ES.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad0e64bc7b7122803a9e96a070e6d34241a6c27a
--- /dev/null
+++ b/tools/i18n/locale/es_ES.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): reverberación estéreo, la mejor opción; no puede eliminar reverberación mono",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Eliminar el efecto de retardo. Aggressive elimina más que Normal, DeReverb elimina reverberación adicional, puede eliminar reverberación mono, pero no limpia bien la reverberación de placa de alta frecuencia",
+ "*GPT模型列表": "*Lista de modelos GPT",
+ "*SoVITS模型列表": "*Lista de modelos SoVITS",
+ "*实验/模型名": "*Nombre del experimento/modelo",
+ "*文本标注文件": "*Archivo de etiquetado de texto",
+ "*训练集音频文件目录": "*Directorio de archivos de audio de entrenamiento",
+ "*请上传并填写参考信息": "*Por favor, suba y complete la información de referencia",
+ "*请填写需要合成的目标文本和语种模式": "*Por favor, complete el texto objetivo a sintetizar y el modo de idioma",
+ ".list标注文件的路径": "Ruta del archivo de anotación .list",
+ "0-前置数据集获取工具": "0-Herramienta de obtención de conjunto de datos previo",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo",
+ "0b-语音切分工具": "0b-Herramienta de división de voz",
+ "0bb-语音降噪工具": "0bb-Herramienta de reducción de ruido de voz",
+ "0c-中文批量离线ASR工具": "0c-Herramienta de ASR en lote fuera de línea en chino",
+ "0d-语音文本校对标注工具": "0d-Herramienta de corrección y etiquetado de texto de voz",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-Herramienta de formateo del conjunto de datos de entrenamiento",
+ "1Aa-文本内容": "1Aa-Contenido del texto",
+ "1Aabc-训练集格式化一键三连": "1Aabc-Formateo del conjunto de datos de entrenamiento en un solo paso",
+ "1Ab-SSL自监督特征提取": "1Ab-Extracción de características auto-supervisada SSL",
+ "1Ac-语义token提取": "1Ac-Extracción de tokens semánticos",
+ "1B-微调训练": "1B-Entrenamiento de ajuste fino",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-Entrenamiento de SoVITS. Los archivos de modelo para compartir se encuentran en SoVITS_weights.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-Entrenamiento de GPT. Los archivos de modelo para compartir se encuentran en GPT_weights.",
+ "1C-推理": "1C-Inferencia",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. El modelo DeEcho-DeReverb tarda casi el doble que los otros dos modelos DeEcho",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Retener voz principal: seleccione este para audio sin coros, retiene mejor la voz principal que HP5. Incluye dos modelos, HP2 y HP3; HP3 puede filtrar ligeramente el acompañamiento pero retiene mejor la voz principal que HP2",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-Cambio de voz",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. El modelo MDX-Net-Dereverb es bastante lento",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Solo retener voz principal: seleccione este para audio con coros, puede debilitar la voz principal. Incluye un modelo HP5",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. La configuración más limpia recomendada es primero MDX-Net, luego DeEcho-Aggressive",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Modelos de eliminación de reverberación y retardo (por FoxJoy)",
+ "ASR 模型": "Modelo ASR",
+ "ASR 模型尺寸": "Tamaño del modelo ASR",
+ "数据类型精度": "precisión del tipo de datos",
+ "ASR 语言设置": "Configuración del idioma ASR",
+ "ASR进程输出信息": "Información de salida del proceso ASR",
+ "GPT模型列表": "Lista de modelos GPT",
+ "GPT训练进程输出信息": "Información de salida del proceso de entrenamiento de GPT",
+ "GPU卡号,只能填1个整数": "Número de tarjeta GPU, solo se puede ingresar un número entero",
+ "GPU卡号以-分割,每个卡号一个进程": "Número de tarjeta GPU separado por '-', cada número de tarjeta es un proceso",
+ "SSL进程输出信息": "Información de salida del proceso SSL",
+ "SoVITS模型列表": "Lista de modelos SoVITS",
+ "SoVITS训练进程输出信息": "Información de salida del proceso de entrenamiento de SoVITS",
+ "TTS推理WebUI进程输出信息": "Información de salida del proceso de interfaz web de inferencia TTS",
+ "TTS推理进程已关闭": "Proceso de inferencia TTS cerrado",
+ "TTS推理进程已开启": "Proceso de inferencia TTS iniciado",
+ "UVR5已关闭": "UVR5 está deshabilitado",
+ "UVR5已开启": "UVR5 está habilitado",
+ "UVR5进程输出信息": "Información de salida del proceso UVR5",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Parámetros de muestreo de GPT (no demasiado bajos cuando no hay texto de referencia. Use los valores por defecto si no está seguro):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
+ "max:归一化后最大值多少": "max: valor máximo después de la normalización",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept: duración máxima del silencio después del corte",
+ "min_interval:最短切割间隔": "min_interval: intervalo mínimo de corte",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: longitud mínima de cada segmento; si el primer segmento es demasiado corto, se une al siguiente hasta superar este valor",
+ "temperature": "temperatura",
+ "threshold:音量小于这个值视作静音的备选切割点": "umbral: puntos de corte alternativos considerados como silencio si el volumen es menor que este valor",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Información de salida del proceso de triple acción",
+ "不切": "No cortar",
+ "中文": "Chino",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentación del tutorial en chino: https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Chino e inglés mezclados",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden ingresar archivos de audio por lotes, seleccionar uno, prioridad para leer carpetas",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Procesamiento por lotes de separación de voz y acompañamiento utilizando el modelo UVR5",
+ "人声提取激进程度": "Nivel de agresividad en la extracción de voz",
+ "伴奏人声分离&去混响&去回声": "Separación de acompañamiento y voz principal y eliminación de reverberación y eco",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Se recomienda usar un GPT ajustado en modo sin texto de referencia; habilítelo si no puede entender el audio de referencia (si no sabe qué escribir). Una vez habilitado, ignorará el texto de referencia ingresado.",
+ "保存频率save_every_epoch": "Frecuencia de guardado (cada epoch)",
+ "凑50字一切": "Todo para alcanzar las 50 palabras",
+ "凑四句一切": "Completa cuatro oraciones para rellenar todo",
+ "切分后文本": "Texto después de la división",
+ "切分后的子音频的输出根目录": "Directorio raíz de salida de los sub-audios después de la división",
+ "切割使用的进程数": "Número de procesos utilizados para la división",
+ "刷新模型路径": "Actualizar la ruta del modelo",
+ "前端处理后的文本(每句):": "Texto después del procesamiento previo (por frase):",
+ "去混响/去延迟,附:": "Eliminación de reverberación/retardo, incluye:",
+ "参考音频在3~10秒范围外,请更换!": "El audio de referencia está fuera del rango de 3 a 10 segundos, ¡por favor cámbielo!",
+ "参考音频的文本": "Texto de referencia del audio",
+ "参考音频的语种": "Idioma del audio de referencia",
+ "合成语音": "Síntesis de voz",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "En el futuro se admitirá la conversión de fonemas, la modificación manual de fonemas y la síntesis de voz paso a paso.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Ingrese el directorio donde se encuentran los audios después de la división. La ruta completa de los archivos de audio leídos = este directorio + nombre de archivo correspondiente en el archivo .list (no la ruta completa). Si se deja en blanco, se utilizará la ruta completa del archivo .list.",
+ "多语种混合": "Mezcla de varios idiomas",
+ "实际输入的参考文本:": "Texto de referencia realmente ingresado:",
+ "实际输入的目标文本(切句后):": "Texto objetivo realmente ingresado (después de dividir en frases):",
+ "实际输入的目标文本(每句):": "Texto objetivo realmente ingresado (por frase):",
+ "实际输入的目标文本:": "Texto objetivo realmente ingresado:",
+ "导出文件格式": "Formato de archivo de exportación",
+ "开启GPT训练": "Iniciar entrenamiento de GPT",
+ "开启SSL提取": "Habilitar la extracción SSL",
+ "开启SoVITS训练": "Iniciar entrenamiento de SoVITS",
+ "开启一键三连": "Habilitar un solo paso de formateo",
+ "开启文本获取": "Habilitar la obtención de texto",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.",
+ "开启离线批量ASR": "Habilitar ASR en lote fuera de línea",
+ "开启语义token提取": "Habilitar la extracción de tokens semánticos",
+ "开启语音切割": "Habilitar la división de voz",
+ "开启语音降噪": "Habilitar la reducción de ruido de voz",
+ "怎么切": "Cómo cortar",
+ "总训练轮数total_epoch": "Número total de épocas de entrenamiento",
+ "总训练轮数total_epoch,不建议太高": "Número total de épocas de entrenamiento, no se recomienda demasiado alto",
+ "打标工具WebUI已关闭": "Interfaz web de la herramienta de etiquetado cerrada",
+ "打标工具WebUI已开启": "Interfaz web de la herramienta de etiquetado iniciada",
+ "打标工具进程输出信息": "Información de salida del proceso de la herramienta de etiquetado",
+ "指定输出主人声文件夹": "Especificar carpeta de salida de voz principal",
+ "指定输出非主人声文件夹": "Especificar carpeta de salida de no voz principal",
+ "按中文句号。切": "Cortar según puntos en chino",
+ "按标点符号切": "Cortar según los signos de puntuación",
+ "按英文句号.切": "Cortar por puntos en inglés.",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Herramienta de división de texto. El resultado de la síntesis puede no ser bueno para textos demasiado largos, por lo que se recomienda dividirlos primero. La síntesis se realiza separando el texto según los saltos de línea y luego uniendo los fragmentos.",
+ "文本模块学习率权重": "Peso de la tasa de aprendizaje del módulo de texto",
+ "文本进程输出信息": "Información de salida del proceso de obtención de texto",
+ "施工中,请静候佳音": "En construcción, por favor espere pacientemente",
+ "日文": "Japonés",
+ "日英混合": "Mezcla de japonés e inglés",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "¿Guardar solo el último archivo ckpt para ahorrar espacio en disco?",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "¿Guardar el modelo final pequeño en la carpeta de pesos en cada punto de guardado?",
+ "是否开启TTS推理WebUI": "¿Habilitar la interfaz web de inferencia TTS?",
+ "是否开启UVR5-WebUI": "¿Habilitar UVR5-WebUI?",
+ "是否开启dpo训练选项(实验性)": "¿Habilitar la opción de entrenamiento dpo (experimental)?",
+ "是否开启打标WebUI": "¿Habilitar la interfaz web de etiquetado?",
+ "是否直接对上次合成结果调整语速。防止随机性。": "¿Si se ajusta directamente la velocidad de habla del último resultado de síntesis para evitar aleatoriedad?",
+ "显卡信息": "Información de la tarjeta gráfica",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Este software es de código abierto bajo la licencia MIT. El autor no tiene control sobre el software. El usuario que lo utilice o distribuya, y el que genere sonidos a partir del software, asume toda la responsabilidad.
Si no acepta estos términos, no puede utilizar ni hacer referencia a ningún código o archivo dentro del paquete de software. Consulte el archivo LICENSE en el directorio raíz para obtener más detalles.",
+ "模型": "Modelo",
+ "模型分为三类:": "Los modelos se dividen en tres categorías:",
+ "模型切换": "Cambio de modelo",
+ "每张显卡的batch_size": "Tamaño de lote por tarjeta gráfica",
+ "终止ASR进程": "Terminar el proceso ASR",
+ "终止GPT训练": "Detener entrenamiento de GPT",
+ "终止SSL提取进程": "Terminar el proceso de extracción SSL",
+ "终止SoVITS训练": "Detener entrenamiento de SoVITS",
+ "终止一键三连": "Terminar el proceso de un solo paso de formateo",
+ "终止文本获取进程": "Terminar el proceso de obtención de texto",
+ "终止语义token提取进程": "Terminar el proceso de extracción de tokens semánticos",
+ "终止语音切割": "Terminar la división de voz",
+ "终止语音降噪进程": "Terminar el proceso de reducción de ruido de voz",
+ "英文": "Inglés",
+ "语义token提取进程输出信息": "Información de salida del proceso de extracción de tokens semánticos",
+ "语速": "Velocidad de habla",
+ "语速调整,高为更快": "Ajustar la velocidad de habla, más alta para más rápido",
+ "语音切割进程输出信息": "Información de salida del proceso de división de voz",
+ "语音降噪进程输出信息": "Información de salida del proceso de reducción de ruido de voz",
+ "请上传3~10秒内参考音频,超过会报错!": "Por favor, suba un audio de referencia de entre 3 y 10 segundos, ¡más de eso causará un error!",
+ "请输入有效文本": "Por favor, introduzca un texto válido",
+ "转换": "Convertir",
+ "输入待处理音频文件夹路径": "Ingrese la ruta de la carpeta de audio a procesar",
+ "输入文件夹路径": "Ingrese la ruta de la carpeta",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "Debe haber archivos y carpetas que comiencen con 23456 en el directorio logs/nombre del experimento",
+ "输出信息": "Información de salida",
+ "输出文件夹路径": "Ruta de la carpeta de salida",
+ "输出的语音": "Audio de salida",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Seleccione el modelo almacenado en SoVITS_weights y GPT_weights después del entrenamiento. Uno de ellos es el modelo base, útil para experimentar con TTS de 5 segundos sin entrenamiento.",
+ "降噪结果输出文件夹": "Carpeta de salida de los resultados de reducción de ruido",
+ "降噪音频文件输入文件夹": "Carpeta de entrada de archivos de audio para reducción de ruido",
+ "需要合成的切分前文本": "Texto a sintetizar antes de la división",
+ "需要合成的文本": "Texto a sintetizar",
+ "需要合成的语种": "Idioma para la síntesis",
+ "音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
+ "预训练的GPT模型路径": "Ruta del modelo GPT preentrenado",
+ "预训练的SSL模型路径": "Ruta del modelo SSL preentrenado",
+ "预训练的SoVITS-D模型路径": "Ruta del modelo SoVITS-D preentrenado",
+ "预训练的SoVITS-G模型路径": "Ruta del modelo SoVITS-G preentrenado",
+ "预训练的中文BERT模型路径": "Ruta del modelo BERT en chino preentrenado"
+}
diff --git a/tools/i18n/locale/fr_FR.json b/tools/i18n/locale/fr_FR.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2eecd18ea69b199aaee6f52c645814d9836556f
--- /dev/null
+++ b/tools/i18n/locale/fr_FR.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1) MDX-Net (onnx_dereverb) : C'est le meilleur choix pour la réverbération à deux canaux, mais il ne peut pas éliminer la réverbération à un seul canal;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho : Supprime les effets de délai. Aggressive est plus exhaustif que Normal dans la suppression, DeReverb élimine également la réverbération, peut supprimer la réverbération monocanal, mais n'élimine pas complètement la réverbération de plaque à haute fréquence.",
+ "*GPT模型列表": "*Liste des modèles GPT",
+ "*SoVITS模型列表": "*Liste des modèles SoVITS",
+ "*实验/模型名": "*Nom de l'expérience/modèle",
+ "*文本标注文件": "*Fichier d'annotation de texte",
+ "*训练集音频文件目录": "*Répertoire des fichiers audio d'entraînement",
+ "*请上传并填写参考信息": "*Veuillez télécharger et remplir les informations de référence",
+ "*请填写需要合成的目标文本和语种模式": "*Veuillez saisir le texte cible à synthétiser et le mode de langue.",
+ ".list标注文件的路径": "Chemin du fichier d'annotation .list",
+ "0-前置数据集获取工具": "0-Outil de récupération de jeu de données préalable",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-Outil de séparation de la voix humaine et de l'accompagnement UVR5 & suppression de la réverbération et du retard",
+ "0b-语音切分工具": "0b-Outil de découpage vocal",
+ "0bb-语音降噪工具": "0bb-Outil de réduction du bruit vocal",
+ "0c-中文批量离线ASR工具": "0c-Outil chinois de transcription automatique hors ligne en masse",
+ "0d-语音文本校对标注工具": "0d-Outil de correction et d'annotation de texte vocal",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-Outil de formatage du jeu de données d'entraînement",
+ "1Aa-文本内容": "1Aa-Contenu du texte",
+ "1Aabc-训练集格式化一键三连": "1Aabc-Formatage en un clic du jeu de données d'entraînement",
+ "1Ab-SSL自监督特征提取": "1Ab-Extraction de caractéristiques auto-supervisée SSL",
+ "1Ac-语义token提取": "1Ac-Extraction de jetons sémantiques",
+ "1B-微调训练": "1B-Entraînement fin",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-Entraînement SoVITS. Les fichiers de modèle destinés au partage sont enregistrés sous SoVITS_weights.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-Entraînement GPT. Les fichiers de modèle destinés au partage sont enregistrés sous GPT_weights.",
+ "1C-推理": "1C-Inférence",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. Le temps de traitement du modèle DeEcho-DeReverb est presque le double de celui des deux autres modèles DeEcho;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Préserver les voix : Choisissez cette option pour les audio sans harmonie, car elle conserve mieux la voix principale par rapport au modèle HP5. Deux modèles intégrés, HP2 et HP3, sont disponibles. HP3 peut légèrement laisser passer l'accompagnement mais conserve la voix principale un peu mieux que HP2;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-Modification de la voix",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. Le modèle MDX-Net-Dereverb est assez lent;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Conserver uniquement la voix principale : Choisissez cette option pour les audio avec harmonie, car elle peut affaiblir la voix principale. Un modèle HP5 intégré est disponible;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. La configuration la plus propre que je recommande est d'utiliser d'abord MDX-Net, puis DeEcho-Aggressive.",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Modèle de suppression de réverbération et de retard (par FoxJoy) :",
+ "ASR 模型": "Modèle ASR",
+ "ASR 模型尺寸": "Taille du modèle ASR",
+ "数据类型精度": "précision du type de données",
+ "ASR 语言设置": "Paramètres de langue ASR",
+ "ASR进程输出信息": "Informations de processus ASR",
+ "GPT模型列表": "Liste des modèles GPT",
+ "GPT训练进程输出信息": "Informations de processus d'entraînement GPT",
+ "GPU卡号,只能填1个整数": "Numéro de carte GPU, ne peut contenir qu'un seul entier",
+ "GPU卡号以-分割,每个卡号一个进程": "Numéro de carte GPU séparé par des tirets, un processus par numéro de carte",
+ "SSL进程输出信息": "Informations de processus SSL",
+ "SoVITS模型列表": "Liste des modèles SoVITS",
+ "SoVITS训练进程输出信息": "Informations de processus d'entraînement SoVITS",
+ "TTS推理WebUI进程输出信息": "Informations de processus de l'interface Web d'inférence TTS",
+ "TTS推理进程已关闭": "Le processus d'inférence TTS est terminé",
+ "TTS推理进程已开启": "Le processus d'inférence TTS est en cours",
+ "UVR5已关闭": "UVR5 est désactivé",
+ "UVR5已开启": "UVR5 est activé",
+ "UVR5进程输出信息": "Informations de processus UVR5",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Paramètres d'échantillonnage de GPT (ne pas mettre trop bas lorsqu'il n'y a pas de texte de référence. Utilisez les valeurs par défaut si vous n'êtes pas sûr):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)",
+ "max:归一化后最大值多少": "max: valeur maximale après normalisation",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept: durée maximale de silence après la coupe",
+ "min_interval:最短切割间隔": "min_interval: intervalle de coupe minimum",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:longueur minimale de chaque segment ; si le premier segment est trop court, il est concaténé avec les segments suivants jusqu'à ce que la longueur dépasse cette valeur",
+ "temperature": "température",
+ "threshold:音量小于这个值视作静音的备选切割点": "seuil: le volume inférieur à cette valeur est considéré comme un point de coupe silencieux alternatif",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Informations de processus de l'un clic trois connexions",
+ "不切": "Pas de découpe",
+ "中文": "Chinois",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentation du tutoriel en chinois:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Mélange de chinois et d'anglais",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "Également possible d'entrer en lot des fichiers audio, au choix, privilégiez la lecture du dossier",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Traitement par lot de séparation voix-accompagnement en utilisant le modèle UVR5.",
+ "人声提取激进程度": "Degré d'extraction des voix",
+ "伴奏人声分离&去混响&去回声": "Séparation de la voix et de l'accompagnement, suppression de la réverbération et de l'écho",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Il est recommandé d'utiliser GPT finement ajusté en mode sans texte de référence. Si vous ne comprenez pas ce que dit l'audio de référence (vous ne savez pas quoi écrire), vous pouvez l'activer ; une fois activé, ignorez le texte de référence saisi.",
+ "保存频率save_every_epoch": "Fréquence de sauvegarde (sauvegarder à chaque époque)",
+ "凑50字一切": "Assembler 50 mots tout",
+ "凑四句一切": "Composez quatre phrases pour tout remplir",
+ "切分后文本": "Texte après découpage",
+ "切分后的子音频的输出根目录": "Répertoire racine de sortie des sous-audios après découpage",
+ "切割使用的进程数": "Nombre de processus utilisés pour le découpage",
+ "刷新模型路径": "Actualiser le chemin du modèle",
+ "前端处理后的文本(每句):": "Texte après traitement frontal (par phrase):",
+ "去混响/去延迟,附:": "Suppression de la réverbération / suppression du retard, ci-joint:",
+ "参考音频在3~10秒范围外,请更换!": "Veuillez remplacer l'audio de référence si sa durée est en dehors de la plage de 3 à 10 secondes!",
+ "参考音频的文本": "Texte de l'audio de référence",
+ "参考音频的语种": "Langue de l'audio de référence",
+ "合成语音": "Synthèse vocale",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "Des fonctionnalités futures incluront la conversion en phonèmes, la modification manuelle des phonèmes et l'exécution par étapes de la synthèse vocale.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Veuillez indiquer le répertoire contenant les audio découpés ! Le chemin complet du fichier audio à lire = ce répertoire - nom du fichier correspondant à l'onde dans le fichier .list (pas le chemin complet). Si laissé vide, le chemin absolu dans le fichier .list sera utilisé.",
+ "多语种混合": "Mélange multilingue",
+ "实际输入的参考文本:": "Texte de référence réellement saisi:",
+ "实际输入的目标文本(切句后):": "Texte cible réellement saisi (après découpage):",
+ "实际输入的目标文本(每句):": "Texte cible réellement saisi (par phrase):",
+ "实际输入的目标文本:": "Texte cible réellement saisi:",
+ "导出文件格式": "Format d'exportation du fichier",
+ "开启GPT训练": "Activer l'entraînement GPT",
+ "开启SSL提取": "Activer l'extraction SSL",
+ "开启SoVITS训练": "Activer l'entraînement SoVITS",
+ "开启一键三连": "Activer l'un clic trois connexions",
+ "开启文本获取": "Activer l'extraction de texte",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.",
+ "开启离线批量ASR": "Activer la transcription automatique hors ligne en masse",
+ "开启语义token提取": "Activer l'extraction de jetons sémantiques",
+ "开启语音切割": "Activer le découpage vocal",
+ "开启语音降噪": "Activer la réduction de bruit vocal",
+ "怎么切": "Comment découper",
+ "总训练轮数total_epoch": "Nombre total d'époques d'entraînement",
+ "总训练轮数total_epoch,不建议太高": "Nombre total d'époques d'entraînement, pas recommandé d'être trop élevé",
+ "打标工具WebUI已关闭": "L'interface Web de l'outil d'annotation est terminée",
+ "打标工具WebUI已开启": "L'interface Web de l'outil d'annotation est en cours",
+ "打标工具进程输出信息": "Informations de processus de l'outil d'annotation",
+ "指定输出主人声文件夹": "Spécifier le dossier de sortie pour la voix principale",
+ "指定输出非主人声文件夹": "Spécifier le dossier de sortie pour la non-voix principale",
+ "按中文句号。切": "Couper selon les points en chinois.",
+ "按标点符号切": "Couper selon les signes de ponctuation",
+ "按英文句号.切": "Découpez par des points en anglais",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Outil de découpage de texte. Un texte trop long peut ne pas donner un bon résultat, donc il est recommandé de le couper d'abord s'il est trop long. La synthèse se fera en séparant le texte par les sauts de ligne puis en les assemblant.",
+ "文本模块学习率权重": "Poids du taux d'apprentissage du module de texte",
+ "文本进程输出信息": "Informations de processus de texte",
+ "施工中,请静候佳音": "En construction, veuillez attendre patiemment",
+ "日文": "Japonais",
+ "日英混合": "Mélange Japonais-Anglais",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Sauvegarder uniquement le dernier fichier ckpt pour économiser de l'espace disque",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Sauvegarder le petit modèle final dans le dossier weights à chaque point de sauvegarde",
+ "是否开启TTS推理WebUI": "Activer l'interface Web d'inférence TTS",
+ "是否开启UVR5-WebUI": "Activer UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "Activer l'option d'entraînement DPO (expérimental)",
+ "是否开启打标WebUI": "Activer l'interface Web d'annotation",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Est-ce qu'on ajuste directement la vitesse de parole du dernier résultat de synthèse pour éviter l'aléatoire ?",
+ "显卡信息": "Informations sur la carte graphique",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Ce logiciel est open source sous la licence MIT. L'auteur n'a aucun contrôle sur le logiciel. Les utilisateurs et les diffuseurs du son exporté par le logiciel en assument l'entière responsabilité.
Si vous n'acceptez pas ces termes, vous ne pouvez ni utiliser ni citer aucun code ou fichier à l'intérieur du package. Voir LICENSE dans le répertoire racine pour plus de détails.",
+ "模型": "Modèle",
+ "模型分为三类:": "Les modèles sont classés en trois catégories:",
+ "模型切换": "Changement de modèle",
+ "每张显卡的batch_size": "Taille de lot par carte graphique",
+ "终止ASR进程": "Arrêter le processus ASR",
+ "终止GPT训练": "Arrêter l'entraînement GPT",
+ "终止SSL提取进程": "Arrêter le processus d'extraction SSL",
+ "终止SoVITS训练": "Arrêter l'entraînement SoVITS",
+ "终止一键三连": "Arrêter l'un clic trois connexions",
+ "终止文本获取进程": "Arrêter le processus d'extraction de texte",
+ "终止语义token提取进程": "Arrêter le processus d'extraction de jetons sémantiques",
+ "终止语音切割": "Arrêter le découpage vocal",
+ "终止语音降噪进程": "Arrêter le processus de réduction du bruit vocal",
+ "英文": "Anglais",
+ "语义token提取进程输出信息": "Informations de processus d'extraction de jetons sémantiques",
+ "语速": "Débit de parole",
+ "语速调整,高为更快": "Ajuster la vitesse de parole, plus élevée pour plus rapide",
+ "语音切割进程输出信息": "Informations de processus de découpage vocal",
+ "语音降噪进程输出信息": "Informations de sortie du processus de réduction du bruit vocal",
+ "请上传3~10秒内参考音频,超过会报错!": "Veuillez télécharger une référence audio de 3 à 10 secondes ; les fichiers plus longs généreront une erreur!",
+ "请输入有效文本": "Veuillez entrer un texte valide",
+ "转换": "Conversion",
+ "输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter",
+ "输入文件夹路径": "Chemin du dossier à entrer",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "Les fichiers et dossiers commençant par 23456 devraient être présents dans le répertoire logs/nom de l'expérience",
+ "输出信息": "Sortie d'information",
+ "输出文件夹路径": "Chemin du dossier de sortie",
+ "输出的语音": "Audio de sortie",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choisissez le modèle entraîné stocké sous SoVITS_weights et GPT_weights. Par défaut, l'un d'eux est un modèle de base pour l'expérience de TTS Zero Shot de 5 secondes.",
+ "降噪结果输出文件夹": "Dossier de sortie des résultats de réduction du bruit",
+ "降噪音频文件输入文件夹": "Dossier d'entrée des fichiers audio de réduction du bruit",
+ "需要合成的切分前文本": "Texte préalable à la synthèse",
+ "需要合成的文本": "Texte à synthétiser",
+ "需要合成的语种": "Langue de synthèse requise",
+ "音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier",
+ "预训练的GPT模型路径": "Chemin du modèle GPT pré-entraîné",
+ "预训练的SSL模型路径": "Chemin du modèle SSL pré-entraîné",
+ "预训练的SoVITS-D模型路径": "Chemin du modèle SoVITS-D pré-entraîné",
+ "预训练的SoVITS-G模型路径": "Chemin du modèle SoVITS-G pré-entraîné",
+ "预训练的中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné"
+}
diff --git a/tools/i18n/locale/it_IT.json b/tools/i18n/locale/it_IT.json
new file mode 100644
index 0000000000000000000000000000000000000000..60569d87527ae9aeca9ee87dac1509038d79eb19
--- /dev/null
+++ b/tools/i18n/locale/it_IT.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): È la scelta migliore per la riverberazione a due canali, ma non può rimuovere la riverberazione a canale singolo;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Rimuove gli effetti di ritardo. Aggressive è più completo di Normal nella rimozione, DeReverb rimuove ulteriormente la riverberazione, può rimuovere la riverberazione a canale singolo, ma non rimuove completamente la riverberazione a piastra ad alta frequenza.",
+ "*GPT模型列表": "*Lista dei modelli GPT",
+ "*SoVITS模型列表": "*Lista dei modelli SoVITS",
+ "*实验/模型名": "*Nome dell'esperimento/modello",
+ "*文本标注文件": "*File di annotazione del testo",
+ "*训练集音频文件目录": "*Directory dei file audio del set di addestramento",
+ "*请上传并填写参考信息": "*Carica e compila le informazioni di riferimento",
+ "*请填写需要合成的目标文本和语种模式": "*Si prega di inserire il testo di destinazione da sintetizzare e la modalità lingua",
+ ".list标注文件的路径": "Percorso del file di annotazione .list",
+ "0-前置数据集获取工具": "0-Strumento di acquisizione del dataset preliminare",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-Strumento di separazione voce e accompagnamento UVR5 & Rimozione riverbero e ritardo",
+ "0b-语音切分工具": "0b-Strumento di segmentazione vocale",
+ "0bb-语音降噪工具": "0bb-Strumento di riduzione del rumore vocale",
+ "0c-中文批量离线ASR工具": "0c-Strumento di ASR offline batch in cinese",
+ "0d-语音文本校对标注工具": "0d-Strumento di correzione e annotazione testo vocale",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-Strumento di formattazione del set di addestramento",
+ "1Aa-文本内容": "1Aa-Contenuto del testo",
+ "1Aabc-训练集格式化一键三连": "1Aabc-Strumento di formattazione del set di addestramento con tre passaggi",
+ "1Ab-SSL自监督特征提取": "1Ab-Estrazione di caratteristiche auto-supervisionata SSL",
+ "1Ac-语义token提取": "1Ac-Estrazione del token semantico",
+ "1B-微调训练": "1B-Allenamento di affinamento",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-Allenamento di SoVITS. I file del modello destinati alla condivisione sono salvati in SoVITS_weights.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-Allenamento di GPT. I file del modello destinati alla condivisione sono salvati in GPT_weights.",
+ "1C-推理": "1C-Inferenza",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. Il tempo di elaborazione del modello DeEcho-DeReverb è quasi il doppio di quello degli altri due modelli DeEcho;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Conserva la voce principale: scegli questa opzione per audio senza armonie, poiché conserva meglio la voce principale rispetto al modello HP5. Include due modelli integrati, HP2 e HP3. HP3 potrebbe far passare leggermente l'accompagnamento ma conserva meglio la voce principale rispetto a HP2;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-Voce modificata",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. Il modello MDX-Net-Dereverb è piuttosto lento;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Solo conserva la voce principale: scegli questa opzione per audio con armonie, poiché potrebbe indebolire la voce principale. Include un modello HP5;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. La configurazione più pulita consigliata è MDX-Net seguito da DeEcho-Aggressive.",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Modello per rimuovere la riverberazione e il ritardo (by FoxJoy):",
+ "ASR 模型": "Modello ASR",
+ "ASR 模型尺寸": "Dimensioni del modello ASR",
+ "数据类型精度": "precisione del tipo di dati",
+ "ASR 语言设置": "Impostazioni linguistiche ASR",
+ "ASR进程输出信息": "Informazioni sull'output del processo ASR",
+ "GPT模型列表": "Elenco dei modelli GPT",
+ "GPT训练进程输出信息": "Informazioni sull'output del processo di allenamento di GPT",
+ "GPU卡号,只能填1个整数": "Numero della scheda grafica, può essere inserito solo un numero intero",
+ "GPU卡号以-分割,每个卡号一个进程": "Numero di GPU separati da '-'; ogni numero corrisponde a un processo",
+ "SSL进程输出信息": "Informazioni sull'output del processo SSL",
+ "SoVITS模型列表": "Elenco dei modelli SoVITS",
+ "SoVITS训练进程输出信息": "Informazioni sull'output del processo di allenamento di SoVITS",
+ "TTS推理WebUI进程输出信息": "Informazioni sull'output del processo dell'interfaccia utente Web per l'inferenza TTS",
+ "TTS推理进程已关闭": "Il processo di inferenza TTS è stato chiuso",
+ "TTS推理进程已开启": "Il processo di inferenza TTS è stato avviato",
+ "UVR5已关闭": "UVR5 è disattivato",
+ "UVR5已开启": "UVR5 è attivato",
+ "UVR5进程输出信息": "Informazioni sull'output del processo UVR5",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Quanta proporzione dell'audio normalizzato deve essere miscelata",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Parametri di campionamento di GPT (non troppo bassi quando non c'è testo di riferimento. Utilizzare i valori predefiniti in caso di incertezza):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: Come calcolare la curva del volume. Più piccolo è, maggiore è la precisione ma aumenta la complessità computazionale (non significa che una maggiore precisione dà risultati migliori)",
+ "max:归一化后最大值多少": "max: Massimo valore dopo la normalizzazione",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept: Massima durata del silenzio dopo il taglio",
+ "min_interval:最短切割间隔": "min_interval: Intervallo minimo di taglio",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: Lunghezza minima per segmento; se il primo segmento è troppo corto, sarà unito ai segmenti successivi fino a superare questo valore",
+ "temperature": "temperatura",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold: Punto di taglio alternativo considerato silenzioso se il volume è inferiore a questo valore",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Informazioni sull'output del processo di 'One Click Three Connect'",
+ "不切": "Nessuna suddivisione",
+ "中文": "Cinese",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentazione del tutorial in cinese:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Cinese e inglese misti",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "È possibile anche inserire file audio in batch, una delle due opzioni, con priorità alla lettura della cartella",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Separazione voce-accompagnamento in batch, utilizza il modello UVR5.",
+ "人声提取激进程度": "Grado di aggressività dell'estrazione vocale",
+ "伴奏人声分离&去混响&去回声": "Separazione tra accompagnamento e voce & Rimozione dell'eco & Rimozione dell'eco",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Si consiglia di utilizzare GPT fine-tuned quando si utilizza la modalità senza testo di riferimento. Se non si riesce a capire cosa dice l'audio di riferimento (e non si sa cosa scrivere), è possibile abilitare questa opzione, ignorando il testo di riferimento inserito.",
+ "保存频率save_every_epoch": "Frequenza di salvataggio ogni epoca",
+ "凑50字一切": "Riempire con 50 caratteri per tutto",
+ "凑四句一切": "Riempire con quattro frasi per tutto",
+ "切分后文本": "Testo dopo il taglio",
+ "切分后的子音频的输出根目录": "Directory radice di output per gli audio segmentati",
+ "切割使用的进程数": "Numero di processi utilizzati per il taglio",
+ "刷新模型路径": "Aggiorna il percorso del modello",
+ "前端处理后的文本(每句):": "Testo elaborato dal front-end (per frase):",
+ "去混响/去延迟,附:": "Rimozione della riverberazione/ritardo, allegato:",
+ "参考音频在3~10秒范围外,请更换!": "L'audio di riferimento è al di fuori dell'intervallo di 3-10 secondi. Si prega di cambiarlo!",
+ "参考音频的文本": "Testo dell'audio di riferimento",
+ "参考音频的语种": "Lingua dell'audio di riferimento",
+ "合成语音": "Sintesi vocale",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Formato di percorso della cartella valido: E:\\codes\\py39\\vits_vc_gpu\\Esempio di test di BaiLuShuangHua (copiare direttamente dalla barra degli indirizzi del gestore file).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "In futuro, sarà supportata la conversione in fonemi, la modifica manuale dei fonemi e l'esecuzione passo-passo della sintesi vocale.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Inserisci la directory dell'audio segmentato! Il percorso completo del file audio letto = questa directory - unione del nome del file corrispondente alle forme d'onda nel file .list (non il percorso completo). Se lasciato vuoto, verrà utilizzato il percorso assoluto nel file .list.",
+ "多语种混合": "Mix multilingue",
+ "实际输入的参考文本:": "Testo di riferimento effettivamente inserito:",
+ "实际输入的目标文本(切句后):": "Testo di destinazione effettivamente inserito (dopo il taglio delle frasi):",
+ "实际输入的目标文本(每句):": "Testo di destinazione effettivamente inserito (per frase):",
+ "实际输入的目标文本:": "Testo di destinazione effettivamente inserito:",
+ "导出文件格式": "Formato di esportazione del file",
+ "开启GPT训练": "Attivare l'allenamento di GPT",
+ "开启SSL提取": "Attivare l'estrazione SSL",
+ "开启SoVITS训练": "Attivare l'allenamento di SoVITS",
+ "开启一键三连": "Attivare la formattazione con tre passaggi",
+ "开启文本获取": "Attivare l'estrazione del testo",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Attivare la modalità senza testo di riferimento. Anche se non inserisci un testo di riferimento, la modalità verrà attivata.",
+ "开启离线批量ASR": "Attivare ASR offline batch",
+ "开启语义token提取": "Attivare l'estrazione del token semantico",
+ "开启语音切割": "Attivare la segmentazione vocale",
+ "开启语音降噪": "Attivare la riduzione del rumore vocale",
+ "怎么切": "Come tagliare",
+ "总训练轮数total_epoch": "Numero totale di epoche di addestramento",
+ "总训练轮数total_epoch,不建议太高": "Numero totale di epoche di addestramento, non raccomandato troppo alto",
+ "打标工具WebUI已关闭": "L'interfaccia utente Web dello strumento di annotazione è stata chiusa",
+ "打标工具WebUI已开启": "L'interfaccia utente Web dello strumento di annotazione è stata avviata",
+ "打标工具进程输出信息": "Informazioni sull'output del processo di annotazione",
+ "指定输出主人声文件夹": "Specifica la cartella di output per la voce principale",
+ "指定输出非主人声文件夹": "Specifica la cartella di output per la non voce principale",
+ "按中文句号。切": "Taglia secondo il punto cinese.",
+ "按标点符号切": "Taglia secondo i segni di punteggiatura",
+ "按英文句号.切": "Taglia secondo il punto inglese",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Strumento di divisione del testo. I testi troppo lunghi potrebbero non avere un buon effetto di sintesi, quindi è consigliabile dividerli prima della sintesi. La sintesi verrà separata in base ai ritorni a capo nel testo e successivamente ricomposta.",
+ "文本模块学习率权重": "Peso del tasso di apprendimento del modulo di testo",
+ "文本进程输出信息": "Informazioni sull'output del processo di estrazione del testo",
+ "施工中,请静候佳音": "In costruzione, attendi pazientemente le buone notizie",
+ "日文": "Giapponese",
+ "日英混合": "Mix giapponese e inglese",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Salvare solo il file ckpt più recente per risparmiare spazio su disco",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Salvare il modello finale più piccolo nella cartella weights ad ogni punto di salvataggio",
+ "是否开启TTS推理WebUI": "Attivare l'interfaccia utente Web per l'inferenza TTS",
+ "是否开启UVR5-WebUI": "Attivare UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "Attivare l'opzione di addestramento DPO (sperimentale)",
+ "是否开启打标WebUI": "Attivare l'interfaccia utente Web di annotazione",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Se regolare direttamente la velocità della voce dell'ultimo risultato di sintesi per evitare casualità.",
+ "显卡信息": "Informazioni sulla scheda grafica",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Questo software è open source con licenza MIT. L'autore non ha alcun controllo sul software. L'utente che utilizza il software o diffonde i suoni derivati dal software ne è responsabile.
Se non accetti questi termini, non puoi utilizzare o citare alcun codice o file all'interno del pacchetto software. Vedi la cartella principaleLICENSE per i dettagli.",
+ "模型": "Modello",
+ "模型分为三类:": "I modelli sono divisi in tre categorie:",
+ "模型切换": "Cambio del modello",
+ "每张显卡的batch_size": "Batch size per ogni scheda grafica",
+ "终止ASR进程": "Terminare il processo ASR",
+ "终止GPT训练": "Terminare l'allenamento di GPT",
+ "终止SSL提取进程": "Terminare il processo di estrazione SSL",
+ "终止SoVITS训练": "Terminare l'allenamento di SoVITS",
+ "终止一键三连": "Terminare la formattazione con tre passaggi",
+ "终止文本获取进程": "Terminare il processo di estrazione del testo",
+ "终止语义token提取进程": "Terminare il processo di estrazione del token semantico",
+ "终止语音切割": "Terminare la segmentazione vocale",
+ "终止语音降噪进程": "Termina il processo di riduzione del rumore vocale",
+ "英文": "Inglese",
+ "语义token提取进程输出信息": "Informazioni sull'output del processo di estrazione del token semantico",
+ "语速": "Velocità della voce",
+ "语速调整,高为更快": "Regolare la velocità della voce, più alta per più veloce",
+ "语音切割进程输出信息": "Informazioni sull'output del processo di segmentazione vocale",
+ "语音降噪进程输出信息": "Informazioni sull'output del processo di riduzione del rumore vocale",
+ "请上传3~10秒内参考音频,超过会报错!": "Carica un audio di riferimento della durata compresa tra 3 e 10 secondi. Superiore a questo, verrà generato un errore!",
+ "请输入有效文本": "Inserisci un testo valido",
+ "转换": "Converti",
+ "输入待处理音频文件夹路径": "Inserisci il percorso della cartella dei file audio da elaborare",
+ "输入文件夹路径": "Inserisci il percorso della cartella",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "Nella cartella logs/nome dell'esperimento dovrebbero esserci file e cartelle che iniziano con 23456",
+ "输出信息": "Informazioni di output",
+ "输出文件夹路径": "Percorso della cartella di output",
+ "输出的语音": "Audio di output",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Scegli il modello salvato in SoVITS_weights e GPT_weights dopo l'addestramento. Uno di default è il modello di base, utilizzato per l'esperienza di Zero Shot TTS in 5 secondi.",
+ "降噪结果输出文件夹": "Cartella di output dei risultati di riduzione del rumore",
+ "降噪音频文件输入文件夹": "Cartella di input dei file audio per la riduzione del rumore",
+ "需要合成的切分前文本": "Testo da sintetizzare prima del taglio",
+ "需要合成的文本": "Testo da sintetizzare",
+ "需要合成的语种": "Lingua da sintetizzare",
+ "音频自动切分输入路径,可文件可文件夹": "Percorso di input per la segmentazione automatica dell'audio, può essere un file o una cartella",
+ "预训练的GPT模型路径": "Percorso del modello preaddestrato GPT",
+ "预训练的SSL模型路径": "Percorso del modello SSL preaddestrato",
+ "预训练的SoVITS-D模型路径": "Percorso del modello preaddestrato SoVITS-D",
+ "预训练的SoVITS-G模型路径": "Percorso del modello preaddestrato SoVITS-G",
+ "预训练的中文BERT模型路径": "Percorso del modello BERT cinese preaddestrato"
+}
diff --git a/tools/i18n/locale/ja_JP.json b/tools/i18n/locale/ja_JP.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb6370337f299cdd7793ef8e1c9f831dfc7d20f8
--- /dev/null
+++ b/tools/i18n/locale/ja_JP.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):二重チャンネルのリバーブに最適な選択ですが、単一チャンネルのリバーブは除去できません;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:遅延効果を除去します。AggressiveはNormalよりも徹底的に除去し、DeReverbは追加でリバーブを除去し、モノラルリバーブを除去できますが、高周波数のプレートリバーブは完全には除去できません。",
+ "*GPT模型列表": "*GPTモデルリスト",
+ "*SoVITS模型列表": "*SoVITSモデルリスト",
+ "*实验/模型名": "*実験/モデル名",
+ "*文本标注文件": "*テキスト注釈ファイル",
+ "*训练集音频文件目录": "*トレーニングデータのオーディオファイルディレクトリ",
+ "*请上传并填写参考信息": "*参照情報をアップロードして記入してください",
+ "*请填写需要合成的目标文本和语种模式": "*合成対象テキストと言語モードを入力してください",
+ ".list标注文件的路径": ".listアノテーションファイルのパス",
+ "0-前置数据集获取工具": "0-データセット取得ツールの事前処理",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5ボーカルアカンパニメント分離&リバーブおよびディレイ除去ツール",
+ "0b-语音切分工具": "0b-音声分割ツール",
+ "0bb-语音降噪工具": "0bb-音声ノイズ除去ツール",
+ "0c-中文批量离线ASR工具": "0c-中国語バッチオフラインASRツール",
+ "0d-语音文本校对标注工具": "0d-音声テキストの校正アノテーションツール",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-トレーニングデータのフォーマットツール",
+ "1Aa-文本内容": "1Aa-テキストの内容",
+ "1Aabc-训练集格式化一键三连": "1Aabc-トレーニングデータのフォーマットワンクリック三連",
+ "1Ab-SSL自监督特征提取": "1Ab-SSLセルフスーパーバイズ特徴抽出",
+ "1Ac-语义token提取": "1Ac-セマンティックトークン抽出",
+ "1B-微调训练": "1B-ファインチューニングトレーニング",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITSトレーニング。共有用のモデルファイルはSoVITS_weightsディレクトリに出力されます。",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPTトレーニング。共有用のモデルファイルはGPT_weightsディレクトリに出力されます。",
+ "1C-推理": "1C-推論",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1、DeEcho-DeReverbモデルの処理時間は、他の2つのDeEchoモデルのほぼ2倍です;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1、主音を保持: ハーモニーなしの音声にはこのオプションを選択し、HP5よりも主音の保持が優れています。HP2とHP3の2つのモデルが内蔵されており、HP3はわずかに伴奏を漏らす可能性がありますが、HP2よりも主音の保持がわずかに良いです;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-ボイスチェンジャー",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverbモデルはかなり遅いです;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2、主音のみを保持: ハーモニー付きの音声にはこのオプションを選択し、主音が弱くなる可能性があります。HP5モデルが1つ内蔵されています;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3、最もクリーンな設定は、MDX-Netの後にDeEcho-Aggressiveを使用することをお勧めします。",
+ "3、去混响、去延迟模型(by FoxJoy):": "3、リバーブ除去と遅延除去モデル(by FoxJoy):",
+ "ASR 模型": "ASR モデル",
+ "ASR 模型尺寸": "ASRモデルサイズ",
+ "数据类型精度": "データ型の精度",
+ "ASR 语言设置": "ASR 言語設定",
+ "ASR进程输出信息": "ASRプロセスの出力情報",
+ "GPT模型列表": "GPTモデルリスト",
+ "GPT训练进程输出信息": "GPTトレーニングプロセスの出力情報",
+ "GPU卡号,只能填1个整数": "GPU番号、1つの整数しか入力できません",
+ "GPU卡号以-分割,每个卡号一个进程": "GPUカード番号はハイフンで区切り、各カード番号ごとに1つのプロセスが実行されます",
+ "SSL进程输出信息": "SSLプロセスの出力情報",
+ "SoVITS模型列表": "SoVITSモデルリスト",
+ "SoVITS训练进程输出信息": "SoVITSトレーニングプロセスの出力情報",
+ "TTS推理WebUI进程输出信息": "TTS推論WebUIプロセスの出力情報",
+ "TTS推理进程已关闭": "TTS推論プロセスが終了しました",
+ "TTS推理进程已开启": "TTS推論プロセスが開始されました",
+ "UVR5已关闭": "UVR5がオフになっています",
+ "UVR5已开启": "UVR5がオンになっています",
+ "UVR5进程输出信息": "UVR5プロセスの出力情報",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:正規化後のオーディオが入る割合",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT サンプリングパラメーター(参照テキストがない場合はあまり低くしないでください。わからない場合はデフォルトを使用してください):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: 音量曲線の計算方法、小さいほど精度が高くなりますが、計算量が増加します(精度が高いほど必ずしも効果が良いわけではありません)",
+ "max:归一化后最大值多少": "max:正規化後の最大値",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:切り終えた後、最大でどれだけ静かにするか",
+ "min_interval:最短切割间隔": "min_interval:最短カット間隔",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:各セグメントの最小長さ。最初のセグメントが短すぎる場合、連続して後続のセグメントに接続され、この値を超えるまで続きます。",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "閾値:この値未満の音量は静音と見なされ、代替のカットポイントとして扱われます",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "ワンクリック三連プロセスの出力情報",
+ "不切": "切らない",
+ "中文": "中国語",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中国語チュートリアルドキュメント:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "中英混合",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "複数のオーディオファイルもインポートできます。フォルダパスが存在する場合、この入力は無視されます。",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "人声と伴奏の分離をバッチ処理で行い、UVR5モデルを使用します。",
+ "人声提取激进程度": "人声抽出の積極性",
+ "伴奏人声分离&去混响&去回声": "ボーカル/伴奏の分離と残響の除去",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "参考テキストなしモードを使用する場合は、微調整されたGPTの使用をお勧めします。参考音声が聞き取れない場合(何を書けば良いかわからない場合)は、有効にすると、入力した参考テキストを無視します。",
+ "保存频率save_every_epoch": "保存頻度save_every_epoch",
+ "凑50字一切": "50文字ずつカット",
+ "凑四句一切": "4つの文で埋める",
+ "切分后文本": "分割後のテキスト",
+ "切分后的子音频的输出根目录": "分割後のサブオーディオの出力ルートディレクトリ",
+ "切割使用的进程数": "分割に使用されるプロセス数",
+ "刷新模型路径": "モデルのパスを更新",
+ "前端处理后的文本(每句):": "フロントエンド処理後のテキスト(文ごと):",
+ "去混响/去延迟,附:": "残響除去/遅延除去、附:",
+ "参考音频在3~10秒范围外,请更换!": "参照音声が3~10秒の範囲外です。別の音声に変更してください!",
+ "参考音频的文本": "参照オーディオのテキスト",
+ "参考音频的语种": "参照オーディオの言語",
+ "合成语音": "推論を開始",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "適切なフォルダパスの例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华テストサンプル(ファイルマネージャのアドレスバーからコピーしてください)。",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "今後、音素変換、手動音素修正、音声合成の段階的実行をサポートする予定です。",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "切断後の音声ファイルが格納されているディレクトリを入力してください!読み取り対象の音声ファイルの完全パス = このディレクトリ - 結合 - listファイル内の波形に対応するファイル名(完全パスではありません)。空白の場合、.listファイル内の絶対完全パスを使用します。",
+ "多语种混合": "多言語混合",
+ "实际输入的参考文本:": "実際に入力された参照テキスト:",
+ "实际输入的目标文本(切句后):": "実際に入力された目標テキスト(文分割後):",
+ "实际输入的目标文本(每句):": "実際に入力された目標テキスト(文ごと):",
+ "实际输入的目标文本:": "実際に入力された目標テキスト:",
+ "导出文件格式": "エクスポートファイル形式",
+ "开启GPT训练": "GPTトレーニングを開始",
+ "开启SSL提取": "SSL抽出を開始",
+ "开启SoVITS训练": "SoVITSトレーニングを開始",
+ "开启一键三连": "ワンクリック三連を開始",
+ "开启文本获取": "テキストの取得を開始",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "参照テキストなしモードを有効にします。参照テキストを入力しない場合も同様に有効になります。",
+ "开启离线批量ASR": "オフラインバッチASRを開始",
+ "开启语义token提取": "セマンティックトークン抽出を開始",
+ "开启语音切割": "音声の分割を開始",
+ "开启语音降噪": "音声ノイズ除去を有効にする",
+ "怎么切": "どうやって切るか",
+ "总训练轮数total_epoch": "総トレーニングエポック数total_epoch",
+ "总训练轮数total_epoch,不建议太高": "総トレーニングエポック数total_epoch、高すぎないようにお勧めします",
+ "打标工具WebUI已关闭": "校正ツールWebUIが終了しました",
+ "打标工具WebUI已开启": "校正ツールWebUIが開始されました",
+ "打标工具进程输出信息": "アノテーションツールプロセスの出力情報",
+ "指定输出主人声文件夹": "ボーカルの出力フォルダを指定:",
+ "指定输出非主人声文件夹": "伴奏の出力フォルダを指定:",
+ "按中文句号。切": "中国語の句点でカット",
+ "按标点符号切": "句読点で分割",
+ "按英文句号.切": "英文のピリオドで切ってください",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "テキストスライサーツール。長文を変換すると効果が不安定になる可能性があるため、長文の場合は事前に切り分けることをお勧めします。推論時には、テキストを個別に推論し、それを組み合わせて再構築します。",
+ "文本模块学习率权重": "テキストモジュールの学習率の重み",
+ "文本进程输出信息": "テキストプロセスの出力情報",
+ "施工中,请静候佳音": "施工中、お待ちください",
+ "日文": "日本語",
+ "日英混合": "日英混合",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "最新のckptファイルのみを保存してディスクスペースを節約するかどうか",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "各保存時間点で最終的な小さなモデルをweightsフォルダに保存するかどうか",
+ "是否开启TTS推理WebUI": "TTS推論WebUIを開く",
+ "是否开启UVR5-WebUI": "UVR5-WebUIをオンにしますか",
+ "是否开启dpo训练选项(实验性)": "DPOトレーニングオプションを有効にするかどうか(実験的)",
+ "是否开启打标WebUI": "WebUIを使用したアノテーションを開始しますか",
+ "是否直接对上次合成结果调整语速。防止随机性。": "直前の合成結果の話速を直接調整して、ランダム性を防ぐか。",
+ "显卡信息": "グラフィックカード情報",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "このソフトウェアはMITライセンスでオープンソース化されており、作者はソフトウェアに対して一切の制御権を持っていません。ソフトウェアを使用する者、ソフトウェアから導出される音声を広める者は、自己責任で行ってください。
この条件を認めない場合、ソフトウェアパッケージ内の任意のコードやファイルを使用または引用することはできません。詳細はルートディレクトリのLICENSEを参照してください。",
+ "模型": "モデル",
+ "模型分为三类:": "モデルは3種類に分かれています:",
+ "模型切换": "モデル切り替え",
+ "每张显卡的batch_size": "各グラフィックカードのバッチサイズ",
+ "终止ASR进程": "ASRプロセスを停止",
+ "终止GPT训练": "GPTトレーニングを停止",
+ "终止SSL提取进程": "SSL抽出プロセスを停止",
+ "终止SoVITS训练": "SoVITSトレーニングを停止",
+ "终止一键三连": "ワンクリック三連を停止",
+ "终止文本获取进程": "テキスト取得プロセスを停止",
+ "终止语义token提取进程": "セマンティックトークン抽出プロセスを停止",
+ "终止语音切割": "音声の分割を停止",
+ "终止语音降噪进程": "音声ノイズ除去プロセスを終了する",
+ "英文": "英語",
+ "语义token提取进程输出信息": "セマンティックトークン抽出プロセスの出力情報",
+ "语速": "話速",
+ "语速调整,高为更快": "話速調整、高いほど速く",
+ "语音切割进程输出信息": "音声分割プロセスの出力情報",
+ "语音降噪进程输出信息": "音声ノイズ除去プロセスの出力情報",
+ "请上传3~10秒内参考音频,超过会报错!": "3~10秒以内の参照音声をアップロードしてください。それを超えるとエラーが発生します!",
+ "请输入有效文本": "有効なテキストを入力してください",
+ "转换": "変換",
+ "输入待处理音频文件夹路径": "処理するオーディオフォルダのパスを入力してください:",
+ "输入文件夹路径": "入力フォルダのパス",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "logs/実験名ディレクトリには23456で始まるファイルとフォルダが含まれている必要があります",
+ "输出信息": "出力情報",
+ "输出文件夹路径": "出力フォルダのパス",
+ "输出的语音": "推論結果",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "SoVITS_weightsおよびGPT_weightsに保存されたモデルを選択します。デフォルトのものはプレトレインであり、ゼロショットTTSを体験できます。",
+ "降噪结果输出文件夹": "ノイズ除去結果出力フォルダ",
+ "降噪音频文件输入文件夹": "ノイズ除去音声ファイル入力フォルダ",
+ "需要合成的切分前文本": "推論が必要な分割前のテキスト",
+ "需要合成的文本": "推論テキスト",
+ "需要合成的语种": "推論テキストの言語",
+ "音频自动切分输入路径,可文件可文件夹": "オーディオの自動分割入力パス、ファイルまたはフォルダを指定できます",
+ "预训练的GPT模型路径": "事前にトレーニングされたGPTモデルのパス",
+ "预训练的SSL模型路径": "事前にトレーニングされたSSLモデルのパス",
+ "预训练的SoVITS-D模型路径": "事前にトレーニングされたSoVITS-Dモデルのパス",
+ "预训练的SoVITS-G模型路径": "事前にトレーニングされたSoVITS-Gモデルのパス",
+ "预训练的中文BERT模型路径": "事前にトレーニングされた中文BERTモデルのパス"
+}
diff --git a/tools/i18n/locale/ko_KR.json b/tools/i18n/locale/ko_KR.json
new file mode 100644
index 0000000000000000000000000000000000000000..b703b2134f579f120983f26afa8112cf0d9042b4
--- /dev/null
+++ b/tools/i18n/locale/ko_KR.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): 듀얼 채널 리버브에는 가장 적합하지만, 싱글 채널 리버브는 제거할 수 없습니다",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:지연 효과를 제거합니다. Aggressive는 Normal보다 더 철저하게 제거하며, DeReverb는 추가로 리버브를 제거하여 단일 채널 리버브를 제거할 수 있지만 고주파 리버브는 완전히 제거하지 못합니다.",
+ "*GPT模型列表": "*GPT 모델 목록",
+ "*SoVITS模型列表": "*SoVITS 모델 목록",
+ "*实验/模型名": "*실험/모델 이름",
+ "*文本标注文件": "*텍스트 주석 파일",
+ "*训练集音频文件目录": "*훈련 세트 오디오 파일 디렉터리",
+ "*请上传并填写参考信息": "*참고 정보를 업로드하고 입력하십시오",
+ "*请填写需要合成的目标文本和语种模式": "*합성할 목표 텍스트와 언어 모드를 입력하세요",
+ ".list标注文件的路径": ".list 주석 파일 경로",
+ "0-前置数据集获取工具": "0-전방 데이터 세트 수집 도구",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5 보컬 및 반주 분리 및 에코 및 지연 제거 도구",
+ "0b-语音切分工具": "0b-음성 분리 도구",
+ "0bb-语音降噪工具": "0bb-음성 노이즈 제거 도구",
+ "0c-中文批量离线ASR工具": "0c-중국어 대량 오프라인 ASR 도구",
+ "0d-语音文本校对标注工具": "0d-음성 텍스트 교정 주석 도구",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-훈련 세트 형식 지정 도구",
+ "1Aa-文本内容": "1Aa-텍스트 내용",
+ "1Aabc-训练集格式化一键三连": "1Aabc-훈련 세트 형식 지정 일괄 처리",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL 자기 지도 특징 추출",
+ "1Ac-语义token提取": "1Ac-의미 토큰 추출",
+ "1B-微调训练": "1B-미세 조정 훈련",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS 훈련. 공유 용 모델 파일은 SoVITS_weights 하위에 출력됩니다.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT 훈련. 공유 용 모델 파일은 GPT_weights 하위에 출력됩니다.",
+ "1C-推理": "1C-추론",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. DeEcho-DeReverb 모델의 처리 시간은 다른 두 DeEcho 모델의 거의 두 배입니다;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. 사람 목소리를 유지: 화음이 없는 오디오를 선택하면 HP5보다 사람 목소리를 더 잘 유지할 수 있습니다. 내장된 HP2와 HP3 모델이 있으며, HP3는 화음을 약간 놓칠 수 있지만 HP2보다 사람 목소리를 조금 더 잘 유지합니다;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-음성 변환",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. MDX-Net-Dereverb 모델은 꽤 느립니다;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. 주 목소리만 유지: 화음이 있는 오디오에 이 모델을 선택하면 주 목소리가 약해질 수 있습니다. 내장된 HP5 모델이 있습니다;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. 개인적으로 가장 깨끗한 설정은 먼저 MDX-Net을 사용하고 그 다음에 DeEcho-Aggressive를 사용하는 것입니다;",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. 잔향 제거 및 지연 제거 모델 (by FoxJoy):",
+ "ASR 模型": "ASR 모델",
+ "ASR 模型尺寸": "ASR 모델 크기",
+ "数据类型精度": "데이터 유형 정밀도",
+ "ASR 语言设置": "ASR 언어 설정",
+ "ASR进程输出信息": "ASR 프로세스 출력 정보",
+ "GPT模型列表": "GPT 모델 목록",
+ "GPT训练进程输出信息": "GPT 훈련 프로세스 출력 정보",
+ "GPU卡号,只能填1个整数": "GPU 카드 번호, 1개의 정수만 입력 가능",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU 카드 번호는 -로 구분되며 각 카드 번호에 하나의 프로세스가 있어야 함",
+ "SSL进程输出信息": "SSL 프로세스 출력 정보",
+ "SoVITS模型列表": "SoVITS 모델 목록",
+ "SoVITS训练进程输出信息": "SoVITS 훈련 프로세스 출력 정보",
+ "TTS推理WebUI进程输出信息": "TTS 추론 WebUI 프로세스 출력 정보",
+ "TTS推理进程已关闭": "TTS 추론 프로세스가 닫혔습니다",
+ "TTS推理进程已开启": "TTS 추론 프로세스가 열렸습니다",
+ "UVR5已关闭": "UVR5가 비활성화되었습니다",
+ "UVR5已开启": "UVR5가 활성화되었습니다",
+ "UVR5进程输出信息": "UVR5 프로세스 출력 정보",
+ "alpha_mix:混多少比例归一化后音频进来": "알파 믹스: 정규화된 오디오가 들어오는 비율",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT 샘플링 매개변수 (참조 텍스트가 없을 때 너무 낮게 설정하지 마십시오. 확실하지 않으면 기본값을 사용하십시오):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop 크기: 볼륨 곡선을 계산하는 방법. 작을수록 정확도가 높아지지만 계산량이 높아집니다 (정확도가 높다고 효과가 좋아지지 않음)",
+ "max:归一化后最大值多少": "최대 값 (정규화 후)",
+ "max_sil_kept:切完后静音最多留多长": "최대 유지되는 정적 길이 (분리 후)",
+ "min_interval:最短切割间隔": "최소 분리 간격",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:각 부분의 최소 길이, 첫 번째 부분이 너무 짧으면 다음 부분과 계속 연결하여 이 값을 초과할 때까지",
+ "temperature": "온도",
+ "threshold:音量小于这个值视作静音的备选切割点": "임계 값: 이 값보다 작은 볼륨은 대체 분리 지점으로 간주됩니다.",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "일괄 처리 프로세스 출력 정보",
+ "不切": "자르지 않음",
+ "中文": "중국어",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "중국어 튜토리얼 문서:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "중영 혼합",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "오디오 파일을 일괄로 입력할 수도 있습니다. 둘 중 하나를 선택하고 폴더를 읽기를 우선합니다.",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "보컬과 반주 분리 배치 처리, UVR5 모델 사용.",
+ "人声提取激进程度": "보컬 추출의 공격성",
+ "伴奏人声分离&去混响&去回声": "반주 및 보컬 분리 & 리버브 제거 & 에코 제거",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "참고 텍스트가 없을 때는 미세 조정된 GPT를 사용하는 것이 좋습니다. 참고 오디오에서 무엇을 말하는지 잘 들리지 않으면 이 모드를 켜서 입력한 참고 텍스트를 무시할 수 있습니다.",
+ "保存频率save_every_epoch": "저장 빈도 (각 라운드마다)",
+ "凑50字一切": "50자를 채우십시오",
+ "凑四句一切": "네 문장의 세트를 완성하세요.",
+ "切分后文本": "분리된 텍스트",
+ "切分后的子音频的输出根目录": "분리된 하위 오디오의 출력 기본 디렉터리",
+ "切割使用的进程数": "사용되는 프로세스 수로 자르기",
+ "刷新模型路径": "모델 경로 새로 고침",
+ "前端处理后的文本(每句):": "프론트엔드 처리 후 텍스트(문장별):",
+ "去混响/去延迟,附:": "리버브 제거/지연 제거, 부록:",
+ "参考音频在3~10秒范围外,请更换!": "참고 오디오가 3~10초 범위를 벗어났습니다. 다른 것으로 바꾸십시오!",
+ "参考音频的文本": "참고 오디오의 텍스트",
+ "参考音频的语种": "참고 오디오의 언어",
+ "合成语音": "합성 음성",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "적절한 폴더 경로 형식 예: E:\\codes\\py39\\vits_vc_gpu\\백로서리 테스트 샘플 (파일 관리자 주소 표시줄에서 복사하면 됩니다).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "앞으로 음소 변환, 음소 수동 수정, 음성 합성 단계별 실행을 지원할 예정입니다.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "분리된 오디오가 위치한 디렉터리를 입력하세요! 읽어들인 오디오 파일의 전체 경로 = 이 디렉터리 - list 파일에서 파형에 해당하는 파일명(전체 경로가 아님). 비워 두면 .list 파일의 절대 전체 경로를 사용합니다.",
+ "多语种混合": "다국어 혼합",
+ "实际输入的参考文本:": "실제 입력된 참고 텍스트:",
+ "实际输入的目标文本(切句后):": "실제 입력된 목표 텍스트(문장 분리 후):",
+ "实际输入的目标文本(每句):": "실제 입력된 목표 텍스트(문장별):",
+ "实际输入的目标文本:": "실제 입력된 목표 텍스트:",
+ "导出文件格式": "내보내기 파일 형식",
+ "开启GPT训练": "GPT 훈련 활성화",
+ "开启SSL提取": "SSL 추출 활성화",
+ "开启SoVITS训练": "SoVITS 훈련 활성화",
+ "开启一键三连": "일괄 처리 활성화",
+ "开启文本获取": "텍스트 추출 활성화",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "참고 텍스트 없이 모드를 활성화합니다. 참고 텍스트를 입력하지 않으면 자동으로 활성화됩니다.",
+ "开启离线批量ASR": "오프라인 대량 ASR 활성화",
+ "开启语义token提取": "의미 토큰 추출 활성화",
+ "开启语音切割": "음성 분리 활성화",
+ "开启语音降噪": "음성 노이즈 제거 활성화",
+ "怎么切": "자르기 옵션",
+ "总训练轮数total_epoch": "총 훈련 라운드 수 (total_epoch)",
+ "总训练轮数total_epoch,不建议太高": "총 훈련 라운드 수 (total_epoch), 너무 높지 않게 권장됨",
+ "打标工具WebUI已关闭": "주석 도구 WebUI가 닫혔습니다",
+ "打标工具WebUI已开启": "주석 도구 WebUI가 열렸습니다",
+ "打标工具进程输出信息": "주석 도구 프로세스 출력 정보",
+ "指定输出主人声文件夹": "지정된 주인 목소리 출력 폴더",
+ "指定输出非主人声文件夹": "지정된 비주인 목소리 출력 폴더",
+ "按中文句号。切": "중국어 문장으로 분리하십시오.",
+ "按标点符号切": "구두점을 기준으로 자르기",
+ "按英文句号.切": "영어 문장으로 분리하기",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "텍스트 분리 도구. 너무 긴 텍스트는 합성 결과가 항상 좋지 않을 수 있으므로 너무 길면 먼저 분리하는 것이 좋습니다. 합성은 텍스트 줄 바꿈을 기준으로 분리되어 다시 조합됩니다.",
+ "文本模块学习率权重": "텍스트 모듈 학습률 가중치",
+ "文本进程输出信息": "텍스트 프로세스 출력 정보",
+ "施工中,请静候佳音": "공사 중입니다. 기다려주십시오.",
+ "日文": "일본어",
+ "日英混合": "일본어와 영어 혼합",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "디스크 공간을 절약하기 위해 최신 ckpt 파일만 저장할지 여부",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "각 저장 시간에 최종 작은 모델을 weights 폴더에 저장할지 여부",
+ "是否开启TTS推理WebUI": "TTS 추론 WebUI 활성화 여부",
+ "是否开启UVR5-WebUI": "UVR5-WebUI를 여시겠습니까?",
+ "是否开启dpo训练选项(实验性)": "dpo 훈련 옵션(실험적) 활성화 여부",
+ "是否开启打标WebUI": "웹 기반 주석 활성화 여부",
+ "是否直接对上次合成结果调整语速。防止随机性。": "직전 합성 결과의 언어 속도를 직접 조정하여 무작위성을 방지할까요?",
+ "显卡信息": "그래픽 카드 정보",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "본 소프트웨어는 MIT 라이선스로 오픈 소스로 제공되며, 제작자는 소프트웨어에 대해 어떠한 제어력도 가지지 않습니다. 소프트웨어 사용자 및 소프트웨어에서 내보낸 소리를 전파하는 자는 전적으로 책임져야 합니다.
이 조항을 인정하지 않으면 소프트웨어의 코드 및 파일을 사용하거나 인용할 수 없습니다. 루트 디렉터리의 LICENSE를 참조하십시오.",
+ "模型": "모델",
+ "模型分为三类:": "모델은 3가지로 나뉩니다:",
+ "模型切换": "모델 전환",
+ "每张显卡的batch_size": "각 그래픽 카드의 배치 크기",
+ "终止ASR进程": "ASR 프로세스 종료",
+ "终止GPT训练": "GPT 훈련 종료",
+ "终止SSL提取进程": "SSL 추출 프로세스 종료",
+ "终止SoVITS训练": "SoVITS 훈련 종료",
+ "终止一键三连": "일괄 처리 종료",
+ "终止文本获取进程": "텍스트 추출 프로세스 종료",
+ "终止语义token提取进程": "의미 토큰 추출 프로세스 종료",
+ "终止语音切割": "음성 분리 종료",
+ "终止语音降噪进程": "음성 노이즈 제거 프로세스 종료",
+ "英文": "영어",
+ "语义token提取进程输出信息": "의미 토큰 추출 프로세스 출력 정보",
+ "语速": "언어 속도",
+ "语速调整,高为更快": "언어 속도 조정, 높을수록 빠름",
+ "语音切割进程输出信息": "음성 분리 프로세스 출력 정보",
+ "语音降噪进程输出信息": "음성 노이즈 제거 프로세스 출력 정보",
+ "请上传3~10秒内参考音频,超过会报错!": "3~10초 이내의 참고 오디오를 업로드하십시오. 초과하면 오류가 발생합니다!",
+ "请输入有效文本": "유효한 텍스트를 입력하세요",
+ "转换": "변환",
+ "输入待处理音频文件夹路径": "처리 대기 중인 오디오 폴더 경로 입력",
+ "输入文件夹路径": "폴더 경로 입력",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "logs/실험 이름 디렉터리에는 23456으로 시작하는 파일과 폴더가 있어야 함",
+ "输出信息": "출력 정보",
+ "输出文件夹路径": "출력 폴더 경로",
+ "输出的语音": "출력 음성",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "SoVITS_weights 및 GPT_weights에 저장된 훈련 완료된 모델 중 선택. 기본적으로 하나는 기본 모델이며 5초 Zero Shot TTS를 체험할 수 있습니다.",
+ "降噪结果输出文件夹": "노이즈 제거 결과 출력 폴더",
+ "降噪音频文件输入文件夹": "노이즈 제거 오디오 파일 입력 폴더",
+ "需要合成的切分前文本": "합성해야 할 분할 전 텍스트",
+ "需要合成的文本": "합성해야 할 텍스트",
+ "需要合成的语种": "합성해야 할 언어",
+ "音频自动切分输入路径,可文件可文件夹": "오디오 자동 분리 입력 경로, 파일 또는 폴더 가능",
+ "预训练的GPT模型路径": "사전 훈련된 GPT 모델 경로",
+ "预训练的SSL模型路径": "사전 훈련된 SSL 모델 경로",
+ "预训练的SoVITS-D模型路径": "사전 훈련된 SoVITS-D 모델 경로",
+ "预训练的SoVITS-G模型路径": "사전 훈련된 SoVITS-G 모델 경로",
+ "预训练的中文BERT模型路径": "사전 훈련된 중국어 BERT 모델 경로"
+}
diff --git a/tools/i18n/locale/pt_BR.json b/tools/i18n/locale/pt_BR.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ca993299987797390a109a6bcfcf0c8ba34a83b
--- /dev/null
+++ b/tools/i18n/locale/pt_BR.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): É a melhor opção para reverberação de dois canais, mas não pode remover a reverberação de um único canal;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Remove os efeitos de atraso. Aggressive é mais completo que Normal na remoção, DeReverb remove adicionalmente a reverberação, pode remover a reverberação de um canal único, mas não remove completamente a reverberação de placa de alta frequência.",
+ "*GPT模型列表": "*Lista de modelos GPT",
+ "*SoVITS模型列表": "*Lista de modelos Sovits",
+ "*实验/模型名": "*Nome do experimento/modelo",
+ "*文本标注文件": "*Arquivo de marcação de texto",
+ "*训练集音频文件目录": "*Diretório de arquivos de áudio do conjunto de treinamento",
+ "*请上传并填写参考信息": "Por favor, faça o upload e preencha as informações de referência",
+ "*请填写需要合成的目标文本和语种模式": "*Por favor, insira o texto alvo a ser sintetizado e o modo de idioma.",
+ ".list标注文件的路径": "Caminho do arquivo de anotação .list",
+ "0-前置数据集获取工具": "0- Ferramenta de aquisição de conjunto de dados pré-frontal",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0A-UVR5 separação de voz e acompanhamento instrumental & ferramenta para remover reverberação e atraso",
+ "0b-语音切分工具": "0b- Ferramenta de corte de voz",
+ "0bb-语音降噪工具": "0bb- Ferramenta de redução de ruído de voz",
+ "0c-中文批量离线ASR工具": "0c- Ferramenta chinesa de ASR offline em lote",
+ "0d-语音文本校对标注工具": "0d- Ferramenta de correção e marcação de texto de voz",
+ "1-GPT-SoVITS-TTS": "1-GPT-SOVITS-TTS",
+ "1A-训练集格式化工具": "1A-Ferramenta de formatação de conjunto de dados de treinamento",
+ "1Aa-文本内容": "1AA-Conteúdo do texto",
+ "1Aabc-训练集格式化一键三连": "1AABC-Formatação de conjunto de treinamento em um clique",
+ "1Ab-SSL自监督特征提取": "1AB-Extração de características auto-supervisionadas SSL",
+ "1Ac-语义token提取": "1AC-Extração de token semântico",
+ "1B-微调训练": "1B-Treinamento de ajuste fino",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1ba-Treinamento SoVITS. O arquivo de modelo para compartilhamento é gerado em SOVITS_WEIGHTS",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1BB-Treinamento GPT. O arquivo de modelo para compartilhamento é gerado em GPT_WEIGHTS",
+ "1C-推理": "1C-raciocínio",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. O tempo de processamento do modelo DeEcho-DeReverb é quase o dobro dos outros dois modelos DeEcho;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Manter a voz: selecione isso para áudio sem harmonia, que preserva melhor a voz principal do que o HP5. Inclui dois modelos, HP2 e HP3; o HP3 pode permitir um pequeno vazamento de acompanhamento, mas preserva a voz principal um pouco melhor do que o HP2;",
+ "2-GPT-SoVITS-变声": "2-gpt-sovits-mudança de voz",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. O modelo MDX-Net-Dereverb é bastante lento;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Manter apenas a voz principal: selecione isso para áudio com harmonia, pode haver uma redução na voz principal. Inclui um modelo HP5;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. A configuração mais limpa recomendada é usar primeiro o MDX-Net e depois o DeEcho-Aggressive.",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Modelo de remoção de reverberação e atraso (por FoxJoy):",
+ "ASR 模型": "Modelo ASR",
+ "ASR 模型尺寸": "Tamanho do modelo ASR",
+ "数据类型精度": "precisão do tipo de dado",
+ "ASR 语言设置": "Configurações de idioma do ASR",
+ "ASR进程输出信息": "Informações de saída do processo ASR",
+ "GPT模型列表": "Lista de modelos GPT",
+ "GPT训练进程输出信息": "Informações de saída do processo de treinamento GPT",
+ "GPU卡号,只能填1个整数": "Número da placa de vídeo, só é possível preencher com um número inteiro",
+ "GPU卡号以-分割,每个卡号一个进程": "Número da placa de vídeo dividido por-, cada número de placa é um processo",
+ "SSL进程输出信息": "Informações de saída do processo SSL",
+ "SoVITS模型列表": "Lista de modelos SoVITS",
+ "SoVITS训练进程输出信息": "Informações de saída do processo de treinamento SoVITS",
+ "TTS推理WebUI进程输出信息": "Informações de saída do processo webui de raciocínio TTS",
+ "TTS推理进程已关闭": "O processo de inferência TTS foi desativado",
+ "TTS推理进程已开启": "O processo de inferência TTS foi iniciado",
+ "UVR5已关闭": "UVR5 está desativado",
+ "UVR5已开启": "UVR5 está ativado",
+ "UVR5进程输出信息": "Informações de saída do processo UVR5",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix: Em que proporção o áudio normalizado é misturado de volta",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Parâmetros de amostragem do GPT (não muito baixos quando não houver texto de referência. Use o padrão se não tiver certeza):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "HOP_SIZE: Como calcular a curva de volume, quanto menor a precisão, maior a quantidade de cálculos (não significa que quanto maior a precisão, melhor o efeito)",
+ "max:归一化后最大值多少": "MAX: Qual é o valor máximo após a normalização?",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept: Depois de cortar, por quanto tempo no máximo o silêncio é mantido",
+ "min_interval:最短切割间隔": "min_interval: O intervalo de corte mínimo",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: Comprimento mínimo de cada segmento. Se o primeiro segmento for muito curto, ele será unido aos segmentos seguintes até exceder este valor",
+ "temperature": "temperatura",
+ "threshold:音量小于这个值视作静音的备选切割点": "Limiar: O volume menor que este valor é considerado como um ponto de corte mudo alternativo",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Informações de saída do processo de um clique",
+ "不切": "Não dividir",
+ "中文": "Chinês",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Documentação do tutorial em chinês:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Mistura de Chinês e Inglês",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "Também é possível inserir arquivos de áudio em lote; escolha uma opção, preferencialmente leia a pasta.",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Processamento em lote de separação de voz e acompanhamento, usando o modelo UVR5.",
+ "人声提取激进程度": "Grau de agressividade da extração de voz",
+ "伴奏人声分离&去混响&去回声": "Separação de acompanhamento e voz & remoção de reverberação & remoção de eco",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Ao usar o modo sem texto de referência, recomenda-se usar um GPT ajustado. Se não conseguir ouvir claramente o áudio de referência (não sabe o que escrever), você pode ativar o modo e ignorar o texto de referência fornecido.",
+ "保存频率save_every_epoch": "Frequência de salvamento save_every_epoch",
+ "凑50字一切": "Complete com 50 caracteres",
+ "凑四句一切": "Complete com quatro frases",
+ "切分后文本": "Texto após divisão",
+ "切分后的子音频的输出根目录": "Diretório raiz de saída do sub-áudio após o corte",
+ "切割使用的进程数": "Número de processos para corte",
+ "刷新模型路径": "Atualizar caminho do modelo",
+ "前端处理后的文本(每句):": "Texto após processamento front-end (por frase):",
+ "去混响/去延迟,附:": "Remoção de reverberação/remoção de atraso, anexo:",
+ "参考音频在3~10秒范围外,请更换!": "O áudio de referência está fora do intervalo de 3 a 10 segundos. Por favor, substitua!",
+ "参考音频的文本": "Texto do áudio de referência",
+ "参考音频的语种": "Idioma do áudio de referência",
+ "合成语音": "Voz sintetizada",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemplo de formato de caminho de pasta válido: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copie do endereço da barra do gerenciador de arquivos).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "Futuramente, serão suportadas a conversão de fonemas, a modificação manual de fonemas e a execução em etapas da síntese de voz.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Preencha o diretório onde os áudios cortados estão localizados! O caminho completo dos arquivos de áudio lidos = este diretório - concatenação com o nome do arquivo de forma correspondente no arquivo .list (não o caminho completo). Se deixar em branco, use o caminho absoluto no arquivo .list.",
+ "多语种混合": "Mistura de múltiplos idiomas",
+ "实际输入的参考文本:": "Texto de referência realmente inserido:",
+ "实际输入的目标文本(切句后):": "Texto alvo realmente inserido (após divisão de frases):",
+ "实际输入的目标文本(每句):": "Texto alvo realmente inserido (por frase):",
+ "实际输入的目标文本:": "Texto alvo realmente inserido:",
+ "导出文件格式": "Formato de arquivo de exportação",
+ "开启GPT训练": "Ativar treinamento GPT",
+ "开启SSL提取": "Ativar extração SSL",
+ "开启SoVITS训练": "Ativar treinamento SoVITS",
+ "开启一键三连": "Ativar um clique",
+ "开启文本获取": "Ativar obtenção de texto",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Ativar o modo sem texto de referência. Não preencher o texto de referência também equivale a ativar.",
+ "开启离线批量ASR": "Ativar ASR offline em lote",
+ "开启语义token提取": "Ativar extração de token semântico",
+ "开启语音切割": "Ativar corte de voz",
+ "开启语音降噪": "Ativar redução de ruído de voz",
+ "怎么切": "Como cortar",
+ "总训练轮数total_epoch": "Total de epoch de treinamento",
+ "总训练轮数total_epoch,不建议太高": "Total de epoch de treinamento, não é recomendável um valor muito alto",
+ "打标工具WebUI已关闭": "A ferramenta de marcação WebUI foi desativado",
+ "打标工具WebUI已开启": "A ferramenta de marcação WebUI está ativada",
+ "打标工具进程输出信息": "Informações de saída do processo da ferramenta de marcação",
+ "指定输出主人声文件夹": "Especificar a pasta de saída da voz principal",
+ "指定输出非主人声文件夹": "Especificar a pasta de saída da voz secundária",
+ "按中文句号。切": "Dividir por ponto final chinês",
+ "按标点符号切": "Dividir por sinais de pontuação",
+ "按英文句号.切": "Dividir por ponto final em inglês",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Ferramenta de divisão de texto. Textos muito longos podem não ter bons resultados de síntese, então é recomendado dividir textos muito longos primeiro. A síntese será feita dividindo e recombinando com base nas quebras de linha do texto.",
+ "文本模块学习率权重": "Weight da taxa de aprendizado do módulo de texto",
+ "文本进程输出信息": "Informações de saída do processo de texto",
+ "施工中,请静候佳音": "Em construção, por favor, aguarde por um bom som",
+ "日文": "Japonês",
+ "日英混合": "Mistura de Japonês e Inglês",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Se deve salvar apenas o último arquivo CKPT para economizar espaço em disco",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Se deve salvar o modelo pequeno final na pasta Weights em cada ponto de salvamento de tempo",
+ "是否开启TTS推理WebUI": "Se deseja ativar o webui de raciocínio TTS",
+ "是否开启UVR5-WebUI": "Se deseja ativar a UVR5-WEBUI",
+ "是否开启dpo训练选项(实验性)": "Se deseja ativar a opção de treinamento DPO (experimental)",
+ "是否开启打标WebUI": "Se deseja abrir o webui de marcação",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Se deve ajustar diretamente a velocidade da fala do último resultado de síntese para evitar aleatoriedade.",
+ "显卡信息": "Informações da placa de vídeo",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Este software é de código aberto sob a licença MIT. O autor não tem controle sobre o software. Aqueles que usam o software e difundem os sons exportados pelo software são totalmente responsáveis.
Se você não concorda com esta cláusula, não pode usar ou citar nenhum código e arquivo dentro do pacote de software. Consulte o diretório raiz LICENSE para mais detalhes.
Traduzido por Rafael Godoy Ebert",
+ "模型": "Modelo",
+ "模型分为三类:": "Modelos dividem-se em três categorias:",
+ "模型切换": "Troca de modelo",
+ "每张显卡的batch_size": "Tamanho do lote de cada placa de vídeo",
+ "终止ASR进程": "Encerrar processo ASR",
+ "终止GPT训练": "Encerrar treinamento GPT",
+ "终止SSL提取进程": "Encerrar processo de extração SSL",
+ "终止SoVITS训练": "Encerrar treinamento SoVITS",
+ "终止一键三连": "Encerrar um clique",
+ "终止文本获取进程": "Encerrar processo de obtenção de texto",
+ "终止语义token提取进程": "Encerrar processo de extração de token semântico",
+ "终止语音切割": "Encerrar corte de voz",
+ "终止语音降噪进程": "Encerrar o processo de redução de ruído de voz",
+ "英文": "Inglês",
+ "语义token提取进程输出信息": "Informações de saída do processo de extração de token semântico",
+ "语速": "Velocidade da fala",
+ "语速调整,高为更快": "Ajustar a velocidade da fala, mais alta para mais rápido",
+ "语音切割进程输出信息": "Informações de saída do processo de corte de voz",
+ "语音降噪进程输出信息": "Informações de saída do processo de redução de ruído de voz",
+ "请上传3~10秒内参考音频,超过会报错!": "Por favor, faça upload de um áudio de referência com duração entre 3 e 10 segundos. Áudios fora dessa faixa causarão erro!",
+ "请输入有效文本": "Por favor, insira um texto válido",
+ "转换": "Converter",
+ "输入待处理音频文件夹路径": "Caminho da pasta de arquivos de áudio a ser processados",
+ "输入文件夹路径": "Caminho da pasta de entrada",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "Logs de saída/deve haver arquivos e pastas começando com 23456 no diretório do nome do experimento",
+ "输出信息": "Informações de saída",
+ "输出文件夹路径": "Caminho da pasta de saída",
+ "输出的语音": "Áudio de saída",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Selecione os modelos armazenados em Sovits_weights e GPT_WEIGHTS. O padrão é o modelo inferior, experiência para 5 segundos de Zero Shot TTS",
+ "降噪结果输出文件夹": "Pasta de saída dos resultados de redução de ruído",
+ "降噪音频文件输入文件夹": "Pasta de entrada dos arquivos de áudio para redução de ruído",
+ "需要合成的切分前文本": "Texto a ser sintetizado antes da divisão",
+ "需要合成的文本": "Texto a ser sintetizado",
+ "需要合成的语种": "Idioma a ser sintetizado",
+ "音频自动切分输入路径,可文件可文件夹": "Caminho de entrada automático de corte de áudio, pode ser um arquivo ou uma pasta",
+ "预训练的GPT模型路径": "Caminho do modelo GPT pre-train",
+ "预训练的SSL模型路径": "Caminho do modelo SSL pre-train",
+ "预训练的SoVITS-D模型路径": "Caminho do modelo SoVITS-D pre-train",
+ "预训练的SoVITS-G模型路径": "Caminho do modelo SoVITS-G pre-train",
+ "预训练的中文BERT模型路径": "Caminho do modelo BERT chinês pre-train"
+}
diff --git a/tools/i18n/locale/ru_RU.json b/tools/i18n/locale/ru_RU.json
new file mode 100644
index 0000000000000000000000000000000000000000..3991115ba993bfdd26e8ec8b23fcbb6806ac7f05
--- /dev/null
+++ b/tools/i18n/locale/ru_RU.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):Это лучший выбор для реверберации с двумя каналами, но он не может устранить реверберацию с одним каналом;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Устраняет эффект задержки. Aggressive устраняет более тщательно, чем Normal, DeReverb дополнительно устраняет реверберацию, может устранить реверберацию с одного канала, но не полностью устраняет высокочастотную реверберацию.",
+ "*GPT模型列表": "*Список моделей GPT",
+ "*SoVITS模型列表": "*Список моделей SoVITS",
+ "*实验/模型名": "*Название эксперимента/модели",
+ "*文本标注文件": "*Файл текстовой аннотации",
+ "*训练集音频文件目录": "*Директория аудиофайлов обучающего набора",
+ "*请上传并填写参考信息": "*Пожалуйста, загрузите и заполните референтные данные",
+ "*请填写需要合成的目标文本和语种模式": "*Пожалуйста, введите целевой текст для синтеза и режим языка",
+ ".list标注文件的路径": "Путь к файлу аннотации .list",
+ "0-前置数据集获取工具": "0-Инструмент для получения предварительного набора данных",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-Инструмент для разделения вокала и аккомпанемента UVR5 & устранения реверберации и задержек",
+ "0b-语音切分工具": "0b-Инструмент для разделения речи",
+ "0bb-语音降噪工具": "0bb-Инструмент для подавления шумов в голосе",
+ "0c-中文批量离线ASR工具": "0c-Инструмент для пакетной офлайн ASR на китайском",
+ "0d-语音文本校对标注工具": "0d-Инструмент для коррекции и аннотации текста речи",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-Инструмент для форматирования обучающего набора",
+ "1Aa-文本内容": "1Aa-Содержание текста",
+ "1Aabc-训练集格式化一键三连": "1Aabc-Форматирование обучающего набора одним нажатием",
+ "1Ab-SSL自监督特征提取": "1Ab-Самоконтролируемое извлечение признаков SSL",
+ "1Ac-语义token提取": "1Ac-Извлечение семантических токенов",
+ "1B-微调训练": "1B-Дообучение",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-Обучение SoVITS. Файлы моделей для распространения находятся в SoVITS_weights.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-Обучение GPT. Файлы моделей для распространения находятся в GPT_weights.",
+ "1C-推理": "1C-Инференс",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. Время обработки модели DeEcho-DeReverb почти вдвое больше, чем у двух других моделей DeEcho;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Сохранение голоса: выберите этот для аудио без гармоний, сохранение голоса будет лучше, чем HP5. Встроенные модели HP2 и HP3, HP3 может немного пропускать сопровождение, но сохраняет голос немного лучше, чем HP2;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-переозвучивание",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. Модель MDX-Net-Dereverb довольно медленная;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Сохранение только основного голоса: выберите это для аудио с гармониями, может ослабить основной голос. Встроенная модель HP5;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. Лично рекомендованная самая чистая конфигурация — сначала MDX-Net, затем DeEcho-Aggressive.",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Модель удаления реверберации и задержек (от FoxJoy):",
+ "ASR 模型": "Модель ASR",
+ "ASR 模型尺寸": "Размер модели ASR",
+ "数据类型精度": "точность типа данных",
+ "ASR 语言设置": "Настройки языка ASR",
+ "ASR进程输出信息": "Информация о процессе ASR",
+ "GPT模型列表": "Список моделей GPT",
+ "GPT训练进程输出信息": "Информация о процессе обучения GPT",
+ "GPU卡号,只能填1个整数": "Номер GPU, можно указать только одно целое число",
+ "GPU卡号以-分割,每个卡号一个进程": "Номера GPU разделяются дефисом, на каждый номер отдельный процесс",
+ "SSL进程输出信息": "Информация о процессе SSL",
+ "SoVITS模型列表": "Список моделей SoVITS",
+ "SoVITS训练进程输出信息": "Информация о процессе обучения SoVITS",
+ "TTS推理WebUI进程输出信息": "Информация о процессе TTS инференса WebUI",
+ "TTS推理进程已关闭": "Процесс TTS-инференции остановлен",
+ "TTS推理进程已开启": "Процесс TTS-инференции запущен",
+ "UVR5已关闭": "UVR5 выключен",
+ "UVR5已开启": "UVR5 включен",
+ "UVR5进程输出信息": "Вывод информации процесса UVR5",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Какая доля нормализованного аудио смешивается",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "Параметры выборки GPT (не устанавливайте слишком низкие значения, если нет ссылочного текста. Используйте значения по умолчанию, если не уверены):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Как рассчитывается кривая громкости, чем меньше, тем выше точность и больше вычислительная нагрузка (большая точность не всегда означает лучший результат)",
+ "max:归一化后最大值多少": "max:Максимальное значение после нормализации",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:Максимальная длительность тишины после разреза",
+ "min_interval:最短切割间隔": "min_interval:Минимальный интервал разреза",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:Минимальная длина каждого отрезка; если первый отрезок слишком короткий, он будет соединен с последующими до достижения этого значения",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:Значение громкости ниже этого считается тишиной для альтернативной точки разреза",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Информация о процессе одного нажатия",
+ "不切": "Не разрезать",
+ "中文": "Китайский",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Документация на китайском языке:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Китайский и английский",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "Можно также импортировать несколько аудиофайлов. Если путь к папке существует, то этот ввод игнорируется.",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Обработка разделения вокала и аккомпанемента пакетно с использованием модели UVR5.",
+ "人声提取激进程度": "Степень агрессивности извлечения вокала",
+ "伴奏人声分离&去混响&去回声": "Разделение вокала/аккомпанемента и удаление эхо",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "При использовании режима без референсного текста рекомендуется использовать настроенную модель GPT. Если не удается разобрать, что говорит референсное аудио (не знаете, что писать), можете включить этот режим, и он проигнорирует введенный референсный текст.",
+ "保存频率save_every_epoch": "Частота сохранения save_every_epoch",
+ "凑50字一切": "Соберите все в 50 символов",
+ "凑四句一切": "Собрать четыре предложения и разрезать",
+ "切分后文本": "Текст после разделения",
+ "切分后的子音频的输出根目录": "Корневой каталог вывода для подаудио после разделения",
+ "切割使用的进程数": "Количество процессов, используемых для разрезания",
+ "刷新模型路径": "Обновить путь к модели",
+ "前端处理后的文本(每句):": "Текст после предварительной обработки (каждое предложение):",
+ "去混响/去延迟,附:": "Удаление реверберации/удаление задержки, примечание:",
+ "参考音频在3~10秒范围外,请更换!": "Референтное аудио вне диапазона 3~10 секунд, пожалуйста, замените!",
+ "参考音频的文本": "Текст референтного аудио",
+ "参考音频的语种": "Язык референтного аудио",
+ "合成语音": "Синтезированный голос",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Пример допустимого формата пути к папке: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (просто скопируйте из адресной строки файлового менеджера).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "В будущем будет поддерживаться преобразование в фонемы, ручная коррекция фонем, пошаговая синтезация речи.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Заполните каталог, где находятся аудиофайлы после разрезания! Полный путь к читаемым аудиофайлам = каталог - файл .list, имя файла соответствует волне (не полный путь). Если оставить пустым, будет использоваться абсолютный путь из файла .list.",
+ "多语种混合": "Смешанные языки",
+ "实际输入的参考文本:": "Фактически введенный референсный текст:",
+ "实际输入的目标文本(切句后):": "Фактически введенный целевой текст (после разбиения на предложения):",
+ "实际输入的目标文本(每句):": "Фактически введенный целевой текст (каждое предложение):",
+ "实际输入的目标文本:": "Фактически введенный целевой текст:",
+ "导出文件格式": "Формат выходных файлов",
+ "开启GPT训练": "Включить обучение GPT",
+ "开启SSL提取": "Включить извлечение SSL",
+ "开启SoVITS训练": "Включить обучение SoVITS",
+ "开启一键三连": "Включить одно нажатие",
+ "开启文本获取": "Включить получение текста",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Включить режим без референтного текста. Не заполняя референтный текст, вы также включаете этот режим.",
+ "开启离线批量ASR": "Включить пакетную офлайн ASR",
+ "开启语义token提取": "Включить извлечение семантических токенов",
+ "开启语音切割": "Включить разрезание речи",
+ "开启语音降噪": "Включить шумоподавление",
+ "怎么切": "Как разрезать",
+ "总训练轮数total_epoch": "Общее количество эпох обучения total_epoch",
+ "总训练轮数total_epoch,不建议太高": "Общее количество эпох обучения total_epoch, не рекомендуется слишком высокое",
+ "打标工具WebUI已关闭": "WebUI инструмента маркировки остановлен",
+ "打标工具WebUI已开启": "WebUI инструмента маркировки запущен",
+ "打标工具进程输出信息": "Информация о процессе аннотации",
+ "指定输出主人声文件夹": "Путь к папке для сохранения вокала:",
+ "指定输出非主人声文件夹": "Путь к папке для сохранения аккомпанемента:",
+ "按中文句号。切": "Разделение по китайским точкам.",
+ "按标点符号切": "Разрезать по пунктуационным знакам",
+ "按英文句号.切": "Разрезать по английской точке.",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Инструмент для разделения текста. Слишком длинные тексты могут не давать хороших результатов синтеза, поэтому рекомендуется сначала их разделить. Синтез будет выполняться отдельно для каждого абзаца, а затем результаты будут соединены вместе.",
+ "文本模块学习率权重": "Веса скорости обучения текстового модуля",
+ "文本进程输出信息": "Информация о процессе обработки текста",
+ "施工中,请静候佳音": "В разработке, ожидайте хороших новостей",
+ "日文": "Японский",
+ "日英混合": "Японский и английский",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Сохранять только последние файлы ckpt для экономии места на диске?",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Сохранять финальную версию модели в папке weights на каждом этапе сохранения?",
+ "是否开启TTS推理WebUI": "Включить TTS инференс WebUI",
+ "是否开启UVR5-WebUI": "Включить UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "Включить опцию тренировки dpo (экспериментально)",
+ "是否开启打标WebUI": "Включить интерфейс веб-аннотации",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Следует ли непосредственно регулировать скорость речи последнего синтезированного результата, чтобы избежать случайности?",
+ "显卡信息": "Информация о видеокарте",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Это программное обеспечение открыто по лицензии MIT, автор не имеет никакого контроля над программным обеспечением, пользователи программного обеспечения и те, кто распространяет звуки, экспортированные программным обеспечением, несут полную ответственность.
Если вы не согласны с этими условиями, вы не можете использовать или ссылаться на любой код и файлы в пакете программного обеспечения. Смотрите LICENSE в корневом каталоге.",
+ "模型": "Модели",
+ "模型分为三类:": "Модели делятся на три типа:",
+ "模型切换": "Переключение модели",
+ "每张显卡的batch_size": "Размер пакета для каждой видеокарты",
+ "终止ASR进程": "Прекратить процесс ASR",
+ "终止GPT训练": "Прекратить обучение GPT",
+ "终止SSL提取进程": "Прекратить процесс извлечения SSL",
+ "终止SoVITS训练": "Прекратить обучение SoVITS",
+ "终止一键三连": "Прекратить одно нажатие",
+ "终止文本获取进程": "Прекратить процесс получения текста",
+ "终止语义token提取进程": "Прекратить процесс извлечения семантических токенов",
+ "终止语音切割": "Прекратить разрезание речи",
+ "终止语音降噪进程": "Прекратить процесс шумоподавления",
+ "英文": "Английский",
+ "语义token提取进程输出信息": "Информация о процессе извлечения семантических токенов",
+ "语速": "Скорость речи",
+ "语速调整,高为更快": "Регулировка скорости речи, чем выше, тем быстрее",
+ "语音切割进程输出信息": "Информация о процессе разрезания речи",
+ "语音降噪进程输出信息": "Информация о процессе шумоподавления",
+ "请上传3~10秒内参考音频,超过会报错!": "Пожалуйста, загрузите референтное аудио длительностью от 3 до 10 секунд, иначе будет ошибка!",
+ "请输入有效文本": "Введите действительный текст",
+ "转换": "Преобразовать",
+ "输入待处理音频文件夹路径": "Путь к папке с аудиофайлами для обработки:",
+ "输入文件夹路径": "Введите путь к папке",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "В директории logs/имя_эксперимента должны быть файлы и папки, начинающиеся с 23456",
+ "输出信息": "Статистика",
+ "输出文件夹路径": "Путь к папке для вывода",
+ "输出的语音": "Выводимый звук",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Выберите модель, сохраненную в SoVITS_weights и GPT_weights после обучения. По умолчанию используется базовая модель для 5-секундного Zero Shot TTS.",
+ "降噪结果输出文件夹": "Папка для вывода результатов шумоподавления",
+ "降噪音频文件输入文件夹": "Папка для ввода аудиофайлов для шумоподавления",
+ "需要合成的切分前文本": "Текст для синтеза до разделения",
+ "需要合成的文本": "Текст для синтеза",
+ "需要合成的语种": "Язык для синтеза",
+ "音频自动切分输入路径,可文件可文件夹": "Путь ввода для автоматического разделения аудио, может быть файлом или папкой",
+ "预训练的GPT模型路径": "Путь к предварительно обученной модели GPT",
+ "预训练的SSL模型路径": "Путь к предварительно обученной модели SSL",
+ "预训练的SoVITS-D模型路径": "Путь к предварительно обученной модели SoVITS-D",
+ "预训练的SoVITS-G模型路径": "Путь к предварительно обученной модели SoVITS-G",
+ "预训练的中文BERT模型路径": "Путь к предварительно обученной китайской модели BERT"
+}
diff --git a/tools/i18n/locale/tr_TR.json b/tools/i18n/locale/tr_TR.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7a4e9a28ef176186a3ca65e92667ad105b1806c
--- /dev/null
+++ b/tools/i18n/locale/tr_TR.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):İki kanallı yankılar için en iyi seçimdir, ancak tek kanallı yankıları ortadan kaldıramaz;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:Gecikme etkilerini giderir. Aggressive, Normal'dan daha kapsamlı bir şekilde giderir, DeReverb ek olarak yankıyı giderir, tek kanallı yankıyı giderebilir, ancak yüksek frekanslı plaka yankısını tamamen gideremez.",
+ "*GPT模型列表": "*GPT model listesi",
+ "*SoVITS模型列表": "*SoVITS model listesi",
+ "*实验/模型名": "*Deney/model adı",
+ "*文本标注文件": "*Metin etiketleme dosyası",
+ "*训练集音频文件目录": "*Eğitim seti ses dosyası dizini",
+ "*请上传并填写参考信息": "*Lütfen referans bilgilerini yükleyin ve doldurun",
+ "*请填写需要合成的目标文本和语种模式": "*Lütfen sentezlenecek hedef metni ve dil modunu giriniz.",
+ ".list标注文件的路径": ".list etiketleme dosyasının yolu",
+ "0-前置数据集获取工具": "0-Ön veri seti alma aracı",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5 vokal eşlik ayırma & yankıyı giderme gecikme aracı",
+ "0b-语音切分工具": "0b-Ses bölme aracı",
+ "0bb-语音降噪工具": "0bb-Ses gürültü azaltma aracı",
+ "0c-中文批量离线ASR工具": "0c-Çince toplu offline ASR aracı",
+ "0d-语音文本校对标注工具": "0d-Ses ve metin düzeltme etiketleme aracı",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-Eğitim seti formatlama aracı",
+ "1Aa-文本内容": "1Aa-Metin içeriği",
+ "1Aabc-训练集格式化一键三连": "1Aabc-Eğitim seti formatlama tek tuşla üçleme",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL kendi kendine denetimli özellik çıkarma",
+ "1Ac-语义token提取": "1Ac-Anlamsal token çıkarma",
+ "1B-微调训练": "1B-Fine-tuning eğitimi",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS eğitimi. Paylaşım için model dosyaları SoVITS_weights altında çıkarılır.",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT eğitimi. Paylaşım için model dosyaları GPT_weights altında çıkarılır.",
+ "1C-推理": "1C-Çıkarım",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. DeEcho-DeReverb modelinin işleme süresi, diğer iki DeEcho modelinin neredeyse iki katıdır;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Ses koruma: Arka vokal içermeyen sesler için bu seçeneği kullanın, ana sesi HP5'ten daha iyi korur. HP2 ve HP3 adlı iki model içerir; HP3, arka vokali biraz kaçırabilir ancak ana sesi HP2'ye göre biraz daha iyi korur;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-Ses Değiştirme",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2. MDX-Net-Dereverb modeli oldukça yavaştır;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Sadece ana sesi koruma: Arka vokalleri içeren sesler için bu seçeneği kullanın, ana sesi zayıflatabilir. İçinde HP5 modeli var;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. Kişisel olarak en temiz konfigürasyon MDX-Net'in ardından DeEcho-Aggressive'dir.",
+ "3、去混响、去延迟模型(by FoxJoy):": "3. Yankı ve gecikme giderme modeli (FoxJoy tarafından):",
+ "ASR 模型": "ASR modeli",
+ "ASR 模型尺寸": "ASR model boyutu",
+ "数据类型精度": "veri türü doğruluğu",
+ "ASR 语言设置": "ASR dil ayarları",
+ "ASR进程输出信息": "ASR işlemi çıktı bilgisi",
+ "GPT模型列表": "GPT model listesi",
+ "GPT训练进程输出信息": "GPT eğitimi işlemi çıktı bilgisi",
+ "GPU卡号,只能填1个整数": "GPU kart numarası, sadece bir tamsayı girilebilir",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU kart numaraları - ile ayrılır, her kart numarası için bir işlem",
+ "SSL进程输出信息": "SSL işlemi çıktı bilgisi",
+ "SoVITS模型列表": "SoVITS model listesi",
+ "SoVITS训练进程输出信息": "SoVITS eğitimi işlemi çıktı bilgisi",
+ "TTS推理WebUI进程输出信息": "TTS çıkarımı WebUI işlemi çıktı bilgisi",
+ "TTS推理进程已关闭": "TTS çıkarım işlemi kapatıldı",
+ "TTS推理进程已开启": "TTS çıkarım işlemi başlatıldı",
+ "UVR5已关闭": "UVR5 kapandı",
+ "UVR5已开启": "UVR5 açıldı",
+ "UVR5进程输出信息": "UVR5 işlem çıktı bilgisi",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:Normalizasyondan sonraki sesin ne kadarlık bir oranı karıştırılsın",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT örnekleme parametreleri (referans metin olmadığında çok düşük olmamalıdır. Emin değilseniz varsayılanı kullanın):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:Ses seviyesi eğrisi nasıl hesaplanır, ne kadar küçükse hassasiyet o kadar yüksek ve hesaplama yükü o kadar artar (hassasiyet arttıkça etki mutlaka daha iyi olmaz)",
+ "max:归一化后最大值多少": "max:Normalizasyondan sonra maksimum değer ne kadar",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:Kesimden sonra en fazla ne kadar sessizlik bırakılır",
+ "min_interval:最短切割间隔": "min_interval:Minimum kesim aralığı",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: bölümün minimum uzunluğu, ilk bölüm çok kısa ise, bu değeri aşana kadar sonraki bölümlerle birleştirilir",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:Ses bu değerden düşükse sessiz olarak kabul edilen alternatif kesim noktası",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "Tek tuşla üçleme işlemi çıktı bilgisi",
+ "不切": "Kesme",
+ "中文": "Çince",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "Çince öğretici belge:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "Çince ve İngilizce karışık",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "Ses dosyaları ayrıca toplu olarak, iki seçimle, öncelikli okuma klasörüyle içe aktarılabilir",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "Vokal ve akor ayırma toplu işleme, UVR5 modelini kullanarak.",
+ "人声提取激进程度": "Vokal çıkarma agresiflik derecesi",
+ "伴奏人声分离&去混响&去回声": "Vokal/Müzik Ayrıştırma ve Yankı Giderme",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "Referans metin modu olmadan kullanıldığında, referans sesi net duyulmadığında (ne yazılacağı bilinmiyorsa) açık bırakılması önerilir, bu durumda girilen referans metni göz ardı edilir.",
+ "保存频率save_every_epoch": "Kayıt sıklığı save_every_epoch",
+ "凑50字一切": "50 kelime birleştir ve kes",
+ "凑四句一切": "Dört cümleyi bir araya getirip kes",
+ "切分后文本": "Bölündükten sonra metin",
+ "切分后的子音频的输出根目录": "Bölündükten sonra alt ses dosyalarının çıktı kök dizini",
+ "切割使用的进程数": "Kesim için kullanılan işlem sayısı",
+ "刷新模型路径": "Model yolu yenile",
+ "前端处理后的文本(每句):": "Ön işleme tabi tutulan metin (her cümle):",
+ "去混响/去延迟,附:": "Yankı giderme/Geçikme giderme, ek:",
+ "参考音频在3~10秒范围外,请更换!": "Referans ses dosyası 3~10 saniye aralığının dışında, lütfen değiştirin!",
+ "参考音频的文本": "Referans ses dosyasının metni",
+ "参考音频的语种": "Referans ses dosyasının dili",
+ "合成语音": "Ses sentezi",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Geçerli klasör yolu formatı örneği: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (dosya yöneticisi adres çubuğundan kopyalayabilirsiniz).",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "İlerleyen zamanlarda fonem dönüştürme, manuel fonem düzenleme ve adım adım ses sentezi desteklenecek.",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Kesmeye uygun ses dosyalarının bulunduğu dizini doldurun! Okunan ses dosyasının tam yolu = bu dizin + list dosyasındaki dalga biçimiyle eşleşen dosya adı (tam yol değil). Boş bırakılırsa, .list dosyasındaki tam yol kullanılır.",
+ "多语种混合": "Çok dilli karışım",
+ "实际输入的参考文本:": "Gerçekten girilen referans metin:",
+ "实际输入的目标文本(切句后):": "Gerçekten girilen hedef metin (cümleler kesildikten sonra):",
+ "实际输入的目标文本(每句):": "Gerçekten girilen hedef metin (her cümle):",
+ "实际输入的目标文本:": "Gerçekten girilen hedef metin:",
+ "导出文件格式": "Dışa aktarma dosya formatı",
+ "开启GPT训练": "GPT eğitimini başlat",
+ "开启SSL提取": "SSL çıkarmayı başlat",
+ "开启SoVITS训练": "SoVITS eğitimini başlat",
+ "开启一键三连": "Tek tuşla üçlemeyi başlat",
+ "开启文本获取": "Metin alma başlat",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "Referans metni olmayan mod açık. Referans metni doldurulmazsa bu mod otomatik olarak açılır.",
+ "开启离线批量ASR": "Offline toplu ASR başlat",
+ "开启语义token提取": "Anlamsal token çıkarmayı başlat",
+ "开启语音切割": "Ses kesimi başlat",
+ "开启语音降噪": "Ses gürültü azaltmayı başlat",
+ "怎么切": "Nasıl kesilir",
+ "总训练轮数total_epoch": "Toplam eğitim turu sayısı total_epoch",
+ "总训练轮数total_epoch,不建议太高": "Toplam eğitim turu sayısı total_epoch, çok yüksek önerilmez",
+ "打标工具WebUI已关闭": "Etiketleme aracı WebUI'si kapatıldı",
+ "打标工具WebUI已开启": "Etiketleme aracı WebUI'si açıldı",
+ "打标工具进程输出信息": "Etiketleme aracı işlemi çıktı bilgisi",
+ "指定输出主人声文件夹": "Vokal için çıkış klasörünü belirtin:",
+ "指定输出非主人声文件夹": "Müzik ve diğer sesler için çıkış klasörünü belirtin:",
+ "按中文句号。切": "Çince dönem işaretine göre kes",
+ "按标点符号切": "Noktalama işaretlerine göre kes",
+ "按英文句号.切": "İngilizce nokta işaretine göre kes",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Metin bölme aracı. Çok uzun metinlerin sentezi her zaman iyi sonuçlar vermez, bu yüzden önerilen önce kesmektir. Sentez, metnin yeni satırlarına göre ayrı ayrı yapılır ve daha sonra birleştirilir.",
+ "文本模块学习率权重": "Metin modülü öğrenme oranı ağırlığı",
+ "文本进程输出信息": "Metin işlemi çıktı bilgisi",
+ "施工中,请静候佳音": "Yapım aşamasında, lütfen iyi haberler için bekleyin",
+ "日文": "Japonca",
+ "日英混合": "Japonca ve İngilizce karışımı",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "Sadece en yeni ckpt dosyasını kaydederek disk alanından tasarruf edilsin mi",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "Her kayıt zamanında son küçük modelin weights klasörüne kaydedilmesi gerekiyor mu",
+ "是否开启TTS推理WebUI": "TTS çıkarımı WebUI'si başlatılsın mı",
+ "是否开启UVR5-WebUI": "UVR5-WebUI açılsın mı",
+ "是否开启dpo训练选项(实验性)": "dpo eğitim seçeneği açılsın mı? (deneysel)",
+ "是否开启打标WebUI": "Etiketleme WebUI'si başlatılsın mı",
+ "是否直接对上次合成结果调整语速。防止随机性。": "Son sentez sonucunun konuşma hızını doğrudan ayarlamak, rastlantısallığı önlemek için mi?",
+ "显卡信息": "Ekran kartı bilgisi",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Bu yazılım MIT lisansı ile açık kaynaktır, yazar yazılım üzerinde herhangi bir kontrol gücüne sahip değildir, yazılımı kullanıcılar ve yazılım tarafından üretilen sesleri yayınlayanlar tüm sorumluluğu üstlenir.
Eğer bu şartları kabul etmiyorsanız, yazılım paketindeki hiçbir kodu veya dosyayı kullanamaz veya atıfta bulunamazsınız. Ayrıntılar için ana dizindeki LICENSE'ı görün.",
+ "模型": "Model",
+ "模型分为三类:": "Modeller üç türdedir:",
+ "模型切换": "Model değiştirme",
+ "每张显卡的batch_size": "Her bir ekran kartı için batch_size",
+ "终止ASR进程": "ASR işlemini durdur",
+ "终止GPT训练": "GPT eğitimini durdur",
+ "终止SSL提取进程": "SSL çıkarma işlemini durdur",
+ "终止SoVITS训练": "SoVITS eğitimini durdur",
+ "终止一键三连": "Tek tuşla üçlemeyi durdur",
+ "终止文本获取进程": "Metin alma işlemini durdur",
+ "终止语义token提取进程": "Anlamsal token çıkarma işlemini durdur",
+ "终止语音切割": "Ses kesimini durdur",
+ "终止语音降噪进程": "Gürültü azaltma işlemini durdur",
+ "英文": "İngilizce",
+ "语义token提取进程输出信息": "Anlamsal token çıkarma işlemi çıktı bilgisi",
+ "语速": "Konuşma hızı",
+ "语速调整,高为更快": "Konuşma hızını ayarla, yüksek daha hızlı",
+ "语音切割进程输出信息": "Ses kesim işlemi çıktı bilgisi",
+ "语音降噪进程输出信息": "Gürültü azaltma işlemi çıktı bilgisi",
+ "请上传3~10秒内参考音频,超过会报错!": "Lütfen 3~10 saniye arasında bir referans ses dosyası yükleyin, aşım durumunda hata verilecektir!",
+ "请输入有效文本": "Geçerli metin girin",
+ "转换": "Dönüştür",
+ "输入待处理音频文件夹路径": "İşlenecek ses klasörünün yolunu girin:",
+ "输入文件夹路径": "Dosya klasörü yolu girin",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "Çıktı logs/deney adı dizininde 23456 ile başlayan dosya ve klasörler olmalı",
+ "输出信息": "Çıkış bilgisi",
+ "输出文件夹路径": "Çıktı klasörü yolu",
+ "输出的语音": "Çıktı sesi",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Eğitimi tamamlanmış ve SoVITS_weights ile GPT_weights altına kaydedilmiş modeli seçin. Varsayılan bir temel modeldir, 5 saniyelik Zero Shot TTS deneyimi için kullanılır.",
+ "降噪结果输出文件夹": "Gürültü azaltma sonuçları çıktı klasörü",
+ "降噪音频文件输入文件夹": "Gürültü azaltma ses dosyaları giriş klasörü",
+ "需要合成的切分前文本": "Bölünmeden önce sentezlenmesi gereken metin",
+ "需要合成的文本": "Sentezlenmesi gereken metin",
+ "需要合成的语种": "Sentezlenmesi gereken dil",
+ "音频自动切分输入路径,可文件可文件夹": "Ses otomatik bölme giriş yolu, dosya veya klasör olabilir",
+ "预训练的GPT模型路径": "Ön eğitilmiş GPT model yolu",
+ "预训练的SSL模型路径": "Ön eğitilmiş SSL model yolu",
+ "预训练的SoVITS-D模型路径": "Ön eğitilmiş SoVITS-D model yolu",
+ "预训练的SoVITS-G模型路径": "Ön eğitilmiş SoVITS-G model yolu",
+ "预训练的中文BERT模型路径": "Ön eğitilmiş Çince BERT model yolu"
+}
diff --git a/tools/i18n/locale/zh_CN.json b/tools/i18n/locale/zh_CN.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f1dc830185be6eb806474d7d06dcf002feb0b49
--- /dev/null
+++ b/tools/i18n/locale/zh_CN.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延迟效果。Aggressive 比 Normal 去除得更彻底,DeReverb 额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。",
+ "*GPT模型列表": "*GPT模型列表",
+ "*SoVITS模型列表": "*SoVITS模型列表",
+ "*实验/模型名": "*实验/模型名",
+ "*文本标注文件": "*文本标注文件",
+ "*训练集音频文件目录": "*训练集音频文件目录",
+ "*请上传并填写参考信息": "*请上传并填写参考信息",
+ "*请填写需要合成的目标文本和语种模式": "*请填写需要合成的目标文本和语种模式",
+ ".list标注文件的路径": ".list标注文件的路径",
+ "0-前置数据集获取工具": "0-前置数据集获取工具",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5人声伴奏分离&去混响去延迟工具",
+ "0b-语音切分工具": "0b-语音切分工具",
+ "0bb-语音降噪工具": "0bb-语音降噪工具",
+ "0c-中文批量离线ASR工具": "0c-中文批量离线ASR工具",
+ "0d-语音文本校对标注工具": "0d-语音文本校对标注工具",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-训练集格式化工具",
+ "1Aa-文本内容": "1Aa-文本内容",
+ "1Aabc-训练集格式化一键三连": "1Aabc-训练集格式化一键三连",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL自监督特征提取",
+ "1Ac-语义token提取": "1Ac-语义token提取",
+ "1B-微调训练": "1B-微调训练",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。",
+ "1C-推理": "1C-推理",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-变声",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb模型挺慢的;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。",
+ "3、去混响、去延迟模型(by FoxJoy):": "3、去混响、去延迟模型(by FoxJoy):",
+ "ASR 模型": "ASR 模型",
+ "ASR 模型尺寸": "ASR 模型尺寸",
+ "数据类型精度": "数据类型精度",
+ "ASR 语言设置": "ASR 语言设置",
+ "ASR进程输出信息": "ASR进程输出信息",
+ "GPT模型列表": "GPT模型列表",
+ "GPT训练进程输出信息": "GPT训练进程输出信息",
+ "GPU卡号,只能填1个整数": "GPU卡号,只能填1个整数",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU卡号以-分割,每个卡号一个进程",
+ "SSL进程输出信息": "SSL进程输出信息",
+ "SoVITS模型列表": "SoVITS模型列表",
+ "SoVITS训练进程输出信息": "SoVITS训练进程输出信息",
+ "TTS推理WebUI进程输出信息": "TTS推理WebUI进程输出信息",
+ "TTS推理进程已关闭": "TTS推理进程已关闭",
+ "TTS推理进程已开启": "TTS推理进程已开启",
+ "UVR5已关闭": "UVR5已关闭",
+ "UVR5已开启": "UVR5已开启",
+ "UVR5进程输出信息": "UVR5进程输出信息",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例归一化后音频进来",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "gpt采样参数(无参考文本时不要太低。不懂就用默认):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)",
+ "max:归一化后最大值多少": "max:归一化后最大值多少",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完后静音最多留多长",
+ "min_interval:最短切割间隔": "min_interval:最短切割间隔",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:音量小于这个值视作静音的备选切割点",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "一键三连进程输出信息",
+ "不切": "不切",
+ "中文": "中文",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "中英混合",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "人声伴奏分离批量处理, 使用UVR5模型。",
+ "人声提取激进程度": "人声提取激进程度",
+ "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。",
+ "保存频率save_every_epoch": "保存频率save_every_epoch",
+ "凑50字一切": "凑50字一切",
+ "凑四句一切": "凑四句一切",
+ "切分后文本": "切分后文本",
+ "切分后的子音频的输出根目录": "切分后的子音频的输出根目录",
+ "切割使用的进程数": "切割使用的进程数",
+ "刷新模型路径": "刷新模型路径",
+ "前端处理后的文本(每句):": "前端处理后的文本(每句):",
+ "去混响/去延迟,附:": "去混响/去延迟,附:",
+ "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!",
+ "参考音频的文本": "参考音频的文本",
+ "参考音频的语种": "参考音频的语种",
+ "合成语音": "合成语音",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "后续将支持转音素、手工修改音素、语音合成分步执行。",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。",
+ "多语种混合": "多语种混合",
+ "实际输入的参考文本:": "实际输入的参考文本:",
+ "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
+ "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
+ "实际输入的目标文本:": "实际输入的目标文本:",
+ "导出文件格式": "导出文件格式",
+ "开启GPT训练": "开启GPT训练",
+ "开启SSL提取": "开启SSL提取",
+ "开启SoVITS训练": "开启SoVITS训练",
+ "开启一键三连": "开启一键三连",
+ "开启文本获取": "开启文本获取",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "开启无参考文本模式。不填参考文本亦相当于开启。",
+ "开启离线批量ASR": "开启离线批量ASR",
+ "开启语义token提取": "开启语义token提取",
+ "开启语音切割": "开启语音切割",
+ "开启语音降噪": "开启语音降噪",
+ "怎么切": "怎么切",
+ "总训练轮数total_epoch": "总训练轮数total_epoch",
+ "总训练轮数total_epoch,不建议太高": "总训练轮数total_epoch,不建议太高",
+ "打标工具WebUI已关闭": "打标工具WebUI已关闭",
+ "打标工具WebUI已开启": "打标工具WebUI已开启",
+ "打标工具进程输出信息": "打标工具进程输出信息",
+ "指定输出主人声文件夹": "指定输出主人声文件夹",
+ "指定输出非主人声文件夹": "指定输出非主人声文件夹",
+ "按中文句号。切": "按中文句号。切",
+ "按标点符号切": "按标点符号切",
+ "按英文句号.切": "按英文句号.切",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。",
+ "文本模块学习率权重": "文本模块学习率权重",
+ "文本进程输出信息": "文本进程输出信息",
+ "施工中,请静候佳音": "施工中,请静候佳音",
+ "日文": "日文",
+ "日英混合": "日英混合",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "是否仅保存最新的ckpt文件以节省硬盘空间",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存时间点将最终小模型保存至weights文件夹",
+ "是否开启TTS推理WebUI": "是否开启TTS推理WebUI",
+ "是否开启UVR5-WebUI": "是否开启UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "是否开启dpo训练选项(实验性)",
+ "是否开启打标WebUI": "是否开启打标WebUI",
+ "是否直接对上次合成结果调整语速。防止随机性。": "是否直接对上次合成结果调整语速。防止随机性。",
+ "显卡信息": "显卡信息",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.",
+ "模型": "模型",
+ "模型分为三类:": "模型分为三类:",
+ "模型切换": "模型切换",
+ "每张显卡的batch_size": "每张显卡的batch_size",
+ "终止ASR进程": "终止ASR进程",
+ "终止GPT训练": "终止GPT训练",
+ "终止SSL提取进程": "终止SSL提取进程",
+ "终止SoVITS训练": "终止SoVITS训练",
+ "终止一键三连": "终止一键三连",
+ "终止文本获取进程": "终止文本获取进程",
+ "终止语义token提取进程": "终止语义token提取进程",
+ "终止语音切割": "终止语音切割",
+ "终止语音降噪进程": "终止语音降噪进程",
+ "英文": "英文",
+ "语义token提取进程输出信息": "语义token提取进程输出信息",
+ "语速": "语速",
+ "语速调整,高为更快": "语速调整,高为更快",
+ "语音切割进程输出信息": "语音切割进程输出信息",
+ "语音降噪进程输出信息": "语音降噪进程输出信息",
+ "请上传3~10秒内参考音频,超过会报错!": "请上传3~10秒内参考音频,超过会报错!",
+ "请输入有效文本": "请输入有效文本",
+ "转换": "转换",
+ "输入待处理音频文件夹路径": "输入待处理音频文件夹路径",
+ "输入文件夹路径": "输入文件夹路径",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "输出logs/实验名目录下应有23456开头的文件和文件夹",
+ "输出信息": "输出信息",
+ "输出文件夹路径": "输出文件夹路径",
+ "输出的语音": "输出的语音",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。",
+ "降噪结果输出文件夹": "降噪结果输出文件夹",
+ "降噪音频文件输入文件夹": "降噪音频文件输入文件夹",
+ "需要合成的切分前文本": "需要合成的切分前文本",
+ "需要合成的文本": "需要合成的文本",
+ "需要合成的语种": "需要合成的语种",
+ "音频自动切分输入路径,可文件可文件夹": "音频自动切分输入路径,可文件可文件夹",
+ "预训练的GPT模型路径": "预训练的GPT模型路径",
+ "预训练的SSL模型路径": "预训练的SSL模型路径",
+ "预训练的SoVITS-D模型路径": "预训练的SoVITS-D模型路径",
+ "预训练的SoVITS-G模型路径": "预训练的SoVITS-G模型路径",
+ "预训练的中文BERT模型路径": "预训练的中文BERT模型路径"
+}
diff --git a/tools/i18n/locale/zh_HK.json b/tools/i18n/locale/zh_HK.json
new file mode 100644
index 0000000000000000000000000000000000000000..d02c40857ebcae48e5e4d2a1bac051ed7e667993
--- /dev/null
+++ b/tools/i18n/locale/zh_HK.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最佳選擇,但不能去除單通道混響;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: 去除延遲效果。Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但對高頻重的板式混響去不乾淨。",
+ "*GPT模型列表": "*GPT模型列表",
+ "*SoVITS模型列表": "*SoVITS模型列表",
+ "*实验/模型名": "*實驗/模型名",
+ "*文本标注文件": "*文本標注文件",
+ "*训练集音频文件目录": "*訓練集音頻文件目錄",
+ "*请上传并填写参考信息": "*請上傳並填寫參考信息",
+ "*请填写需要合成的目标文本和语种模式": "請填寫需要合成的目標文本和語言模式",
+ ".list标注文件的路径": ".list標註文件的路徑",
+ "0-前置数据集获取工具": "0-前置數據集獲取工具",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5人聲伴奏分離&去混響去延遲工具",
+ "0b-语音切分工具": "0b-語音切分工具",
+ "0bb-语音降噪工具": "0bb-語音降噪工具",
+ "0c-中文批量离线ASR工具": "0c-中文批量離線ASR工具",
+ "0d-语音文本校对标注工具": "0d-語音文本校對標注工具",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-訓練集格式化工具",
+ "1Aa-文本内容": "1Aa-文本內容",
+ "1Aabc-训练集格式化一键三连": "1Aabc-訓練集格式化一鍵三連",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL自監督特徵提取",
+ "1Ac-语义token提取": "1Ac-語義token提取",
+ "1B-微调训练": "1B-微調訓練",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS訓練。用於分享的模型文件輸出在SoVITS_weights下。",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT訓練。用於分享的模型文件輸出在GPT_weights下。",
+ "1C-推理": "1C-推理",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1、DeEcho-DeReverb 模型的處理時間是另外兩個 DeEcho 模型的接近兩倍;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1、保留人聲:不帶和聲的音頻選這個,對主人聲保留比HP5更好。內置HP2和HP3兩個模型,HP3可能輕微漏伴奏但對主人聲保留比HP2稍微好一點點;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-變聲",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb 模型的處理時間挺慢的;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2、僅保留主人聲:帶和聲的音頻選這個,對主人聲可能有削弱。內置HP5一個模型;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3、個人推薦的最乾淨的配置是先 MDX-Net 再 DeEcho-Aggressive。",
+ "3、去混响、去延迟模型(by FoxJoy):": "3、去混響、去延遲模型(by FoxJoy):",
+ "ASR 模型": "ASR 模型",
+ "ASR 模型尺寸": "ASR 模型尺寸",
+ "数据类型精度": "數據類型精度",
+ "ASR 语言设置": "ASR 語言設置",
+ "ASR进程输出信息": "ASR進程輸出信息",
+ "GPT模型列表": "GPT模型列表",
+ "GPT训练进程输出信息": "GPT訓練進程輸出信息",
+ "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
+ "SSL进程输出信息": "SSL進程輸出信息",
+ "SoVITS模型列表": "SoVITS模型列表",
+ "SoVITS训练进程输出信息": "SoVITS訓練進程輸出信息",
+ "TTS推理WebUI进程输出信息": "TTS推理WebUI進程輸出信息",
+ "TTS推理进程已关闭": "TTS推理進程已關閉",
+ "TTS推理进程已开启": "TTS推理進程已開啟",
+ "UVR5已关闭": "UVR5已關閉",
+ "UVR5已开启": "UVR5已開啟",
+ "UVR5进程输出信息": "UVR5進程輸出信息",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT 采样参数(无参考文本时不要太低。不懂就用默认):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
+ "max:归一化后最大值多少": "max:歸一化後最大值多少",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
+ "min_interval:最短切割间隔": "min_interval:最短切割間隔",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:每段最小多長,如果第一段太短一直和後面段連起來直到超過這個值",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:音量小於這個值視作靜音的備選切割點",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "一鍵三連進程輸出信息",
+ "不切": "不切",
+ "中文": "中文",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "中英混合",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
+ "人声提取激进程度": "人聲提取激進程度",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT,聽不清參考音頻說的是啥(不知道寫啥)可以開啟,開啟後無視填寫的參考文本。",
+ "保存频率save_every_epoch": "保存頻率save_every_epoch",
+ "凑50字一切": "湊50字一切",
+ "凑四句一切": "湊四句一切",
+ "切分后文本": "切分後文本",
+ "切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
+ "切割使用的进程数": "切割使用的進程數",
+ "刷新模型路径": "刷新模型路徑",
+ "前端处理后的文本(每句):": "前端處理後的文本(每句):",
+ "去混响/去延迟,附:": "去混響/去延遲,附",
+ "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
+ "参考音频的文本": "參考音頻的文本",
+ "参考音频的语种": "參考音頻的語種",
+ "合成语音": "合成語音",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的文件夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試樣例(去文件管理器地址欄拷就行了)。",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻文件完整路徑=該目錄-拼接-list文件裡波形對應的文件名(不是全路徑)。如果留空則使用.list文件裡的絕對全路徑。",
+ "多语种混合": "多語種混合",
+ "实际输入的参考文本:": "實際輸入的參考文本:",
+ "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
+ "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
+ "实际输入的目标文本:": "實際輸入的目標文本:",
+ "导出文件格式": "導出檔格式",
+ "开启GPT训练": "開啟GPT訓練",
+ "开启SSL提取": "開啟SSL提取",
+ "开启SoVITS训练": "開啟SoVITS訓練",
+ "开启一键三连": "開啟一鍵三連",
+ "开启文本获取": "開啟文本獲取",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
+ "开启离线批量ASR": "開啟離線批量ASR",
+ "开启语义token提取": "開啟語義token提取",
+ "开启语音切割": "開啟語音切割",
+ "开启语音降噪": "開啟語音降噪",
+ "怎么切": "怎麼切",
+ "总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "总训练轮数total_epoch,不建议太高": "總訓練輪數total_epoch,不建議太高",
+ "打标工具WebUI已关闭": "打標工具WebUI已關閉",
+ "打标工具WebUI已开启": "打標工具WebUI已開啟",
+ "打标工具进程输出信息": "打標工具進程輸出信息",
+ "指定输出主人声文件夹": "指定输出主人声文件夹",
+ "指定输出非主人声文件夹": "指定输出非主人声文件夹",
+ "按中文句号。切": "按中文句號。切",
+ "按标点符号切": "按標點符號切",
+ "按英文句号.切": "按英文句號.切",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太長的文本合成出來效果不一定好,所以太長建議先切。合成會根據文本的換行分開合成再拼起來。",
+ "文本模块学习率权重": "文本模塊學習率權重",
+ "文本进程输出信息": "文本進程輸出信息",
+ "施工中,请静候佳音": "施工中,請靜候佳音",
+ "日文": "日文",
+ "日英混合": "日英混合",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt文件以節省硬碟空間",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights文件夾",
+ "是否开启TTS推理WebUI": "是否開啟TTS推理WebUI",
+ "是否开启UVR5-WebUI": "是否開啟UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "是否開啟dpo訓練選項(實驗性)",
+ "是否开启打标WebUI": "是否開啟打標WebUI",
+ "是否直接对上次合成结果调整语速。防止随机性。": "是否直接對上次合成結果調整語速。防止隨機性。",
+ "显卡信息": "顯卡信息",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟件以MIT協議開源, 作者不對軟件具備任何控制力, 使用軟件者、傳播軟件導出的聲音者自負全責.
如不認可該條款, 則不能使用或引用軟件包內任何代碼和文件. 詳見根目錄LICENSE.",
+ "模型": "模型",
+ "模型分为三类:": "模型分為三類:",
+ "模型切换": "模型切換",
+ "每张显卡的batch_size": "每張顯卡的batch_size",
+ "终止ASR进程": "終止ASR進程",
+ "终止GPT训练": "終止GPT訓練",
+ "终止SSL提取进程": "終止SSL提取進程",
+ "终止SoVITS训练": "終止SoVITS訓練",
+ "终止一键三连": "終止一鍵三連",
+ "终止文本获取进程": "終止文本獲取進程",
+ "终止语义token提取进程": "終止語義token提取進程",
+ "终止语音切割": "終止語音切割",
+ "终止语音降噪进程": "終止語音降噪進程",
+ "英文": "英文",
+ "语义token提取进程输出信息": "語義token提取進程輸出信息",
+ "语速": "語速",
+ "语速调整,高为更快": "調整語速,高為更快",
+ "语音切割进程输出信息": "語音切割進程輸出信息",
+ "语音降噪进程输出信息": "語音降噪進程輸出信息",
+ "请上传3~10秒内参考音频,超过会报错!": "請上傳3~10秒內參考音頻,超過會報錯!",
+ "请输入有效文本": "請輸入有效文本",
+ "转换": "轉換",
+ "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
+ "输入文件夹路径": "輸入文件夾路徑",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
+ "输出信息": "輸出訊息",
+ "输出文件夹路径": "輸出文件夾路徑",
+ "输出的语音": "輸出的語音",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
+ "降噪结果输出文件夹": "降噪結果輸出文件夾",
+ "降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
+ "需要合成的切分前文本": "需要合成的切分前文本",
+ "需要合成的文本": "需要合成的文本",
+ "需要合成的语种": "需要合成的語種",
+ "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
+ "预训练的GPT模型路径": "預訓練的GPT模型路徑",
+ "预训练的SSL模型路径": "預訓練的SSL模型路徑",
+ "预训练的SoVITS-D模型路径": "預訓練的SoVITS-D模型路徑",
+ "预训练的SoVITS-G模型路径": "預訓練的SoVITS-G模型路徑",
+ "预训练的中文BERT模型路径": "預訓練的中文BERT模型路徑"
+}
diff --git a/tools/i18n/locale/zh_SG.json b/tools/i18n/locale/zh_SG.json
new file mode 100644
index 0000000000000000000000000000000000000000..192c3adde25843fb950eafedf2bf579c231859aa
--- /dev/null
+++ b/tools/i18n/locale/zh_SG.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最好的選擇,不能去除單通道混響;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
+ "*GPT模型列表": "*GPT模型列表",
+ "*SoVITS模型列表": "*SoVITS模型列表",
+ "*实验/模型名": "*實驗/模型名",
+ "*文本标注文件": "*文本標註文件",
+ "*训练集音频文件目录": "*訓練集音頻文件目錄",
+ "*请上传并填写参考信息": "*請上傳並填寫參考信息",
+ "*请填写需要合成的目标文本和语种模式": "請填寫需要合成的目標文本和語言模式",
+ ".list标注文件的路径": ".list標註文件的路徑",
+ "0-前置数据集获取工具": "0-前置數據集獲取工具",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5人聲伴奏分離&去混響去延遲工具",
+ "0b-语音切分工具": "0b-語音切分工具",
+ "0bb-语音降噪工具": "0bb-語音降噪工具",
+ "0c-中文批量离线ASR工具": "0c-中文批量離線ASR工具",
+ "0d-语音文本校对标注工具": "0d-語音文本校對標註工具",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-訓練集格式化工具",
+ "1Aa-文本内容": "1Aa-文本內容",
+ "1Aabc-训练集格式化一键三连": "1Aabc-訓練集格式化一鍵三連",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL自監督特徵提取",
+ "1Ac-语义token提取": "1Ac-語義token提取",
+ "1B-微调训练": "1B-微調訓練",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS訓練。用於分享的模型文件輸出在SoVITS_weights下。",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT訓練。用於分享的模型文件輸出在GPT_weights下。",
+ "1C-推理": "1C-推理",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1、DeEcho-DeReverb 模型的耗時是另外兩個 DeEcho 模型的接近兩倍;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1、保留人聲:不帶和聲的音頻選這個,對主人聲保留比HP5更好。內置HP2和HP3兩個模型,HP3可能輕微漏伴奏但對主人聲保留比HP2稍微好一丁點;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-變聲",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb 模型的處理時間挺慢的;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2、僅保留主人聲:帶和聲的音頻選這個,對主人聲可能有削弱。內置HP5一個模型;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3、個人推薦的最乾淨的配置是先 MDX-Net 再 DeEcho-Aggressive。",
+ "3、去混响、去延迟模型(by FoxJoy):": "3、去混響、去延遲模型(by FoxJoy):",
+ "ASR 模型": "ASR 模型",
+ "ASR 模型尺寸": "ASR 模型尺寸",
+ "数据类型精度": "數據類型精度",
+ "ASR 语言设置": "ASR 語言設定",
+ "ASR进程输出信息": "ASR進程輸出資訊",
+ "GPT模型列表": "GPT模型列表",
+ "GPT训练进程输出信息": "GPT訓練進程輸出資訊",
+ "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
+ "SSL进程输出信息": "SSL進程輸出資訊",
+ "SoVITS模型列表": "SoVITS模型列表",
+ "SoVITS训练进程输出信息": "SoVITS訓練進程輸出資訊",
+ "TTS推理WebUI进程输出信息": "TTS推理WebUI進程輸出資訊",
+ "TTS推理进程已关闭": "TTS推理進程已關閉",
+ "TTS推理进程已开启": "TTS推理進程已開啟",
+ "UVR5已关闭": "UVR5已關閉",
+ "UVR5已开启": "UVR5已開啟",
+ "UVR5进程输出信息": "UVR5進程輸出資訊",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT 采样参数(无参考文本时不要太低。不懂就用默认):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
+ "max:归一化后最大值多少": "max:歸一化後最大值多少",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
+ "min_interval:最短切割间隔": "min_interval:最短切割間隔",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:每段最小多長,如果第一段太短一直和後面段連起來直到超過這個值",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:音量小於這個值視作靜音的備選切割點",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "一鍵三連進程輸出資訊",
+ "不切": "不切",
+ "中文": "中文",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "中英混合",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
+ "人声提取激进程度": "人聲提取激進程度",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT,聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
+ "保存频率save_every_epoch": "保存頻率save_every_epoch",
+ "凑50字一切": "湊50字一切",
+ "凑四句一切": "湊四句一切",
+ "切分后文本": "切分後文本",
+ "切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
+ "切割使用的进程数": "切割使用的進程數",
+ "刷新模型路径": "刷新模型路徑",
+ "前端处理后的文本(每句):": "前端處理後的文本(每句):",
+ "去混响/去延迟,附:": "去混響/去延遲,附:",
+ "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
+ "参考音频的文本": "參考音頻的文本",
+ "参考音频的语种": "參考音頻的語種",
+ "合成语音": "合成語音",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻檔案完整路徑=該目錄-拼接-list檔案裡波形對應的檔案名(不是全路徑)。如果留空則使用.list檔案裡的絕對全路徑。",
+ "多语种混合": "多語種混合",
+ "实际输入的参考文本:": "實際輸入的參考文本:",
+ "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
+ "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
+ "实际输入的目标文本:": "實際輸入的目標文本:",
+ "导出文件格式": "導出檔格式",
+ "开启GPT训练": "開啟GPT訓練",
+ "开启SSL提取": "開啟SSL提取",
+ "开启SoVITS训练": "開啟SoVITS訓練",
+ "开启一键三连": "開啟一鍵三連",
+ "开启文本获取": "開啟文本獲取",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
+ "开启离线批量ASR": "開啟離線批量ASR",
+ "开启语义token提取": "開啟語義token提取",
+ "开启语音切割": "開啟語音切割",
+ "开启语音降噪": "開啟語音降噪",
+ "怎么切": "怎麼切",
+ "总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "总训练轮数total_epoch,不建议太高": "總訓練輪數total_epoch,不建議太高",
+ "打标工具WebUI已关闭": "打標工具WebUI已關閉",
+ "打标工具WebUI已开启": "打標工具WebUI已開啟",
+ "打标工具进程输出信息": "打標工具進程輸出資訊",
+ "指定输出主人声文件夹": "指定输出主人声文件夹",
+ "指定输出非主人声文件夹": "指定输出非主人声文件夹",
+ "按中文句号。切": "按中文句號。切",
+ "按标点符号切": "按標點符號切",
+ "按英文句号.切": "按英文句號.切",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太長的文本合成出來效果不一定好,所以太長建議先切。合成會根據文本的換行分開合成再拼起來。",
+ "文本模块学习率权重": "文本模塊學習率權重",
+ "文本进程输出信息": "文本進程輸出資訊",
+ "施工中,请静候佳音": "施工中,請靜候佳音",
+ "日文": "日文",
+ "日英混合": "日英混合",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt文件以節省硬盤空間",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights文件夾",
+ "是否开启TTS推理WebUI": "是否開啟TTS推理WebUI",
+ "是否开启UVR5-WebUI": "是否開啟UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "是否開啟dpo訓練選項(實驗性)",
+ "是否开启打标WebUI": "是否開啟打標WebUI",
+ "是否直接对上次合成结果调整语速。防止随机性。": "是否直接對上次合成結果調整語速。防止隨機性。",
+ "显卡信息": "顯卡資訊",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何代碼和文件。詳見根目錄LICENSE。",
+ "模型": "模型",
+ "模型分为三类:": "模型分為三類:",
+ "模型切换": "模型切換",
+ "每张显卡的batch_size": "每張顯卡的batch_size",
+ "终止ASR进程": "終止ASR進程",
+ "终止GPT训练": "終止GPT訓練",
+ "终止SSL提取进程": "終止SSL提取進程",
+ "终止SoVITS训练": "終止SoVITS訓練",
+ "终止一键三连": "終止一鍵三連",
+ "终止文本获取进程": "終止文本獲取進程",
+ "终止语义token提取进程": "終止語義token提取進程",
+ "终止语音切割": "終止語音切割",
+ "终止语音降噪进程": "終止語音降噪進程",
+ "英文": "英文",
+ "语义token提取进程输出信息": "語義token提取進程輸出資訊",
+ "语速": "語速",
+ "语速调整,高为更快": "調整語速,高為更快",
+ "语音切割进程输出信息": "語音切割進程輸出資訊",
+ "语音降噪进程输出信息": "語音降噪進程輸出資訊",
+ "请上传3~10秒内参考音频,超过会报错!": "請上傳3~10秒內參考音頻,超過會報錯!",
+ "请输入有效文本": "請輸入有效文本",
+ "转换": "轉換",
+ "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
+ "输入文件夹路径": "輸入文件夾路徑",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
+ "输出信息": "輸出訊息",
+ "输出文件夹路径": "輸出文件夾路徑",
+ "输出的语音": "輸出的語音",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
+ "降噪结果输出文件夹": "降噪結果輸出文件夾",
+ "降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
+ "需要合成的切分前文本": "需要合成的切分前文本",
+ "需要合成的文本": "需要合成的文本",
+ "需要合成的语种": "需要合成的語種",
+ "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
+ "预训练的GPT模型路径": "預訓練的GPT模型路徑",
+ "预训练的SSL模型路径": "預訓練的SSL模型路徑",
+ "预训练的SoVITS-D模型路径": "預訓練的SoVITS-D模型路徑",
+ "预训练的SoVITS-G模型路径": "預訓練的SoVITS-G模型路徑",
+ "预训练的中文BERT模型路径": "預訓練的中文BERT模型路徑"
+}
diff --git a/tools/i18n/locale/zh_TW.json b/tools/i18n/locale/zh_TW.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff90766057b639a17501c67edd2c0c081f61511d
--- /dev/null
+++ b/tools/i18n/locale/zh_TW.json
@@ -0,0 +1,172 @@
+{
+ "(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb):對於雙通道混響是最好的選擇,不能去除單通道混響;",
+ "(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho:去除延遲效果。Aggressive 比 Normal 去除得更徹底,DeReverb 額外去除混響,可去除單聲道混響,但是對高頻重的板式混響去不乾淨。",
+ "*GPT模型列表": "*GPT模型列表",
+ "*SoVITS模型列表": "*SoVITS模型列表",
+ "*实验/模型名": "*實驗/模型名",
+ "*文本标注文件": "*文本標注文件",
+ "*训练集音频文件目录": "*訓練集音頻文件目錄",
+ "*请上传并填写参考信息": "*請上傳並填寫參考資訊",
+ "*请填写需要合成的目标文本和语种模式": "請填寫需要合成的目標文本和語言模式",
+ ".list标注文件的路径": ".list標注文件的路徑",
+ "0-前置数据集获取工具": "0-前置數據集獲取工具",
+ "0a-UVR5人声伴奏分离&去混响去延迟工具": "0a-UVR5人聲伴奏分離&去混響去延遲工具",
+ "0b-语音切分工具": "0b-語音切分工具",
+ "0bb-语音降噪工具": "0bb-語音降噪工具",
+ "0c-中文批量离线ASR工具": "0c-中文批量離線ASR工具",
+ "0d-语音文本校对标注工具": "0d-語音文本校對標注工具",
+ "1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
+ "1A-训练集格式化工具": "1A-訓練集格式化工具",
+ "1Aa-文本内容": "1Aa-文本內容",
+ "1Aabc-训练集格式化一键三连": "1Aabc-訓練集格式化一鍵三連",
+ "1Ab-SSL自监督特征提取": "1Ab-SSL自監督特徵提取",
+ "1Ac-语义token提取": "1Ac-語義token提取",
+ "1B-微调训练": "1B-微調訓練",
+ "1Ba-SoVITS训练。用于分享的模型文件输出在SoVITS_weights下。": "1Ba-SoVITS訓練。用於分享的模型文件輸出在SoVITS_weights下。",
+ "1Bb-GPT训练。用于分享的模型文件输出在GPT_weights下。": "1Bb-GPT訓練。用於分享的模型文件輸出在GPT_weights下。",
+ "1C-推理": "1C-推理",
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1、DeEcho-DeReverb 模型的耗時是另外兩個 DeEcho 模型的接近兩倍;",
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1、保留人聲:不帶和聲的音頻選這個,對主人聲保留比HP5更好。內置HP2和HP3兩個模型,HP3可能輕微漏伴奏但對主人聲保留比HP2稍微好一丁點;",
+ "2-GPT-SoVITS-变声": "2-GPT-SoVITS-變聲",
+ "2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb模型挺慢的;",
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2、僅保留主人聲:帶和聲的音頻選這個,對主人聲可能有削弱。內置HP5一個模型;",
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3、個人推薦的最乾淨的配置是先 MDX-Net 再 DeEcho-Aggressive。",
+ "3、去混响、去延迟模型(by FoxJoy):": "3、去混響、去延遲模型(by FoxJoy):",
+ "ASR 模型": "ASR 模型",
+ "ASR 模型尺寸": "ASR 模型尺寸",
+ "数据类型精度": "數據類型精度",
+ "ASR 语言设置": "ASR 語言設置",
+ "ASR进程输出信息": "ASR進程輸出資訊",
+ "GPT模型列表": "GPT模型列表",
+ "GPT训练进程输出信息": "GPT訓練進程輸出資訊",
+ "GPU卡号,只能填1个整数": "GPU卡號,只能填1個整數",
+ "GPU卡号以-分割,每个卡号一个进程": "GPU卡號以-分割,每個卡號一個進程",
+ "SSL进程输出信息": "SSL進程輸出資訊",
+ "SoVITS模型列表": "SoVITS模型列表",
+ "SoVITS训练进程输出信息": "SoVITS訓練進程輸出資訊",
+ "TTS推理WebUI进程输出信息": "TTS推理WebUI進程輸出資訊",
+ "TTS推理进程已关闭": "TTS推理進程已關閉",
+ "TTS推理进程已开启": "TTS推理進程已開啟",
+ "UVR5已关闭": "UVR5已關閉",
+ "UVR5已开启": "UVR5已開啟",
+ "UVR5进程输出信息": "UVR5進程輸出資訊",
+ "alpha_mix:混多少比例归一化后音频进来": "alpha_mix:混多少比例歸一化後音頻進來",
+ "gpt采样参数(无参考文本时不要太低。不懂就用默认):": "GPT 采样参数(无参考文本时不要太低。不懂就用默认):",
+ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size:怎麼算音量曲線,越小精度越大計算量越高(不是精度越大效果越好)",
+ "max:归一化后最大值多少": "max:歸一化後最大值多少",
+ "max_sil_kept:切完后静音最多留多长": "max_sil_kept:切完後靜音最多留多長",
+ "min_interval:最短切割间隔": "min_interval:最短切割間隔",
+ "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:每段最小多長,如果第一段太短一直和後面段連起來直到超過這個值",
+ "temperature": "temperature",
+ "threshold:音量小于这个值视作静音的备选切割点": "threshold:音量小於這個值視作靜音的備選切割點",
+ "top_k": "top_k",
+ "top_p": "top_p",
+ "一键三连进程输出信息": "一鍵三連進程輸出資訊",
+ "不切": "不切",
+ "中文": "中文",
+ "中文教程文档:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e": "中文教程文檔:https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e",
+ "中英混合": "中英混合",
+ "也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量输入音频文件, 二选一, 优先读文件夹",
+ "人声伴奏分离批量处理, 使用UVR5模型。": "人聲伴奏分離批量處理, 使用UVR5模型。",
+ "人声提取激进程度": "人聲提取激進程度",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。": "使用無參考文本模式時建議使用微調的GPT,聽不清參考音頻說的啥(不曉得寫啥)可以開,開啟後無視填寫的參考文本。",
+ "保存频率save_every_epoch": "保存頻率save_every_epoch",
+ "凑50字一切": "湊50字一切",
+ "凑四句一切": "湊四句一切",
+ "切分后文本": "切分後文本",
+ "切分后的子音频的输出根目录": "切分後的子音頻的輸出根目錄",
+ "切割使用的进程数": "切割使用的進程數",
+ "刷新模型路径": "刷新模型路徑",
+ "前端处理后的文本(每句):": "前端處理後的文本(每句):",
+ "去混响/去延迟,附:": "去混響/去延遲,附:",
+ "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
+ "参考音频的文本": "參考音頻的文本",
+ "参考音频的语种": "參考音頻的語種",
+ "合成语音": "合成語音",
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "合格的資料夾路徑格式舉例: E:\\codes\\py39\\vits_vc_gpu\\白鷺霜華測試範例(去文件管理器地址欄拷就行了)。",
+ "后续将支持转音素、手工修改音素、语音合成分步执行。": "後續將支持轉音素、手工修改音素、語音合成分步執行。",
+ "填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "填切割後音頻所在目錄!讀取的音頻檔案完整路徑=該目錄-拼接-list檔案裡波形對應的檔案名(不是全路徑)。如果留空則使用.list檔案裡的絕對全路徑。",
+ "多语种混合": "多語種混合",
+ "实际输入的参考文本:": "實際輸入的參考文本:",
+ "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
+ "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
+ "实际输入的目标文本:": "實際輸入的目標文本:",
+ "导出文件格式": "導出檔格式",
+ "开启GPT训练": "開啟GPT訓練",
+ "开启SSL提取": "開啟SSL提取",
+ "开启SoVITS训练": "開啟SoVITS訓練",
+ "开启一键三连": "開啟一鍵三連",
+ "开启文本获取": "開啟文本獲取",
+ "开启无参考文本模式。不填参考文本亦相当于开启。": "開啟無參考文本模式。不填參考文本亦相當於開啟。",
+ "开启离线批量ASR": "開啟離線批量ASR",
+ "开启语义token提取": "開啟語義token提取",
+ "开启语音切割": "開啟語音切割",
+ "开启语音降噪": "開啟語音降噪",
+ "怎么切": "怎麼切",
+ "总训练轮数total_epoch": "總訓練輪數total_epoch",
+ "总训练轮数total_epoch,不建议太高": "總訓練輪數total_epoch,不建議太高",
+ "打标工具WebUI已关闭": "打標工具WebUI已關閉",
+ "打标工具WebUI已开启": "打標工具WebUI已開啟",
+ "打标工具进程输出信息": "打標工具進程輸出資訊",
+ "指定输出主人声文件夹": "指定输出主人声文件夹",
+ "指定输出非主人声文件夹": "指定输出非主人声文件夹",
+ "按中文句号。切": "按中文句號。切",
+ "按标点符号切": "按標點符號切",
+ "按英文句号.切": "按英文句號.切",
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "文本切分工具。太長的文本合成出來效果不一定好,所以太長建議先切。合成會根據文本的換行分開合成再拼起來。",
+ "文本模块学习率权重": "文本模塊學習率權重",
+ "文本进程输出信息": "文本進程輸出資訊",
+ "施工中,请静候佳音": "施工中,請靜候佳音",
+ "日文": "日文",
+ "日英混合": "日英混合",
+ "是否仅保存最新的ckpt文件以节省硬盘空间": "是否僅保存最新的ckpt文件以節省硬盤空間",
+ "是否在每次保存时间点将最终小模型保存至weights文件夹": "是否在每次保存時間點將最終小模型保存至weights文件夾",
+ "是否开启TTS推理WebUI": "是否開啟TTS推理WebUI",
+ "是否开启UVR5-WebUI": "是否開啟UVR5-WebUI",
+ "是否开启dpo训练选项(实验性)": "是否開啟dpo訓練選項(實驗性)",
+ "是否开启打标WebUI": "是否開啟打標WebUI",
+ "是否直接对上次合成结果调整语速。防止随机性。": "是否直接對上次合成結果調整語速。防止隨機性。",
+ "显卡信息": "顯卡資訊",
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "本軟體以MIT協議開源,作者不對軟體具備任何控制力,使用軟體者、傳播軟體導出的聲音者自負全責。
如不認可該條款,則不能使用或引用軟體包內任何代碼和文件。詳見根目錄LICENSE。",
+ "模型": "模型",
+ "模型分为三类:": "模型分為三類:",
+ "模型切换": "模型切換",
+ "每张显卡的batch_size": "每張顯卡的batch_size",
+ "终止ASR进程": "終止ASR進程",
+ "终止GPT训练": "終止GPT訓練",
+ "终止SSL提取进程": "終止SSL提取進程",
+ "终止SoVITS训练": "終止SoVITS訓練",
+ "终止一键三连": "終止一鍵三連",
+ "终止文本获取进程": "終止文本獲取進程",
+ "终止语义token提取进程": "終止語義token提取進程",
+ "终止语音切割": "終止語音切割",
+ "终止语音降噪进程": "終止語音降噪進程",
+ "英文": "英文",
+ "语义token提取进程输出信息": "語義token提取進程輸出資訊",
+ "语速": "語速",
+ "语速调整,高为更快": "調整語速,高為更快",
+ "语音切割进程输出信息": "語音切割進程輸出資訊",
+ "语音降噪进程输出信息": "語音降噪進程輸出資訊",
+ "请上传3~10秒内参考音频,超过会报错!": "請上傳3~10秒內參考音頻,超過會報錯!",
+ "请输入有效文本": "請輸入有效文本",
+ "转换": "轉換",
+ "输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
+ "输入文件夹路径": "輸入文件夾路徑",
+ "输出logs/实验名目录下应有23456开头的文件和文件夹": "輸出logs/實驗名目錄下應有23456開頭的文件和文件夾",
+ "输出信息": "輸出訊息",
+ "输出文件夹路径": "輸出文件夾路徑",
+ "输出的语音": "輸出的語音",
+ "选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "選擇訓練完存放在SoVITS_weights和GPT_weights下的模型。默認的一個是底模,體驗5秒Zero Shot TTS用。",
+ "降噪结果输出文件夹": "降噪結果輸出文件夾",
+ "降噪音频文件输入文件夹": "降噪音頻文件輸入文件夾",
+ "需要合成的切分前文本": "需要合成的切分前文本",
+ "需要合成的文本": "需要合成的文本",
+ "需要合成的语种": "需要合成的語種",
+ "音频自动切分输入路径,可文件可文件夹": "音頻自動切分輸入路徑,可文件可文件夾",
+ "预训练的GPT模型路径": "預訓練的GPT模型路徑",
+ "预训练的SSL模型路径": "預訓練的SSL模型路徑",
+ "预训练的SoVITS-D模型路径": "預訓練的SoVITS-D模型路徑",
+ "预训练的SoVITS-G模型路径": "預訓練的SoVITS-G模型路徑",
+ "预训练的中文BERT模型路径": "預訓練的中文BERT模型路徑"
+}
diff --git a/tools/i18n/scan_i18n.py b/tools/i18n/scan_i18n.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7433152c4f09eff847326bd47a173a6e36955c0
--- /dev/null
+++ b/tools/i18n/scan_i18n.py
@@ -0,0 +1,119 @@
+import ast
+import glob
+import json
+import os
+from collections import OrderedDict
+
+I18N_JSON_DIR : os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), 'locale')
+DEFAULT_LANGUAGE: str = "zh_CN" # 默认语言
+TITLE_LEN : int = 60 # 标题显示长度
+KEY_LEN : int = 30 # 键名显示长度
+SHOW_KEYS : bool = False # 是否显示键信息
+
+def extract_i18n_strings(node):
+ i18n_strings = []
+
+ if (
+ isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "i18n"
+ ):
+ for arg in node.args:
+ if isinstance(arg, ast.Str):
+ i18n_strings.append(arg.s)
+
+ for child_node in ast.iter_child_nodes(node):
+ i18n_strings.extend(extract_i18n_strings(child_node))
+
+ return i18n_strings
+
+def scan_i18n_strings():
+ """
+ scan the directory for all .py files (recursively)
+ for each file, parse the code into an AST
+ for each AST, extract the i18n strings
+ """
+ strings = []
+ print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
+ for filename in glob.iglob("**/*.py", recursive=True):
+ with open(filename, "r", encoding="utf-8") as f:
+ code = f.read()
+ if "I18nAuto" in code:
+ tree = ast.parse(code)
+ i18n_strings = extract_i18n_strings(tree)
+ print(f"{filename.ljust(30)}: {len(i18n_strings)}")
+ strings.extend(i18n_strings)
+
+ code_keys = set(strings)
+ print(f"{'Total Unique'.ljust(30)}: {len(code_keys)}")
+ return code_keys
+
+def update_i18n_json(json_file, standard_keys):
+ print(f" Process {json_file} ".center(TITLE_LEN, "="))
+ # 读取 JSON 文件
+ with open(json_file, "r", encoding="utf-8") as f:
+ json_data = json.load(f, object_pairs_hook=OrderedDict)
+ # 打印处理前的 JSON 条目数
+ len_before = len(json_data)
+ print(f"{'Total Keys'.ljust(KEY_LEN)}: {len_before}")
+ # 识别缺失的键并补全
+ miss_keys = set(standard_keys) - set(json_data.keys())
+ if len(miss_keys) > 0:
+ print(f"{'Missing Keys (+)'.ljust(KEY_LEN)}: {len(miss_keys)}")
+ for key in miss_keys:
+ if DEFAULT_LANGUAGE in json_file:
+ # 默认语言的键值相同.
+ json_data[key] = key
+ else:
+ # 其他语言的值设置为 #! + 键名以标注未被翻译.
+ json_data[key] = "#!" + key
+ if SHOW_KEYS:
+ print(f"{'Added Missing Key'.ljust(KEY_LEN)}: {key}")
+ # 识别多余的键并删除
+ diff_keys = set(json_data.keys()) - set(standard_keys)
+ if len(diff_keys) > 0:
+ print(f"{'Unused Keys (-)'.ljust(KEY_LEN)}: {len(diff_keys)}")
+ for key in diff_keys:
+ del json_data[key]
+ if SHOW_KEYS:
+ print(f"{'Removed Unused Key'.ljust(KEY_LEN)}: {key}")
+ # 按键顺序排序
+ json_data = OrderedDict(
+ sorted(json_data.items(),
+ key=lambda x: list(standard_keys).index(x[0])))
+ # 打印处理后的 JSON 条目数
+ if len(miss_keys) != 0 or len(diff_keys) != 0:
+ print(f"{'Total Keys (After)'.ljust(KEY_LEN)}: {len(json_data)}")
+ # 识别有待翻译的键
+ num_miss_translation = 0
+ duplicate_items = {}
+ for key, value in json_data.items():
+ if value.startswith("#!"):
+ num_miss_translation += 1
+ if SHOW_KEYS:
+ print(f"{'Missing Translation'.ljust(KEY_LEN)}: {key}")
+ if value in duplicate_items:
+ duplicate_items[value].append(key)
+ else:
+ duplicate_items[value] = [key]
+ # 打印是否有重复的值
+ for value, keys in duplicate_items.items():
+ if len(keys) > 1:
+ print("\n".join([f"\033[31m{'[Failed] Duplicate Value'.ljust(KEY_LEN)}: {key} -> {value}\033[0m" for key in keys]))
+
+ if num_miss_translation > 0:
+ print(f"\033[31m{'[Failed] Missing Translation'.ljust(KEY_LEN)}: {num_miss_translation}\033[0m")
+ else:
+ print(f"\033[32m[Passed] All Keys Translated\033[0m")
+ # 将处理后的结果写入 JSON 文件
+ with open(json_file, "w", encoding="utf-8") as f:
+ json.dump(json_data, f, ensure_ascii=False, indent=4, sort_keys=True)
+ f.write("\n")
+ print(f" Updated {json_file} ".center(TITLE_LEN, "=") + '\n')
+
+if __name__ == "__main__":
+ code_keys = scan_i18n_strings()
+ for json_file in os.listdir(I18N_JSON_DIR):
+ if json_file.endswith(r".json"):
+ json_file = os.path.join(I18N_JSON_DIR, json_file)
+ update_i18n_json(json_file, code_keys)
\ No newline at end of file
diff --git a/tools/my_utils.py b/tools/my_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..60acb2741dbb2570cb00f4ef2c55dd8a41342d22
--- /dev/null
+++ b/tools/my_utils.py
@@ -0,0 +1,33 @@
+import platform,os,traceback
+import ffmpeg
+import numpy as np
+
+
+def load_audio(file, sr):
+ try:
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+ file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
+ if os.path.exists(file) == False:
+ raise RuntimeError(
+ "You input a wrong audio path that does not exists, please fix it!"
+ )
+ out, _ = (
+ ffmpeg.input(file, threads=0)
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+ )
+ except Exception as e:
+ traceback.print_exc()
+ raise RuntimeError(f"Failed to load audio: {e}")
+
+ return np.frombuffer(out, np.float32).flatten()
+
+
+def clean_path(path_str:str):
+ if path_str.endswith(('\\','/')):
+ return clean_path(path_str[0:-1])
+ if platform.system() == 'Windows':
+ path_str = path_str.replace('/', '\\')
+ return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ").strip("\u202a")
\ No newline at end of file
diff --git a/tools/slice_audio.py b/tools/slice_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a06292d993825ca49d57f1274865c029c0b2bb4
--- /dev/null
+++ b/tools/slice_audio.py
@@ -0,0 +1,48 @@
+import os,sys,numpy as np
+import traceback
+from scipy.io import wavfile
+# parent_directory = os.path.dirname(os.path.abspath(__file__))
+# sys.path.append(parent_directory)
+from tools.my_utils import load_audio
+from slicer2 import Slicer
+
+def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
+ os.makedirs(opt_root,exist_ok=True)
+ if os.path.isfile(inp):
+ input=[inp]
+ elif os.path.isdir(inp):
+ input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
+ else:
+ return "输入路径存在但既不是文件也不是文件夹"
+ slicer = Slicer(
+ sr=32000, # 长音频采样率
+ threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
+ min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
+ min_interval= int(min_interval), # 最短切割间隔
+ hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
+ max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
+ )
+ _max=float(_max)
+ alpha=float(alpha)
+ for inp_path in input[int(i_part)::int(all_part)]:
+ # print(inp_path)
+ try:
+ name = os.path.basename(inp_path)
+ audio = load_audio(inp_path, 32000)
+ # print(audio.shape)
+ for chunk, start, end in slicer.slice(audio): # start和end是帧数
+ tmp_max = np.abs(chunk).max()
+ if(tmp_max>1):chunk/=tmp_max
+ chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
+ wavfile.write(
+ "%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
+ 32000,
+ # chunk.astype(np.float32),
+ (chunk * 32767).astype(np.int16),
+ )
+ except:
+ print(inp_path,"->fail->",traceback.format_exc())
+ return "执行完毕,请检查输出文件"
+
+print(slice(*sys.argv[1:]))
+
diff --git a/tools/slicer2.py b/tools/slicer2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba6794b6335fc50a494ba1b1cfb375536ab7a1aa
--- /dev/null
+++ b/tools/slicer2.py
@@ -0,0 +1,261 @@
+import numpy as np
+
+
+# This function is obtained from librosa.
+def get_rms(
+ y,
+ frame_length=2048,
+ hop_length=512,
+ pad_mode="constant",
+):
+ padding = (int(frame_length // 2), int(frame_length // 2))
+ y = np.pad(y, padding, mode=pad_mode)
+
+ axis = -1
+ # put our new within-frame axis at the end for now
+ out_strides = y.strides + tuple([y.strides[axis]])
+ # Reduce the shape on the framing axis
+ x_shape_trimmed = list(y.shape)
+ x_shape_trimmed[axis] -= frame_length - 1
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+ if axis < 0:
+ target_axis = axis - 1
+ else:
+ target_axis = axis + 1
+ xw = np.moveaxis(xw, -1, target_axis)
+ # Downsample along the target axis
+ slices = [slice(None)] * xw.ndim
+ slices[axis] = slice(0, None, hop_length)
+ x = xw[tuple(slices)]
+
+ # Calculate power
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+
+ return np.sqrt(power)
+
+
+class Slicer:
+ def __init__(
+ self,
+ sr: int,
+ threshold: float = -40.0,
+ min_length: int = 5000,
+ min_interval: int = 300,
+ hop_size: int = 20,
+ max_sil_kept: int = 5000,
+ ):
+ if not min_length >= min_interval >= hop_size:
+ raise ValueError(
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+ )
+ if not max_sil_kept >= hop_size:
+ raise ValueError(
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
+ )
+ min_interval = sr * min_interval / 1000
+ self.threshold = 10 ** (threshold / 20.0)
+ self.hop_size = round(sr * hop_size / 1000)
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
+ self.min_interval = round(min_interval / self.hop_size)
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+
+ def _apply_slice(self, waveform, begin, end):
+ if len(waveform.shape) > 1:
+ return waveform[
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
+ ]
+ else:
+ return waveform[
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
+ ]
+
+ # @timeit
+ def slice(self, waveform):
+ if len(waveform.shape) > 1:
+ samples = waveform.mean(axis=0)
+ else:
+ samples = waveform
+ if samples.shape[0] <= self.min_length:
+ return [waveform]
+ rms_list = get_rms(
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
+ ).squeeze(0)
+ sil_tags = []
+ silence_start = None
+ clip_start = 0
+ for i, rms in enumerate(rms_list):
+ # Keep looping while frame is silent.
+ if rms < self.threshold:
+ # Record start of silent frames.
+ if silence_start is None:
+ silence_start = i
+ continue
+ # Keep looping while frame is not silent and silence start has not been recorded.
+ if silence_start is None:
+ continue
+ # Clear recorded silence start if interval is not enough or clip is too short
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+ need_slice_middle = (
+ i - silence_start >= self.min_interval
+ and i - clip_start >= self.min_length
+ )
+ if not is_leading_silence and not need_slice_middle:
+ silence_start = None
+ continue
+ # Need slicing. Record the range of silent frames to be removed.
+ if i - silence_start <= self.max_sil_kept:
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
+ if silence_start == 0:
+ sil_tags.append((0, pos))
+ else:
+ sil_tags.append((pos, pos))
+ clip_start = pos
+ elif i - silence_start <= self.max_sil_kept * 2:
+ pos = rms_list[
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ pos += i - self.max_sil_kept
+ pos_l = (
+ rms_list[
+ silence_start : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ + silence_start
+ )
+ pos_r = (
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
+ + i
+ - self.max_sil_kept
+ )
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ clip_start = pos_r
+ else:
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+ clip_start = max(pos_r, pos)
+ else:
+ pos_l = (
+ rms_list[
+ silence_start : silence_start + self.max_sil_kept + 1
+ ].argmin()
+ + silence_start
+ )
+ pos_r = (
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
+ + i
+ - self.max_sil_kept
+ )
+ if silence_start == 0:
+ sil_tags.append((0, pos_r))
+ else:
+ sil_tags.append((pos_l, pos_r))
+ clip_start = pos_r
+ silence_start = None
+ # Deal with trailing silence.
+ total_frames = rms_list.shape[0]
+ if (
+ silence_start is not None
+ and total_frames - silence_start >= self.min_interval
+ ):
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+ sil_tags.append((pos, total_frames + 1))
+ # Apply and return slices.
+ ####音频+起始时间+终止时间
+ if len(sil_tags) == 0:
+ return [[waveform,0,int(total_frames*self.hop_size)]]
+ else:
+ chunks = []
+ if sil_tags[0][0] > 0:
+ chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
+ for i in range(len(sil_tags) - 1):
+ chunks.append(
+ [self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
+ )
+ if sil_tags[-1][1] < total_frames:
+ chunks.append(
+ [self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
+ )
+ return chunks
+
+
+def main():
+ import os.path
+ from argparse import ArgumentParser
+
+ import librosa
+ import soundfile
+
+ parser = ArgumentParser()
+ parser.add_argument("audio", type=str, help="The audio to be sliced")
+ parser.add_argument(
+ "--out", type=str, help="Output directory of the sliced audio clips"
+ )
+ parser.add_argument(
+ "--db_thresh",
+ type=float,
+ required=False,
+ default=-40,
+ help="The dB threshold for silence detection",
+ )
+ parser.add_argument(
+ "--min_length",
+ type=int,
+ required=False,
+ default=5000,
+ help="The minimum milliseconds required for each sliced audio clip",
+ )
+ parser.add_argument(
+ "--min_interval",
+ type=int,
+ required=False,
+ default=300,
+ help="The minimum milliseconds for a silence part to be sliced",
+ )
+ parser.add_argument(
+ "--hop_size",
+ type=int,
+ required=False,
+ default=10,
+ help="Frame length in milliseconds",
+ )
+ parser.add_argument(
+ "--max_sil_kept",
+ type=int,
+ required=False,
+ default=500,
+ help="The maximum silence length kept around the sliced clip, presented in milliseconds",
+ )
+ args = parser.parse_args()
+ out = args.out
+ if out is None:
+ out = os.path.dirname(os.path.abspath(args.audio))
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
+ slicer = Slicer(
+ sr=sr,
+ threshold=args.db_thresh,
+ min_length=args.min_length,
+ min_interval=args.min_interval,
+ hop_size=args.hop_size,
+ max_sil_kept=args.max_sil_kept,
+ )
+ chunks = slicer.slice(audio)
+ if not os.path.exists(out):
+ os.makedirs(out)
+ for i, chunk in enumerate(chunks):
+ if len(chunk.shape) > 1:
+ chunk = chunk.T
+ soundfile.write(
+ os.path.join(
+ out,
+ f"%s_%d.wav"
+ % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+ ),
+ chunk,
+ sr,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25f260cf2fc66f628bab6eb6ea96bf71c9f613d
--- /dev/null
+++ b/tools/subfix_webui.py
@@ -0,0 +1,498 @@
+import argparse,os
+import copy
+import json
+import os
+import uuid
+
+import librosa
+import gradio as gr
+import numpy as np
+import soundfile
+
+g_json_key_text = ""
+g_json_key_path = ""
+g_load_file = ""
+g_load_format = ""
+
+g_max_json_index = 0
+g_index = 0
+g_batch = 10
+g_text_list = []
+g_audio_list = []
+g_checkbox_list = []
+g_data_json = []
+
+
+def reload_data(index, batch):
+ global g_index
+ g_index = index
+ global g_batch
+ g_batch = batch
+ datas = g_data_json[index:index+batch]
+ output = []
+ for d in datas:
+ output.append(
+ {
+ g_json_key_text: d[g_json_key_text],
+ g_json_key_path: d[g_json_key_path]
+ }
+ )
+ return output
+
+
+def b_change_index(index, batch):
+ global g_index, g_batch
+ g_index, g_batch = index, batch
+ datas = reload_data(index, batch)
+ output = []
+ for i , _ in enumerate(datas):
+ output.append(
+ # gr.Textbox(
+ # label=f"Text {i+index}",
+ # value=_[g_json_key_text]#text
+ # )
+ {
+ "__type__":"update",
+ "label":f"Text {i+index}",
+ "value":_[g_json_key_text]
+ }
+ )
+ for _ in range(g_batch - len(datas)):
+ output.append(
+ # gr.Textbox(
+ # label=f"Text",
+ # value=""
+ # )
+ {
+ "__type__": "update",
+ "label": f"Text",
+ "value": ""
+ }
+ )
+ for _ in datas:
+ output.append(_[g_json_key_path])
+ for _ in range(g_batch - len(datas)):
+ output.append(None)
+ for _ in range(g_batch):
+ output.append(False)
+ return output
+
+
+def b_next_index(index, batch):
+ b_save_file()
+ if (index + batch) <= g_max_json_index:
+ return index + batch , *b_change_index(index + batch, batch)
+ else:
+ return index, *b_change_index(index, batch)
+
+
+def b_previous_index(index, batch):
+ b_save_file()
+ if (index - batch) >= 0:
+ return index - batch , *b_change_index(index - batch, batch)
+ else:
+ return 0, *b_change_index(0, batch)
+
+
+def b_submit_change(*text_list):
+ global g_data_json
+ change = False
+ for i, new_text in enumerate(text_list):
+ if g_index + i <= g_max_json_index:
+ new_text = new_text.strip()+' '
+ if (g_data_json[g_index + i][g_json_key_text] != new_text):
+ g_data_json[g_index + i][g_json_key_text] = new_text
+ change = True
+ if change:
+ b_save_file()
+ return g_index, *b_change_index(g_index, g_batch)
+
+
+def b_delete_audio(*checkbox_list):
+ global g_data_json, g_index, g_max_json_index
+ b_save_file()
+ change = False
+ for i, checkbox in reversed(list(enumerate(checkbox_list))):
+ if g_index + i < len(g_data_json):
+ if (checkbox == True):
+ g_data_json.pop(g_index + i)
+ change = True
+
+ g_max_json_index = len(g_data_json)-1
+ if g_index > g_max_json_index:
+ g_index = g_max_json_index
+ g_index = g_index if g_index >= 0 else 0
+ if change:
+ b_save_file()
+ # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch)
+ return {"value":g_index,"__type__":"update","maximum":(g_max_json_index if g_max_json_index>=0 else 0)},*b_change_index(g_index, g_batch)
+
+
+def b_invert_selection(*checkbox_list):
+ new_list = [not item if item is True else True for item in checkbox_list]
+ return new_list
+
+
+def get_next_path(filename):
+ base_dir = os.path.dirname(filename)
+ base_name = os.path.splitext(os.path.basename(filename))[0]
+ for i in range(100):
+ new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav")
+ if not os.path.exists(new_path) :
+ return new_path
+ return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav')
+
+
+def b_audio_split(audio_breakpoint, *checkbox_list):
+ global g_data_json , g_max_json_index
+ checked_index = []
+ for i, checkbox in enumerate(checkbox_list):
+ if (checkbox == True and g_index+i < len(g_data_json)):
+ checked_index.append(g_index + i)
+ if len(checked_index) == 1 :
+ index = checked_index[0]
+ audio_json = copy.deepcopy(g_data_json[index])
+ path = audio_json[g_json_key_path]
+ data, sample_rate = librosa.load(path, sr=None, mono=True)
+ audio_maxframe = len(data)
+ break_frame = int(audio_breakpoint * sample_rate)
+
+ if (break_frame >= 1 and break_frame < audio_maxframe):
+ audio_first = data[0:break_frame]
+ audio_second = data[break_frame:]
+ nextpath = get_next_path(path)
+ soundfile.write(nextpath, audio_second, sample_rate)
+ soundfile.write(path, audio_first, sample_rate)
+ g_data_json.insert(index + 1, audio_json)
+ g_data_json[index + 1][g_json_key_path] = nextpath
+ b_save_file()
+
+ g_max_json_index = len(g_data_json) - 1
+ # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
+ return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch)
+
+def b_merge_audio(interval_r, *checkbox_list):
+ global g_data_json , g_max_json_index
+ b_save_file()
+ checked_index = []
+ audios_path = []
+ audios_text = []
+ for i, checkbox in enumerate(checkbox_list):
+ if (checkbox == True and g_index+i < len(g_data_json)):
+ checked_index.append(g_index + i)
+
+ if (len(checked_index)>1):
+ for i in checked_index:
+ audios_path.append(g_data_json[i][g_json_key_path])
+ audios_text.append(g_data_json[i][g_json_key_text])
+ for i in reversed(checked_index[1:]):
+ g_data_json.pop(i)
+
+ base_index = checked_index[0]
+ base_path = audios_path[0]
+ g_data_json[base_index][g_json_key_text] = "".join(audios_text)
+
+ audio_list = []
+ l_sample_rate = None
+ for i, path in enumerate(audios_path):
+ data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True)
+ l_sample_rate = sample_rate
+ if (i > 0):
+ silence = np.zeros(int(l_sample_rate * interval_r))
+ audio_list.append(silence)
+
+ audio_list.append(data)
+
+ audio_concat = np.concatenate(audio_list)
+
+ soundfile.write(base_path, audio_concat, l_sample_rate)
+
+ b_save_file()
+
+ g_max_json_index = len(g_data_json) - 1
+
+ # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
+ return {"value":g_index,"maximum":g_max_json_index,"__type__":"update"}, *b_change_index(g_index, g_batch)
+
+
+def b_save_json():
+ with open(g_load_file,'w', encoding="utf-8") as file:
+ for data in g_data_json:
+ file.write(f'{json.dumps(data, ensure_ascii = False)}\n')
+
+
+def b_save_list():
+ with open(g_load_file,'w', encoding="utf-8") as file:
+ for data in g_data_json:
+ wav_path = data["wav_path"]
+ speaker_name = data["speaker_name"]
+ language = data["language"]
+ text = data["text"]
+ file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n')
+
+
+def b_load_json():
+ global g_data_json, g_max_json_index
+ with open(g_load_file, 'r', encoding="utf-8") as file:
+ g_data_json = file.readlines()
+ g_data_json = [json.loads(line) for line in g_data_json]
+ g_max_json_index = len(g_data_json) - 1
+
+
+def b_load_list():
+ global g_data_json, g_max_json_index
+ with open(g_load_file, 'r', encoding="utf-8") as source:
+ data_list = source.readlines()
+ for _ in data_list:
+ data = _.split('|')
+ if (len(data) == 4):
+ wav_path, speaker_name, language, text = data
+ g_data_json.append(
+ {
+ 'wav_path':wav_path,
+ 'speaker_name':speaker_name,
+ 'language':language,
+ 'text':text.strip()
+ }
+ )
+ else:
+ print("error line:", data)
+ g_max_json_index = len(g_data_json) - 1
+
+
+def b_save_file():
+ if g_load_format == "json":
+ b_save_json()
+ elif g_load_format == "list":
+ b_save_list()
+
+
+def b_load_file():
+ if g_load_format == "json":
+ b_load_json()
+ elif g_load_format == "list":
+ b_load_list()
+
+
+def set_global(load_json, load_list, json_key_text, json_key_path, batch):
+ global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch
+
+ g_batch = int(batch)
+
+ if (load_json != "None"):
+ g_load_format = "json"
+ g_load_file = load_json
+ elif (load_list != "None"):
+ g_load_format = "list"
+ g_load_file = load_list
+ else:
+ g_load_format = "list"
+ g_load_file = "demo.list"
+
+ g_json_key_text = json_key_text
+ g_json_key_path = json_key_path
+
+ b_load_file()
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='Process some integers.')
+ parser.add_argument('--load_json', default="None", help='source file, like demo.json')
+ parser.add_argument('--is_share', default="False", help='whether webui is_share=True')
+ parser.add_argument('--load_list', default="None", help='source file, like demo.list')
+ parser.add_argument('--webui_port_subfix', default=9871, help='source file, like demo.list')
+ parser.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text')
+ parser.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path')
+ parser.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10')
+
+ args = parser.parse_args()
+
+ set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch)
+
+ with gr.Blocks() as demo:
+
+ with gr.Row():
+ btn_change_index = gr.Button("Change Index")
+ btn_submit_change = gr.Button("Submit Text")
+ btn_merge_audio = gr.Button("Merge Audio")
+ btn_delete_audio = gr.Button("Delete Audio")
+ btn_previous_index = gr.Button("Previous Index")
+ btn_next_index = gr.Button("Next Index")
+
+ with gr.Row():
+ index_slider = gr.Slider(
+ minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3
+ )
+ splitpoint_slider = gr.Slider(
+ minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3
+ )
+ btn_audio_split = gr.Button("Split Audio", scale=1)
+ btn_save_json = gr.Button("Save File", visible=True, scale=1)
+ btn_invert_selection = gr.Button("Invert Selection", scale=1)
+
+ with gr.Row():
+ with gr.Column():
+ for _ in range(0,g_batch):
+ with gr.Row():
+ text = gr.Textbox(
+ label = "Text",
+ visible = True,
+ scale=5
+ )
+ audio_output = gr.Audio(
+ label="Output Audio",
+ visible = True,
+ scale=5
+ )
+ audio_check = gr.Checkbox(
+ label="Yes",
+ show_label = True,
+ info = "Choose Audio",
+ scale=1
+ )
+ g_text_list.append(text)
+ g_audio_list.append(audio_output)
+ g_checkbox_list.append(audio_check)
+
+
+
+ with gr.Row():
+ batchsize_slider = gr.Slider(
+ minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False
+ )
+ interval_slider = gr.Slider(
+ minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3
+ )
+ btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1)
+ btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1)
+
+ btn_change_index.click(
+ b_change_index,
+ inputs=[
+ index_slider,
+ batchsize_slider,
+ ],
+ outputs=[
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ],
+ )
+
+
+ btn_submit_change.click(
+ b_submit_change,
+ inputs=[
+ *g_text_list,
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ],
+ )
+
+ btn_previous_index.click(
+ b_previous_index,
+ inputs=[
+ index_slider,
+ batchsize_slider,
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ],
+ )
+
+ btn_next_index.click(
+ b_next_index,
+ inputs=[
+ index_slider,
+ batchsize_slider,
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ],
+ )
+
+ btn_delete_audio.click(
+ b_delete_audio,
+ inputs=[
+ *g_checkbox_list
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ]
+ )
+
+ btn_merge_audio.click(
+ b_merge_audio,
+ inputs=[
+ interval_slider,
+ *g_checkbox_list
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ]
+ )
+
+ btn_audio_split.click(
+ b_audio_split,
+ inputs=[
+ splitpoint_slider,
+ *g_checkbox_list
+ ],
+ outputs=[
+ index_slider,
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ]
+ )
+
+ btn_invert_selection.click(
+ b_invert_selection,
+ inputs=[
+ *g_checkbox_list
+ ],
+ outputs=[
+ *g_checkbox_list
+ ]
+ )
+
+ btn_save_json.click(
+ b_save_file
+ )
+
+ demo.load(
+ b_change_index,
+ inputs=[
+ index_slider,
+ batchsize_slider,
+ ],
+ outputs=[
+ *g_text_list,
+ *g_audio_list,
+ *g_checkbox_list
+ ],
+ )
+
+ demo.launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ quiet=True,
+ share=eval(args.is_share),
+ server_port=int(args.webui_port_subfix)
+ )
\ No newline at end of file
diff --git a/tools/uvr5/bs_roformer/__init__.py b/tools/uvr5/bs_roformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/uvr5/bs_roformer/attend.py b/tools/uvr5/bs_roformer/attend.py
new file mode 100644
index 0000000000000000000000000000000000000000..899d26f6fbed25cfab78181237bf14bc270608ae
--- /dev/null
+++ b/tools/uvr5/bs_roformer/attend.py
@@ -0,0 +1,120 @@
+from functools import wraps
+from packaging import version
+from collections import namedtuple
+
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+from einops import rearrange, reduce
+
+# constants
+
+FlashAttentionConfig = namedtuple('FlashAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
+
+# helpers
+
+def exists(val):
+ return val is not None
+
+def default(v, d):
+ return v if exists(v) else d
+
+def once(fn):
+ called = False
+ @wraps(fn)
+ def inner(x):
+ nonlocal called
+ if called:
+ return
+ called = True
+ return fn(x)
+ return inner
+
+print_once = once(print)
+
+# main class
+
+class Attend(nn.Module):
+ def __init__(
+ self,
+ dropout = 0.,
+ flash = False,
+ scale = None
+ ):
+ super().__init__()
+ self.scale = scale
+ self.dropout = dropout
+ self.attn_dropout = nn.Dropout(dropout)
+
+ self.flash = flash
+ assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+
+ # determine efficient attention configs for cuda and cpu
+
+ self.cpu_config = FlashAttentionConfig(True, True, True)
+ self.cuda_config = None
+
+ if not torch.cuda.is_available() or not flash:
+ return
+
+ device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
+
+ if device_properties.major == 8 and device_properties.minor == 0:
+ print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
+ self.cuda_config = FlashAttentionConfig(True, False, False)
+ else:
+ print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
+ self.cuda_config = FlashAttentionConfig(False, True, True)
+
+ def flash_attn(self, q, k, v):
+ _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+
+ if exists(self.scale):
+ default_scale = q.shape[-1] ** -0.5
+ q = q * (self.scale / default_scale)
+
+ # Check if there is a compatible device for flash attention
+
+ config = self.cuda_config if is_cuda else self.cpu_config
+
+ # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale
+
+ with torch.backends.cuda.sdp_kernel(**config._asdict()):
+ out = F.scaled_dot_product_attention(
+ q, k, v,
+ dropout_p = self.dropout if self.training else 0.
+ )
+
+ return out
+
+ def forward(self, q, k, v):
+ """
+ einstein notation
+ b - batch
+ h - heads
+ n, i, j - sequence length (base sequence length, source, target)
+ d - feature dimension
+ """
+
+ q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
+
+ scale = default(self.scale, q.shape[-1] ** -0.5)
+
+ if self.flash:
+ return self.flash_attn(q, k, v)
+
+ # similarity
+
+ sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
+
+ # attention
+
+ attn = sim.softmax(dim=-1)
+ attn = self.attn_dropout(attn)
+
+ # aggregate values
+
+ out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
+
+ return out
diff --git a/tools/uvr5/bs_roformer/bs_roformer.py b/tools/uvr5/bs_roformer/bs_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f17e1bb756a63a30b2bb11b67548acc2f5e202ff
--- /dev/null
+++ b/tools/uvr5/bs_roformer/bs_roformer.py
@@ -0,0 +1,583 @@
+from functools import partial
+
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+
+from bs_roformer.attend import Attend
+
+from typing import Tuple, Optional, List, Callable
+# from beartype.typing import Tuple, Optional, List, Callable
+# from beartype import beartype
+
+from rotary_embedding_torch import RotaryEmbedding
+
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+
+# helper functions
+
+def exists(val):
+ return val is not None
+
+
+def default(v, d):
+ return v if exists(v) else d
+
+
+def pack_one(t, pattern):
+ return pack([t], pattern)
+
+
+def unpack_one(t, ps, pattern):
+ return unpack(t, ps, pattern)[0]
+
+
+# norm
+
+def l2norm(t):
+ return F.normalize(t, dim = -1, p = 2)
+
+
+class RMSNorm(Module):
+ def __init__(self, dim):
+ super().__init__()
+ self.scale = dim ** 0.5
+ self.gamma = nn.Parameter(torch.ones(dim))
+
+ def forward(self, x):
+ return F.normalize(x, dim=-1) * self.scale * self.gamma
+
+
+# attention
+
+class FeedForward(Module):
+ def __init__(
+ self,
+ dim,
+ mult=4,
+ dropout=0.
+ ):
+ super().__init__()
+ dim_inner = int(dim * mult)
+ self.net = nn.Sequential(
+ RMSNorm(dim),
+ nn.Linear(dim, dim_inner),
+ nn.GELU(),
+ nn.Dropout(dropout),
+ nn.Linear(dim_inner, dim),
+ nn.Dropout(dropout)
+ )
+
+ def forward(self, x):
+ return self.net(x)
+
+
+class Attention(Module):
+ def __init__(
+ self,
+ dim,
+ heads=8,
+ dim_head=64,
+ dropout=0.,
+ rotary_embed=None,
+ flash=True
+ ):
+ super().__init__()
+ self.heads = heads
+ self.scale = dim_head ** -0.5
+ dim_inner = heads * dim_head
+
+ self.rotary_embed = rotary_embed
+
+ self.attend = Attend(flash=flash, dropout=dropout)
+
+ self.norm = RMSNorm(dim)
+ self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+
+ self.to_gates = nn.Linear(dim, heads)
+
+ self.to_out = nn.Sequential(
+ nn.Linear(dim_inner, dim, bias=False),
+ nn.Dropout(dropout)
+ )
+
+ def forward(self, x):
+ x = self.norm(x)
+
+ q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
+
+ if exists(self.rotary_embed):
+ q = self.rotary_embed.rotate_queries_or_keys(q)
+ k = self.rotary_embed.rotate_queries_or_keys(k)
+
+ out = self.attend(q, k, v)
+
+ gates = self.to_gates(x)
+ out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid()
+
+ out = rearrange(out, 'b h n d -> b n (h d)')
+ return self.to_out(out)
+
+
+class LinearAttention(Module):
+ """
+ this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
+ """
+
+ # @beartype
+ def __init__(
+ self,
+ *,
+ dim,
+ dim_head=32,
+ heads=8,
+ scale=8,
+ flash=False,
+ dropout=0.
+ ):
+ super().__init__()
+ dim_inner = dim_head * heads
+ self.norm = RMSNorm(dim)
+
+ self.to_qkv = nn.Sequential(
+ nn.Linear(dim, dim_inner * 3, bias=False),
+ Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads)
+ )
+
+ self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+
+ self.attend = Attend(
+ scale=scale,
+ dropout=dropout,
+ flash=flash
+ )
+
+ self.to_out = nn.Sequential(
+ Rearrange('b h d n -> b n (h d)'),
+ nn.Linear(dim_inner, dim, bias=False)
+ )
+
+ def forward(
+ self,
+ x
+ ):
+ x = self.norm(x)
+
+ q, k, v = self.to_qkv(x)
+
+ q, k = map(l2norm, (q, k))
+ q = q * self.temperature.exp()
+
+ out = self.attend(q, k, v)
+
+ return self.to_out(out)
+
+
+class Transformer(Module):
+ def __init__(
+ self,
+ *,
+ dim,
+ depth,
+ dim_head=64,
+ heads=8,
+ attn_dropout=0.,
+ ff_dropout=0.,
+ ff_mult=4,
+ norm_output=True,
+ rotary_embed=None,
+ flash_attn=True,
+ linear_attn=False
+ ):
+ super().__init__()
+ self.layers = ModuleList([])
+
+ for _ in range(depth):
+ if linear_attn:
+ attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn)
+ else:
+ attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout,
+ rotary_embed=rotary_embed, flash=flash_attn)
+
+ self.layers.append(ModuleList([
+ attn,
+ FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+ ]))
+
+ self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+
+ def forward(self, x):
+
+ for attn, ff in self.layers:
+ x = attn(x) + x
+ x = ff(x) + x
+
+ return self.norm(x)
+
+
+# bandsplit module
+
+class BandSplit(Module):
+ # @beartype
+ def __init__(
+ self,
+ dim,
+ dim_inputs: Tuple[int, ...]
+ ):
+ super().__init__()
+ self.dim_inputs = dim_inputs
+ self.to_features = ModuleList([])
+
+ for dim_in in dim_inputs:
+ net = nn.Sequential(
+ RMSNorm(dim_in),
+ nn.Linear(dim_in, dim)
+ )
+
+ self.to_features.append(net)
+
+ def forward(self, x):
+ x = x.split(self.dim_inputs, dim=-1)
+
+ outs = []
+ for split_input, to_feature in zip(x, self.to_features):
+ split_output = to_feature(split_input)
+ outs.append(split_output)
+
+ return torch.stack(outs, dim=-2)
+
+
+def MLP(
+ dim_in,
+ dim_out,
+ dim_hidden=None,
+ depth=1,
+ activation=nn.Tanh
+):
+ dim_hidden = default(dim_hidden, dim_in)
+
+ net = []
+ dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+
+ for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+ is_last = ind == (len(dims) - 2)
+
+ net.append(nn.Linear(layer_dim_in, layer_dim_out))
+
+ if is_last:
+ continue
+
+ net.append(activation())
+
+ return nn.Sequential(*net)
+
+
+class MaskEstimator(Module):
+ # @beartype
+ def __init__(
+ self,
+ dim,
+ dim_inputs: Tuple[int, ...],
+ depth,
+ mlp_expansion_factor=4
+ ):
+ super().__init__()
+ self.dim_inputs = dim_inputs
+ self.to_freqs = ModuleList([])
+ dim_hidden = dim * mlp_expansion_factor
+
+ for dim_in in dim_inputs:
+ net = []
+
+ mlp = nn.Sequential(
+ MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth),
+ nn.GLU(dim=-1)
+ )
+
+ self.to_freqs.append(mlp)
+
+ def forward(self, x):
+ x = x.unbind(dim=-2)
+
+ outs = []
+
+ for band_features, mlp in zip(x, self.to_freqs):
+ freq_out = mlp(band_features)
+ outs.append(freq_out)
+
+ return torch.cat(outs, dim=-1)
+
+
+# main class
+
+DEFAULT_FREQS_PER_BANDS = (
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 12, 12, 12, 12, 12, 12, 12, 12,
+ 24, 24, 24, 24, 24, 24, 24, 24,
+ 48, 48, 48, 48, 48, 48, 48, 48,
+ 128, 129,
+)
+
+
+class BSRoformer(Module):
+
+ # @beartype
+ def __init__(
+ self,
+ dim,
+ *,
+ depth,
+ stereo=False,
+ num_stems=1,
+ time_transformer_depth=2,
+ freq_transformer_depth=2,
+ linear_transformer_depth=0,
+ freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+ # in the paper, they divide into ~60 bands, test with 1 for starters
+ dim_head=64,
+ heads=8,
+ attn_dropout=0.,
+ ff_dropout=0.,
+ flash_attn=True,
+ dim_freqs_in=1025,
+ stft_n_fft=2048,
+ stft_hop_length=512,
+ # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
+ stft_win_length=2048,
+ stft_normalized=False,
+ stft_window_fn: Optional[Callable] = None,
+ mask_estimator_depth=2,
+ multi_stft_resolution_loss_weight=1.,
+ multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+ multi_stft_hop_size=147,
+ multi_stft_normalized=False,
+ multi_stft_window_fn: Callable = torch.hann_window
+ ):
+ super().__init__()
+
+ self.stereo = stereo
+ self.audio_channels = 2 if stereo else 1
+ self.num_stems = num_stems
+
+ self.layers = ModuleList([])
+
+ transformer_kwargs = dict(
+ dim=dim,
+ heads=heads,
+ dim_head=dim_head,
+ attn_dropout=attn_dropout,
+ ff_dropout=ff_dropout,
+ flash_attn=flash_attn,
+ norm_output=False
+ )
+
+ time_rotary_embed = RotaryEmbedding(dim=dim_head)
+ freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+
+ for _ in range(depth):
+ tran_modules = []
+ if linear_transformer_depth > 0:
+ tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs))
+ tran_modules.append(
+ Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs)
+ )
+ tran_modules.append(
+ Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs)
+ )
+ self.layers.append(nn.ModuleList(tran_modules))
+
+ self.final_norm = RMSNorm(dim)
+
+ self.stft_kwargs = dict(
+ n_fft=stft_n_fft,
+ hop_length=stft_hop_length,
+ win_length=stft_win_length,
+ normalized=stft_normalized
+ )
+
+ self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+
+ freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1]
+
+ assert len(freqs_per_bands) > 1
+ assert sum(
+ freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}'
+
+ freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands)
+
+ self.band_split = BandSplit(
+ dim=dim,
+ dim_inputs=freqs_per_bands_with_complex
+ )
+
+ self.mask_estimators = nn.ModuleList([])
+
+ for _ in range(num_stems):
+ mask_estimator = MaskEstimator(
+ dim=dim,
+ dim_inputs=freqs_per_bands_with_complex,
+ depth=mask_estimator_depth
+ )
+
+ self.mask_estimators.append(mask_estimator)
+
+ # for the multi-resolution stft loss
+
+ self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+ self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+ self.multi_stft_n_fft = stft_n_fft
+ self.multi_stft_window_fn = multi_stft_window_fn
+
+ self.multi_stft_kwargs = dict(
+ hop_length=multi_stft_hop_size,
+ normalized=multi_stft_normalized
+ )
+
+ def forward(
+ self,
+ raw_audio,
+ target=None,
+ return_loss_breakdown=False
+ ):
+ """
+ einops
+
+ b - batch
+ f - freq
+ t - time
+ s - audio channel (1 for mono, 2 for stereo)
+ n - number of 'stems'
+ c - complex (2)
+ d - feature dimension
+ """
+
+ device = raw_audio.device
+
+ if raw_audio.ndim == 2:
+ raw_audio = rearrange(raw_audio, 'b t -> b 1 t')
+
+ channels = raw_audio.shape[1]
+ assert (not self.stereo and channels == 1) or (
+ self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)'
+
+ # to stft
+
+ raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t')
+
+ stft_window = self.stft_window_fn(device=device)
+
+ stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+ stft_repr = torch.view_as_real(stft_repr)
+
+ stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
+ stft_repr = rearrange(stft_repr,
+ 'b s f t c -> b (f s) t c') # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+
+ x = rearrange(stft_repr, 'b f t c -> b t (f c)')
+ # print("460:", x.dtype)#fp32
+ x = self.band_split(x)
+
+ # axial / hierarchical attention
+
+ # print("487:",x.dtype)#fp16
+ for transformer_block in self.layers:
+
+ if len(transformer_block) == 3:
+ linear_transformer, time_transformer, freq_transformer = transformer_block
+
+ x, ft_ps = pack([x], 'b * d')
+ # print("494:", x.dtype)#fp16
+ x = linear_transformer(x)
+ # print("496:", x.dtype)#fp16
+ x, = unpack(x, ft_ps, 'b * d')
+ else:
+ time_transformer, freq_transformer = transformer_block
+
+ # print("501:", x.dtype)#fp16
+ x = rearrange(x, 'b t f d -> b f t d')
+ x, ps = pack([x], '* t d')
+
+ x = time_transformer(x)
+ # print("505:", x.dtype)#fp16
+ x, = unpack(x, ps, '* t d')
+ x = rearrange(x, 'b f t d -> b t f d')
+ x, ps = pack([x], '* f d')
+
+ x = freq_transformer(x)
+
+ x, = unpack(x, ps, '* f d')
+
+ # print("515:", x.dtype)######fp16
+ x = self.final_norm(x)
+
+ num_stems = len(self.mask_estimators)
+ # print("519:", x.dtype)#fp32
+ mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+ mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2)
+
+ # modulate frequency representation
+
+ stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c')
+
+ # complex number multiplication
+
+ stft_repr = torch.view_as_complex(stft_repr)
+ mask = torch.view_as_complex(mask)
+
+ stft_repr = stft_repr * mask
+
+ # istft
+
+ stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels)
+
+ recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False)
+
+ recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', s=self.audio_channels, n=num_stems)
+
+ if num_stems == 1:
+ recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t')
+
+ # if a target is passed in, calculate loss for learning
+
+ if not exists(target):
+ return recon_audio
+
+ if self.num_stems > 1:
+ assert target.ndim == 4 and target.shape[1] == self.num_stems
+
+ if target.ndim == 2:
+ target = rearrange(target, '... t -> ... 1 t')
+
+ target = target[..., :recon_audio.shape[-1]] # protect against lost length on istft
+
+ loss = F.l1_loss(recon_audio, target)
+
+ multi_stft_resolution_loss = 0.
+
+ for window_size in self.multi_stft_resolutions_window_sizes:
+ res_stft_kwargs = dict(
+ n_fft=max(window_size, self.multi_stft_n_fft), # not sure what n_fft is across multi resolution stft
+ win_length=window_size,
+ return_complex=True,
+ window=self.multi_stft_window_fn(window_size, device=device),
+ **self.multi_stft_kwargs,
+ )
+
+ recon_Y = torch.stft(rearrange(recon_audio, '... s t -> (... s) t'), **res_stft_kwargs)
+ target_Y = torch.stft(rearrange(target, '... s t -> (... s) t'), **res_stft_kwargs)
+
+ multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+
+ weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+
+ total_loss = loss + weighted_multi_resolution_loss
+
+ if not return_loss_breakdown:
+ return total_loss
+
+ return total_loss, (loss, multi_stft_resolution_loss)
\ No newline at end of file
diff --git a/tools/uvr5/bsroformer.py b/tools/uvr5/bsroformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee9759ab59ce6383d075bab4d985bd3d8e9439a
--- /dev/null
+++ b/tools/uvr5/bsroformer.py
@@ -0,0 +1,216 @@
+# This code is modified from https://github.com/ZFTurbo/
+import pdb
+
+import librosa
+from tqdm import tqdm
+import os
+import torch
+import numpy as np
+import soundfile as sf
+import torch.nn as nn
+
+import warnings
+warnings.filterwarnings("ignore")
+from bs_roformer.bs_roformer import BSRoformer
+
+class BsRoformer_Loader:
+ def get_model_from_config(self):
+ config = {
+ "attn_dropout": 0.1,
+ "depth": 12,
+ "dim": 512,
+ "dim_freqs_in": 1025,
+ "dim_head": 64,
+ "ff_dropout": 0.1,
+ "flash_attn": True,
+ "freq_transformer_depth": 1,
+ "freqs_per_bands":(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12, 24, 24, 24, 24, 24, 24, 24, 24, 48, 48, 48, 48, 48, 48, 48, 48, 128, 129),
+ "heads": 8,
+ "linear_transformer_depth": 0,
+ "mask_estimator_depth": 2,
+ "multi_stft_hop_size": 147,
+ "multi_stft_normalized": False,
+ "multi_stft_resolution_loss_weight": 1.0,
+ "multi_stft_resolutions_window_sizes":(4096, 2048, 1024, 512, 256),
+ "num_stems": 1,
+ "stereo": True,
+ "stft_hop_length": 441,
+ "stft_n_fft": 2048,
+ "stft_normalized": False,
+ "stft_win_length": 2048,
+ "time_transformer_depth": 1,
+
+ }
+
+
+ model = BSRoformer(
+ **dict(config)
+ )
+
+ return model
+
+
+ def demix_track(self, model, mix, device):
+ C = 352800
+ # num_overlap
+ N = 1
+ fade_size = C // 10
+ step = int(C // N)
+ border = C - step
+ batch_size = 4
+
+ length_init = mix.shape[-1]
+
+ progress_bar = tqdm(total=length_init // step + 1)
+ progress_bar.set_description("Processing")
+
+ # Do pad from the beginning and end to account floating window results better
+ if length_init > 2 * border and (border > 0):
+ mix = nn.functional.pad(mix, (border, border), mode='reflect')
+
+ # Prepare windows arrays (do 1 time for speed up). This trick repairs click problems on the edges of segment
+ window_size = C
+ fadein = torch.linspace(0, 1, fade_size)
+ fadeout = torch.linspace(1, 0, fade_size)
+ window_start = torch.ones(window_size)
+ window_middle = torch.ones(window_size)
+ window_finish = torch.ones(window_size)
+ window_start[-fade_size:] *= fadeout # First audio chunk, no fadein
+ window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout
+ window_middle[-fade_size:] *= fadeout
+ window_middle[:fade_size] *= fadein
+
+ with torch.amp.autocast('cuda'):
+ with torch.inference_mode():
+ req_shape = (1, ) + tuple(mix.shape)
+
+ result = torch.zeros(req_shape, dtype=torch.float32)
+ counter = torch.zeros(req_shape, dtype=torch.float32)
+ i = 0
+ batch_data = []
+ batch_locations = []
+ while i < mix.shape[1]:
+ part = mix[:, i:i + C].to(device)
+ length = part.shape[-1]
+ if length < C:
+ if length > C // 2 + 1:
+ part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
+ else:
+ part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
+ if(self.is_half==True):
+ part=part.half()
+ batch_data.append(part)
+ batch_locations.append((i, length))
+ i += step
+ progress_bar.update(1)
+
+ if len(batch_data) >= batch_size or (i >= mix.shape[1]):
+ arr = torch.stack(batch_data, dim=0)
+ # print(23333333,arr.dtype)
+ x = model(arr)
+
+ window = window_middle
+ if i - step == 0: # First audio chunk, no fadein
+ window = window_start
+ elif i >= mix.shape[1]: # Last audio chunk, no fadeout
+ window = window_finish
+
+ for j in range(len(batch_locations)):
+ start, l = batch_locations[j]
+ result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l]
+ counter[..., start:start+l] += window[..., :l]
+
+ batch_data = []
+ batch_locations = []
+
+ estimated_sources = result / counter
+ estimated_sources = estimated_sources.cpu().numpy()
+ np.nan_to_num(estimated_sources, copy=False, nan=0.0)
+
+ if length_init > 2 * border and (border > 0):
+ # Remove pad
+ estimated_sources = estimated_sources[..., border:-border]
+
+ progress_bar.close()
+
+ return {k: v for k, v in zip(['vocals', 'other'], estimated_sources)}
+
+
+ def run_folder(self,input, vocal_root, others_root, format):
+ # start_time = time.time()
+ self.model.eval()
+ path = input
+
+ if not os.path.isdir(vocal_root):
+ os.mkdir(vocal_root)
+
+ if not os.path.isdir(others_root):
+ os.mkdir(others_root)
+
+ try:
+ mix, sr = librosa.load(path, sr=44100, mono=False)
+ except Exception as e:
+ print('Can read track: {}'.format(path))
+ print('Error message: {}'.format(str(e)))
+ return
+
+ # Convert mono to stereo if needed
+ if len(mix.shape) == 1:
+ mix = np.stack([mix, mix], axis=0)
+
+ mix_orig = mix.copy()
+
+ mixture = torch.tensor(mix, dtype=torch.float32)
+ res = self.demix_track(self.model, mixture, self.device)
+
+ estimates = res['vocals'].T
+
+ if format in ["wav", "flac"]:
+ sf.write("{}/{}_{}.{}".format(vocal_root, os.path.basename(path)[:-4], 'vocals', format), estimates, sr)
+ sf.write("{}/{}_{}.{}".format(others_root, os.path.basename(path)[:-4], 'instrumental', format), mix_orig.T - estimates, sr)
+ else:
+ path_vocal = "%s/%s_vocals.wav" % (vocal_root, os.path.basename(path)[:-4])
+ path_other = "%s/%s_instrumental.wav" % (others_root, os.path.basename(path)[:-4])
+ sf.write(path_vocal, estimates, sr)
+ sf.write(path_other, mix_orig.T - estimates, sr)
+ opt_path_vocal = path_vocal[:-4] + ".%s" % format
+ opt_path_other = path_other[:-4] + ".%s" % format
+ if os.path.exists(path_vocal):
+ os.system(
+ "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)
+ )
+ if os.path.exists(opt_path_vocal):
+ try:
+ os.remove(path_vocal)
+ except:
+ pass
+ if os.path.exists(path_other):
+ os.system(
+ "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)
+ )
+ if os.path.exists(opt_path_other):
+ try:
+ os.remove(path_other)
+ except:
+ pass
+
+ # print("Elapsed time: {:.2f} sec".format(time.time() - start_time))
+
+
+ def __init__(self, model_path, device,is_half):
+ self.device = device
+ self.extract_instrumental=True
+
+ model = self.get_model_from_config()
+ state_dict = torch.load(model_path,map_location="cpu")
+ model.load_state_dict(state_dict)
+ self.is_half=is_half
+ if(is_half==False):
+ self.model = model.to(device)
+ else:
+ self.model = model.half().to(device)
+
+
+ def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False):
+ self.run_folder(input, vocal_root, others_root, format)
+
diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/tools/uvr5/lib/lib_v5/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..831a83cdfcec8009d0d77a71ac8fd0eadc77b926
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/dataset.py
@@ -0,0 +1,183 @@
+import os
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from tqdm import tqdm
+
+from . import spec_utils
+
+
+class VocalRemoverValidationSet(torch.utils.data.Dataset):
+ def __init__(self, patch_list):
+ self.patch_list = patch_list
+
+ def __len__(self):
+ return len(self.patch_list)
+
+ def __getitem__(self, idx):
+ path = self.patch_list[idx]
+ data = np.load(path)
+
+ X, y = data["X"], data["y"]
+
+ X_mag = np.abs(X)
+ y_mag = np.abs(y)
+
+ return X_mag, y_mag
+
+
+def make_pair(mix_dir, inst_dir):
+ input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
+
+ X_list = sorted(
+ [
+ os.path.join(mix_dir, fname)
+ for fname in os.listdir(mix_dir)
+ if os.path.splitext(fname)[1] in input_exts
+ ]
+ )
+ y_list = sorted(
+ [
+ os.path.join(inst_dir, fname)
+ for fname in os.listdir(inst_dir)
+ if os.path.splitext(fname)[1] in input_exts
+ ]
+ )
+
+ filelist = list(zip(X_list, y_list))
+
+ return filelist
+
+
+def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
+ if split_mode == "random":
+ filelist = make_pair(
+ os.path.join(dataset_dir, "mixtures"),
+ os.path.join(dataset_dir, "instruments"),
+ )
+
+ random.shuffle(filelist)
+
+ if len(val_filelist) == 0:
+ val_size = int(len(filelist) * val_rate)
+ train_filelist = filelist[:-val_size]
+ val_filelist = filelist[-val_size:]
+ else:
+ train_filelist = [
+ pair for pair in filelist if list(pair) not in val_filelist
+ ]
+ elif split_mode == "subdirs":
+ if len(val_filelist) != 0:
+ raise ValueError(
+ "The `val_filelist` option is not available in `subdirs` mode"
+ )
+
+ train_filelist = make_pair(
+ os.path.join(dataset_dir, "training/mixtures"),
+ os.path.join(dataset_dir, "training/instruments"),
+ )
+
+ val_filelist = make_pair(
+ os.path.join(dataset_dir, "validation/mixtures"),
+ os.path.join(dataset_dir, "validation/instruments"),
+ )
+
+ return train_filelist, val_filelist
+
+
+def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
+ perm = np.random.permutation(len(X))
+ for i, idx in enumerate(tqdm(perm)):
+ if np.random.uniform() < reduction_rate:
+ y[idx] = spec_utils.reduce_vocal_aggressively(
+ X[idx], y[idx], reduction_mask
+ )
+
+ if np.random.uniform() < 0.5:
+ # swap channel
+ X[idx] = X[idx, ::-1]
+ y[idx] = y[idx, ::-1]
+ if np.random.uniform() < 0.02:
+ # mono
+ X[idx] = X[idx].mean(axis=0, keepdims=True)
+ y[idx] = y[idx].mean(axis=0, keepdims=True)
+ if np.random.uniform() < 0.02:
+ # inst
+ X[idx] = y[idx]
+
+ if np.random.uniform() < mixup_rate and i < len(perm) - 1:
+ lam = np.random.beta(mixup_alpha, mixup_alpha)
+ X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
+ y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
+
+ return X, y
+
+
+def make_padding(width, cropsize, offset):
+ left = offset
+ roi_size = cropsize - left * 2
+ if roi_size == 0:
+ roi_size = cropsize
+ right = roi_size - (width % roi_size) + left
+
+ return left, right, roi_size
+
+
+def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
+ len_dataset = patches * len(filelist)
+
+ X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+ y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
+ X, y = X / coef, y / coef
+
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+ starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
+ ends = starts + cropsize
+ for j in range(patches):
+ idx = i * patches + j
+ X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
+ y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
+
+ return X_dataset, y_dataset
+
+
+def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
+ patch_list = []
+ patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
+ cropsize, sr, hop_length, n_fft, offset
+ )
+ os.makedirs(patch_dir, exist_ok=True)
+
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+ basename = os.path.splitext(os.path.basename(X_path))[0]
+
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
+ X, y = X / coef, y / coef
+
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+ len_dataset = int(np.ceil(X.shape[2] / roi_size))
+ for j in range(len_dataset):
+ outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
+ start = j * roi_size
+ if not os.path.exists(outpath):
+ np.savez(
+ outpath,
+ X=X_pad[:, :, start : start + cropsize],
+ y=y_pad[:, :, start : start + cropsize],
+ )
+ patch_list.append(outpath)
+
+ return VocalRemoverValidationSet(patch_list)
diff --git a/tools/uvr5/lib/lib_v5/layers.py b/tools/uvr5/lib/lib_v5/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/tools/uvr5/lib/lib_v5/layers_123812KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_123812KB.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/tools/uvr5/lib/lib_v5/layers_123821KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_123821KB.py
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/tools/uvr5/lib/lib_v5/layers_33966KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_33966KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/tools/uvr5/lib/lib_v5/layers_537227KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_537227KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/tools/uvr5/lib/lib_v5/layers_537238KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_537238KB.py
@@ -0,0 +1,126 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(SeperableConv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nin,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ groups=nin,
+ bias=False,
+ ),
+ nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+ def __call__(self, x):
+ skip = self.conv1(x)
+ h = self.conv2(skip)
+
+ return h, skip
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+ h = self.conv(x)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+ self.conv3 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv6 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.conv7 = SeperableConv2DBNActiv(
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = nn.Sequential(
+ Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+ )
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ feat6 = self.conv6(x)
+ feat7 = self.conv7(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+ bottle = self.bottleneck(out)
+ return bottle
diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/tools/uvr5/lib/lib_v5/layers_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..83321555e0b488e2b6a08417ef4687bd186590c1
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/layers_new.py
@@ -0,0 +1,125 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+ super(Conv2DBNActiv, self).__init__()
+ self.conv = nn.Sequential(
+ nn.Conv2d(
+ nin,
+ nout,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ dilation=dilation,
+ bias=False,
+ ),
+ nn.BatchNorm2d(nout),
+ activ(),
+ )
+
+ def __call__(self, x):
+ return self.conv(x)
+
+
+class Encoder(nn.Module):
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+ super(Encoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+
+ def __call__(self, x):
+ h = self.conv1(x)
+ h = self.conv2(h)
+
+ return h
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+ ):
+ super(Decoder, self).__init__()
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def __call__(self, x, skip=None):
+ x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+
+ if skip is not None:
+ skip = spec_utils.crop_center(skip, x)
+ x = torch.cat([x, skip], dim=1)
+
+ h = self.conv1(x)
+ # h = self.conv2(h)
+
+ if self.dropout is not None:
+ h = self.dropout(h)
+
+ return h
+
+
+class ASPPModule(nn.Module):
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
+ super(ASPPModule, self).__init__()
+ self.conv1 = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, None)),
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
+ )
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+ self.conv3 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
+ )
+ self.conv4 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
+ )
+ self.conv5 = Conv2DBNActiv(
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
+ )
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+ def forward(self, x):
+ _, _, h, w = x.size()
+ feat1 = F.interpolate(
+ self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+ )
+ feat2 = self.conv2(x)
+ feat3 = self.conv3(x)
+ feat4 = self.conv4(x)
+ feat5 = self.conv5(x)
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+ out = self.bottleneck(out)
+
+ if self.dropout is not None:
+ out = self.dropout(out)
+
+ return out
+
+
+class LSTMModule(nn.Module):
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
+ super(LSTMModule, self).__init__()
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+ self.lstm = nn.LSTM(
+ input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
+ )
+ self.dense = nn.Sequential(
+ nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
+ )
+
+ def forward(self, x):
+ N, _, nbins, nframes = x.size()
+ h = self.conv(x)[:, 0] # N, nbins, nframes
+ h = h.permute(2, 0, 1) # nframes, N, nbins
+ h, _ = self.lstm(h)
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
+ h = h.reshape(nframes, N, 1, nbins)
+ h = h.permute(1, 2, 3, 0)
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/tools/uvr5/lib/lib_v5/model_param_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a886051ad37dcd8b7be29ff9443294c85f6add0
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/model_param_init.py
@@ -0,0 +1,69 @@
+import json
+import os
+import pathlib
+
+default_param = {}
+default_param["bins"] = 768
+default_param["unstable_bins"] = 9 # training only
+default_param["reduction_bins"] = 762 # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = 757
+default_param["pre_filter_stop"] = 768
+default_param["band"] = {}
+
+
+default_param["band"][1] = {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 960,
+ "crop_start": 0,
+ "crop_stop": 245,
+ "lpf_start": 61, # inference only
+ "res_type": "polyphase",
+}
+
+default_param["band"][2] = {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 1536,
+ "crop_start": 24,
+ "crop_stop": 547,
+ "hpf_start": 81, # inference only
+ "res_type": "sinc_best",
+}
+
+
+def int_keys(d):
+ r = {}
+ for k, v in d:
+ if k.isdigit():
+ k = int(k)
+ r[k] = v
+ return r
+
+
+class ModelParameters(object):
+ def __init__(self, config_path=""):
+ if ".pth" == pathlib.Path(config_path).suffix:
+ import zipfile
+
+ with zipfile.ZipFile(config_path, "r") as zip:
+ self.param = json.loads(
+ zip.read("param.json"), object_pairs_hook=int_keys
+ )
+ elif ".json" == pathlib.Path(config_path).suffix:
+ with open(config_path, "r") as f:
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+ else:
+ self.param = default_param
+
+ for k in [
+ "mid_side",
+ "mid_side_b",
+ "mid_side_b2",
+ "stereo_w",
+ "stereo_n",
+ "reverse",
+ ]:
+ if not k in self.param:
+ self.param[k] = False
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
new file mode 100644
index 0000000000000000000000000000000000000000..da097dc2277d18511bf66487f847fb745d93731f
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 16000,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 16000,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac49901adccf0838b0792ea4da517f4b2093e168
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 32000,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 32000,
+ "pre_filter_start": 1000,
+ "pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d9d9e175aeb2e82b9d64d168176808d44f521
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 33075,
+ "hl": 384,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 33075,
+ "pre_filter_start": 1000,
+ "pre_filter_stop": 1021
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cc7a4282f2530b8957ec6791355d6a2f98ad76d
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 1024,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0b8c2c0f82b67a92aaff92822abded210c2a6a9
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
@@ -0,0 +1,19 @@
+{
+ "bins": 256,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 256,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 256,
+ "pre_filter_stop": 256
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cd0ed370c815803f4cede8aa7cf2f36d77e2a7a
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 1024,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 1024
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
new file mode 100644
index 0000000000000000000000000000000000000000..2663ce443f3f2b3416029080356ad564281a991b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
@@ -0,0 +1,19 @@
+{
+ "bins": 1024,
+ "unstable_bins": 0,
+ "reduction_bins": 0,
+ "band": {
+ "1": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 700,
+ "hpf_start": -1,
+ "res_type": "sinc_best"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1023,
+ "pre_filter_stop": 700
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
new file mode 100644
index 0000000000000000000000000000000000000000..f537c435f1786c3eca8e082ee21dc0ce0eba817c
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
@@ -0,0 +1,30 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 705,
+ "band": {
+ "1": {
+ "sr": 6000,
+ "hl": 66,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 240,
+ "lpf_start": 60,
+ "lpf_stop": 118,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 32000,
+ "hl": 352,
+ "n_fft": 1024,
+ "crop_start": 22,
+ "crop_stop": 505,
+ "hpf_start": 44,
+ "hpf_stop": 23,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 32000,
+ "pre_filter_start": 710,
+ "pre_filter_stop": 731
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json b/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d4d6f3ccfaa85af9e0e54b23ef1f7d3de1613d3
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
@@ -0,0 +1,30 @@
+{
+ "bins": 512,
+ "unstable_bins": 7,
+ "reduction_bins": 510,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 160,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 192,
+ "lpf_start": 41,
+ "lpf_stop": 139,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 44100,
+ "hl": 640,
+ "n_fft": 1024,
+ "crop_start": 10,
+ "crop_stop": 320,
+ "hpf_start": 47,
+ "hpf_stop": 15,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 510,
+ "pre_filter_stop": 512
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json b/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
new file mode 100644
index 0000000000000000000000000000000000000000..be075f52e4a8ddba952cb2fc608b29e089e7f9f9
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
@@ -0,0 +1,30 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 705,
+ "band": {
+ "1": {
+ "sr": 6000,
+ "hl": 66,
+ "n_fft": 512,
+ "crop_start": 0,
+ "crop_stop": 240,
+ "lpf_start": 60,
+ "lpf_stop": 240,
+ "res_type": "sinc_fastest"
+ },
+ "2": {
+ "sr": 48000,
+ "hl": 528,
+ "n_fft": 1536,
+ "crop_start": 22,
+ "crop_stop": 505,
+ "hpf_start": 82,
+ "hpf_stop": 22,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 48000,
+ "pre_filter_start": 710,
+ "pre_filter_stop": 731
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
new file mode 100644
index 0000000000000000000000000000000000000000..d99e23986cf7e68be023e3cf382b5d131409095d
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
@@ -0,0 +1,42 @@
+{
+ "bins": 768,
+ "unstable_bins": 5,
+ "reduction_bins": 733,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 278,
+ "lpf_start": 28,
+ "lpf_stop": 140,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 768,
+ "crop_start": 14,
+ "crop_stop": 322,
+ "hpf_start": 70,
+ "hpf_stop": 14,
+ "lpf_start": 283,
+ "lpf_stop": 314,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 131,
+ "crop_stop": 313,
+ "hpf_start": 154,
+ "hpf_stop": 141,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 757,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc2c487dd52d91beb32d69bc36ad8e3b6124978b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
@@ -0,0 +1,43 @@
+{
+ "mid_side": true,
+ "bins": 768,
+ "unstable_bins": 5,
+ "reduction_bins": 733,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 278,
+ "lpf_start": 28,
+ "lpf_stop": 140,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 768,
+ "crop_start": 14,
+ "crop_stop": 322,
+ "hpf_start": 70,
+ "hpf_stop": 14,
+ "lpf_start": 283,
+ "lpf_stop": 314,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 131,
+ "crop_stop": 313,
+ "hpf_start": 154,
+ "hpf_stop": 141,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 757,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
new file mode 100644
index 0000000000000000000000000000000000000000..33b0877c2e964657af2c648b71cbb84ff6b1e581
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
@@ -0,0 +1,43 @@
+{
+ "mid_side_b2": true,
+ "bins": 640,
+ "unstable_bins": 7,
+ "reduction_bins": 565,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 108,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 187,
+ "lpf_start": 92,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 216,
+ "n_fft": 768,
+ "crop_start": 0,
+ "crop_stop": 212,
+ "hpf_start": 68,
+ "hpf_stop": 34,
+ "lpf_start": 174,
+ "lpf_stop": 209,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 432,
+ "n_fft": 640,
+ "crop_start": 66,
+ "crop_stop": 307,
+ "hpf_start": 86,
+ "hpf_stop": 72,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 639,
+ "pre_filter_stop": 640
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ae850a08f6fe11e7a5a0267f3be35f993cc4eb6
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
@@ -0,0 +1,54 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
new file mode 100644
index 0000000000000000000000000000000000000000..6346701543891938e69fc35754b58b8da9b561d6
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
@@ -0,0 +1,55 @@
+{
+ "bins": 768,
+ "unstable_bins": 7,
+ "mid_side": true,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bf477114c585236da7c48ffd81960919da38b81
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
@@ -0,0 +1,55 @@
+{
+ "mid_side_b": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0bf477114c585236da7c48ffd81960919da38b81
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
@@ -0,0 +1,55 @@
+{
+ "mid_side_b": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
new file mode 100644
index 0000000000000000000000000000000000000000..779a1c908357cccedcd22b695ca68df13c1967bd
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
@@ -0,0 +1,55 @@
+{
+ "reverse": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json
new file mode 100644
index 0000000000000000000000000000000000000000..1fefd4aa50bf6c744294fbb305888742c96e4c4c
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_44100_sw.json
@@ -0,0 +1,55 @@
+{
+ "stereo_w": true,
+ "bins": 768,
+ "unstable_bins": 7,
+ "reduction_bins": 668,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 1024,
+ "crop_start": 0,
+ "crop_stop": 186,
+ "lpf_start": 37,
+ "lpf_stop": 73,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 11025,
+ "hl": 128,
+ "n_fft": 512,
+ "crop_start": 4,
+ "crop_stop": 185,
+ "hpf_start": 36,
+ "hpf_stop": 18,
+ "lpf_start": 93,
+ "lpf_stop": 185,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 22050,
+ "hl": 256,
+ "n_fft": 512,
+ "crop_start": 46,
+ "crop_stop": 186,
+ "hpf_start": 93,
+ "hpf_stop": 46,
+ "lpf_start": 164,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 512,
+ "n_fft": 768,
+ "crop_start": 121,
+ "crop_stop": 382,
+ "hpf_start": 138,
+ "hpf_stop": 123,
+ "res_type": "sinc_medium"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 740,
+ "pre_filter_stop": 768
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json
new file mode 100644
index 0000000000000000000000000000000000000000..af798108de02a7243335e71be5c57e4094a5d7b1
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v2.json
@@ -0,0 +1,54 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 637,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json
new file mode 100644
index 0000000000000000000000000000000000000000..319b99810f364946da7a30b15b916a5309981608
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v2_sn.json
@@ -0,0 +1,55 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 637,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "convert_channels": "stereo_n",
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json b/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a73bc97ac545145a75bdca7addc5d59f5b8574b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/4band_v3.json
@@ -0,0 +1,54 @@
+{
+ "bins": 672,
+ "unstable_bins": 8,
+ "reduction_bins": 530,
+ "band": {
+ "1": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 640,
+ "crop_start": 0,
+ "crop_stop": 85,
+ "lpf_start": 25,
+ "lpf_stop": 53,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 7350,
+ "hl": 80,
+ "n_fft": 320,
+ "crop_start": 4,
+ "crop_stop": 87,
+ "hpf_start": 25,
+ "hpf_stop": 12,
+ "lpf_start": 31,
+ "lpf_stop": 62,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 14700,
+ "hl": 160,
+ "n_fft": 512,
+ "crop_start": 17,
+ "crop_stop": 216,
+ "hpf_start": 48,
+ "hpf_stop": 24,
+ "lpf_start": 139,
+ "lpf_stop": 210,
+ "res_type": "polyphase"
+ },
+ "4": {
+ "sr": 44100,
+ "hl": 480,
+ "n_fft": 960,
+ "crop_start": 78,
+ "crop_stop": 383,
+ "hpf_start": 130,
+ "hpf_stop": 86,
+ "res_type": "kaiser_fast"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 668,
+ "pre_filter_stop": 672
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/modelparams/ensemble.json b/tools/uvr5/lib/lib_v5/modelparams/ensemble.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca96bf19c593dbe127e1a013ae456ac093602e28
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/modelparams/ensemble.json
@@ -0,0 +1,43 @@
+{
+ "mid_side_b2": true,
+ "bins": 1280,
+ "unstable_bins": 7,
+ "reduction_bins": 565,
+ "band": {
+ "1": {
+ "sr": 11025,
+ "hl": 108,
+ "n_fft": 2048,
+ "crop_start": 0,
+ "crop_stop": 374,
+ "lpf_start": 92,
+ "lpf_stop": 186,
+ "res_type": "polyphase"
+ },
+ "2": {
+ "sr": 22050,
+ "hl": 216,
+ "n_fft": 1536,
+ "crop_start": 0,
+ "crop_stop": 424,
+ "hpf_start": 68,
+ "hpf_stop": 34,
+ "lpf_start": 348,
+ "lpf_stop": 418,
+ "res_type": "polyphase"
+ },
+ "3": {
+ "sr": 44100,
+ "hl": 432,
+ "n_fft": 1280,
+ "crop_start": 132,
+ "crop_stop": 614,
+ "hpf_start": 172,
+ "hpf_stop": 144,
+ "res_type": "polyphase"
+ }
+ },
+ "sr": 44100,
+ "pre_filter_start": 1280,
+ "pre_filter_stop": 1280
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/lib_v5/nets.py b/tools/uvr5/lib/lib_v5/nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d0341f05217f875e5975cf1449cd1578fc1edd5
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets.py
@@ -0,0 +1,123 @@
+import layers
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_123812KB.py b/tools/uvr5/lib/lib_v5/nets_123812KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_123812KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_123821KB.py b/tools/uvr5/lib/lib_v5/nets_123821KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_123821KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_33966KB.py b/tools/uvr5/lib/lib_v5/nets_33966KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb1e208d9684d98a9a19223e6753d7d3e7f3b31
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_33966KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_33966KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 16)
+ self.stg1_high_band_net = BaseASPPNet(2, 16)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(8, 16)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(16, 32)
+
+ self.out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/tools/uvr5/lib/lib_v5/nets_537227KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda9b8fe03874ad9a609855c609bdbe4d70a5a6b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_537227KB.py
@@ -0,0 +1,123 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/tools/uvr5/lib/lib_v5/nets_537238KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..dda9b8fe03874ad9a609855c609bdbe4d70a5a6b
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_537238KB.py
@@ -0,0 +1,123 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_537238KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 64)
+ self.stg1_high_band_net = BaseASPPNet(2, 64)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(32, 64)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(64, 128)
+
+ self.out = nn.Conv2d(128, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_61968KB.py b/tools/uvr5/lib/lib_v5/nets_61968KB.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_61968KB.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_123821KB as layers
+
+
+class BaseASPPNet(nn.Module):
+ def __init__(self, nin, ch, dilations=(4, 8, 16)):
+ super(BaseASPPNet, self).__init__()
+ self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
+ self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
+ self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
+ self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
+
+ self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
+
+ self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
+ self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
+ self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
+ self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
+
+ def __call__(self, x):
+ h, e1 = self.enc1(x)
+ h, e2 = self.enc2(h)
+ h, e3 = self.enc3(h)
+ h, e4 = self.enc4(h)
+
+ h = self.aspp(h)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedASPPNet(nn.Module):
+ def __init__(self, n_fft):
+ super(CascadedASPPNet, self).__init__()
+ self.stg1_low_band_net = BaseASPPNet(2, 32)
+ self.stg1_high_band_net = BaseASPPNet(2, 32)
+
+ self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
+ self.stg2_full_band_net = BaseASPPNet(16, 32)
+
+ self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
+ self.stg3_full_band_net = BaseASPPNet(32, 64)
+
+ self.out = nn.Conv2d(64, 2, 1, bias=False)
+ self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
+ self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+
+ self.offset = 128
+
+ def forward(self, x, aggressiveness=None):
+ mix = x.detach()
+ x = x.clone()
+
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ aux1 = torch.cat(
+ [
+ self.stg1_low_band_net(x[:, :, :bandw]),
+ self.stg1_high_band_net(x[:, :, bandw:]),
+ ],
+ dim=2,
+ )
+
+ h = torch.cat([x, aux1], dim=1)
+ aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
+
+ h = torch.cat([x, aux1, aux2], dim=1)
+ h = self.stg3_full_band_net(self.stg3_bridge(h))
+
+ mask = torch.sigmoid(self.out(h))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux1 = torch.sigmoid(self.aux1_out(aux1))
+ aux1 = F.pad(
+ input=aux1,
+ pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
+ mode="replicate",
+ )
+ aux2 = torch.sigmoid(self.aux2_out(aux2))
+ aux2 = F.pad(
+ input=aux2,
+ pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
+ mode="replicate",
+ )
+ return mask * mix, aux1 * mix, aux2 * mix
+ else:
+ if aggressiveness:
+ mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
+ mask[:, :, : aggressiveness["split_bin"]],
+ 1 + aggressiveness["value"] / 3,
+ )
+ mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
+ mask[:, :, aggressiveness["split_bin"] :],
+ 1 + aggressiveness["value"],
+ )
+
+ return mask * mix
+
+ def predict(self, x_mag, aggressiveness=None):
+ h = self.forward(x_mag, aggressiveness)
+
+ if self.offset > 0:
+ h = h[:, :, :, self.offset : -self.offset]
+ assert h.size()[3] > 0
+
+ return h
diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/tools/uvr5/lib/lib_v5/nets_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..9170aae4c7b5026a1e8c0dd80d6df1f632fc995d
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/nets_new.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import layers_new
+
+
+class BaseNet(nn.Module):
+ def __init__(
+ self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
+ ):
+ super(BaseNet, self).__init__()
+ self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
+ self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
+ self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
+ self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
+ self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
+
+ self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
+
+ self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
+ self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
+ self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
+ self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
+ self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
+
+ def __call__(self, x):
+ e1 = self.enc1(x)
+ e2 = self.enc2(e1)
+ e3 = self.enc3(e2)
+ e4 = self.enc4(e3)
+ e5 = self.enc5(e4)
+
+ h = self.aspp(e5)
+
+ h = self.dec4(h, e4)
+ h = self.dec3(h, e3)
+ h = self.dec2(h, e2)
+ h = torch.cat([h, self.lstm_dec2(h)], dim=1)
+ h = self.dec1(h, e1)
+
+ return h
+
+
+class CascadedNet(nn.Module):
+ def __init__(self, n_fft, nout=32, nout_lstm=128):
+ super(CascadedNet, self).__init__()
+
+ self.max_bin = n_fft // 2
+ self.output_bin = n_fft // 2 + 1
+ self.nin_lstm = self.max_bin // 2
+ self.offset = 64
+
+ self.stg1_low_band_net = nn.Sequential(
+ BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
+ layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
+ )
+
+ self.stg1_high_band_net = BaseNet(
+ 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
+ )
+
+ self.stg2_low_band_net = nn.Sequential(
+ BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
+ layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
+ )
+ self.stg2_high_band_net = BaseNet(
+ nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
+ )
+
+ self.stg3_full_band_net = BaseNet(
+ 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
+ )
+
+ self.out = nn.Conv2d(nout, 2, 1, bias=False)
+ self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
+
+ def forward(self, x):
+ x = x[:, :, : self.max_bin]
+
+ bandw = x.size()[2] // 2
+ l1_in = x[:, :, :bandw]
+ h1_in = x[:, :, bandw:]
+ l1 = self.stg1_low_band_net(l1_in)
+ h1 = self.stg1_high_band_net(h1_in)
+ aux1 = torch.cat([l1, h1], dim=2)
+
+ l2_in = torch.cat([l1_in, l1], dim=1)
+ h2_in = torch.cat([h1_in, h1], dim=1)
+ l2 = self.stg2_low_band_net(l2_in)
+ h2 = self.stg2_high_band_net(h2_in)
+ aux2 = torch.cat([l2, h2], dim=2)
+
+ f3_in = torch.cat([x, aux1, aux2], dim=1)
+ f3 = self.stg3_full_band_net(f3_in)
+
+ mask = torch.sigmoid(self.out(f3))
+ mask = F.pad(
+ input=mask,
+ pad=(0, 0, 0, self.output_bin - mask.size()[2]),
+ mode="replicate",
+ )
+
+ if self.training:
+ aux = torch.cat([aux1, aux2], dim=1)
+ aux = torch.sigmoid(self.aux_out(aux))
+ aux = F.pad(
+ input=aux,
+ pad=(0, 0, 0, self.output_bin - aux.size()[2]),
+ mode="replicate",
+ )
+ return mask, aux
+ else:
+ return mask
+
+ def predict_mask(self, x):
+ mask = self.forward(x)
+
+ if self.offset > 0:
+ mask = mask[:, :, :, self.offset : -self.offset]
+ assert mask.size()[3] > 0
+
+ return mask
+
+ def predict(self, x, aggressiveness=None):
+ mask = self.forward(x)
+ pred_mag = x * mask
+
+ if self.offset > 0:
+ pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
+ assert pred_mag.size()[3] > 0
+
+ return pred_mag
diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/tools/uvr5/lib/lib_v5/spec_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac05ebefa0898bb61c21db769cc58ee8f4b4d7fb
--- /dev/null
+++ b/tools/uvr5/lib/lib_v5/spec_utils.py
@@ -0,0 +1,676 @@
+import hashlib
+import json
+import math
+import os
+
+import librosa
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+
+
+def crop_center(h1, h2):
+ h1_shape = h1.size()
+ h2_shape = h2.size()
+
+ if h1_shape[3] == h2_shape[3]:
+ return h1
+ elif h1_shape[3] < h2_shape[3]:
+ raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
+
+ # s_freq = (h2_shape[2] - h1_shape[2]) // 2
+ # e_freq = s_freq + h1_shape[2]
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
+ e_time = s_time + h2_shape[3]
+ h1 = h1[:, :, :, s_time:e_time]
+
+ return h1
+
+
+def wave_to_spectrogram(
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
+ if reverse:
+ wave_left = np.flip(np.asfortranarray(wave[0]))
+ wave_right = np.flip(np.asfortranarray(wave[1]))
+ elif mid_side:
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+ elif mid_side_b2:
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+ else:
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+
+ spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
+ spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def wave_to_spectrogram_mt(
+ wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
+):
+ import threading
+
+ if reverse:
+ wave_left = np.flip(np.asfortranarray(wave[0]))
+ wave_right = np.flip(np.asfortranarray(wave[1]))
+ elif mid_side:
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+ elif mid_side_b2:
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
+ else:
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+
+ def run_thread(**kwargs):
+ global spec_left
+ spec_left = librosa.stft(**kwargs)
+
+ thread = threading.Thread(
+ target=run_thread,
+ kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
+ )
+ thread.start()
+ spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+ thread.join()
+
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def combine_spectrograms(specs, mp):
+ l = min([specs[i].shape[2] for i in specs])
+ spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
+ offset = 0
+ bands_n = len(mp.param["band"])
+
+ for d in range(1, bands_n + 1):
+ h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
+ spec_c[:, offset : offset + h, :l] = specs[d][
+ :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
+ ]
+ offset += h
+
+ if offset > mp.param["bins"]:
+ raise ValueError("Too much bins")
+
+ # lowpass fiter
+ if (
+ mp.param["pre_filter_start"] > 0
+ ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
+ if bands_n == 1:
+ spec_c = fft_lp_filter(
+ spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
+ )
+ else:
+ gp = 1
+ for b in range(
+ mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
+ ):
+ g = math.pow(
+ 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
+ )
+ gp = g
+ spec_c[:, b, :] *= g
+
+ return np.asfortranarray(spec_c)
+
+
+def spectrogram_to_image(spec, mode="magnitude"):
+ if mode == "magnitude":
+ if np.iscomplexobj(spec):
+ y = np.abs(spec)
+ else:
+ y = spec
+ y = np.log10(y**2 + 1e-8)
+ elif mode == "phase":
+ if np.iscomplexobj(spec):
+ y = np.angle(spec)
+ else:
+ y = spec
+
+ y -= y.min()
+ y *= 255 / y.max()
+ img = np.uint8(y)
+
+ if y.ndim == 3:
+ img = img.transpose(1, 2, 0)
+ img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
+
+ return img
+
+
+def reduce_vocal_aggressively(X, y, softmask):
+ v = X - y
+ y_mag_tmp = np.abs(y)
+ v_mag_tmp = np.abs(v)
+
+ v_mask = v_mag_tmp > y_mag_tmp
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+
+ return y_mag * np.exp(1.0j * np.angle(y))
+
+
+def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
+ if min_range < fade_size * 2:
+ raise ValueError("min_range must be >= fade_area * 2")
+
+ mag = mag.copy()
+
+ idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
+ starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+ ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+ uninformative = np.where(ends - starts > min_range)[0]
+ if len(uninformative) > 0:
+ starts = starts[uninformative]
+ ends = ends[uninformative]
+ old_e = None
+ for s, e in zip(starts, ends):
+ if old_e is not None and s - old_e < fade_size:
+ s = old_e - fade_size * 2
+
+ if s != 0:
+ weight = np.linspace(0, 1, fade_size)
+ mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
+ else:
+ s -= fade_size
+
+ if e != mag.shape[2]:
+ weight = np.linspace(1, 0, fade_size)
+ mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
+ else:
+ e += fade_size
+
+ mag[:, :, s + fade_size : e - fade_size] += ref[
+ :, :, s + fade_size : e - fade_size
+ ]
+ old_e = e
+
+ return mag
+
+
+def align_wave_head_and_tail(a, b):
+ l = min([a[0].size, b[0].size])
+
+ return a[:l, :l], b[:l, :l]
+
+
+def cache_or_load(mix_path, inst_path, mp):
+ mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
+ inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
+
+ cache_dir = "mph{}".format(
+ hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
+ )
+ mix_cache_dir = os.path.join("cache", cache_dir)
+ inst_cache_dir = os.path.join("cache", cache_dir)
+
+ os.makedirs(mix_cache_dir, exist_ok=True)
+ os.makedirs(inst_cache_dir, exist_ok=True)
+
+ mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
+ inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
+
+ if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
+ X_spec_m = np.load(mix_cache_path)
+ y_spec_m = np.load(inst_cache_path)
+ else:
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+
+ for d in range(len(mp.param["band"]), 0, -1):
+ bp = mp.param["band"][d]
+
+ if d == len(mp.param["band"]): # high-end band
+ X_wave[d], _ = librosa.load(
+ mix_path,
+ sr = bp["sr"],
+ mono = False,
+ dtype = np.float32,
+ res_type = bp["res_type"]
+ )
+ y_wave[d], _ = librosa.load(
+ inst_path,
+ sr = bp["sr"],
+ mono = False,
+ dtype = np.float32,
+ res_type = bp["res_type"],
+ )
+ else: # lower bands
+ X_wave[d] = librosa.resample(
+ X_wave[d + 1],
+ orig_sr = mp.param["band"][d + 1]["sr"],
+ target_sr = bp["sr"],
+ res_type = bp["res_type"],
+ )
+ y_wave[d] = librosa.resample(
+ y_wave[d + 1],
+ orig_sr = mp.param["band"][d + 1]["sr"],
+ target_sr = bp["sr"],
+ res_type = bp["res_type"],
+ )
+
+ X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
+
+ X_spec_s[d] = wave_to_spectrogram(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+ y_spec_s[d] = wave_to_spectrogram(
+ y_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+
+ del X_wave, y_wave
+
+ X_spec_m = combine_spectrograms(X_spec_s, mp)
+ y_spec_m = combine_spectrograms(y_spec_s, mp)
+
+ if X_spec_m.shape != y_spec_m.shape:
+ raise ValueError("The combined spectrograms are different: " + mix_path)
+
+ _, ext = os.path.splitext(mix_path)
+
+ np.save(mix_cache_path, X_spec_m)
+ np.save(inst_cache_path, y_spec_m)
+
+ return X_spec_m, y_spec_m
+
+
+def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
+
+ if reverse:
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+ elif mid_side:
+ return np.asfortranarray(
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+ )
+ elif mid_side_b2:
+ return np.asfortranarray(
+ [
+ np.add(wave_right / 1.25, 0.4 * wave_left),
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
+ ]
+ )
+ else:
+ return np.asfortranarray([wave_left, wave_right])
+
+
+def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
+ import threading
+
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ def run_thread(**kwargs):
+ global wave_left
+ wave_left = librosa.istft(**kwargs)
+
+ thread = threading.Thread(
+ target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
+ )
+ thread.start()
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
+ thread.join()
+
+ if reverse:
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+ elif mid_side:
+ return np.asfortranarray(
+ [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
+ )
+ elif mid_side_b2:
+ return np.asfortranarray(
+ [
+ np.add(wave_right / 1.25, 0.4 * wave_left),
+ np.subtract(wave_left / 1.25, 0.4 * wave_right),
+ ]
+ )
+ else:
+ return np.asfortranarray([wave_left, wave_right])
+
+
+def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
+ wave_band = {}
+ bands_n = len(mp.param["band"])
+ offset = 0
+
+ for d in range(1, bands_n + 1):
+ bp = mp.param["band"][d]
+ spec_s = np.ndarray(
+ shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
+ )
+ h = bp["crop_stop"] - bp["crop_start"]
+ spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
+ :, offset : offset + h, :
+ ]
+
+ offset += h
+ if d == bands_n: # higher
+ if extra_bins_h: # if --high_end_process bypass
+ max_bin = bp["n_fft"] // 2
+ spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
+ :, :extra_bins_h, :
+ ]
+ if bp["hpf_start"] > 0:
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+ if bands_n == 1:
+ wave = spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+ else:
+ wave = np.add(
+ wave,
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ )
+ else:
+ sr = mp.param["band"][d + 1]["sr"]
+ if d == 1: # lower
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+ wave = librosa.resample(
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ orig_sr = bp["sr"],
+ target_sr = sr,
+ res_type = "sinc_fastest",
+ )
+ else: # mid
+ spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
+ spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
+ wave2 = np.add(
+ wave,
+ spectrogram_to_wave(
+ spec_s,
+ bp["hl"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ ),
+ )
+ # wave = librosa.core.resample(wave2, orig_sr=bp['sr'], target_sr=sr, res_type="sinc_fastest")
+ wave = librosa.core.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type="scipy")
+
+ return wave.T
+
+
+def fft_lp_filter(spec, bin_start, bin_stop):
+ g = 1.0
+ for b in range(bin_start, bin_stop):
+ g -= 1 / (bin_stop - bin_start)
+ spec[:, b, :] = g * spec[:, b, :]
+
+ spec[:, bin_stop:, :] *= 0
+
+ return spec
+
+
+def fft_hp_filter(spec, bin_start, bin_stop):
+ g = 1.0
+ for b in range(bin_start, bin_stop, -1):
+ g -= 1 / (bin_start - bin_stop)
+ spec[:, b, :] = g * spec[:, b, :]
+
+ spec[:, 0 : bin_stop + 1, :] *= 0
+
+ return spec
+
+
+def mirroring(a, spec_m, input_high_end, mp):
+ if "mirroring" == a:
+ mirror = np.flip(
+ np.abs(
+ spec_m[
+ :,
+ mp.param["pre_filter_start"]
+ - 10
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
+ - 10,
+ :,
+ ]
+ ),
+ 1,
+ )
+ mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
+
+ return np.where(
+ np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
+ )
+
+ if "mirroring2" == a:
+ mirror = np.flip(
+ np.abs(
+ spec_m[
+ :,
+ mp.param["pre_filter_start"]
+ - 10
+ - input_high_end.shape[1] : mp.param["pre_filter_start"]
+ - 10,
+ :,
+ ]
+ ),
+ 1,
+ )
+ mi = np.multiply(mirror, input_high_end * 1.7)
+
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+
+
+def ensembling(a, specs):
+ for i in range(1, len(specs)):
+ if i == 1:
+ spec = specs[0]
+
+ ln = min([spec.shape[2], specs[i].shape[2]])
+ spec = spec[:, :, :ln]
+ specs[i] = specs[i][:, :, :ln]
+
+ if "min_mag" == a:
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
+ if "max_mag" == a:
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
+
+ return spec
+
+
+def stft(wave, nfft, hl):
+ wave_left = np.asfortranarray(wave[0])
+ wave_right = np.asfortranarray(wave[1])
+ spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl)
+ spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl)
+ spec = np.asfortranarray([spec_left, spec_right])
+
+ return spec
+
+
+def istft(spec, hl):
+ spec_left = np.asfortranarray(spec[0])
+ spec_right = np.asfortranarray(spec[1])
+
+ wave_left = librosa.istft(spec_left, hop_length=hl)
+ wave_right = librosa.istft(spec_right, hop_length=hl)
+ wave = np.asfortranarray([wave_left, wave_right])
+
+
+if __name__ == "__main__":
+ import argparse
+ import sys
+ import time
+
+ import cv2
+ from model_param_init import ModelParameters
+
+ p = argparse.ArgumentParser()
+ p.add_argument(
+ "--algorithm",
+ "-a",
+ type=str,
+ choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
+ default="min_mag",
+ )
+ p.add_argument(
+ "--model_params",
+ "-m",
+ type=str,
+ default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
+ )
+ p.add_argument("--output_name", "-o", type=str, default="output")
+ p.add_argument("--vocals_only", "-v", action="store_true")
+ p.add_argument("input", nargs="+")
+ args = p.parse_args()
+
+ start_time = time.time()
+
+ if args.algorithm.startswith("invert") and len(args.input) != 2:
+ raise ValueError("There should be two input files.")
+
+ if not args.algorithm.startswith("invert") and len(args.input) < 2:
+ raise ValueError("There must be at least two input files.")
+
+ wave, specs = {}, {}
+ mp = ModelParameters(args.model_params)
+
+ for i in range(len(args.input)):
+ spec = {}
+
+ for d in range(len(mp.param["band"]), 0, -1):
+ bp = mp.param["band"][d]
+
+ if d == len(mp.param["band"]): # high-end band
+ wave[d], _ = librosa.load(
+ args.input[i],
+ sr = bp["sr"],
+ mono = False,
+ dtype = np.float32,
+ res_type = bp["res_type"],
+ )
+
+ if len(wave[d].shape) == 1: # mono to stereo
+ wave[d] = np.array([wave[d], wave[d]])
+ else: # lower bands
+ wave[d] = librosa.resample(
+ wave[d + 1],
+ orig_sr = mp.param["band"][d + 1]["sr"],
+ target_sr = bp["sr"],
+ res_type = bp["res_type"],
+ )
+
+ spec[d] = wave_to_spectrogram(
+ wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ mp.param["mid_side"],
+ mp.param["mid_side_b2"],
+ mp.param["reverse"],
+ )
+
+ specs[i] = combine_spectrograms(spec, mp)
+
+ del wave
+
+ if args.algorithm == "deep":
+ d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
+ v_spec = d_spec - specs[1]
+ sf.write(
+ os.path.join("{}.wav".format(args.output_name)),
+ cmb_spectrogram_to_wave(v_spec, mp),
+ mp.param["sr"],
+ )
+
+ if args.algorithm.startswith("invert"):
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
+ specs[0] = specs[0][:, :, :ln]
+ specs[1] = specs[1][:, :, :ln]
+
+ if "invert_p" == args.algorithm:
+ X_mag = np.abs(specs[0])
+ y_mag = np.abs(specs[1])
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
+ v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
+ else:
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
+ v_spec = specs[0] - specs[1]
+
+ if not args.vocals_only:
+ X_mag = np.abs(specs[0])
+ y_mag = np.abs(specs[1])
+ v_mag = np.abs(v_spec)
+
+ X_image = spectrogram_to_image(X_mag)
+ y_image = spectrogram_to_image(y_mag)
+ v_image = spectrogram_to_image(v_mag)
+
+ cv2.imwrite("{}_X.png".format(args.output_name), X_image)
+ cv2.imwrite("{}_y.png".format(args.output_name), y_image)
+ cv2.imwrite("{}_v.png".format(args.output_name), v_image)
+
+ sf.write(
+ "{}_X.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(specs[0], mp),
+ mp.param["sr"],
+ )
+ sf.write(
+ "{}_y.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(specs[1], mp),
+ mp.param["sr"],
+ )
+
+ sf.write(
+ "{}_v.wav".format(args.output_name),
+ cmb_spectrogram_to_wave(v_spec, mp),
+ mp.param["sr"],
+ )
+ else:
+ if not args.algorithm == "deep":
+ sf.write(
+ os.path.join("ensembled", "{}.wav".format(args.output_name)),
+ cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
+ mp.param["sr"],
+ )
+
+ if args.algorithm == "align":
+ trackalignment = [
+ {
+ "file1": '"{}"'.format(args.input[0]),
+ "file2": '"{}"'.format(args.input[1]),
+ }
+ ]
+
+ for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
+ os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
+
+ # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
diff --git a/tools/uvr5/lib/name_params.json b/tools/uvr5/lib/name_params.json
new file mode 100644
index 0000000000000000000000000000000000000000..0390a523878c82076c7de2bdbe580f7927ac1f7e
--- /dev/null
+++ b/tools/uvr5/lib/name_params.json
@@ -0,0 +1,263 @@
+{
+ "equivalent" : [
+ {
+ "model_hash_name" : [
+ {
+ "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "a82f14e75892e55e994376edbf0c8435",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "08611fb99bd59eaa79ad27c58d137727",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "5c7bbca45a187e81abbbd351606164e5",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ },
+ {
+ "hash_name": "d6b2cb685a058a091e5e7098192d3233",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ },
+ {
+ "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "68aa2c8093d0080704b200d140f59e54",
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
+ "param_name": "3band_44100"
+ },
+ {
+ "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid.json"
+ },
+ {
+ "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid.json"
+ },
+ {
+ "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100.json"
+ },
+ {
+ "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100.json"
+ },
+ {
+ "hash_name": "89e83b511ad474592689e562d5b1f80e",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000.json"
+ },
+ {
+ "hash_name": "0b954da81d453b716b114d6d7c95177f",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000.json"
+ }
+
+ ],
+ "v4 Models": [
+ {
+ "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "80ab74d65e515caa3622728d2de07d23",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr32000_hl512"
+ },
+ {
+ "hash_name": "edc115e7fc523245062200c00caa847f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "ae702fed0238afb5346db8356fe25f13",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
+ "param_name": "1band_sr44100_hl1024"
+ }
+ ]
+ }
+ ],
+ "User Models" : [
+ {
+ "1 Band": [
+ {
+ "hash_name": "1band_sr16000_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr16000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "1band_sr32000_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr32000_hl512.json",
+ "param_name": "1band_sr16000_hl512"
+ },
+ {
+ "hash_name": "1band_sr33075_hl384",
+ "model_params": "lib/lib_v5/modelparams/1band_sr33075_hl384.json",
+ "param_name": "1band_sr33075_hl384"
+ },
+ {
+ "hash_name": "1band_sr44100_hl256",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl256.json",
+ "param_name": "1band_sr44100_hl256"
+ },
+ {
+ "hash_name": "1band_sr44100_hl512",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl512.json",
+ "param_name": "1band_sr44100_hl512"
+ },
+ {
+ "hash_name": "1band_sr44100_hl1024",
+ "model_params": "lib/lib_v5/modelparams/1band_sr44100_hl1024.json",
+ "param_name": "1band_sr44100_hl1024"
+ }
+ ],
+ "2 Band": [
+ {
+ "hash_name": "2band_44100_lofi",
+ "model_params": "lib/lib_v5/modelparams/2band_44100_lofi.json",
+ "param_name": "2band_44100_lofi"
+ },
+ {
+ "hash_name": "2band_32000",
+ "model_params": "lib/lib_v5/modelparams/2band_32000.json",
+ "param_name": "2band_32000"
+ },
+ {
+ "hash_name": "2band_48000",
+ "model_params": "lib/lib_v5/modelparams/2band_48000.json",
+ "param_name": "2band_48000"
+ }
+ ],
+ "3 Band": [
+ {
+ "hash_name": "3band_44100",
+ "model_params": "lib/lib_v5/modelparams/3band_44100.json",
+ "param_name": "3band_44100"
+ },
+ {
+ "hash_name": "3band_44100_mid",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_mid.json",
+ "param_name": "3band_44100_mid"
+ },
+ {
+ "hash_name": "3band_44100_msb2",
+ "model_params": "lib/lib_v5/modelparams/3band_44100_msb2.json",
+ "param_name": "3band_44100_msb2"
+ }
+ ],
+ "4 Band": [
+ {
+ "hash_name": "4band_44100",
+ "model_params": "lib/lib_v5/modelparams/4band_44100.json",
+ "param_name": "4band_44100"
+ },
+ {
+ "hash_name": "4band_44100_mid",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_mid.json",
+ "param_name": "4band_44100_mid"
+ },
+ {
+ "hash_name": "4band_44100_msb",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb.json",
+ "param_name": "4band_44100_msb"
+ },
+ {
+ "hash_name": "4band_44100_msb2",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_msb2.json",
+ "param_name": "4band_44100_msb2"
+ },
+ {
+ "hash_name": "4band_44100_reverse",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_reverse.json",
+ "param_name": "4band_44100_reverse"
+ },
+ {
+ "hash_name": "4band_44100_sw",
+ "model_params": "lib/lib_v5/modelparams/4band_44100_sw.json",
+ "param_name": "4band_44100_sw"
+ },
+ {
+ "hash_name": "4band_v2",
+ "model_params": "lib/lib_v5/modelparams/4band_v2.json",
+ "param_name": "4band_v2"
+ },
+ {
+ "hash_name": "4band_v2_sn",
+ "model_params": "lib/lib_v5/modelparams/4band_v2_sn.json",
+ "param_name": "4band_v2_sn"
+ },
+ {
+ "hash_name": "tmodelparam",
+ "model_params": "lib/lib_v5/modelparams/tmodelparam.json",
+ "param_name": "User Model Param Set"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cd969dc69261f3e1e5c1255d15b648b378f5ec
--- /dev/null
+++ b/tools/uvr5/lib/utils.py
@@ -0,0 +1,121 @@
+import json
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+
+def load_data(file_name: str = "./lib/name_params.json") -> dict:
+ with open(file_name, "r") as f:
+ data = json.load(f)
+
+ return data
+
+
+def make_padding(width, cropsize, offset):
+ left = offset
+ roi_size = cropsize - left * 2
+ if roi_size == 0:
+ roi_size = cropsize
+ right = roi_size - (width % roi_size) + left
+
+ return left, right, roi_size
+
+
+def inference(X_spec, device, model, aggressiveness, data):
+ """
+ data : dic configs
+ """
+
+ def _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
+ ):
+ model.eval()
+ with torch.no_grad():
+ preds = []
+
+ iterations = [n_window]
+
+ total_iterations = sum(iterations)
+ for i in tqdm(range(n_window)):
+ start = i * roi_size
+ X_mag_window = X_mag_pad[
+ None, :, :, start : start + data["window_size"]
+ ]
+ X_mag_window = torch.from_numpy(X_mag_window)
+ if is_half:
+ X_mag_window = X_mag_window.half()
+ X_mag_window = X_mag_window.to(device)
+
+ pred = model.predict(X_mag_window, aggressiveness)
+
+ pred = pred.detach().cpu().numpy()
+ preds.append(pred[0])
+
+ pred = np.concatenate(preds, axis=2)
+ return pred
+
+ def preprocess(X_spec):
+ X_mag = np.abs(X_spec)
+ X_phase = np.angle(X_spec)
+
+ return X_mag, X_phase
+
+ X_mag, X_phase = preprocess(X_spec)
+
+ coef = X_mag.max()
+ X_mag_pre = X_mag / coef
+
+ n_frame = X_mag_pre.shape[2]
+ pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
+ n_window = int(np.ceil(n_frame / roi_size))
+
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+
+ if list(model.state_dict().values())[0].dtype == torch.float16:
+ is_half = True
+ else:
+ is_half = False
+ pred = _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+ )
+ pred = pred[:, :, :n_frame]
+
+ if data["tta"]:
+ pad_l += roi_size // 2
+ pad_r += roi_size // 2
+ n_window += 1
+
+ X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
+
+ pred_tta = _execute(
+ X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
+ )
+ pred_tta = pred_tta[:, :, roi_size // 2 :]
+ pred_tta = pred_tta[:, :, :n_frame]
+
+ return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
+ else:
+ return pred * coef, X_mag, np.exp(1.0j * X_phase)
+
+
+def _get_name_params(model_path, model_hash):
+ data = load_data()
+ flag = False
+ ModelName = model_path
+ for type in list(data):
+ for model in list(data[type][0]):
+ for i in range(len(data[type][0][model])):
+ if str(data[type][0][model][i]["hash_name"]) == model_hash:
+ flag = True
+ elif str(data[type][0][model][i]["hash_name"]) in ModelName:
+ flag = True
+
+ if flag:
+ model_params_auto = data[type][0][model][i]["model_params"]
+ param_name_auto = data[type][0][model][i]["param_name"]
+ if type == "equivalent":
+ return param_name_auto, model_params_auto
+ else:
+ flag = False
+ return param_name_auto, model_params_auto
diff --git a/tools/uvr5/mdxnet.py b/tools/uvr5/mdxnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..151ac92c8aed9a0799b542fc140e12e8e03a7530
--- /dev/null
+++ b/tools/uvr5/mdxnet.py
@@ -0,0 +1,256 @@
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+
+cpu = torch.device("cpu")
+
+
+class ConvTDFNetTrim:
+ def __init__(
+ self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
+ ):
+ super(ConvTDFNetTrim, self).__init__()
+
+ self.dim_f = dim_f
+ self.dim_t = 2**dim_t
+ self.n_fft = n_fft
+ self.hop = hop
+ self.n_bins = self.n_fft // 2 + 1
+ self.chunk_size = hop * (self.dim_t - 1)
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
+ device
+ )
+ self.target_name = target_name
+ self.blender = "blender" in model_name
+
+ self.dim_c = 4
+ out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
+ self.freq_pad = torch.zeros(
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
+ ).to(device)
+
+ self.n = L // 2
+
+ def stft(self, x):
+ x = x.reshape([-1, self.chunk_size])
+ x = torch.stft(
+ x,
+ n_fft=self.n_fft,
+ hop_length=self.hop,
+ window=self.window,
+ center=True,
+ return_complex=True,
+ )
+ x = torch.view_as_real(x)
+ x = x.permute([0, 3, 1, 2])
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+ [-1, self.dim_c, self.n_bins, self.dim_t]
+ )
+ return x[:, :, : self.dim_f]
+
+ def istft(self, x, freq_pad=None):
+ freq_pad = (
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
+ if freq_pad is None
+ else freq_pad
+ )
+ x = torch.cat([x, freq_pad], -2)
+ c = 4 * 2 if self.target_name == "*" else 2
+ x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
+ [-1, 2, self.n_bins, self.dim_t]
+ )
+ x = x.permute([0, 2, 3, 1])
+ x = x.contiguous()
+ x = torch.view_as_complex(x)
+ x = torch.istft(
+ x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
+ )
+ return x.reshape([-1, c, self.chunk_size])
+
+
+def get_models(device, dim_f, dim_t, n_fft):
+ return ConvTDFNetTrim(
+ device=device,
+ model_name="Conv-TDF",
+ target_name="vocals",
+ L=11,
+ dim_f=dim_f,
+ dim_t=dim_t,
+ n_fft=n_fft,
+ )
+
+
+class Predictor:
+ def __init__(self, args):
+ import onnxruntime as ort
+
+ logger.info(ort.get_available_providers())
+ self.args = args
+ self.model_ = get_models(
+ device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
+ )
+ self.model = ort.InferenceSession(
+ os.path.join(args.onnx, self.model_.target_name + ".onnx"),
+ providers=[
+ "CUDAExecutionProvider",
+ "DmlExecutionProvider",
+ "CPUExecutionProvider",
+ ],
+ )
+ logger.info("ONNX load done")
+
+ def demix(self, mix):
+ samples = mix.shape[-1]
+ margin = self.args.margin
+ chunk_size = self.args.chunks * 44100
+ assert not margin == 0, "margin cannot be zero!"
+ if margin > chunk_size:
+ margin = chunk_size
+
+ segmented_mix = {}
+
+ if self.args.chunks == 0 or samples < chunk_size:
+ chunk_size = samples
+
+ counter = -1
+ for skip in range(0, samples, chunk_size):
+ counter += 1
+
+ s_margin = 0 if counter == 0 else margin
+ end = min(skip + chunk_size + margin, samples)
+
+ start = skip - s_margin
+
+ segmented_mix[skip] = mix[:, start:end].copy()
+ if end == samples:
+ break
+
+ sources = self.demix_base(segmented_mix, margin_size=margin)
+ """
+ mix:(2,big_sample)
+ segmented_mix:offset->(2,small_sample)
+ sources:(1,2,big_sample)
+ """
+ return sources
+
+ def demix_base(self, mixes, margin_size):
+ chunked_sources = []
+ progress_bar = tqdm(total=len(mixes))
+ progress_bar.set_description("Processing")
+ for mix in mixes:
+ cmix = mixes[mix]
+ sources = []
+ n_sample = cmix.shape[1]
+ model = self.model_
+ trim = model.n_fft // 2
+ gen_size = model.chunk_size - 2 * trim
+ pad = gen_size - n_sample % gen_size
+ mix_p = np.concatenate(
+ (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
+ )
+ mix_waves = []
+ i = 0
+ while i < n_sample + pad:
+ waves = np.array(mix_p[:, i : i + model.chunk_size])
+ mix_waves.append(waves)
+ i += gen_size
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
+ with torch.no_grad():
+ _ort = self.model
+ spek = model.stft(mix_waves)
+ if self.args.denoise:
+ spec_pred = (
+ -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+ + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
+ )
+ tar_waves = model.istft(torch.tensor(spec_pred))
+ else:
+ tar_waves = model.istft(
+ torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
+ )
+ tar_signal = (
+ tar_waves[:, :, trim:-trim]
+ .transpose(0, 1)
+ .reshape(2, -1)
+ .numpy()[:, :-pad]
+ )
+
+ start = 0 if mix == 0 else margin_size
+ end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
+ if margin_size == 0:
+ end = None
+ sources.append(tar_signal[:, start:end])
+
+ progress_bar.update(1)
+
+ chunked_sources.append(sources)
+ _sources = np.concatenate(chunked_sources, axis=-1)
+ # del self.model
+ progress_bar.close()
+ return _sources
+
+ def prediction(self, m, vocal_root, others_root, format):
+ os.makedirs(vocal_root, exist_ok=True)
+ os.makedirs(others_root, exist_ok=True)
+ basename = os.path.basename(m)
+ mix, rate = librosa.load(m, mono=False, sr=44100)
+ if mix.ndim == 1:
+ mix = np.asfortranarray([mix, mix])
+ mix = mix.T
+ sources = self.demix(mix.T)
+ opt = sources[0].T
+ if format in ["wav", "flac"]:
+ sf.write(
+ "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
+ )
+ sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
+ else:
+ path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
+ path_other = "%s/%s_others.wav" % (others_root, basename)
+ sf.write(path_vocal, mix - opt, rate)
+ sf.write(path_other, opt, rate)
+ opt_path_vocal = path_vocal[:-4] + ".%s" % format
+ opt_path_other = path_other[:-4] + ".%s" % format
+ if os.path.exists(path_vocal):
+ os.system(
+ "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_vocal, opt_path_vocal)
+ )
+ if os.path.exists(opt_path_vocal):
+ try:
+ os.remove(path_vocal)
+ except:
+ pass
+ if os.path.exists(path_other):
+ os.system(
+ "ffmpeg -i '%s' -vn '%s' -q:a 2 -y" % (path_other, opt_path_other)
+ )
+ if os.path.exists(opt_path_other):
+ try:
+ os.remove(path_other)
+ except:
+ pass
+
+
+class MDXNetDereverb:
+ def __init__(self, chunks):
+ self.onnx = "%s/uvr5_weights/onnx_dereverb_By_FoxJoy"%os.path.dirname(os.path.abspath(__file__))
+ self.shifts = 10 # 'Predict with randomised equivariant stabilisation'
+ self.mixing = "min_mag" # ['default','min_mag','max_mag']
+ self.chunks = chunks
+ self.margin = 44100
+ self.dim_t = 9
+ self.dim_f = 3072
+ self.n_fft = 6144
+ self.denoise = True
+ self.pred = Predictor(self)
+ self.device = cpu
+
+ def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False):
+ self.pred.prediction(input, vocal_root, others_root, format)
diff --git a/tools/uvr5/uvr5_weights/.gitignore b/tools/uvr5/uvr5_weights/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..005717ead0bb8f920c00d76feb8207deb7946a57
--- /dev/null
+++ b/tools/uvr5/uvr5_weights/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/tools/uvr5/vr.py b/tools/uvr5/vr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e63a8d6d4ef8c93683312245d6d70ee189fbbf2
--- /dev/null
+++ b/tools/uvr5/vr.py
@@ -0,0 +1,370 @@
+import os,sys
+parent_directory = os.path.dirname(os.path.abspath(__file__))
+import logging,pdb
+logger = logging.getLogger(__name__)
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from lib.lib_v5 import nets_61968KB as Nets
+from lib.lib_v5 import spec_utils
+from lib.lib_v5.model_param_init import ModelParameters
+from lib.lib_v5.nets_new import CascadedNet
+from lib.utils import inference
+
+
+class AudioPre:
+ def __init__(self, agg, model_path, device, is_half, tta=False):
+ self.model_path = model_path
+ self.device = device
+ self.data = {
+ # Processing Options
+ "postprocess": False,
+ "tta": tta,
+ # Constants
+ "window_size": 512,
+ "agg": agg,
+ "high_end_process": "mirroring",
+ }
+ mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v2.json"%parent_directory)
+ model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
+ cpk = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(cpk)
+ model.eval()
+ if is_half:
+ model = model.half().to(device)
+ else:
+ model = model.to(device)
+
+ self.mp = mp
+ self.model = model
+
+ def _path_audio_(
+ self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
+ ):
+ if ins_root is None and vocal_root is None:
+ return "No save root."
+ name = os.path.basename(music_file)
+ if ins_root is not None:
+ os.makedirs(ins_root, exist_ok=True)
+ if vocal_root is not None:
+ os.makedirs(vocal_root, exist_ok=True)
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+ bands_n = len(self.mp.param["band"])
+ # print(bands_n)
+ for d in range(bands_n, 0, -1):
+ bp = self.mp.param["band"][d]
+ if d == bands_n: # high-end band
+ (
+ X_wave[d],
+ _,
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
+ music_file,
+ sr = bp["sr"],
+ mono = False,
+ dtype = np.float32,
+ res_type = bp["res_type"],
+ )
+ if X_wave[d].ndim == 1:
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+ else: # lower bands
+ X_wave[d] = librosa.core.resample(
+ X_wave[d + 1],
+ orig_sr = self.mp.param["band"][d + 1]["sr"],
+ target_sr = bp["sr"],
+ res_type = bp["res_type"],
+ )
+ # Stft of wave source
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ self.mp.param["mid_side"],
+ self.mp.param["mid_side_b2"],
+ self.mp.param["reverse"],
+ )
+ # pdb.set_trace()
+ if d == bands_n and self.data["high_end_process"] != "none":
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+ )
+ input_high_end = X_spec_s[d][
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+ ]
+
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+ aggresive_set = float(self.data["agg"] / 100)
+ aggressiveness = {
+ "value": aggresive_set,
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
+ }
+ with torch.no_grad():
+ pred, X_mag, X_phase = inference(
+ X_spec_m, self.device, self.model, aggressiveness, self.data
+ )
+ # Postprocess
+ if self.data["postprocess"]:
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
+ pred = spec_utils.mask_silence(pred, pred_inv)
+ y_spec_m = pred * X_phase
+ v_spec_m = X_spec_m - y_spec_m
+
+ if is_hp3 == True:
+ ins_root,vocal_root = vocal_root,ins_root
+
+ if ins_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+ )
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+ logger.info("%s instruments done" % name)
+ if is_hp3 == True:
+ head = "vocal_"
+ else:
+ head = "instrument_"
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ ins_root,
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ ) #
+ else:
+ path = os.path.join(
+ ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+ if vocal_root is not None:
+ if is_hp3 == True:
+ head = "instrument_"
+ else:
+ head = "vocal_"
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+ )
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+ logger.info("%s vocals done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ vocal_root,
+ head + "{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ else:
+ path = os.path.join(
+ vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+
+
+class AudioPreDeEcho:
+ def __init__(self, agg, model_path, device, is_half, tta=False):
+ self.model_path = model_path
+ self.device = device
+ self.data = {
+ # Processing Options
+ "postprocess": False,
+ "tta": tta,
+ # Constants
+ "window_size": 512,
+ "agg": agg,
+ "high_end_process": "mirroring",
+ }
+ mp = ModelParameters("%s/lib/lib_v5/modelparams/4band_v3.json"%parent_directory)
+ nout = 64 if "DeReverb" in model_path else 48
+ model = CascadedNet(mp.param["bins"] * 2, nout)
+ cpk = torch.load(model_path, map_location="cpu")
+ model.load_state_dict(cpk)
+ model.eval()
+ if is_half:
+ model = model.half().to(device)
+ else:
+ model = model.to(device)
+
+ self.mp = mp
+ self.model = model
+
+ def _path_audio_(
+ self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
+ ): # 3个VR模型vocal和ins是反的
+ if ins_root is None and vocal_root is None:
+ return "No save root."
+ name = os.path.basename(music_file)
+ if ins_root is not None:
+ os.makedirs(ins_root, exist_ok=True)
+ if vocal_root is not None:
+ os.makedirs(vocal_root, exist_ok=True)
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+ bands_n = len(self.mp.param["band"])
+ # print(bands_n)
+ for d in range(bands_n, 0, -1):
+ bp = self.mp.param["band"][d]
+ if d == bands_n: # high-end band
+ (
+ X_wave[d],
+ _,
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
+ music_file,
+ sr = bp["sr"],
+ mono = False,
+ dtype = np.float32,
+ res_type = bp["res_type"],
+ )
+ if X_wave[d].ndim == 1:
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+ else: # lower bands
+ X_wave[d] = librosa.core.resample(
+ X_wave[d + 1],
+ orig_sr = self.mp.param["band"][d + 1]["sr"],
+ target_sr = bp["sr"],
+ res_type = bp["res_type"],
+ )
+ # Stft of wave source
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+ X_wave[d],
+ bp["hl"],
+ bp["n_fft"],
+ self.mp.param["mid_side"],
+ self.mp.param["mid_side_b2"],
+ self.mp.param["reverse"],
+ )
+ # pdb.set_trace()
+ if d == bands_n and self.data["high_end_process"] != "none":
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+ )
+ input_high_end = X_spec_s[d][
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+ ]
+
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+ aggresive_set = float(self.data["agg"] / 100)
+ aggressiveness = {
+ "value": aggresive_set,
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
+ }
+ with torch.no_grad():
+ pred, X_mag, X_phase = inference(
+ X_spec_m, self.device, self.model, aggressiveness, self.data
+ )
+ # Postprocess
+ if self.data["postprocess"]:
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
+ pred = spec_utils.mask_silence(pred, pred_inv)
+ y_spec_m = pred * X_phase
+ v_spec_m = X_spec_m - y_spec_m
+
+ if ins_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+ )
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+ logger.info("%s instruments done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ ins_root,
+ "vocal_{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ ) #
+ else:
+ path = os.path.join(
+ ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_instrument) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
+ if vocal_root is not None:
+ if self.data["high_end_process"].startswith("mirroring"):
+ input_high_end_ = spec_utils.mirroring(
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+ )
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
+ )
+ else:
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+ logger.info("%s vocals done" % name)
+ if format in ["wav", "flac"]:
+ sf.write(
+ os.path.join(
+ vocal_root,
+ "instrument_{}_{}.{}".format(name, self.data["agg"], format),
+ ),
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ else:
+ path = os.path.join(
+ vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
+ )
+ sf.write(
+ path,
+ (np.array(wav_vocals) * 32768).astype("int16"),
+ self.mp.param["sr"],
+ )
+ if os.path.exists(path):
+ opt_format_path = path[:-4] + ".%s" % format
+ os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+ if os.path.exists(opt_format_path):
+ try:
+ os.remove(path)
+ except:
+ pass
diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e024dabc029dd74b4cd9d0b5df67c1ebdce1e35
--- /dev/null
+++ b/tools/uvr5/webui.py
@@ -0,0 +1,190 @@
+import os
+import traceback,gradio as gr
+import logging
+from tools.i18n.i18n import I18nAuto
+from tools.my_utils import clean_path
+i18n = I18nAuto()
+
+logger = logging.getLogger(__name__)
+import librosa,ffmpeg
+import soundfile as sf
+import torch
+import sys
+from mdxnet import MDXNetDereverb
+from vr import AudioPre, AudioPreDeEcho
+from bsroformer import BsRoformer_Loader
+
+weight_uvr5_root = "tools/uvr5/uvr5_weights"
+uvr5_names = []
+for name in os.listdir(weight_uvr5_root):
+ if name.endswith(".pth") or name.endswith(".ckpt") or "onnx" in name:
+ uvr5_names.append(name.replace(".pth", "").replace(".ckpt", ""))
+
+device=sys.argv[1]
+is_half=eval(sys.argv[2])
+webui_port_uvr5=int(sys.argv[3])
+is_share=eval(sys.argv[4])
+
+def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
+ infos = []
+ try:
+ inp_root = clean_path(inp_root)
+ save_root_vocal = clean_path(save_root_vocal)
+ save_root_ins = clean_path(save_root_ins)
+ is_hp3 = "HP3" in model_name
+ if model_name == "onnx_dereverb_By_FoxJoy":
+ pre_fun = MDXNetDereverb(15)
+ elif model_name == "Bs_Roformer" or "bs_roformer" in model_name.lower():
+ func = BsRoformer_Loader
+ pre_fun = func(
+ model_path = os.path.join(weight_uvr5_root, model_name + ".ckpt"),
+ device = device,
+ is_half=is_half
+ )
+ else:
+ func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
+ pre_fun = func(
+ agg=int(agg),
+ model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
+ device=device,
+ is_half=is_half,
+ )
+ if inp_root != "":
+ paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
+ else:
+ paths = [path.name for path in paths]
+ for path in paths:
+ inp_path = os.path.join(inp_root, path)
+ if(os.path.isfile(inp_path)==False):continue
+ need_reformat = 1
+ done = 0
+ try:
+ info = ffmpeg.probe(inp_path, cmd="ffprobe")
+ if (
+ info["streams"][0]["channels"] == 2
+ and info["streams"][0]["sample_rate"] == "44100"
+ ):
+ need_reformat = 0
+ pre_fun._path_audio_(
+ inp_path, save_root_ins, save_root_vocal, format0,is_hp3
+ )
+ done = 1
+ except:
+ need_reformat = 1
+ traceback.print_exc()
+ if need_reformat == 1:
+ tmp_path = "%s/%s.reformatted.wav" % (
+ os.path.join(os.environ["TEMP"]),
+ os.path.basename(inp_path),
+ )
+ os.system(
+ f'ffmpeg -i "{inp_path}" -vn -acodec pcm_s16le -ac 2 -ar 44100 "{tmp_path}" -y'
+ )
+ inp_path = tmp_path
+ try:
+ if done == 0:
+ pre_fun._path_audio_(
+ inp_path, save_root_ins, save_root_vocal, format0,is_hp3
+ )
+ infos.append("%s->Success" % (os.path.basename(inp_path)))
+ yield "\n".join(infos)
+ except:
+ infos.append(
+ "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
+ )
+ yield "\n".join(infos)
+ except:
+ infos.append(traceback.format_exc())
+ yield "\n".join(infos)
+ finally:
+ try:
+ if model_name == "onnx_dereverb_By_FoxJoy":
+ del pre_fun.pred.model
+ del pre_fun.pred.model_
+ else:
+ del pre_fun.model
+ del pre_fun
+ except:
+ traceback.print_exc()
+ print("clean_empty_cache")
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ yield "\n".join(infos)
+
+with gr.Blocks(title="UVR5 WebUI") as app:
+ gr.Markdown(
+ value=
+ i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
+ )
+ with gr.Tabs():
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
+ with gr.Group():
+ gr.Markdown(
+ value=i18n("人声伴奏分离批量处理, 使用UVR5模型。") + "
" + \
+ i18n("合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。")+ "
" + \
+ i18n("模型分为三类:") + "
" + \
+ i18n("1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;") + "
" + \
+ i18n("2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;") + "
" + \
+ i18n("3、去混响、去延迟模型(by FoxJoy):") + "
" + \
+ i18n("(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;") + "
" + \
+ i18n("(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。") + "
" + \
+ i18n("去混响/去延迟,附:") + "
" + \
+ i18n("1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;") + "
" + \
+ i18n("2、MDX-Net-Dereverb模型挺慢的;") + "
" + \
+ i18n("3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。")
+ )
+ with gr.Row():
+ with gr.Column():
+ dir_wav_input = gr.Textbox(
+ label=i18n("输入待处理音频文件夹路径"),
+ placeholder="C:\\Users\\Desktop\\todo-songs",
+ )
+ wav_inputs = gr.File(
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
+ )
+ with gr.Column():
+ model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names)
+ agg = gr.Slider(
+ minimum=0,
+ maximum=20,
+ step=1,
+ label=i18n("人声提取激进程度"),
+ value=10,
+ interactive=True,
+ visible=False, # 先不开放调整
+ )
+ opt_vocal_root = gr.Textbox(
+ label=i18n("指定输出主人声文件夹"), value="output/uvr5_opt"
+ )
+ opt_ins_root = gr.Textbox(
+ label=i18n("指定输出非主人声文件夹"), value="output/uvr5_opt"
+ )
+ format0 = gr.Radio(
+ label=i18n("导出文件格式"),
+ choices=["wav", "flac", "mp3", "m4a"],
+ value="flac",
+ interactive=True,
+ )
+ but2 = gr.Button(i18n("转换"), variant="primary")
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
+ but2.click(
+ uvr,
+ [
+ model_choose,
+ dir_wav_input,
+ opt_vocal_root,
+ wav_inputs,
+ opt_ins_root,
+ agg,
+ format0,
+ ],
+ [vc_output4],
+ api_name="uvr_convert",
+ )
+app.queue(concurrency_count=511, max_size=1022).launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ share=is_share,
+ server_port=webui_port_uvr5,
+ quiet=True,
+)
diff --git a/webui_nohup.py b/webui_nohup.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc5ba1fd4a2316fb8a1b68538553b404e87cf3b
--- /dev/null
+++ b/webui_nohup.py
@@ -0,0 +1,59 @@
+import os,shutil,sys,pdb,re
+version="v2"#if sys.argv[-1]=="v2" else"v1"
+os.environ["version"]=version
+now_dir = os.getcwd()
+sys.path.insert(0, now_dir)
+import json,yaml,warnings,torch
+import platform
+import psutil
+import signal
+
+warnings.filterwarnings("ignore")
+torch.manual_seed(233333)
+tmp = os.path.join(now_dir, "TEMP")
+os.makedirs(tmp, exist_ok=True)
+os.environ["TEMP"] = tmp
+if(os.path.exists(tmp)):
+ for name in os.listdir(tmp):
+ if(name=="jieba.cache"):continue
+ path="%s/%s"%(tmp,name)
+ delete=os.remove if os.path.isfile(path) else shutil.rmtree
+ try:
+ delete(path)
+ except Exception as e:
+ print(str(e))
+ pass
+import site
+site_packages_roots = []
+for path in site.getsitepackages():
+ if "packages" in path:
+ site_packages_roots.append(path)
+if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
+#os.environ["OPENBLAS_NUM_THREADS"] = "4"
+os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
+os.environ["all_proxy"] = ""
+for site_packages_root in site_packages_roots:
+ if os.path.exists(site_packages_root):
+ try:
+ with open("%s/users.pth" % (site_packages_root), "w") as f:
+ f.write(
+ "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
+ % (now_dir, now_dir, now_dir, now_dir, now_dir)
+ )
+ break
+ except PermissionError:
+ pass
+from tools import my_utils
+import traceback
+import shutil
+import pdb
+import gradio as gr
+from subprocess import Popen
+import signal
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+from scipy.io import wavfile
+from tools.my_utils import load_audio
+from multiprocessing import cpu_count
+
+# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu