Voice-Cloning-for-Bilibili

Runtime error

App Files Files Community

Create app._use_multi_repo_ids_.py

#10

by nijisakai - opened Jul 25, 2023

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+298

-0

Files changed (1) hide show

app._use_multi_repo_ids_.py +298 -0

app._use_multi_repo_ids_.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import json
+import os
+import subprocess
+from pathlib import Path
+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from demucs.apply import apply_model
+from demucs.pretrained import DEFAULT_MODEL, get_model
+from huggingface_hub import hf_hub_download, list_repo_files
+from so_vits_svc_fork.hparams import HParams
+from so_vits_svc_fork.inference.core import Svc
+###################################################################
+# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
+###################################################################
+# The Hugging Face Hub repo ID - 在这里修改repo_id，可替换成任何已经训练好的模型！
+repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay","nijisakai/Eric_Cartman"]
+# If None, Uses latest ckpt in the repo
+ckpt_name = None
+# If None, Uses "kmeans.pt" if it exists in the repo
+cluster_model_name = None
+# Set the default f0 type to use - use the one it was trained on.
+# The default for so-vits-svc-fork is "dio".
+# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
+default_f0_method = "crepe"
+# The default ratio of cluster inference to SVC inference.
+# If cluster_model_name is not found in the repo, this is set to 0.
+default_cluster_infer_ratio = 0.5
+# Limit on duration of audio at inference time. increase if you can
+# In this parent app, we set the limit with an env var to 30 seconds
+# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
+duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
+###################################################################
+models = []
+speakers = []
+for repo_id in repo_ids:
+    # Figure out the latest generator by taking highest value one.
+    # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
+    if ckpt_name is None:
+        latest_id = sorted(
+            [
+                int(Path(x).stem.split("_")[1])
+                for x in list_repo_files(repo_id)
+                if x.startswith("G_") and x.endswith(".pth")
+            ]
+        )[-1]
+        ckpt_name = f"G_{latest_id}.pth"
+    cluster_model_name = cluster_model_name or "kmeans.pt"
+    if cluster_model_name in list_repo_files(repo_id):
+        print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
+        cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
+    else:
+        print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
+        cluster_model_path = None
+    default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
+    generator_path = hf_hub_download(repo_id, ckpt_name)
+    config_path = hf_hub_download(repo_id, "config.json")
+    hparams = HParams(**json.loads(Path(config_path).read_text()))
+    speaker = list(hparams.spk.keys())
+    speakers.extend(speaker)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
+    models.append(model)
+    # Reset ckpt_name and cluster_model_name for the next iteration
+    ckpt_name = None
+    cluster_model_name = None
+demucs_model = get_model(DEFAULT_MODEL)
+def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
+    wav, sr = librosa.load(filename, mono=False, sr=sr)
+    wav = torch.tensor(wav)
+    ref = wav.mean(0)
+    wav = (wav - ref.mean()) / ref.std()
+    sources = apply_model(
+        model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
+    )[0]
+    sources = sources * ref.std() + ref.mean()
+    vocal_wav = sources[-1]
+    vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
+    vocal_wav = vocal_wav.numpy()
+    vocal_wav = librosa.to_mono(vocal_wav)
+    vocal_wav = vocal_wav.T
+    instrumental_wav = sources[:-1].sum(0).numpy().T
+    return vocal_wav, instrumental_wav
+def download_youtube_clip(
+    video_identifier,
+    start_time,
+    end_time,
+    output_filename,
+    num_attempts=5,
+    url_base="https://www.youtube.com/watch?v=",
+    quiet=False,
+    force=False,
+):
+    output_path = Path(output_filename)
+    if output_path.exists():
+        if not force:
+            return output_path
+        else:
+            output_path.unlink()
+    quiet = "--quiet --no-warnings" if quiet else ""
+    command = f"""
+        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
+    """.strip()
+    attempts = 0
+    while True:
+        try:
+            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            attempts += 1
+            if attempts == num_attempts:
+                return None
+        else:
+            break
+    if output_path.exists():
+        return output_path
+    else:
+        return None
+def predict(
+    speaker,
+    audio,
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    f0_method: str = "crepe",
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    chunk_seconds: float = 0.5,
+    absolute_thresh: bool = False,
+):
+    model = models[speakers.index(speaker)]
+    audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
+    audio = model.infer_silence(
+        audio.astype(np.float32),
+        speaker=speaker,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        absolute_thresh=absolute_thresh,
+    )
+    return model.target_sample, audio
+def predict_song_from_yt(
+    ytid_or_url,
+    start,
+    end,
+    speaker=speakers[0],
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    f0_method: str = "dio",
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    chunk_seconds: float = 0.5,
+    absolute_thresh: bool = False,
+):
+    model = models[speakers.index(speaker)]
+    end = min(start + duration_limit, end)
+    original_track_filepath = download_youtube_clip(
+        ytid_or_url,
+        start,
+        end,
+        "track.wav",
+        force=True,
+        url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
+    )
+    vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
+    if transpose != 0:
+        inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
+    cloned_vox = model.infer_silence(
+        vox_wav.astype(np.float32),
+        speaker=speaker,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        absolute_thresh=absolute_thresh,
+    )
+    full_song = inst_wav + np.expand_dims(cloned_vox, 1)
+    return (model.target_sample, full_song), (model.target_sample, cloned_vox)
+description = f"""
+<center>💡 - 如何使用此程序：在页面上方选择“从B站视频上传”模块，填写视频网址和视频起止时间后，点击“submit”按键即可！您还可以点击页面最下方的示例快速预览效果</center>
+""".strip()
+article = """
+<p style='text-align: center'> 注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。
+</p>
+""".strip()
+interface_mic = gr.Interface(
+    predict,
+    inputs=[
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
+        gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降 (如果使用，建议0.5左右)"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs="audio",
+    cache_examples=False,
+    title="🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+)
+interface_file = gr.Interface(
+    predict,
+    inputs=[
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
+        gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降 (如果使用，建议0.5左右)"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs="audio",
+    cache_examples=False,
+    title="🌊💕🎶 可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+)
+interface_yt = gr.Interface(
+    predict_song_from_yt,
+    inputs=[
+        gr.Textbox(
+            label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址，可直接填写相应的BV号"
+        ),
+        gr.Number(value=0, label="起始时间 (秒)"),
+        gr.Number(value=15, label="结束时间 (秒)"),
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs=[gr.Audio(label="AI歌手+伴奏🎵"), gr.Audio(label="AI歌手人声部分🎤")],
+    title="🌊💕🎶 - 可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+    cache_examples=False,
+)
+interface = gr.TabbedInterface(
+    [interface_yt, interface_mic, interface_file],
+    ["📺 - 从B站视频上传 ⭐推荐⭐", "🎙️ - 从麦克风上传", "🎵 - 从文件上传"],
+)
+if __name__ == "__main__":
+    interface.launch(show_error=True)