Voice-Cloning-for-Bilibili

Runtime error

App Files Files Community

nijisakai

kevinwang676 commited on Jul 24, 2023

Commit

12e4312

•

0 Parent(s):

Duplicate from kevinwang676/Voice-Cloning-for-Bilibili

Browse files

Co-authored-by: Kevin Wang <[email protected]>

Files changed (8) hide show

.gitattributes +34 -0
Makefile +11 -0
README.md +14 -0
app.py +306 -0
packages.txt +3 -0
pyproject.toml +17 -0
requirements.txt +6 -0
training_so_vits_svc_fork.ipynb +540 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Makefile ADDED Viewed

	@@ -0,0 +1,11 @@

+.PHONY: quality style
+# Check that source code meets quality standards
+quality:
+	black --check --diff .
+	ruff .
+# Format source code automatically
+style:
+	black .
+	ruff . --fix

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Voice Cloning
+emoji: 😻
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: kevinwang676/Voice-Cloning-for-Bilibili
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import json
+import os
+import subprocess
+from pathlib import Path
+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from demucs.apply import apply_model
+from demucs.pretrained import DEFAULT_MODEL, get_model
+from huggingface_hub import hf_hub_download, list_repo_files
+from so_vits_svc_fork.hparams import HParams
+from so_vits_svc_fork.inference.core import Svc
+###################################################################
+# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
+###################################################################
+# The Hugging Face Hub repo ID - 在这里修改repo_id，可替换成任何已经训练好的模型！
+repo_id = "kevinwang676/talktalkai-qing"
+# If None, Uses latest ckpt in the repo
+ckpt_name = None
+# If None, Uses "kmeans.pt" if it exists in the repo
+cluster_model_name = None
+# Set the default f0 type to use - use the one it was trained on.
+# The default for so-vits-svc-fork is "dio".
+# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
+default_f0_method = "crepe"
+# The default ratio of cluster inference to SVC inference.
+# If cluster_model_name is not found in the repo, this is set to 0.
+default_cluster_infer_ratio = 0.5
+# Limit on duration of audio at inference time. increase if you can
+# In this parent app, we set the limit with an env var to 30 seconds
+# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
+duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
+###################################################################
+# Figure out the latest generator by taking highest value one.
+# Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
+if ckpt_name is None:
+    latest_id = sorted(
+        [
+            int(Path(x).stem.split("_")[1])
+            for x in list_repo_files(repo_id)
+            if x.startswith("G_") and x.endswith(".pth")
+        ]
+    )[-1]
+    ckpt_name = f"G_{latest_id}.pth"
+cluster_model_name = cluster_model_name or "kmeans.pt"
+if cluster_model_name in list_repo_files(repo_id):
+    print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
+    cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
+else:
+    print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
+    cluster_model_path = None
+default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
+generator_path = hf_hub_download(repo_id, ckpt_name)
+config_path = hf_hub_download(repo_id, "config.json")
+hparams = HParams(**json.loads(Path(config_path).read_text()))
+speakers = list(hparams.spk.keys())
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
+demucs_model = get_model(DEFAULT_MODEL)
+def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
+    wav, sr = librosa.load(filename, mono=False, sr=sr)
+    wav = torch.tensor(wav)
+    ref = wav.mean(0)
+    wav = (wav - ref.mean()) / ref.std()
+    sources = apply_model(
+        model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
+    )[0]
+    sources = sources * ref.std() + ref.mean()
+    # We take just the vocals stem. I know the vocals for this model are at index -1
+    # If using different model, check model.sources.index('vocals')
+    vocal_wav = sources[-1]
+    # I did this because its the same normalization the so-vits model required
+    vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
+    vocal_wav = vocal_wav.numpy()
+    vocal_wav = librosa.to_mono(vocal_wav)
+    vocal_wav = vocal_wav.T
+    instrumental_wav = sources[:-1].sum(0).numpy().T
+    return vocal_wav, instrumental_wav
+def download_youtube_clip(
+    video_identifier,
+    start_time,
+    end_time,
+    output_filename,
+    num_attempts=5,
+    url_base="https://www.youtube.com/watch?v=",
+    quiet=False,
+    force=False,
+):
+    output_path = Path(output_filename)
+    if output_path.exists():
+        if not force:
+            return output_path
+        else:
+            output_path.unlink()
+    quiet = "--quiet --no-warnings" if quiet else ""
+    command = f"""
+        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
+    """.strip()
+    attempts = 0
+    while True:
+        try:
+            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            attempts += 1
+            if attempts == num_attempts:
+                return None
+        else:
+            break
+    if output_path.exists():
+        return output_path
+    else:
+        return None
+def predict(
+    speaker,
+    audio,
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    f0_method: str = "crepe",
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    chunk_seconds: float = 0.5,
+    absolute_thresh: bool = False,
+):
+    audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
+    audio = model.infer_silence(
+        audio.astype(np.float32),
+        speaker=speaker,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        absolute_thresh=absolute_thresh,
+    )
+    return model.target_sample, audio
+def predict_song_from_yt(
+    ytid_or_url,
+    start,
+    end,
+    speaker=speakers[0],
+    transpose: int = 0,
+    auto_predict_f0: bool = False,
+    cluster_infer_ratio: float = 0,
+    noise_scale: float = 0.4,
+    f0_method: str = "dio",
+    db_thresh: int = -40,
+    pad_seconds: float = 0.5,
+    chunk_seconds: float = 0.5,
+    absolute_thresh: bool = False,
+):
+    end = min(start + duration_limit, end)
+    original_track_filepath = download_youtube_clip(
+        ytid_or_url,
+        start,
+        end,
+        "track.wav",
+        force=True,
+        url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
+    )
+    vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
+    if transpose != 0:
+        inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
+    cloned_vox = model.infer_silence(
+        vox_wav.astype(np.float32),
+        speaker=speaker,
+        transpose=transpose,
+        auto_predict_f0=auto_predict_f0,
+        cluster_infer_ratio=cluster_infer_ratio,
+        noise_scale=noise_scale,
+        f0_method=f0_method,
+        db_thresh=db_thresh,
+        pad_seconds=pad_seconds,
+        chunk_seconds=chunk_seconds,
+        absolute_thresh=absolute_thresh,
+    )
+    full_song = inst_wav + np.expand_dims(cloned_vox, 1)
+    return (model.target_sample, full_song), (model.target_sample, cloned_vox)
+description = f"""
+## <center>🏞️ - 滔滔AI，为您提供全场景的AI声音服务（如AI拟声、AI歌手、AI变声等）</center>
+### <center>🌟 - 滔滔AI合作音乐人：[一清清清](https://space.bilibili.com/22960772?spm_id_from=333.337.0.0)；AI歌手，唱我想唱！</center>
+### <center>🎡 - 更多精彩，尽在[滔滔AI](http://www.talktalkai.com)；合作：[email protected]</center>
+<center>💡 - 如何使用此程序：在页面上方选择“从B站视频上传”模块，填写视频网址和视频起止时间后，点击“submit”按键即可！您还可以点击页面最下方的示例快速预览效果</center>
+<h1 align="center"><a href="http://www.talktalkai.com"><img src="https://y.qq.com/music/photo_new/T001R300x300M0000025Gr0r2OXvrn_2.jpg", alt="talktalkai" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1>
+""".strip()
+article = """
+<p style='text-align: center'> 注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。
+</p>
+<p style='text-align: center'> 🌊🎶🏞️ - 江水东流急，滔滔无尽声。 明·顾璘
+</p>
+""".strip()
+interface_mic = gr.Interface(
+    predict,
+    inputs=[
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟"),
+        gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降 (如果使用，建议0.5左右)"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs="audio",
+    title="🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+)
+interface_file = gr.Interface(
+    predict,
+    inputs=[
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟"),
+        gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降 (如果使用，建议0.5左右)"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs="audio",
+    title="🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+)
+interface_yt = gr.Interface(
+    predict_song_from_yt,
+    inputs=[
+        gr.Textbox(
+            label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址，可直接填写相应的BV号", value="https://www.bilibili.com/video/BV..."
+        ),
+        gr.Number(value=0, label="起始时间 (秒)"),
+        gr.Number(value=15, label="结束时间 (秒)"),
+        gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟"),
+        gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+        gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+        gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降"),
+        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+        gr.Dropdown(
+            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+            value=default_f0_method,
+            label="模型推理方法 (crepe推理效果最好)", visible=False
+        ),
+    ],
+    outputs=[gr.Audio(label="AI歌手+伴奏🎵"), gr.Audio(label="AI歌手人声部分🎤")],
+    title="🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音",
+    description=description,
+    article=article,
+    examples=[
+        ["https://www.bilibili.com/video/BV1ip4y1p7Pn", 87, 103, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
+    ],
+)
+interface = gr.TabbedInterface(
+    [interface_yt, interface_mic, interface_file],
+    ["📺 - 从B站视频上传 ⭐推荐⭐", "🎙️ - 从麦克风上传", "🎵 - 从文件上传"],
+)
+if __name__ == "__main__":
+    interface.launch(show_error=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+x264
+libx264-dev

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[tool.black]
+line-length = 119
+target_version = ['py37']
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "W605"]
+select = ["C", "E", "F", "I", "W"]
+line-length = 119
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+[tool.ruff.isort]
+known-first-party = ["so_vits_svc_fork"]
+lines-after-imports = 2

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+so-vits-svc-fork
+gradio
+huggingface_hub
+yt-dlp
+demucs
+gradio

training_so_vits_svc_fork.ipynb ADDED Viewed

	@@ -0,0 +1,540 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/nateraw/voice-cloning/blob/main/training_so_vits_svc_fork.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jIcNJ5QfDsV_"
+      },
+      "outputs": [],
+      "source": [
+        "# %%capture\n",
+        "! pip install git+https://github.com/nateraw/so-vits-svc-fork@main\n",
+        "! pip install openai-whisper yt-dlp huggingface_hub demucs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6uZAhUPOhFv9"
+      },
+      "source": [
+        "---\n",
+        "\n",
+        "# Restart runtime\n",
+        "\n",
+        "After running the cell above, you'll need to restart the Colab runtime because we installed a different version of numpy.\n",
+        "\n",
+        "`Runtime -> Restart runtime`\n",
+        "\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DROusQatF-wF"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import login\n",
+        "\n",
+        "login()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Settings"
+      ],
+      "metadata": {
+        "id": "yOM9WWmmRqTA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5oTDjDEKFz3W"
+      },
+      "outputs": [],
+      "source": [
+        "CHARACTER = \"kanye\"\n",
+        "DO_EXTRACT_VOCALS = False\n",
+        "MODEL_REPO_ID = \"dog/kanye\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BFd_ly1P_5Ht"
+      },
+      "source": [
+        "## Data Preparation\n",
+        "\n",
+        "Prepare a data.csv file here with `ytid,start,end` as the first line (they're the expected column names). Then, prepare a training set given YouTube IDs and their start and end segment times in seconds. Try to pick segments that have dry vocal only, as that'll provide the best results.\n",
+        "\n",
+        "An example is given below for Kanye West."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rBrtgDtWmhRb"
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile data.csv\n",
+        "ytid,start,end\n",
+        "lkK4de9nbzQ,0,137\n",
+        "gXU9Am2Seo0,30,69\n",
+        "gXU9Am2Seo0,94,135\n",
+        "iVgrhWvQpqU,0,55\n",
+        "iVgrhWvQpqU,58,110\n",
+        "UIV-q-gneKA,85,99\n",
+        "UIV-q-gneKA,110,125\n",
+        "UIV-q-gneKA,127,141\n",
+        "UIV-q-gneKA,173,183\n",
+        "GmlyYCGE9ak,0,102\n",
+        "x-7aWcPmJ60,25,43\n",
+        "x-7aWcPmJ60,47,72\n",
+        "x-7aWcPmJ60,98,113\n",
+        "DK2LCIzIBrU,0,56\n",
+        "DK2LCIzIBrU,80,166\n",
+        "_W56nZk0fCI,184,224"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cxxp4uYoC0aG"
+      },
+      "outputs": [],
+      "source": [
+        "import subprocess\n",
+        "from pathlib import Path\n",
+        "import librosa\n",
+        "from scipy.io import wavfile\n",
+        "import numpy as np\n",
+        "from demucs.pretrained import get_model, DEFAULT_MODEL\n",
+        "from demucs.apply import apply_model\n",
+        "import torch\n",
+        "import csv\n",
+        "import whisper\n",
+        "\n",
+        "\n",
+        "def download_youtube_clip(video_identifier, start_time, end_time, output_filename, num_attempts=5, url_base=\"https://www.youtube.com/watch?v=\"):\n",
+        "    status = False\n",
+        "\n",
+        "    output_path = Path(output_filename)\n",
+        "    if output_path.exists():\n",
+        "        return True, \"Already Downloaded\"\n",
+        "\n",
+        "    command = f\"\"\"\n",
+        "        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o \"{output_filename}\" --download-sections \"*{start_time}-{end_time}\" \"{url_base}{video_identifier}\"\n",
+        "    \"\"\".strip()\n",
+        "\n",
+        "    attempts = 0\n",
+        "    while True:\n",
+        "        try:\n",
+        "            output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)\n",
+        "        except subprocess.CalledProcessError as err:\n",
+        "            attempts += 1\n",
+        "            if attempts == num_attempts:\n",
+        "                return status, err.output\n",
+        "        else:\n",
+        "            break\n",
+        "\n",
+        "    status = output_path.exists()\n",
+        "    return status, \"Downloaded\"\n",
+        "\n",
+        "\n",
+        "def split_long_audio(model, filepaths, character_name, save_dir=\"data_dir\", out_sr=44100):\n",
+        "    if isinstance(filepaths, str):\n",
+        "        filepaths = [filepaths]\n",
+        "\n",
+        "    for file_idx, filepath in enumerate(filepaths):\n",
+        "\n",
+        "        save_path = Path(save_dir) / character_name\n",
+        "        save_path.mkdir(exist_ok=True, parents=True)\n",
+        "\n",
+        "        print(f\"Transcribing file {file_idx}: '{filepath}' to segments...\")\n",
+        "        result = model.transcribe(filepath, word_timestamps=True, task=\"transcribe\", beam_size=5, best_of=5)\n",
+        "        segments = result['segments']\n",
+        "    \n",
+        "        wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)\n",
+        "        wav, _ = librosa.effects.trim(wav, top_db=20)\n",
+        "        peak = np.abs(wav).max()\n",
+        "        if peak > 1.0:\n",
+        "            wav = 0.98 * wav / peak\n",
+        "        wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)\n",
+        "        wav2 /= max(wav2.max(), -wav2.min())\n",
+        "\n",
+        "        for i, seg in enumerate(segments):\n",
+        "            start_time = seg['start']\n",
+        "            end_time = seg['end']\n",
+        "            wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]\n",
+        "            wav_seg_name = f\"{character_name}_{file_idx}_{i}.wav\"\n",
+        "            out_fpath = save_path / wav_seg_name\n",
+        "            wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))\n",
+        "\n",
+        "\n",
+        "def extract_vocal_demucs(model, filename, out_filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):\n",
+        "    wav, sr = librosa.load(filename, mono=False, sr=sr)\n",
+        "    wav = torch.tensor(wav)\n",
+        "    ref = wav.mean(0)\n",
+        "    wav = (wav - ref.mean()) / ref.std()\n",
+        "    sources = apply_model(\n",
+        "        model,\n",
+        "        wav[None],\n",
+        "        device=device,\n",
+        "        shifts=shifts,\n",
+        "        split=split,\n",
+        "        overlap=overlap,\n",
+        "        progress=True,\n",
+        "        num_workers=jobs\n",
+        "    )[0]\n",
+        "    sources = sources * ref.std() + ref.mean()\n",
+        "\n",
+        "    wav = sources[-1]\n",
+        "    wav = wav / max(1.01 * wav.abs().max(), 1)\n",
+        "    wavfile.write(out_filename, rate=sr, data=wav.numpy().T)\n",
+        "    return out_filename\n",
+        "\n",
+        "\n",
+        "def create_dataset(\n",
+        "    clips_csv_filepath = \"data.csv\",\n",
+        "    character = \"somebody\",\n",
+        "    do_extract_vocals = False,\n",
+        "    whisper_size = \"medium\",\n",
+        "    # Where raw yt clips will be downloaded to\n",
+        "    dl_dir = \"downloads\",\n",
+        "    # Where actual data will be organized\n",
+        "    data_dir = \"dataset_raw\",\n",
+        "    **kwargs\n",
+        "):\n",
+        "    dl_path = Path(dl_dir) / character\n",
+        "    dl_path.mkdir(exist_ok=True, parents=True)\n",
+        "    if do_extract_vocals:\n",
+        "        demucs_model = get_model(DEFAULT_MODEL)\n",
+        "\n",
+        "    with Path(clips_csv_filepath).open() as f:\n",
+        "        reader = csv.DictReader(f)\n",
+        "        for i, row in enumerate(reader):\n",
+        "            outfile_path = dl_path / f\"{character}_{i:04d}.wav\"\n",
+        "            download_youtube_clip(row['ytid'], row['start'], row['end'], outfile_path)\n",
+        "            if do_extract_vocals:\n",
+        "                extract_vocal_demucs(demucs_model, outfile_path, outfile_path)\n",
+        "\n",
+        "    filenames = sorted([str(x) for x in dl_path.glob(\"*.wav\")])\n",
+        "    whisper_model = whisper.load_model(whisper_size)\n",
+        "    split_long_audio(whisper_model, filenames, character, data_dir)    "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "D9GrcDUKEGro"
+      },
+      "outputs": [],
+      "source": [
+        "\"\"\"\n",
+        "Here, we override config to have num_workers=0 because\n",
+        "of a limitation in HF Spaces Docker /dev/shm.\n",
+        "\"\"\"\n",
+        "\n",
+        "import json\n",
+        "from pathlib import Path\n",
+        "import multiprocessing\n",
+        "\n",
+        "def update_config(config_file=\"configs/44k/config.json\"):\n",
+        "    config_path = Path(config_file)\n",
+        "    data = json.loads(config_path.read_text())\n",
+        "    data['train']['batch_size'] = 32\n",
+        "    data['train']['eval_interval'] = 500\n",
+        "    data['train']['num_workers'] = multiprocessing.cpu_count()\n",
+        "    data['train']['persistent_workers'] = True\n",
+        "    data['train']['push_to_hub'] = True\n",
+        "    data['train']['repo_id'] = MODEL_REPO_ID # tuple(data['spk'])[0]\n",
+        "    data['train']['private'] = True\n",
+        "    config_path.write_text(json.dumps(data, indent=2, sort_keys=False))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run all Preprocessing Steps"
+      ],
+      "metadata": {
+        "id": "aF6OZkTZRzhj"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OAPnD3xKD_Gw"
+      },
+      "outputs": [],
+      "source": [
+        "create_dataset(character=CHARACTER, do_extract_vocals=DO_EXTRACT_VOCALS)\n",
+        "! svc pre-resample\n",
+        "! svc pre-config\n",
+        "! svc pre-hubert -fm crepe\n",
+        "update_config()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training"
+      ],
+      "metadata": {
+        "id": "VpyGazF6R3CE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true
+        },
+        "id": "MByHpf_wEByg"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import annotations\n",
+        "\n",
+        "import os\n",
+        "import re\n",
+        "import warnings\n",
+        "from logging import getLogger\n",
+        "from multiprocessing import cpu_count\n",
+        "from pathlib import Path\n",
+        "from typing import Any\n",
+        "\n",
+        "import lightning.pytorch as pl\n",
+        "import torch\n",
+        "from lightning.pytorch.accelerators import MPSAccelerator, TPUAccelerator\n",
+        "from lightning.pytorch.loggers import TensorBoardLogger\n",
+        "from lightning.pytorch.strategies.ddp import DDPStrategy\n",
+        "from lightning.pytorch.tuner import Tuner\n",
+        "from torch.cuda.amp import autocast\n",
+        "from torch.nn import functional as F\n",
+        "from torch.utils.data import DataLoader\n",
+        "from torch.utils.tensorboard.writer import SummaryWriter\n",
+        "\n",
+        "import so_vits_svc_fork.f0\n",
+        "import so_vits_svc_fork.modules.commons as commons\n",
+        "import so_vits_svc_fork.utils\n",
+        "\n",
+        "from so_vits_svc_fork import utils\n",
+        "from so_vits_svc_fork.dataset import TextAudioCollate, TextAudioDataset\n",
+        "from so_vits_svc_fork.logger import is_notebook\n",
+        "from so_vits_svc_fork.modules.descriminators import MultiPeriodDiscriminator\n",
+        "from so_vits_svc_fork.modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss\n",
+        "from so_vits_svc_fork.modules.mel_processing import mel_spectrogram_torch\n",
+        "from so_vits_svc_fork.modules.synthesizers import SynthesizerTrn\n",
+        "\n",
+        "from so_vits_svc_fork.train import VitsLightning, VCDataModule\n",
+        "\n",
+        "LOG = getLogger(__name__)\n",
+        "torch.set_float32_matmul_precision(\"high\")\n",
+        "\n",
+        "\n",
+        "from pathlib import Path\n",
+        "\n",
+        "from huggingface_hub import create_repo, upload_folder, login, list_repo_files, delete_file\n",
+        "\n",
+        "# if os.environ.get(\"HF_TOKEN\"):\n",
+        "#     login(os.environ.get(\"HF_TOKEN\"))\n",
+        "\n",
+        "\n",
+        "class HuggingFacePushCallback(pl.Callback):\n",
+        "    def __init__(self, repo_id, private=False, every=100):\n",
+        "        self.repo_id = repo_id\n",
+        "        self.private = private\n",
+        "        self.every = every\n",
+        "\n",
+        "    def on_validation_epoch_end(self, trainer, pl_module):\n",
+        "        self.repo_url = create_repo(\n",
+        "            repo_id=self.repo_id,\n",
+        "            exist_ok=True,\n",
+        "            private=self.private\n",
+        "        )\n",
+        "        self.repo_id = self.repo_url.repo_id\n",
+        "        if pl_module.global_step == 0:\n",
+        "            return\n",
+        "        print(f\"\\n🤗 Pushing to Hugging Face Hub: {self.repo_url}...\")\n",
+        "        model_dir = pl_module.hparams.model_dir\n",
+        "        upload_folder(\n",
+        "            repo_id=self.repo_id,\n",
+        "            folder_path=model_dir,\n",
+        "            path_in_repo=\".\",\n",
+        "            commit_message=\"🍻 cheers\",\n",
+        "            ignore_patterns=[\"*.git*\", \"*README.md*\", \"*__pycache__*\"],\n",
+        "        )\n",
+        "        ckpt_pattern = r'^(D_|G_)\\d+\\.pth$'\n",
+        "        todelete = []\n",
+        "        repo_ckpts = [x for x in list_repo_files(self.repo_id) if re.match(ckpt_pattern, x) and x not in [\"G_0.pth\", \"D_0.pth\"]]\n",
+        "        local_ckpts = [x.name for x in Path(model_dir).glob(\"*.pth\") if re.match(ckpt_pattern, x.name)]\n",
+        "        to_delete = set(repo_ckpts) - set(local_ckpts)\n",
+        "\n",
+        "        for fname in to_delete:\n",
+        "            print(f\"🗑 Deleting {fname} from repo\")\n",
+        "            delete_file(fname, self.repo_id)\n",
+        "\n",
+        "\n",
+        "def train(\n",
+        "    config_path: Path | str, model_path: Path | str, reset_optimizer: bool = False\n",
+        "):\n",
+        "    config_path = Path(config_path)\n",
+        "    model_path = Path(model_path)\n",
+        "\n",
+        "    hparams = utils.get_backup_hparams(config_path, model_path)\n",
+        "    utils.ensure_pretrained_model(model_path, hparams.model.get(\"type_\", \"hifi-gan\"))\n",
+        "\n",
+        "    datamodule = VCDataModule(hparams)\n",
+        "    strategy = (\n",
+        "        (\n",
+        "            \"ddp_find_unused_parameters_true\"\n",
+        "            if os.name != \"nt\"\n",
+        "            else DDPStrategy(find_unused_parameters=True, process_group_backend=\"gloo\")\n",
+        "        )\n",
+        "        if torch.cuda.device_count() > 1\n",
+        "        else \"auto\"\n",
+        "    )\n",
+        "    LOG.info(f\"Using strategy: {strategy}\")\n",
+        "    \n",
+        "    callbacks = []\n",
+        "    if hparams.train.push_to_hub:\n",
+        "        callbacks.append(HuggingFacePushCallback(hparams.train.repo_id, hparams.train.private))\n",
+        "    if not is_notebook():\n",
+        "        callbacks.append(pl.callbacks.RichProgressBar())\n",
+        "    if callbacks == []:\n",
+        "        callbacks = None\n",
+        "\n",
+        "    trainer = pl.Trainer(\n",
+        "        logger=TensorBoardLogger(\n",
+        "            model_path, \"lightning_logs\", hparams.train.get(\"log_version\", 0)\n",
+        "        ),\n",
+        "        # profiler=\"simple\",\n",
+        "        val_check_interval=hparams.train.eval_interval,\n",
+        "        max_epochs=hparams.train.epochs,\n",
+        "        check_val_every_n_epoch=None,\n",
+        "        precision=\"16-mixed\"\n",
+        "        if hparams.train.fp16_run\n",
+        "        else \"bf16-mixed\"\n",
+        "        if hparams.train.get(\"bf16_run\", False)\n",
+        "        else 32,\n",
+        "        strategy=strategy,\n",
+        "        callbacks=callbacks,\n",
+        "        benchmark=True,\n",
+        "        enable_checkpointing=False,\n",
+        "    )\n",
+        "    tuner = Tuner(trainer)\n",
+        "    model = VitsLightning(reset_optimizer=reset_optimizer, **hparams)\n",
+        "\n",
+        "    # automatic batch size scaling\n",
+        "    batch_size = hparams.train.batch_size\n",
+        "    batch_split = str(batch_size).split(\"-\")\n",
+        "    batch_size = batch_split[0]\n",
+        "    init_val = 2 if len(batch_split) <= 1 else int(batch_split[1])\n",
+        "    max_trials = 25 if len(batch_split) <= 2 else int(batch_split[2])\n",
+        "    if batch_size == \"auto\":\n",
+        "        batch_size = \"binsearch\"\n",
+        "    if batch_size in [\"power\", \"binsearch\"]:\n",
+        "        model.tuning = True\n",
+        "        tuner.scale_batch_size(\n",
+        "            model,\n",
+        "            mode=batch_size,\n",
+        "            datamodule=datamodule,\n",
+        "            steps_per_trial=1,\n",
+        "            init_val=init_val,\n",
+        "            max_trials=max_trials,\n",
+        "        )\n",
+        "        model.tuning = False\n",
+        "    else:\n",
+        "        batch_size = int(batch_size)\n",
+        "    # automatic learning rate scaling is not supported for multiple optimizers\n",
+        "    \"\"\"if hparams.train.learning_rate  == \"auto\":\n",
+        "    lr_finder = tuner.lr_find(model)\n",
+        "    LOG.info(lr_finder.results)\n",
+        "    fig = lr_finder.plot(suggest=True)\n",
+        "    fig.savefig(model_path / \"lr_finder.png\")\"\"\"\n",
+        "\n",
+        "    trainer.fit(model, datamodule=datamodule)\n",
+        "\n",
+        "if __name__ == '__main__':\n",
+        "    train('configs/44k/config.json', 'logs/44k')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Train Cluster Model"
+      ],
+      "metadata": {
+        "id": "b2vNCDrSR8Xo"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DBBEx-6Y1sOy"
+      },
+      "outputs": [],
+      "source": [
+        "! svc train-cluster"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "y_qYMuNY1tlm"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import upload_file\n",
+        "\n",
+        "upload_file(path_or_fileobj=\"/content/logs/44k/kmeans.pt\", repo_id=MODEL_REPO_ID, path_in_repo=\"kmeans.pt\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "machine_shape": "hm",
+      "provenance": [],
+      "authorship_tag": "ABX9TyOQeFSvxop9rlCaglNlNoXI",
+      "include_colab_link": true
+    },
+    "gpuClass": "premium",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}