Voice-Cloning-for-Bilibili

Runtime error

App Files Files Community

nijisakai commited on Jul 25, 2023

Commit

eb7e85b

•

1 Parent(s): 43ee5dc

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -262

app.py CHANGED Viewed

@@ -1,6 +1,3 @@
-import torch.nn as nn
-import io
 import json
 import os
 import subprocess
@@ -17,273 +14,83 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from so_vits_svc_fork.hparams import HParams
 from so_vits_svc_fork.inference.core import Svc
-repo_id = ["nijisakai/sunyanzi", "kevinwang676/talktalkai-qing"]
-ckpt_names = []
-latest_ids = []
-for repo in repo_id:
-    latest_id = sorted(
-        [
-            int(Path(x).stem.split("_")[1])
-            for x in list_repo_files(repo)
-            if x.startswith("G_") and x.endswith(".pth")
-        ]
-    )[-1]
-    ckpt_names.append(f"G_{latest_id}.pth")
-    latest_ids.append(latest_id)
-cluster_model_names = ["kmeans.pt" for _ in range(len(repo_id))]
-cluster_model_paths = [
-    hf_hub_download(repo, name) if name in list_repo_files(repo) else None for repo, name in zip(repo_id, cluster_model_names)
-]
-device = "cuda" if torch.cuda.is_available() else "cpu"
-generator_paths = [hf_hub_download(repo, ckpt_name) for repo, ckpt_name in zip(repo_id, ckpt_names)]
-config_paths = [hf_hub_download(repo, "config.json") for repo in repo_id]
-hparams_list = [HParams(**json.loads(Path(config_path).read_text())) for config_path in config_paths]
-speakers = []
-for hparams in hparams_list:
-    speakers.extend(list(hparams.spk.keys()))
-models = [
-    Svc(net_g_path=gen_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
-    for gen_path, config_path, cluster_model_path in zip(generator_paths, config_paths, cluster_model_paths)
-]
-demucs_model = get_model(DEFAULT_MODEL)
 duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
-def extract_vocal_demucs(model_path, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
-    model = nn.Module()
-    with open(model_path, "rb") as f:
-        buffer = io.BytesIO(f.read())
-    model_state_dict = torch.load(buffer)
-    model.load_state_dict(model_state_dict)
-    model.to(device)
-    wav, sr = librosa.load(filename, mono=False, sr=sr)
-    wav = torch.tensor(wav)
-    ref = wav.mean(0)
-    wav = (wav - ref.mean()) / ref.std()
-    sources = apply_model(
-        model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
-    )[0]
-    sources = sources * ref.std() + ref.mean()
-    vocal_wav = sources[-1]
-    vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
-    vocal_wav = vocal_wav.numpy()
-    vocal_wav = librosa.to_mono(vocal_wav)
-    vocal_wav = vocal_wav.T
-    instrumental_wav = sources[:-1].sum(0).numpy().T
-    return vocal_wav, instrumental_wav
-def predict(models, speaker, audio, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0,
-            noise_scale: float = 0.4, f0_method: str = "crepe", db_thresh: int = -40, pad_seconds: float = 0.5,
-            chunk_seconds: float = 0.5, absolute_thresh: bool = False):
-    audio, _ = librosa.load(audio, sr=models[0].target_sample, duration=duration_limit)
-    audio = model.infer_silence(
-        audio.astype(np.float32),
-        speaker=speaker,
-        transpose=transpose,
-        auto_predict_f0=auto_predict_f0,
-        cluster_infer_ratio=cluster_infer_ratio,
-        noise_scale=noise_scale,
-        f0_method=f0_method,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
-        chunk_seconds=chunk_seconds,
-        absolute_thresh=absolute_thresh,
-    )
-    return model.target_sample, audio
-def predict_song_from_yt(
-    ytid_or_url,
-    start,
-    end,
-    speaker,
-    transpose: int = 0,
-    auto_predict_f0: bool = False,
-    cluster_infer_ratio: float = 0,
-    noise_scale: float = 0.4,
-    f0_method: str = "dio",
-    db_thresh: int = -40,
-    pad_seconds: float = 0.5,
-    chunk_seconds: float = 0.5,
-    absolute_thresh: bool = False,
-):
-    # Check if start and end are valid numeric values
-    try:
-        start = float(start)
-        end = float(end)
-    except ValueError:
-        raise ValueError("Invalid start or end time. Please provide valid numeric values.")
-    end = min(start + duration_limit, end)
-    original_track_filepath = download_youtube_clip(
-        ytid_or_url,
-        start,
-        end,
-        "track.wav",
-        force=True,
-        url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
-    )
-    vox_wav, inst_wav = extract_vocal_demucs(models[0], original_track_filepath)
-    if transpose != 0:
-        inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=models[0].target_sample, n_steps=transpose).T
-    cloned_vox = models[0].infer_silence(
-        vox_wav.astype(np.float32),
-        speaker=speaker,
-        transpose=transpose,
-        auto_predict_f0=auto_predict_f0,
-        cluster_infer_ratio=cluster_infer_ratio,
-        noise_scale=noise_scale,
-        f0_method=f0_method,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
-        chunk_seconds=chunk_seconds,
-        absolute_thresh=absolute_thresh,
-    )
-    full_song = inst_wav + np.expand_dims(cloned_vox, 1)
-    return (models[0].target_sample, full_song), (models[0].target_sample, cloned_vox)
-description = f"""
-<center>💡 - How to use this app: Select the "Predict from YouTube Video" tab above, fill in the YouTube video URL and the start and end times of the video, then click the "Submit" button!</center>
-""".strip()
-article = """
-<p style='text-align: center'> Note❗: Please do not generate content that may cause harm to individuals or organizations. This program is for research, learning, and personal entertainment purposes only.
-</p>
-""".strip()
-def download_youtube_clip(
-    video_identifier,
-    start_time,
-    end_time,
-    output_filename,
-    num_attempts=5,
-    url_base="https://www.youtube.com/watch?v=",
-    quiet=False,
-    force=False,
-):
-    output_path = Path(output_filename)
-    if output_path.exists():
-        if not force:
-            return output_path
-        else:
-            output_path.unlink()
-    quiet = "--quiet --no-warnings" if quiet else ""
-    command = f"""
-        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
-    """.strip()
-    attempts = 0
-    while True:
-        try:
-            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
-        except subprocess.CalledProcessError:
-            attempts += 1
-            if attempts == num_attempts:
-                return None
-        else:
-            break
-    if output_path.exists():
-        return output_path
-    else:
-        return None
-interface_mic = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(speakers, label="🎤AI Singer Selection🎶"),
-        gr.Audio(type="filepath", source="microphone", label="Please upload the song you want to convert using the microphone"),
-        gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
-        gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
-        gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation (recommended value around 0.5)"),
-        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
-        gr.Dropdown(
-            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
-            value="crepe",
-            label="Model Inference Method (crepe gives the best results)", visible=False
-        ),
-    ],
-    outputs="audio",
-    cache_examples=False,
-    title="🌊💕🎶 - AI Music Generation: Upload from Bilibili Directly, No Need to Separate Background Audio",
-    description=description,
-    article=article,
-)
-interface_file = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(speakers, value=speakers[0], label="🎤AI Singer Selection🎶"),
-        gr.Audio(type="filepath", source="upload", label="Please upload the song you want to convert (vocals only)"),
-        gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
-        gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
-        gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation (recommended value around 0.5)"),
-        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
-        gr.Dropdown(
-            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
-            value="crepe",
-            label="Model Inference Method (crepe gives the best results)", visible=False
-        ),
-    ],
-    outputs="audio",
-    cache_examples=False,
-    title="🌊💕🎶 Upload from Bilibili Directly, No Need to Separate Background Audio",
-    description=description,
-    article=article,
-)
-interface_yt = gr.Interface(
-    predict_song_from_yt,
-    inputs=[
-        gr.Textbox(
-            label="Bilibili URL", info="Please provide the Bilibili URL containing the song you like, you can also directly input the BV number"
-        ),
-        gr.Number(value=0, label="Start Time (seconds)"),
-        gr.Number(value=15, label="End Time (seconds)"),
-        gr.Dropdown(speakers, value=speakers[0], label="🎤AI Singer Selection🎶"),
-        gr.Slider(-12, 12, value=0, step=1, label="Transpose (default is 0; positive values for pitch increase)"),
-        gr.Checkbox(False, label="Enable Automatic f0 Prediction", info="Check this box to enable; works best with clustering model for f0 prediction, use for voice conversion only", visible=False),
-        gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Cluster Inference Ratio", info="0-1 range, 0 for no clustering. Using clustering model can improve timbre similarity, but may affect articulation"),
-        gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="Noise Scale (keep unchanged)", visible=False),
-        gr.Dropdown(
-            choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
-            value="crepe",
-            label="Model Inference Method (crepe gives the best results)", visible=False
-        ),
-    ],
-    outputs=[gr.Audio(label="AI Singer + Accompaniment🎵"), gr.Audio(label="AI Singer Vocals🎤")],
-    title="🌊💕🎶 Upload from Bilibili Directly, No Need to Separate Background Audio",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
 interfaces = []
-for idx, model in enumerate(models):
-    interfaces.append(
-        gr.TabbedInterface(
-            [interface_yt, interface_mic, interface_file],
-            ["📺 Predict from Bilibili Video ⭐Recommended⭐", "🎙️ Predict from Microphone", "🎵 Predict from File"],
-        )
     )
 if __name__ == "__main__":
-    for idx, interface in enumerate(interfaces):
-        print(f"Launching Interface {idx + 1}")
-        interface.launch(show_error=True)

 import json
 import os
 import subprocess
 from so_vits_svc_fork.hparams import HParams
 from so_vits_svc_fork.inference.core import Svc
+###################################################################
+# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
+###################################################################
+# The Hugging Face Hub repo IDs - 在这里修改repo_id，可替换成任何已经训练好的模型！
+repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
+# If None, Uses latest ckpt in the repo
+ckpt_name = None
+# If None, Uses "kmeans.pt" if it exists in the repo
+cluster_model_name = None
+# Set the default f0 type to use - use the one it was trained on.
+# The default for so-vits-svc-fork is "dio".
+# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
+default_f0_method = "crepe"
+# The default ratio of cluster inference to SVC inference.
+# If cluster_model_name is not found in the repo, this is set to 0.
+default_cluster_infer_ratio = 0.5
+# Limit on duration of audio at inference time. increase if you can
+# In this parent app, we set the limit with an env var to 30 seconds
+# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
 duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
+###################################################################
 interfaces = []
+for repo_id in repo_ids:
+    # Figure out the latest generator by taking highest value one.
+    # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
+    if ckpt_name is None:
+        latest_id = sorted(
+            [
+                int(Path(x).stem.split("_")[1])
+                for x in list_repo_files(repo_id)
+                if x.startswith("G_") and x.endswith(".pth")
+            ]
+        )[-1]
+        ckpt_name = f"G_{latest_id}.pth"
+    cluster_model_name = cluster_model_name or "kmeans.pt"
+    if cluster_model_name in list_repo_files(repo_id):
+        print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
+        cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
+    else:
+        print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
+        cluster_model_path = None
+    default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
+    generator_path = hf_hub_download(repo_id, ckpt_name)
+    config_path = hf_hub_download(repo_id, "config.json")
+    hparams = HParams(**json.loads(Path(config_path).read_text()))
+    speakers = list(hparams.spk.keys())
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
+    demucs_model = get_model(DEFAULT_MODEL)
+    # ... (same code as before to define the functions)
+    interface = gr.Interface(
+        predict,
+        inputs=[
+            gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
+            gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
+            # ... (same inputs as before)
+        ],
+        outputs="audio",
+        cache_examples=False,
+        title=f"🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音 ({repo_id})",
+        description=description,
+        article=article,
     )
+    interfaces.append(interface)
+# Combine the interfaces using a TabbedInterface
+interface = gr.TabbedInterface(interfaces, [f"Model {i+1}" for i in range(len(interfaces))])
 if __name__ == "__main__":
+    interface.launch(show_error=True)