Voice-Cloning-for-Bilibili

Runtime error

App Files Files Community

nijisakai commited on Jul 25, 2023

Commit

f06136b

•

1 Parent(s): 1d3811a

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -423

app.py CHANGED Viewed

@@ -14,16 +14,11 @@ from huggingface_hub import hf_hub_download, list_repo_files
 from so_vits_svc_fork.hparams import HParams
 from so_vits_svc_fork.inference.core import Svc
 ###################################################################
 # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
 ###################################################################
-# The Hugging Face Hub repo IDs - Modify this list to include any pre-trained models you want!
-repo_ids = [
-    "nijisakai/sunyanzi",
-    "kevinwang676/jay",
-    # Add more repo IDs here...
-]
 # If None, Uses latest ckpt in the repo
 ckpt_name = None
@@ -42,14 +37,14 @@ default_cluster_infer_ratio = 0.5
 # Limit on duration of audio at inference time. increase if you can
 # In this parent app, we set the limit with an env var to 30 seconds
-# If you didn't set env var + you go OOM try changing 9e9 to <=300ish
 duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
 ###################################################################
-# Helper function to download model and cluster model
-def download_models(repo_id):
-    global ckpt_name, cluster_model_name
     if ckpt_name is None:
         latest_id = sorted(
             [
@@ -74,427 +69,173 @@ def download_models(repo_id):
     hparams = HParams(**json.loads(Path(config_path).read_text()))
     speakers = list(hparams.spk.keys())
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = Svc(
-        net_g_path=generator_path,
-        config_path=config_path,
-        device=device,
-        cluster_model_path=cluster_model_path,
-    )
     demucs_model = get_model(DEFAULT_MODEL)
-    return model, demucs_model, speakers
-# Helper function to extract vocals using the demucs model
-def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
-    wav, sr = librosa.load(filename, mono=False, sr=sr)
-    wav = torch.tensor(wav)
-    ref = wav.mean(0)
-    wav = (wav - ref.mean()) / ref.std()
-    sources = apply_model(
-        model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
-    )[0]
-    sources = sources * ref.std() + ref.mean()
-    # We take just the vocals stem. I know the vocals for this model are at index -1
-    # If using a different model, check model.sources.index('vocals')
-    vocal_wav = sources[-1]
-    # I did this because it's the same normalization the so-vits model required
-    vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
-    vocal_wav = vocal_wav.numpy()
-    vocal_wav = librosa.to_mono(vocal_wav)
-    vocal_wav = vocal_wav.T
-    instrumental_wav = sources[:-1].sum(0).numpy().T
-    return vocal_wav, instrumental_wav
-def download_youtube_clip(
-    video_identifier,
-    start_time,
-    end_time,
-    output_filename,
-    num_attempts=5,
-    url_base="https://www.youtube.com/watch?v=",
-    quiet=False,
-    force=False,
-):
-    output_path = Path(output_filename)
-    if output_path.exists():
-        if not force:
             return output_path
         else:
-            output_path.unlink()
-    quiet = "--quiet --no-warnings" if quiet else ""
-    command = f"""
-        yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
-    """.strip()
-    attempts = 0
-    while True:
-        try:
-            _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
-        except subprocess.CalledProcessError:
-            attempts += 1
-            if attempts == num_attempts:
-                return None
-        else:
-            break
-    if output_path.exists():
-        return output_path
-    else:
-        return None
-def predict(
-    speaker,
-    audio,
-    transpose: int = 0,
-    auto_predict_f0: bool = False,
-    cluster_infer_ratio: float = 0,
-    noise_scale: float = 0.4,
-    f0_method: str = "crepe",
-    db_thresh: int = -40,
-    pad_seconds: float = 0.5,
-    chunk_seconds: float = 0.5,
-    absolute_thresh: bool = False,
-):
-    audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
-    audio = model.infer_silence(
-        audio.astype(np.float32),
-        speaker=speaker,
-        transpose=transpose,
-        auto_predict_f0=auto_predict_f0,
-        cluster_infer_ratio=cluster_infer_ratio,
-        noise_scale=noise_scale,
-        f0_method=f0_method,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
-        chunk_seconds=chunk_seconds,
-        absolute_thresh=absolute_thresh,
-    )
-    return model.target_sample, audio
-def predict_song_from_yt(
-    ytid_or_url,
-    start,
-    end,
-    speaker=speakers[0],
-    transpose: int = 0,
-    auto_predict_f0: bool = False,
-    cluster_infer_ratio: float = 0,
-    noise_scale: float = 0.4,
-    f0_method: str = "dio",
-    db_thresh: int = -40,
-    pad_seconds: float = 0.5,
-    chunk_seconds: float = 0.5,
-    absolute_thresh: bool = False,
-):
-    end = min(start + duration_limit, end)
-    original_track_filepath = download_youtube_clip(
         ytid_or_url,
         start,
         end,
-        "track.wav",
-        force=True,
-        url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
-    )
-    vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
-    if transpose != 0:
-        inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
-    cloned_vox = model.infer_silence(
-        vox_wav.astype(np.float32),
-        speaker=speaker,
-        transpose=transpose,
-        auto_predict_f0=auto_predict_f0,
-        cluster_infer_ratio=cluster_infer_ratio,
-        noise_scale=noise_scale,
-        f0_method=f0_method,
-        db_thresh=db_thresh,
-        pad_seconds=pad_seconds,
-        chunk_seconds=chunk_seconds,
-        absolute_thresh=absolute_thresh,
-    )
-    full_song = inst_wav + np.expand_dims(cloned_vox, 1)
-    return (model.target_sample, full_song), (model.target_sample, cloned_vox)
-# Create a dictionary to store all models, demucs models, and speakers
-all_models = {}
-for repo_id in repo_ids:
-    model, demucs_model, speakers = download_models(repo_id)
-    all_models[repo_id] = {
-        "model": model,
-        "demucs_model": demucs_model,
-        "speakers": speakers,
-    }
-# Interface definition
-description = """
-# ... (existing code)
-# No changes made to this part of the code, so skipping it
-interface_yt = gr.Interface(
-    predict_song_from_yt,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Textbox(
-            label="Bilibili URL",
-            info="Please enter the Bilibili URL containing the song you want to convert. You can also use the BV number directly.",
-            value="https://www.bilibili.com/video/BV...",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=[
-        gr.Audio(label="AI Singer + Accompaniment"),
-        gr.Audio(label="AI Singer Vocals Only"),
-    ],
-    title="🌊💕🎶 - Upload Audio from Bilibili, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="file",
-            label="Upload Audio File",
-            description="Upload the audio file you want to convert. (Voice only, no background music)",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface_mic = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="microphone",
-            label="Use Microphone to Upload Your Song",
-            description="Upload the song you want to convert using your microphone.",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio from Microphone, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface_file = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="file",
-            label="Upload Audio File",
-            description="Upload the audio file you want to convert. (Voice only, no background music)",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="file",
-            label="Upload Audio File",
-            description="Upload the audio file you want to convert. (Voice only, no background music)",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface_yt = gr.Interface(
-    predict_song_from_yt,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Textbox(
-            label="Bilibili URL",
-            info="Please enter the Bilibili URL containing the song you want to convert. You can also use the BV number directly.",
-            value="https://www.bilibili.com/video/BV...",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=[
-        gr.Audio(label="AI Singer + Accompaniment"),
-        gr.Audio(label="AI Singer Vocals Only"),
-    ],
-    title="🌊💕🎶 - Upload Audio from Bilibili, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="file",
-            label="Upload Audio File",
-            description="Upload the audio file you want to convert. (Voice only, no background music)",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface_mic = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="microphone",
-            label="Use Microphone to Upload Your Song",
-            description="Upload the song you want to convert using your microphone.",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio from Microphone, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface_file = gr.Interface(
-    predict,
-    inputs=[
-        gr.Dropdown(
-            choices=repo_ids,
-            label="Select Pre-trained Model",
-            default=repo_ids[0],
-            description="Choose from different pre-trained models.",
-        ),
-        gr.Dropdown(
-            choices=speakers,
-            label="AI Singer Selection",
-            description="Choose your favorite AI singer.",
-        ),
-        gr.Audio(
-            type="file",
-            label="Upload Audio File",
-            description="Upload the audio file you want to convert. (Voice only, no background music)",
-        ),
-        # ... (existing code)
-        # Rest of the inputs, no changes made, so skipping the code
-    ],
-    outputs=gr.Audio(label="Converted Audio"),
-    title="🌊💕🎶 - Upload Audio File, No Need to Separate Background Music",
-    description=description,
-    article=article,
-    cache_examples=False,
-)
-interface = gr.TabbedInterface(
-    [interface_yt, interface_mic, interface_file],
-    ["📺 - Upload Audio from Bilibili ⭐Recommended⭐", "🎙️ - Upload Audio from Microphone", "🎵 - Upload Audio File"],
-)
 if __name__ == "__main__":
     interface.launch(show_error=True)

 from so_vits_svc_fork.hparams import HParams
 from so_vits_svc_fork.inference.core import Svc
 ###################################################################
 # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
 ###################################################################
+# The Hugging Face Hub repo IDs - 在这里修改repo_id，可替换成任何已经训练好的模型！
+repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay"]
 # If None, Uses latest ckpt in the repo
 ckpt_name = None
 # Limit on duration of audio at inference time. increase if you can
 # In this parent app, we set the limit with an env var to 30 seconds
+# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
 duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
 ###################################################################
+interfaces = []
+for repo_id in repo_ids:
+    # Figure out the latest generator by taking highest value one.
+    # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
     if ckpt_name is None:
         latest_id = sorted(
             [
     hparams = HParams(**json.loads(Path(config_path).read_text()))
     speakers = list(hparams.spk.keys())
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
     demucs_model = get_model(DEFAULT_MODEL)
+    def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
+        wav, sr = librosa.load(filename, mono=False, sr=sr)
+        wav = torch.tensor(wav)
+        ref = wav.mean(0)
+        wav = (wav - ref.mean()) / ref.std()
+        sources = apply_model(
+            model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
+        )[0]
+        sources = sources * ref.std() + ref.mean()
+        # We take just the vocals stem. I know the vocals for this model are at index -1
+        # If using different model, check model.sources.index('vocals')
+        vocal_wav = sources[-1]
+        # I did this because its the same normalization the so-vits model required
+        vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
+        vocal_wav = vocal_wav.numpy()
+        vocal_wav = librosa.to_mono(vocal_wav)
+        vocal_wav = vocal_wav.T
+        instrumental_wav = sources[:-1].sum(0).numpy().T
+        return vocal_wav, instrumental_wav
+    def download_youtube_clip(
+        video_identifier,
+        start_time,
+        end_time,
+        output_filename,
+        num_attempts=5,
+        url_base="https://www.youtube.com/watch?v=",
+        quiet=False,
+        force=False,
+    ):
+        output_path = Path(output_filename)
+        if output_path.exists():
+            if not force:
+                return output_path
+            else:
+                output_path.unlink()
+        quiet = "--quiet --no-warnings" if quiet else ""
+        command = f"""
+            yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}"  # noqa: E501
+        """.strip()
+        attempts = 0
+        while True:
+            try:
+                _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError:
+                attempts += 1
+                if attempts == num_attempts:
+                    return None
+            else:
+                break
+        if output_path.exists():
             return output_path
         else:
+            return None
+    def predict(
+        speaker,
+        audio,
+        transpose: int = 0,
+        auto_predict_f0: bool = False,
+        cluster_infer_ratio: float = 0,
+        noise_scale: float = 0.4,
+        f0_method: str = "crepe",
+        db_thresh: int = -40,
+        pad_seconds: float = 0.5,
+        chunk_seconds: float = 0.5,
+        absolute_thresh: bool = False,
+    ):
+        audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
+        audio = model.infer_silence(
+            audio.astype(np.float32),
+            speaker=speaker,
+            transpose=transpose,
+            auto_predict_f0=auto_predict_f0,
+            cluster_infer_ratio=cluster_infer_ratio,
+            noise_scale=noise_scale,
+            f0_method=f0_method,
+            db_thresh=db_thresh,
+            pad_seconds=pad_seconds,
+            chunk_seconds=chunk_seconds,
+            absolute_thresh=absolute_thresh,
+        )
+        return model.target_sample, audio
+    def predict_song_from_yt(
         ytid_or_url,
         start,
         end,
+        speaker=speakers[0],
+        transpose: int = 0,
+        auto_predict_f0: bool = False,
+        cluster_infer_ratio: float = 0,
+        noise_scale: float = 0.4,
+        f0_method: str = "dio",
+        db_thresh: int = -40,
+        pad_seconds: float = 0.5,
+        chunk_seconds: float = 0.5,
+        absolute_thresh: bool = False,
+    ):
+        end = min(start + duration_limit, end)
+        original_track_filepath = download_youtube_clip(
+            ytid_or_url,
+            start,
+            end,
+            "track.wav",
+            force=True,
+            url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
+        )
+        vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
+        if transpose != 0:
+            inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
+        cloned_vox = model.infer_silence(
+            vox_wav.astype(np.float32),
+            speaker=speaker,
+            transpose=transpose,
+            auto_predict_f0=auto_predict_f0,
+            cluster_infer_ratio=cluster_infer_ratio,
+            noise_scale=noise_scale,
+            f0_method=f0_method,
+            db_thresh=db_thresh,
+            pad_seconds=pad_seconds,
+            chunk_seconds=chunk_seconds,
+            absolute_thresh=absolute_thresh,
+        )
+        full_song = inst_wav + np.expand_dims(cloned_vox, 1)
+        return (model.target_sample, full_song), (model.target_sample, cloned_vox)
+    description = f"""
+    <center>💡 - 如何使用此程序：在页面上方选择“从B站视频上传”模块，填写视频网址和视频起止时间后，点击“submit”按键即可！您还可以点击页面最下方的示例快速预览效果</center>
+    """.strip()
+    article = """
+    <p style='text-align: center'> 注意❗：请不要生成会对个人以及组织造成侵害的内容，此程序仅供科研、学习及个人娱乐使用。
+    </p>
+    """.strip()
+    interface = gr.Interface(
+        predict,
+        inputs=[
+            gr.Dropdown(speakers, label="🎤AI歌手选择🎶"),
+            gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
+            gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0；有正负值，+2为升高两个key)"),
+            gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启；配合聚类模型f0预测效果更好，仅限语音转换时使用", visible=False),
+            gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间，0即不启用聚类。使用聚类模型能提升音色相似度，但会导致咬字下降 (如果使用，建议0.5左右)"),
+            gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
+            gr.Dropdown(
+                choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
+                value=default_f0_method,
+                label="模型推理方法 (crepe推理效果最好)", visible=False
+            ),
+        ],
+        outputs="audio",
+        cache_examples=False,
+        title=f"🌊💕🎶 - 滔滔AI+音乐：可从B站直接上传素材，无需分离背景音 ({repo_id})",
+        description=description,
+        article=article,
+    )
+    interfaces.append(interface)
+# Combine the interfaces using a TabbedInterface
+interface = gr.TabbedInterface(interfaces, [f"Model {i+1}" for i in range(len(interfaces))])
 if __name__ == "__main__":
     interface.launch(show_error=True)