Create app._use_multi_repo_ids_.py

#10
by nijisakai - opened
Files changed (1) hide show
  1. app._use_multi_repo_ids_.py +298 -0
app._use_multi_repo_ids_.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import librosa
8
+ import numpy as np
9
+ import torch
10
+ from demucs.apply import apply_model
11
+ from demucs.pretrained import DEFAULT_MODEL, get_model
12
+ from huggingface_hub import hf_hub_download, list_repo_files
13
+
14
+ from so_vits_svc_fork.hparams import HParams
15
+ from so_vits_svc_fork.inference.core import Svc
16
+
17
+
18
+ ###################################################################
19
+ # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
20
+ ###################################################################
21
+ # The Hugging Face Hub repo ID - 在这里修改repo_id,可替换成任何已经训练好的模型!
22
+ repo_ids = ["nijisakai/sunyanzi", "kevinwang676/jay","nijisakai/Eric_Cartman"]
23
+
24
+ # If None, Uses latest ckpt in the repo
25
+ ckpt_name = None
26
+
27
+ # If None, Uses "kmeans.pt" if it exists in the repo
28
+ cluster_model_name = None
29
+
30
+ # Set the default f0 type to use - use the one it was trained on.
31
+ # The default for so-vits-svc-fork is "dio".
32
+ # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
33
+ default_f0_method = "crepe"
34
+
35
+ # The default ratio of cluster inference to SVC inference.
36
+ # If cluster_model_name is not found in the repo, this is set to 0.
37
+ default_cluster_infer_ratio = 0.5
38
+
39
+ # Limit on duration of audio at inference time. increase if you can
40
+ # In this parent app, we set the limit with an env var to 30 seconds
41
+ # If you didnt set env var + you go OOM try changing 9e9 to <=300ish
42
+ duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
43
+ ###################################################################
44
+
45
+ models = []
46
+ speakers = []
47
+ for repo_id in repo_ids:
48
+ # Figure out the latest generator by taking highest value one.
49
+ # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
50
+ if ckpt_name is None:
51
+ latest_id = sorted(
52
+ [
53
+ int(Path(x).stem.split("_")[1])
54
+ for x in list_repo_files(repo_id)
55
+ if x.startswith("G_") and x.endswith(".pth")
56
+ ]
57
+ )[-1]
58
+ ckpt_name = f"G_{latest_id}.pth"
59
+
60
+ cluster_model_name = cluster_model_name or "kmeans.pt"
61
+ if cluster_model_name in list_repo_files(repo_id):
62
+ print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
63
+ cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
64
+ else:
65
+ print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
66
+ cluster_model_path = None
67
+ default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
68
+
69
+ generator_path = hf_hub_download(repo_id, ckpt_name)
70
+ config_path = hf_hub_download(repo_id, "config.json")
71
+ hparams = HParams(**json.loads(Path(config_path).read_text()))
72
+ speaker = list(hparams.spk.keys())
73
+ speakers.extend(speaker)
74
+ device = "cuda" if torch.cuda.is_available() else "cpu"
75
+ model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
76
+ models.append(model)
77
+
78
+ # Reset ckpt_name and cluster_model_name for the next iteration
79
+ ckpt_name = None
80
+ cluster_model_name = None
81
+
82
+ demucs_model = get_model(DEFAULT_MODEL)
83
+
84
+ def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
85
+ wav, sr = librosa.load(filename, mono=False, sr=sr)
86
+ wav = torch.tensor(wav)
87
+ ref = wav.mean(0)
88
+ wav = (wav - ref.mean()) / ref.std()
89
+ sources = apply_model(
90
+ model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
91
+ )[0]
92
+ sources = sources * ref.std() + ref.mean()
93
+ vocal_wav = sources[-1]
94
+ vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
95
+ vocal_wav = vocal_wav.numpy()
96
+ vocal_wav = librosa.to_mono(vocal_wav)
97
+ vocal_wav = vocal_wav.T
98
+ instrumental_wav = sources[:-1].sum(0).numpy().T
99
+ return vocal_wav, instrumental_wav
100
+
101
+ def download_youtube_clip(
102
+ video_identifier,
103
+ start_time,
104
+ end_time,
105
+ output_filename,
106
+ num_attempts=5,
107
+ url_base="https://www.youtube.com/watch?v=",
108
+ quiet=False,
109
+ force=False,
110
+ ):
111
+ output_path = Path(output_filename)
112
+ if output_path.exists():
113
+ if not force:
114
+ return output_path
115
+ else:
116
+ output_path.unlink()
117
+
118
+ quiet = "--quiet --no-warnings" if quiet else ""
119
+ command = f"""
120
+ yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
121
+ """.strip()
122
+
123
+ attempts = 0
124
+ while True:
125
+ try:
126
+ _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
127
+ except subprocess.CalledProcessError:
128
+ attempts += 1
129
+ if attempts == num_attempts:
130
+ return None
131
+ else:
132
+ break
133
+
134
+ if output_path.exists():
135
+ return output_path
136
+ else:
137
+ return None
138
+
139
+ def predict(
140
+ speaker,
141
+ audio,
142
+ transpose: int = 0,
143
+ auto_predict_f0: bool = False,
144
+ cluster_infer_ratio: float = 0,
145
+ noise_scale: float = 0.4,
146
+ f0_method: str = "crepe",
147
+ db_thresh: int = -40,
148
+ pad_seconds: float = 0.5,
149
+ chunk_seconds: float = 0.5,
150
+ absolute_thresh: bool = False,
151
+ ):
152
+ model = models[speakers.index(speaker)]
153
+ audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
154
+ audio = model.infer_silence(
155
+ audio.astype(np.float32),
156
+ speaker=speaker,
157
+ transpose=transpose,
158
+ auto_predict_f0=auto_predict_f0,
159
+ cluster_infer_ratio=cluster_infer_ratio,
160
+ noise_scale=noise_scale,
161
+ f0_method=f0_method,
162
+ db_thresh=db_thresh,
163
+ pad_seconds=pad_seconds,
164
+ chunk_seconds=chunk_seconds,
165
+ absolute_thresh=absolute_thresh,
166
+ )
167
+ return model.target_sample, audio
168
+
169
+
170
+ def predict_song_from_yt(
171
+ ytid_or_url,
172
+ start,
173
+ end,
174
+ speaker=speakers[0],
175
+ transpose: int = 0,
176
+ auto_predict_f0: bool = False,
177
+ cluster_infer_ratio: float = 0,
178
+ noise_scale: float = 0.4,
179
+ f0_method: str = "dio",
180
+ db_thresh: int = -40,
181
+ pad_seconds: float = 0.5,
182
+ chunk_seconds: float = 0.5,
183
+ absolute_thresh: bool = False,
184
+ ):
185
+ model = models[speakers.index(speaker)]
186
+ end = min(start + duration_limit, end)
187
+ original_track_filepath = download_youtube_clip(
188
+ ytid_or_url,
189
+ start,
190
+ end,
191
+ "track.wav",
192
+ force=True,
193
+ url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
194
+ )
195
+ vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
196
+ if transpose != 0:
197
+ inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
198
+ cloned_vox = model.infer_silence(
199
+ vox_wav.astype(np.float32),
200
+ speaker=speaker,
201
+ transpose=transpose,
202
+ auto_predict_f0=auto_predict_f0,
203
+ cluster_infer_ratio=cluster_infer_ratio,
204
+ noise_scale=noise_scale,
205
+ f0_method=f0_method,
206
+ db_thresh=db_thresh,
207
+ pad_seconds=pad_seconds,
208
+ chunk_seconds=chunk_seconds,
209
+ absolute_thresh=absolute_thresh,
210
+ )
211
+ full_song = inst_wav + np.expand_dims(cloned_vox, 1)
212
+ return (model.target_sample, full_song), (model.target_sample, cloned_vox)
213
+
214
+
215
+
216
+ description = f"""
217
+ <center>💡 - 如何使用此程序:在页面上方选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“submit”按键即可!您还可以点击页面最下方的示例快速预览效果</center>
218
+ """.strip()
219
+
220
+ article = """
221
+ <p style='text-align: center'> 注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
222
+ </p>
223
+ """.strip()
224
+
225
+ interface_mic = gr.Interface(
226
+ predict,
227
+ inputs=[
228
+ gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
229
+ gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲"),
230
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
231
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
232
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
233
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
234
+ gr.Dropdown(
235
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
236
+ value=default_f0_method,
237
+ label="模型推理方法 (crepe推理效果最好)", visible=False
238
+ ),
239
+ ],
240
+ outputs="audio",
241
+ cache_examples=False,
242
+ title="🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音",
243
+ description=description,
244
+ article=article,
245
+ )
246
+ interface_file = gr.Interface(
247
+ predict,
248
+ inputs=[
249
+ gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
250
+ gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)"),
251
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
252
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
253
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)"),
254
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
255
+ gr.Dropdown(
256
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
257
+ value=default_f0_method,
258
+ label="模型推理方法 (crepe推理效果最好)", visible=False
259
+ ),
260
+ ],
261
+ outputs="audio",
262
+ cache_examples=False,
263
+ title="🌊💕🎶 可从B站直接上传素材,无需分离背景音",
264
+ description=description,
265
+ article=article,
266
+ )
267
+ interface_yt = gr.Interface(
268
+ predict_song_from_yt,
269
+ inputs=[
270
+ gr.Textbox(
271
+ label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址,可直接填写相应的BV号"
272
+ ),
273
+ gr.Number(value=0, label="起始时间 (秒)"),
274
+ gr.Number(value=15, label="结束时间 (秒)"),
275
+ gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手选择🎶"),
276
+ gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)"),
277
+ gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False),
278
+ gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降"),
279
+ gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False),
280
+ gr.Dropdown(
281
+ choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
282
+ value=default_f0_method,
283
+ label="模型推理方法 (crepe推理效果最好)", visible=False
284
+ ),
285
+ ],
286
+ outputs=[gr.Audio(label="AI歌手+伴奏🎵"), gr.Audio(label="AI歌手人声部分🎤")],
287
+ title="🌊💕🎶 - 可从B站直接上传素材,无需分离背景音",
288
+ description=description,
289
+ article=article,
290
+ cache_examples=False,
291
+ )
292
+ interface = gr.TabbedInterface(
293
+ [interface_yt, interface_mic, interface_file],
294
+ ["📺 - 从B站视频上传 ⭐推荐⭐", "🎙️ - 从麦克风上传", "🎵 - 从文件上传"],
295
+ )
296
+
297
+ if __name__ == "__main__":
298
+ interface.launch(show_error=True)